In [1]:
import pandas as pd 

In [3]:
df = pd.read_csv('Fraud_Analysis_Dataset.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
2,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1
3,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1
4,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1


In [5]:
print("Dataset Shape (rows, columns):", df.shape)

Dataset Shape (rows, columns): (11142, 10)


In [7]:
print("\nColumn Names:")


Column Names:


In [9]:
df.columns.tolist()

['step',
 'type',
 'amount',
 'nameOrig',
 'oldbalanceOrg',
 'newbalanceOrig',
 'nameDest',
 'oldbalanceDest',
 'newbalanceDest',
 'isFraud']

In [11]:
print("\nColumn Names:", df.columns.tolist())


Column Names: ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud']


In [15]:
print("\nMissing Values:\n", df.isnull().sum())


Missing Values:
 step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64


In [17]:
df['amount'] = df['amount'].fillna(df['amount'].median())
print("Missing values after filling:\n", df.isnull().sum())

Missing values after filling:
 step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64


In [19]:
df = df.drop_duplicates()
print("Number of rows after removing duplicates:", len(df))

Number of rows after removing duplicates: 11142


In [21]:
df['nameOrig'] = df['nameOrig'].str.upper()
df['nameDest'] = df['nameDest'].str.upper()
print("First few rows after standardization:\n", df[['nameOrig', 'nameDest']].head())

First few rows after standardization:
       nameOrig     nameDest
0  C1305486145   C553264065
1   C840083671    C38997010
2  C1420196421   C972765878
3  C2101527076  C1007251739
4   C137533655  C1848415041


In [23]:
# Cell 7: Validate balances
df['balance_diff_orig'] = df['oldbalanceOrg'] + df['amount'] - df['newbalanceOrig']
df['is_suspicious_orig'] = df['balance_diff_orig'].abs() > 0.01
df['balance_diff_dest'] = df['oldbalanceDest'] + df['amount'] - df['newbalanceDest']
df['is_suspicious_dest'] = df['balance_diff_dest'].abs() > 0.01
print("Suspicious balance transactions:\n", df[df['is_suspicious_orig'] | df['is_suspicious_dest']][['step', 'type', 'amount', 'balance_diff_orig', 'balance_diff_dest']].head())

Suspicious balance transactions:
    step      type   amount  balance_diff_orig  balance_diff_dest
0     1  TRANSFER    181.0              362.0              181.0
1     1  CASH_OUT    181.0              362.0            21363.0
2     1  TRANSFER   2806.0             5612.0             2806.0
3     1  CASH_OUT   2806.0             5612.0            29008.0
4     1  TRANSFER  20128.0            40256.0            20128.0


In [27]:
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df['is_outlier'] = (df['amount'] < lower_bound) | (df['amount'] > upper_bound)
print("Outliers:\n", df[df['is_outlier']][['step', 'type', 'amount', 'isFraud']].head())

Outliers:
     step      type      amount  isFraud
6      1  CASH_OUT   416001.33        1
7      1  TRANSFER  1277212.77        1
8      1  CASH_OUT  1277212.77        1
16     2  TRANSFER  1096187.24        1
17     2  CASH_OUT  1096187.24        1


In [29]:

print("Fraud rate by transaction type:\n", df.groupby('type')['isFraud'].mean().sort_values(ascending=False))
print("\nLarge fraudulent transactions:\n", df[(df['amount'] > df['amount'].quantile(0.95)) & (df['isFraud'] == 1)][['step', 'amount', 'nameOrig', 'nameDest']])

Fraud rate by transaction type:
 type
TRANSFER    0.385246
CASH_OUT    0.308926
CASH_IN     0.000000
DEBIT       0.000000
PAYMENT     0.000000
Name: isFraud, dtype: float64

Large fraudulent transactions:
       step      amount     nameOrig     nameDest
7        1  1277212.77  C1334405552   C431687661
8        1  1277212.77   C467632528   C716083600
16       2  1096187.24  C1093223281  C2063275841
17       2  1096187.24    C77163673   C644345897
18       2   963532.14  C1440057381   C268086000
...    ...         ...          ...          ...
1123    94  2169679.91   C395839623   C925758982
1124    94  1454592.61   C708686257   C191950817
1125    94  1454592.61   C824268591   C819390198
1128    94  2393539.65   C211205523  C1430994246
1129    94  2393539.65  C1767333284   C610460069

[386 rows x 4 columns]


In [31]:

df.to_csv('cleaned_fraud_dataset.csv', index=False)
print("Cleaned dataset saved as 'cleaned_fraud_dataset.csv'")

Cleaned dataset saved as 'cleaned_fraud_dataset.csv'


In [34]:
df_cleaned = pd.read_csv('cleaned_fraud_dataset.csv')

In [36]:
print("Dataset Shape:", df_cleaned.shape)

Dataset Shape: (11142, 15)


In [38]:
print("Columns:", df_cleaned.columns.tolist())

Columns: ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'balance_diff_orig', 'is_suspicious_orig', 'balance_diff_dest', 'is_suspicious_dest', 'is_outlier']


In [42]:
print("\nMissing Values:\n", df_cleaned.isnull().sum())


Missing Values:
 step                  0
type                  0
amount                0
nameOrig              0
oldbalanceOrg         0
newbalanceOrig        0
nameDest              0
oldbalanceDest        0
newbalanceDest        0
isFraud               0
balance_diff_orig     0
is_suspicious_orig    0
balance_diff_dest     0
is_suspicious_dest    0
is_outlier            0
dtype: int64


In [44]:
print("\nData Types:\n", df_cleaned.dtypes)
assert df_cleaned['amount'].dtype in ['float64', 'int64'], "Amount column has incorrect type!"


Data Types:
 step                    int64
type                   object
amount                float64
nameOrig               object
oldbalanceOrg         float64
newbalanceOrig        float64
nameDest               object
oldbalanceDest        float64
newbalanceDest        float64
isFraud                 int64
balance_diff_orig     float64
is_suspicious_orig       bool
balance_diff_dest     float64
is_suspicious_dest       bool
is_outlier               bool
dtype: object


In [46]:
balance_issues = df_cleaned[(df_cleaned['balance_diff_orig'].abs() > 0.01) | (df_cleaned['balance_diff_dest'].abs() > 0.01)]
print("\nNumber of balance discrepancies:", len(balance_issues))
print("Sample balance issues:\n", balance_issues[['step', 'type', 'amount', 'balance_diff_orig', 'balance_diff_dest']].head())


Number of balance discrepancies: 11142
Sample balance issues:
    step      type   amount  balance_diff_orig  balance_diff_dest
0     1  TRANSFER    181.0              362.0              181.0
1     1  CASH_OUT    181.0              362.0            21363.0
2     1  TRANSFER   2806.0             5612.0             2806.0
3     1  CASH_OUT   2806.0             5612.0            29008.0
4     1  TRANSFER  20128.0            40256.0            20128.0


In [48]:
fraud_count = df_cleaned['isFraud'].sum()
total_rows = len(df_cleaned)
print(f"\nFraud Rate: {fraud_count/total_rows:.2%} ({fraud_count} out of {total_rows})")
assert fraud_count > 0, "No fraudulent transactions found!"


Fraud Rate: 10.25% (1142 out of 11142)


In [50]:
outlier_count = df_cleaned['is_outlier'].sum()
print(f"\nNumber of outlier transactions: {outlier_count} ({outlier_count/total_rows:.2%})")


Number of outlier transactions: 1172 (10.52%)


In [52]:
print("\nValidation Complete!")
print("Data is ready if: no missing values, reasonable fraud rate (e.g., 0.1%-5%), and acceptable balance discrepancies.")


Validation Complete!
Data is ready if: no missing values, reasonable fraud rate (e.g., 0.1%-5%), and acceptable balance discrepancies.


In [61]:
# Cell 12: Reconfirm dataset
print("Dataset Shape:", df.shape)
print("Fraud Distribution:\n", df['isFraud'].value_counts())
print("Current Fraud Rate:", df['isFraud'].mean() * 100, "%")

Dataset Shape: (11142, 15)
Fraud Distribution:
 isFraud
0    10000
1     1142
Name: count, dtype: int64
Current Fraud Rate: 10.249506372285047 %


In [63]:
# Downsample to ~1.14% fraud rate with available data
n_non_fraud = len(df[df['isFraud'] == 0])  # Use all available non-fraud
n_fraud = int((n_non_fraud / 0.9886) * 0.0114)  # Calculate fraud to get ~1.14%
df_non_fraud = df[df['isFraud'] == 0].sample(n=n_non_fraud, random_state=42)
df_fraud = df[df['isFraud'] == 1].sample(n=min(n_fraud, len(df[df['isFraud'] == 1])), random_state=42)
df_balanced = pd.concat([df_non_fraud, df_fraud])
print("New Dataset Shape:", df_balanced.shape)
print("New Fraud Rate:", df_balanced['isFraud'].mean() * 100, "%")
df_balanced.to_csv('balanced_fraud_dataset.csv', index=False)
print("Balanced dataset saved as 'balanced_fraud_dataset.csv'")

New Dataset Shape: (10115, 15)
New Fraud Rate: 1.1369253583786456 %
Balanced dataset saved as 'balanced_fraud_dataset.csv'


In [65]:
print("Dataset Shape:", df.shape)
print("Fraud Distribution:\n", df['isFraud'].value_counts())
print("Current Fraud Rate:", df['isFraud'].mean() * 100, "%")

Dataset Shape: (11142, 15)
Fraud Distribution:
 isFraud
0    10000
1     1142
Name: count, dtype: int64
Current Fraud Rate: 10.249506372285047 %


In [67]:
df_cleaned = pd.read_csv('balanced_fraud_dataset.csv')

In [69]:
df_cleaned.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,balance_diff_orig,is_suspicious_orig,balance_diff_dest,is_suspicious_dest,is_outlier
0,6,PAYMENT,33410.82,C1022823162,379355.65,345944.83,M1725760738,0.0,0.0,0,66821.64,True,33410.82,True,False
1,4,DEBIT,1141.74,C308221618,240677.0,239535.26,C1182461167,29379.28,30521.02,0,2283.48,True,0.0,False,False
2,1,PAYMENT,1146.72,C151515831,10248.0,9101.28,M1130160902,0.0,0.0,0,2293.44,True,1146.72,True,False
3,4,PAYMENT,1080.65,C1129874021,35125.0,34044.35,M1871079355,0.0,0.0,0,2161.3,True,1080.65,True,False
4,4,CASH_IN,382513.35,C1178953630,4084533.87,4467047.21,C914293025,501046.14,189527.57,0,0.01,False,694031.92,True,True


In [71]:
df = df.drop_duplicates()

In [73]:
#  Verify current dataset size after duplicates
print("Current Dataset Shape:", df.shape)
print("Non-Fraudulent Transactions:", len(df[df['isFraud'] == 0]))
print("Fraudulent Transactions:", len(df[df['isFraud'] == 1]))
print("Current Fraud Rate:", df['isFraud'].mean() * 100, "%")

Current Dataset Shape: (11142, 15)
Non-Fraudulent Transactions: 10000
Fraudulent Transactions: 1142
Current Fraud Rate: 10.249506372285047 %


In [75]:
# Downsample to ~1.14% fraud rate with available data
n_non_fraud = len(df[df['isFraud'] == 0])  # Use all available non-fraud
n_fraud = int((n_non_fraud / 0.9886) * 0.0114)  # Calculate fraud to get ~1.14%
df_non_fraud = df[df['isFraud'] == 0].sample(n=n_non_fraud, random_state=42)
df_fraud = df[df['isFraud'] == 1].sample(n=min(n_fraud, len(df[df['isFraud'] == 1])), random_state=42)
df_balanced = pd.concat([df_non_fraud, df_fraud])
print("New Dataset Shape:", df_balanced.shape)
print("New Fraud Rate:", df_balanced['isFraud'].mean() * 100, "%")
print("Number of Fraud Cases:", df_balanced['isFraud'].sum())
df_balanced.to_csv('balanced_fraud_dataset.csv', index=False)
print("Balanced dataset saved as 'balanced_fraud_dataset.csv'")

New Dataset Shape: (10115, 15)
New Fraud Rate: 1.1369253583786456 %
Number of Fraud Cases: 115
Balanced dataset saved as 'balanced_fraud_dataset.csv'


In [77]:
df_balanced = pd.read_csv('balanced_fraud_dataset.csv')

In [79]:
df_balanced['is_merchant'] = df_balanced['nameDest'].str.startswith('M')

In [81]:
df_balanced['is_zero_orig'] = (df_balanced['newbalanceOrig'] == 0) & (df_balanced['oldbalanceOrg'] > 0)
df_balanced['is_large_trans'] = (df_balanced['amount'] > df_balanced['amount'].quantile(0.95))

In [83]:
df_balanced['transfer_to_cashout'] = (
    (df_balanced['type'] == 'TRANSFER') & 
    (df_balanced['nameDest'].shift(-1).str.startswith('C')) &  # Next recipient is a customer
    (df_balanced['type'].shift(-1) == 'CASH_OUT') & 
    (df_balanced['amount'] > 0)  # Ensure valid transaction
).astype(int)

In [85]:
print("Sample with New Features:\n", df_balanced[['step', 'type', 'amount', 'nameDest', 'is_merchant', 'is_zero_orig', 'is_large_trans', 'transfer_to_cashout', 'isFraud']].head(10))
df_balanced.to_csv('balanced_fraud_dataset.csv', index=False)  # Save updated dataset
print("Updated dataset saved as 'balanced_fraud_dataset.csv'")

Sample with New Features:
    step      type     amount     nameDest  is_merchant  is_zero_orig  \
0     6   PAYMENT   33410.82  M1725760738         True         False   
1     4     DEBIT    1141.74  C1182461167        False         False   
2     1   PAYMENT    1146.72  M1130160902         True         False   
3     4   PAYMENT    1080.65  M1871079355         True         False   
4     4   CASH_IN  382513.35   C914293025        False         False   
5     6   PAYMENT     379.90  M1931130011         True         False   
6     1   PAYMENT    6131.21   M515273883         True         False   
7     5   PAYMENT    2588.64   M542202332         True         False   
8     6  CASH_OUT  118961.45   C194746285        False         False   
9     1   CASH_IN   13289.87  C1225616405        False         False   

   is_large_trans  transfer_to_cashout  isFraud  
0           False                    0        0  
1           False                    0        0  
2           False             

In [87]:
#  Validate the updated balanced dataset
df_cleaned = pd.read_csv('balanced_fraud_dataset.csv')
print("Dataset Shape:", df_cleaned.shape)
print("Fraud Rate:", df_cleaned['isFraud'].mean() * 100, "%")
print("Number of Fraud Cases:", df_cleaned['isFraud'].sum())
balance_issues = df_cleaned[(df_cleaned['balance_diff_orig'].abs() > 0.01) | (df_cleaned['balance_diff_dest'].abs() > 0.01)]
print("Number of balance discrepancies:", len(balance_issues))
print("Validation Complete! Data is ready if fraud rate is ~1.14% and discrepancies align with merchant data.")

Dataset Shape: (10115, 19)
Fraud Rate: 1.1369253583786456 %
Number of Fraud Cases: 115
Number of balance discrepancies: 10115
Validation Complete! Data is ready if fraud rate is ~1.14% and discrepancies align with merchant data.


In [89]:
df_balanced = df_balanced.sort_values('step')