In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [23]:
df = pd.read_csv("../data/data.csv")
print(f"Dataset loaded with shape: {df.shape}")

Dataset loaded with shape: (6362620, 11)


In [24]:
df_copy = df.copy()

In [25]:
# bin the 'step' column into 24 bins
df_copy['time_period'] = df_copy['step'] % 24

In [26]:
#this is assuming steps are hours
df_copy['day'] = df_copy['step'] // 24

In [27]:
#to convert to a normal distribution
df_copy['log_amount'] = np.log1p(df_copy['amount'])

In [28]:
# bin the 'amount' column into 10 bins
df_copy['amount_bin'] = pd.qcut(df_copy['amount'], q=10, labels=False, duplicates='drop')

In [29]:
df_copy['orig_balance_diff'] = df_copy['newbalanceOrig'] - df_copy['oldbalanceOrg']
        
# Check if balance becomes zero after transaction
df_copy['orig_zero_after_transaction'] = (df_copy['newbalanceOrig'] == 0).astype(int)

# Calculate transaction to balance ratio
df_copy['orig_transaction_to_balance_ratio'] = df_copy['amount'] / (df_copy['oldbalanceOrg'] + 1)

In [30]:
# Calculate the difference in balance
df_copy['dest_balance_diff'] = df_copy['newbalanceDest'] - df_copy['oldbalanceDest']

# Check if balance becomes zero after transaction
df_copy['dest_zero_after_transaction'] = (df_copy['newbalanceDest'] == 0).astype(int)

# Calculate transaction to balance ratio
df_copy['dest_transaction_to_balance_ratio'] = df_copy['amount'] / (df_copy['oldbalanceDest'] + 1)

In [31]:
 # Check if the amount matches the balance difference
if all(col in df_copy.columns for col in ['amount', 'oldbalanceOrg', 'newbalanceOrig']):
    df_copy['orig_amount_matches_balance_diff'] = (abs(df_copy['oldbalanceOrg'] - df_copy['newbalanceOrig'] - df_copy['amount']) < 0.01).astype(int)
    print("Added feature to detect if origin amount matches balance difference")

if all(col in df_copy.columns for col in ['amount', 'oldbalanceDest', 'newbalanceDest']):
    df_copy['dest_amount_matches_balance_diff'] = (abs(df_copy['newbalanceDest'] - df_copy['oldbalanceDest'] - df_copy['amount']) < 0.01).astype(int)
    print("Added feature to detect if destination amount matches balance difference")

Added feature to detect if origin amount matches balance difference
Added feature to detect if destination amount matches balance difference


In [33]:
frequency= df_copy['nameOrig'].value_counts()
df_copy['nameOrig_freq'] = df_copy['nameOrig'].map(frequency)

df_copy.drop(columns=['nameOrig'], inplace=True)


In [None]:
frequency_dest = df_copy['nameDest'].value_counts()
df_copy['nameDest_freq'] = df_copy['nameDest'].map(frequency_dest)
df_copy.drop(columns=['nameDest'], inplace=True)

In [40]:
df_copy.drop(columns=['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'], inplace=True)

In [41]:
df_copy['orig_amount_matches_balance_diff'].value_counts()

orig_amount_matches_balance_diff
0    5066426
1    1296194
Name: count, dtype: int64

In [42]:
df_copy.head()

Unnamed: 0,type,isFraud,isFlaggedFraud,time_period,day,log_amount,amount_bin,orig_balance_diff,orig_zero_after_transaction,orig_transaction_to_balance_ratio,dest_balance_diff,dest_zero_after_transaction,dest_transaction_to_balance_ratio,orig_amount_matches_balance_diff,dest_amount_matches_balance_diff,nameOrig_freq,nameDest_freq
0,PAYMENT,0,0,1,0,9.194276,1,-9839.64,0,0.057834,0.0,1,9839.64,1,0,1,1
1,PAYMENT,0,0,1,0,7.531166,0,-1864.28,0,0.087731,0.0,1,1864.28,1,0,1,1
2,TRANSFER,1,0,1,0,5.204007,0,-181.0,1,0.994505,0.0,1,181.0,1,0,1,44
3,CASH_OUT,1,0,1,0,5.204007,0,-181.0,1,0.994505,-21182.0,1,0.008545,1,0,1,41
4,PAYMENT,0,0,1,0,9.364703,2,-11668.14,0,0.280788,0.0,1,11668.14,1,0,1,1


isFlaggedFraud
0    6362604
1         16
Name: count, dtype: int64