In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import json
import gc
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [4]:
trx_df = pd.read_csv('assets/transactions_data.csv')
trx_df

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors
0,7475327,2010-01-01 00:01:00,1556,2972,$-77.00,Swipe Transaction,59935,Beulah,ND,58523.0,5499,
1,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,67570,Bettendorf,IA,52722.0,5311,
2,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,27092,Vista,CA,92084.0,4829,
3,7475331,2010-01-01 00:05:00,430,2860,$200.00,Swipe Transaction,27092,Crown Point,IN,46307.0,4829,
4,7475332,2010-01-01 00:06:00,848,3915,$46.41,Swipe Transaction,13051,Harwood,MD,20776.0,5813,
...,...,...,...,...,...,...,...,...,...,...,...,...
13305910,23761868,2019-10-31 23:56:00,1718,2379,$1.11,Chip Transaction,86438,West Covina,CA,91792.0,5499,
13305911,23761869,2019-10-31 23:56:00,1766,2066,$12.80,Online Transaction,39261,ONLINE,,,5815,
13305912,23761870,2019-10-31 23:57:00,199,1031,$40.44,Swipe Transaction,2925,Allen,TX,75002.0,4900,
13305913,23761873,2019-10-31 23:58:00,1986,5443,$4.00,Chip Transaction,46284,Daly City,CA,94014.0,5411,


In [5]:
def santize_df(amount_str):
    """convert amount from string to float"""
    if isinstance(amount_str, str):
        return float(amount_str.replace('$', ''))
    return amount_str

In [6]:
trx_df = trx_df[trx_df.errors.isna()]
display(trx_df)
trx_df = trx_df.drop(columns=['merchant_city', 'merchant_state', 'mcc', 'errors', 'date'], axis=1)
display(trx_df)
trx_df['amount'] = trx_df['amount'].apply(santize_df)
display(trx_df)

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors
0,7475327,2010-01-01 00:01:00,1556,2972,$-77.00,Swipe Transaction,59935,Beulah,ND,58523.0,5499,
1,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,67570,Bettendorf,IA,52722.0,5311,
2,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,27092,Vista,CA,92084.0,4829,
3,7475331,2010-01-01 00:05:00,430,2860,$200.00,Swipe Transaction,27092,Crown Point,IN,46307.0,4829,
4,7475332,2010-01-01 00:06:00,848,3915,$46.41,Swipe Transaction,13051,Harwood,MD,20776.0,5813,
...,...,...,...,...,...,...,...,...,...,...,...,...
13305910,23761868,2019-10-31 23:56:00,1718,2379,$1.11,Chip Transaction,86438,West Covina,CA,91792.0,5499,
13305911,23761869,2019-10-31 23:56:00,1766,2066,$12.80,Online Transaction,39261,ONLINE,,,5815,
13305912,23761870,2019-10-31 23:57:00,199,1031,$40.44,Swipe Transaction,2925,Allen,TX,75002.0,4900,
13305913,23761873,2019-10-31 23:58:00,1986,5443,$4.00,Chip Transaction,46284,Daly City,CA,94014.0,5411,


Unnamed: 0,id,client_id,card_id,amount,use_chip,merchant_id,zip
0,7475327,1556,2972,$-77.00,Swipe Transaction,59935,58523.0
1,7475328,561,4575,$14.57,Swipe Transaction,67570,52722.0
2,7475329,1129,102,$80.00,Swipe Transaction,27092,92084.0
3,7475331,430,2860,$200.00,Swipe Transaction,27092,46307.0
4,7475332,848,3915,$46.41,Swipe Transaction,13051,20776.0
...,...,...,...,...,...,...,...
13305910,23761868,1718,2379,$1.11,Chip Transaction,86438,91792.0
13305911,23761869,1766,2066,$12.80,Online Transaction,39261,
13305912,23761870,199,1031,$40.44,Swipe Transaction,2925,75002.0
13305913,23761873,1986,5443,$4.00,Chip Transaction,46284,94014.0


Unnamed: 0,id,client_id,card_id,amount,use_chip,merchant_id,zip
0,7475327,1556,2972,-77.00,Swipe Transaction,59935,58523.0
1,7475328,561,4575,14.57,Swipe Transaction,67570,52722.0
2,7475329,1129,102,80.00,Swipe Transaction,27092,92084.0
3,7475331,430,2860,200.00,Swipe Transaction,27092,46307.0
4,7475332,848,3915,46.41,Swipe Transaction,13051,20776.0
...,...,...,...,...,...,...,...
13305910,23761868,1718,2379,1.11,Chip Transaction,86438,91792.0
13305911,23761869,1766,2066,12.80,Online Transaction,39261,
13305912,23761870,199,1031,40.44,Swipe Transaction,2925,75002.0
13305913,23761873,1986,5443,4.00,Chip Transaction,46284,94014.0


In [7]:
with open('assets/train_fraud_labels.json', 'r') as f:
    fraud_labels_json = json.load(fp=f)

In [8]:
display(fraud_labels_json)
fraud_labels_dict = fraud_labels_json.get('target', {})
fraud_labels_series = pd.Series(fraud_labels_dict, name='is_fraud')
display(fraud_labels_series)
fraud_labels_series.index = fraud_labels_series.index.astype(int)
display(fraud_labels_series)

{'target': {'10649266': 'No',
  '23410063': 'No',
  '9316588': 'No',
  '12478022': 'No',
  '9558530': 'No',
  '12532830': 'No',
  '19526714': 'No',
  '9906964': 'No',
  '13224888': 'No',
  '13749094': 'No',
  '12303776': 'No',
  '19480376': 'No',
  '11716050': 'No',
  '20025400': 'No',
  '7661688': 'No',
  '16662807': 'No',
  '21419778': 'No',
  '18011186': 'No',
  '23289598': 'No',
  '11644547': 'No',
  '23235120': 'No',
  '19748218': 'No',
  '8720720': 'No',
  '18335831': 'No',
  '18936727': 'No',
  '15223870': 'No',
  '12370203': 'No',
  '17126661': 'No',
  '22270430': 'No',
  '18790248': 'No',
  '20143410': 'No',
  '9497252': 'No',
  '17619208': 'No',
  '11052664': 'No',
  '14670204': 'No',
  '17681877': 'No',
  '22485981': 'No',
  '22332853': 'No',
  '16628447': 'No',
  '7766832': 'No',
  '7614276': 'No',
  '14069486': 'No',
  '13755628': 'No',
  '17306332': 'No',
  '19822702': 'No',
  '19118845': 'No',
  '12799754': 'No',
  '17368331': 'No',
  '23652500': 'No',
  '14024256': 'No'

10649266    No
23410063    No
9316588     No
12478022    No
9558530     No
            ..
14064699    No
7676538     No
15131030    No
17244732    No
15151926    No
Name: is_fraud, Length: 8914963, dtype: object

10649266    No
23410063    No
9316588     No
12478022    No
9558530     No
            ..
14064699    No
7676538     No
15131030    No
17244732    No
15151926    No
Name: is_fraud, Length: 8914963, dtype: object

In [9]:
merged_df = pd.merge(trx_df, fraud_labels_series, how='left', left_on='id', right_index=True)
del trx_df
gc.collect()
display(merged_df)
merged_df.fillna({'is_fraud':'No'}, inplace=True)
display(merged_df)

Unnamed: 0,id,client_id,card_id,amount,use_chip,merchant_id,zip,is_fraud
0,7475327,1556,2972,-77.00,Swipe Transaction,59935,58523.0,No
1,7475328,561,4575,14.57,Swipe Transaction,67570,52722.0,No
2,7475329,1129,102,80.00,Swipe Transaction,27092,92084.0,No
3,7475331,430,2860,200.00,Swipe Transaction,27092,46307.0,
4,7475332,848,3915,46.41,Swipe Transaction,13051,20776.0,No
...,...,...,...,...,...,...,...,...
13305910,23761868,1718,2379,1.11,Chip Transaction,86438,91792.0,No
13305911,23761869,1766,2066,12.80,Online Transaction,39261,,No
13305912,23761870,199,1031,40.44,Swipe Transaction,2925,75002.0,No
13305913,23761873,1986,5443,4.00,Chip Transaction,46284,94014.0,


Unnamed: 0,id,client_id,card_id,amount,use_chip,merchant_id,zip,is_fraud
0,7475327,1556,2972,-77.00,Swipe Transaction,59935,58523.0,No
1,7475328,561,4575,14.57,Swipe Transaction,67570,52722.0,No
2,7475329,1129,102,80.00,Swipe Transaction,27092,92084.0,No
3,7475331,430,2860,200.00,Swipe Transaction,27092,46307.0,No
4,7475332,848,3915,46.41,Swipe Transaction,13051,20776.0,No
...,...,...,...,...,...,...,...,...
13305910,23761868,1718,2379,1.11,Chip Transaction,86438,91792.0,No
13305911,23761869,1766,2066,12.80,Online Transaction,39261,,No
13305912,23761870,199,1031,40.44,Swipe Transaction,2925,75002.0,No
13305913,23761873,1986,5443,4.00,Chip Transaction,46284,94014.0,No


In [10]:
merged_df['is_fraud'] = merged_df['is_fraud'].map({'Yes': 1, 'No': 0})
display(merged_df)

Unnamed: 0,id,client_id,card_id,amount,use_chip,merchant_id,zip,is_fraud
0,7475327,1556,2972,-77.00,Swipe Transaction,59935,58523.0,0
1,7475328,561,4575,14.57,Swipe Transaction,67570,52722.0,0
2,7475329,1129,102,80.00,Swipe Transaction,27092,92084.0,0
3,7475331,430,2860,200.00,Swipe Transaction,27092,46307.0,0
4,7475332,848,3915,46.41,Swipe Transaction,13051,20776.0,0
...,...,...,...,...,...,...,...,...
13305910,23761868,1718,2379,1.11,Chip Transaction,86438,91792.0,0
13305911,23761869,1766,2066,12.80,Online Transaction,39261,,0
13305912,23761870,199,1031,40.44,Swipe Transaction,2925,75002.0,0
13305913,23761873,1986,5443,4.00,Chip Transaction,46284,94014.0,0


In [11]:
card_df = pd.read_csv('assets/cards_data.csv')
display(card_df)

Unnamed: 0,id,client_id,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web
0,4524,825,Visa,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,2731,825,Visa,Debit,4956965974959986,12/2020,393,YES,2,$21968,04/2014,2014,No
2,3701,825,Visa,Debit,4582313478255491,02/2024,719,YES,2,$46414,07/2003,2004,No
3,42,825,Visa,Credit,4879494103069057,08/2024,693,NO,1,$12400,01/2003,2012,No
4,4659,825,Mastercard,Debit (Prepaid),5722874738736011,03/2009,75,YES,1,$28,09/2008,2009,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6141,5361,185,Amex,Credit,300609782832003,01/2024,663,YES,1,$6900,11/2000,2013,No
6142,2711,185,Visa,Credit,4718517475996018,01/2021,492,YES,2,$5700,04/2012,2012,No
6143,1305,1007,Mastercard,Credit,5929512204765914,08/2020,237,NO,2,$9200,02/2012,2012,No
6144,743,1110,Mastercard,Debit,5589768928167462,01/2020,630,YES,1,$28074,01/2020,2020,No


In [12]:
card_df = card_df.drop(columns=['client_id', 'acct_open_date', 'cvv', 'card_number', 'expires'], axis=1)
card_df

Unnamed: 0,id,card_brand,card_type,has_chip,num_cards_issued,credit_limit,year_pin_last_changed,card_on_dark_web
0,4524,Visa,Debit,YES,2,$24295,2008,No
1,2731,Visa,Debit,YES,2,$21968,2014,No
2,3701,Visa,Debit,YES,2,$46414,2004,No
3,42,Visa,Credit,NO,1,$12400,2012,No
4,4659,Mastercard,Debit (Prepaid),YES,1,$28,2009,No
...,...,...,...,...,...,...,...,...
6141,5361,Amex,Credit,YES,1,$6900,2013,No
6142,2711,Visa,Credit,YES,2,$5700,2012,No
6143,1305,Mastercard,Credit,NO,2,$9200,2012,No
6144,743,Mastercard,Debit,YES,1,$28074,2020,No


In [13]:
card_df['credit_limit'] = card_df['credit_limit'].apply(santize_df)
card_df

Unnamed: 0,id,card_brand,card_type,has_chip,num_cards_issued,credit_limit,year_pin_last_changed,card_on_dark_web
0,4524,Visa,Debit,YES,2,24295.0,2008,No
1,2731,Visa,Debit,YES,2,21968.0,2014,No
2,3701,Visa,Debit,YES,2,46414.0,2004,No
3,42,Visa,Credit,NO,1,12400.0,2012,No
4,4659,Mastercard,Debit (Prepaid),YES,1,28.0,2009,No
...,...,...,...,...,...,...,...,...
6141,5361,Amex,Credit,YES,1,6900.0,2013,No
6142,2711,Visa,Credit,YES,2,5700.0,2012,No
6143,1305,Mastercard,Credit,NO,2,9200.0,2012,No
6144,743,Mastercard,Debit,YES,1,28074.0,2020,No


In [14]:
merged_df = pd.merge(merged_df, card_df, left_on='card_id', right_on='id', how='inner')
del card_df
gc.collect()
merged_df

Unnamed: 0,id_x,client_id,card_id,amount,use_chip,merchant_id,zip,is_fraud,id_y,card_brand,card_type,has_chip,num_cards_issued,credit_limit,year_pin_last_changed,card_on_dark_web
0,7475327,1556,2972,-77.00,Swipe Transaction,59935,58523.0,0,2972,Mastercard,Debit (Prepaid),YES,2,55.0,2008,No
1,7475328,561,4575,14.57,Swipe Transaction,67570,52722.0,0,4575,Mastercard,Credit,YES,1,9100.0,2015,No
2,7475329,1129,102,80.00,Swipe Transaction,27092,92084.0,0,102,Mastercard,Debit,YES,1,14802.0,2008,No
3,7475331,430,2860,200.00,Swipe Transaction,27092,46307.0,0,2860,Mastercard,Debit,NO,2,37634.0,2006,No
4,7475332,848,3915,46.41,Swipe Transaction,13051,20776.0,0,3915,Visa,Debit,YES,1,19113.0,2014,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13094517,23761868,1718,2379,1.11,Chip Transaction,86438,91792.0,0,2379,Mastercard,Debit,YES,1,26743.0,2019,No
13094518,23761869,1766,2066,12.80,Online Transaction,39261,,0,2066,Mastercard,Debit,YES,1,5141.0,2012,No
13094519,23761870,199,1031,40.44,Swipe Transaction,2925,75002.0,0,1031,Mastercard,Debit,YES,1,17686.0,2007,No
13094520,23761873,1986,5443,4.00,Chip Transaction,46284,94014.0,0,5443,Visa,Debit,YES,2,14036.0,2010,No


In [15]:
merged_df = merged_df.drop(columns=['id_y', 'card_id'], axis=1)
merged_df

Unnamed: 0,id_x,client_id,amount,use_chip,merchant_id,zip,is_fraud,card_brand,card_type,has_chip,num_cards_issued,credit_limit,year_pin_last_changed,card_on_dark_web
0,7475327,1556,-77.00,Swipe Transaction,59935,58523.0,0,Mastercard,Debit (Prepaid),YES,2,55.0,2008,No
1,7475328,561,14.57,Swipe Transaction,67570,52722.0,0,Mastercard,Credit,YES,1,9100.0,2015,No
2,7475329,1129,80.00,Swipe Transaction,27092,92084.0,0,Mastercard,Debit,YES,1,14802.0,2008,No
3,7475331,430,200.00,Swipe Transaction,27092,46307.0,0,Mastercard,Debit,NO,2,37634.0,2006,No
4,7475332,848,46.41,Swipe Transaction,13051,20776.0,0,Visa,Debit,YES,1,19113.0,2014,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13094517,23761868,1718,1.11,Chip Transaction,86438,91792.0,0,Mastercard,Debit,YES,1,26743.0,2019,No
13094518,23761869,1766,12.80,Online Transaction,39261,,0,Mastercard,Debit,YES,1,5141.0,2012,No
13094519,23761870,199,40.44,Swipe Transaction,2925,75002.0,0,Mastercard,Debit,YES,1,17686.0,2007,No
13094520,23761873,1986,4.00,Chip Transaction,46284,94014.0,0,Visa,Debit,YES,2,14036.0,2010,No


In [16]:
categorical_cols = merged_df.select_dtypes(include=['object']).columns
display(categorical_cols)
df = merged_df.copy()


Index(['use_chip', 'card_brand', 'card_type', 'has_chip', 'card_on_dark_web'], dtype='object')

In [17]:
print("Cardinality Analysis:")
print("-" * 50)
total_dummies = 0
for col in categorical_cols:
    n_unique = df[col].nunique()
    total_dummies += n_unique
    memory_estimate = (len(df) * n_unique * 1) / (1024**3)  # GB estimate
    print(f"{col}: {n_unique:,} unique values (~{memory_estimate:.2f} GB)")

print(f"\nTotal dummy columns: {total_dummies:,}")
print(f"Estimated memory needed: {(len(df) * total_dummies * 1) / (1024**3):.2f} GB")

Cardinality Analysis:
--------------------------------------------------
use_chip: 3 unique values (~0.04 GB)
card_brand: 4 unique values (~0.05 GB)
card_type: 3 unique values (~0.04 GB)
has_chip: 2 unique values (~0.02 GB)
card_on_dark_web: 1 unique values (~0.01 GB)

Total dummy columns: 13
Estimated memory needed: 0.16 GB


In [18]:
print("Starting one-by-one encoding...")
print(f"Initial memory: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# Process each categorical column individually
for i, col in enumerate(categorical_cols, 1):
    print(f"\n[{i}/{len(categorical_cols)}] Processing {col}...")
    print(f"  Unique values: {df[col].nunique()}")
    
    # Create dummies for this column only
    dummies = pd.get_dummies(df[col], prefix=col, dtype=np.uint8)
    
    # Drop original column
    df = df.drop(columns=[col])
    
    # Add dummy columns
    df = pd.concat([df, dummies], axis=1)
    
    # Force cleanup
    del dummies
    gc.collect()
    
    print(f"  Current memory: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

print("\n✓ Encoding complete!")
print(f"Final shape: {df.shape}")
print(f"Final memory: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

Starting one-by-one encoding...
Initial memory: 4.29 GB

[1/5] Processing use_chip...
  Unique values: 3
  Current memory: 3.53 GB

[2/5] Processing card_brand...
  Unique values: 4
  Current memory: 2.89 GB

[3/5] Processing card_type...
  Unique values: 3
  Current memory: 2.25 GB

[4/5] Processing has_chip...
  Unique values: 2
  Current memory: 1.65 GB

[5/5] Processing card_on_dark_web...
  Unique values: 1
  Current memory: 1.04 GB

✓ Encoding complete!
Final shape: (13094522, 22)
Final memory: 1.04 GB


In [19]:
display(df)

Unnamed: 0,id_x,client_id,amount,merchant_id,zip,is_fraud,num_cards_issued,credit_limit,year_pin_last_changed,use_chip_Chip Transaction,use_chip_Online Transaction,use_chip_Swipe Transaction,card_brand_Amex,card_brand_Discover,card_brand_Mastercard,card_brand_Visa,card_type_Credit,card_type_Debit,card_type_Debit (Prepaid),has_chip_NO,has_chip_YES,card_on_dark_web_No
0,7475327,1556,-77.00,59935,58523.0,0,2,55.0,2008,0,0,1,0,0,1,0,0,0,1,0,1,1
1,7475328,561,14.57,67570,52722.0,0,1,9100.0,2015,0,0,1,0,0,1,0,1,0,0,0,1,1
2,7475329,1129,80.00,27092,92084.0,0,1,14802.0,2008,0,0,1,0,0,1,0,0,1,0,0,1,1
3,7475331,430,200.00,27092,46307.0,0,2,37634.0,2006,0,0,1,0,0,1,0,0,1,0,1,0,1
4,7475332,848,46.41,13051,20776.0,0,1,19113.0,2014,0,0,1,0,0,0,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13094517,23761868,1718,1.11,86438,91792.0,0,1,26743.0,2019,1,0,0,0,0,1,0,0,1,0,0,1,1
13094518,23761869,1766,12.80,39261,,0,1,5141.0,2012,0,1,0,0,0,1,0,0,1,0,0,1,1
13094519,23761870,199,40.44,2925,75002.0,0,1,17686.0,2007,0,0,1,0,0,1,0,0,1,0,0,1,1
13094520,23761873,1986,4.00,46284,94014.0,0,2,14036.0,2010,1,0,0,0,0,0,1,0,1,0,0,1,1


In [20]:
df = df.dropna().drop(['client_id', 'id_x'], axis=1)
display(df)

Unnamed: 0,amount,merchant_id,zip,is_fraud,num_cards_issued,credit_limit,year_pin_last_changed,use_chip_Chip Transaction,use_chip_Online Transaction,use_chip_Swipe Transaction,card_brand_Amex,card_brand_Discover,card_brand_Mastercard,card_brand_Visa,card_type_Credit,card_type_Debit,card_type_Debit (Prepaid),has_chip_NO,has_chip_YES,card_on_dark_web_No
0,-77.00,59935,58523.0,0,2,55.0,2008,0,0,1,0,0,1,0,0,0,1,0,1,1
1,14.57,67570,52722.0,0,1,9100.0,2015,0,0,1,0,0,1,0,1,0,0,0,1,1
2,80.00,27092,92084.0,0,1,14802.0,2008,0,0,1,0,0,1,0,0,1,0,0,1,1
3,200.00,27092,46307.0,0,2,37634.0,2006,0,0,1,0,0,1,0,0,1,0,1,0,1
4,46.41,13051,20776.0,0,1,19113.0,2014,0,0,1,0,0,0,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13094516,165.09,30286,28590.0,0,1,17993.0,2012,1,0,0,0,0,1,0,0,1,0,0,1,1
13094517,1.11,86438,91792.0,0,1,26743.0,2019,1,0,0,0,0,1,0,0,1,0,0,1,1
13094519,40.44,2925,75002.0,0,1,17686.0,2007,0,0,1,0,0,1,0,0,1,0,0,1,1
13094520,4.00,46284,94014.0,0,2,14036.0,2010,1,0,0,0,0,0,1,0,1,0,0,1,1


In [21]:
df.columns

Index(['amount', 'merchant_id', 'zip', 'is_fraud', 'num_cards_issued',
       'credit_limit', 'year_pin_last_changed', 'use_chip_Chip Transaction',
       'use_chip_Online Transaction', 'use_chip_Swipe Transaction',
       'card_brand_Amex', 'card_brand_Discover', 'card_brand_Mastercard',
       'card_brand_Visa', 'card_type_Credit', 'card_type_Debit',
       'card_type_Debit (Prepaid)', 'has_chip_NO', 'has_chip_YES',
       'card_on_dark_web_No'],
      dtype='object')

In [22]:
# extreme inbalance in data distribution
df['is_fraud'].value_counts()

is_fraud
0    11477397
1        1191
Name: count, dtype: int64

In [23]:
df_fraud = df[df.is_fraud == 1]
display(df_fraud)
df_non_fraud = df[df.is_fraud == 0]
display(df_non_fraud)

Unnamed: 0,amount,merchant_id,zip,is_fraud,num_cards_issued,credit_limit,year_pin_last_changed,use_chip_Chip Transaction,use_chip_Online Transaction,use_chip_Swipe Transaction,card_brand_Amex,card_brand_Discover,card_brand_Mastercard,card_brand_Visa,card_type_Credit,card_type_Debit,card_type_Debit (Prepaid),has_chip_NO,has_chip_YES,card_on_dark_web_No
2499866,93.85,22936,94126.0,1,1,30400.0,2011,0,0,1,0,0,1,0,1,0,0,0,1,1
2504224,14.44,18403,94131.0,1,1,30400.0,2011,0,0,1,0,0,1,0,1,0,0,0,1,1
2514134,5.61,99370,39503.0,1,1,9294.0,2011,0,0,1,0,0,1,0,0,1,0,1,0,1
2514176,99.50,27092,39482.0,1,1,9294.0,2011,0,0,1,0,0,1,0,0,1,0,1,0,1
2514783,13.45,6813,21146.0,1,1,16600.0,2014,0,0,1,0,0,1,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11838376,31.68,87720,43302.0,1,1,7702.0,2010,0,0,1,0,0,1,0,0,1,0,0,1,1
11839064,6.08,87720,43302.0,1,1,9294.0,2011,0,0,1,0,0,1,0,0,1,0,1,0,1
11839829,47.67,70776,43302.0,1,2,1627.0,2013,1,0,0,0,0,1,0,0,1,0,0,1,1
11847005,5.66,95457,43551.0,1,2,1627.0,2013,1,0,0,0,0,1,0,0,1,0,0,1,1


Unnamed: 0,amount,merchant_id,zip,is_fraud,num_cards_issued,credit_limit,year_pin_last_changed,use_chip_Chip Transaction,use_chip_Online Transaction,use_chip_Swipe Transaction,card_brand_Amex,card_brand_Discover,card_brand_Mastercard,card_brand_Visa,card_type_Credit,card_type_Debit,card_type_Debit (Prepaid),has_chip_NO,has_chip_YES,card_on_dark_web_No
0,-77.00,59935,58523.0,0,2,55.0,2008,0,0,1,0,0,1,0,0,0,1,0,1,1
1,14.57,67570,52722.0,0,1,9100.0,2015,0,0,1,0,0,1,0,1,0,0,0,1,1
2,80.00,27092,92084.0,0,1,14802.0,2008,0,0,1,0,0,1,0,0,1,0,0,1,1
3,200.00,27092,46307.0,0,2,37634.0,2006,0,0,1,0,0,1,0,0,1,0,1,0,1
4,46.41,13051,20776.0,0,1,19113.0,2014,0,0,1,0,0,0,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13094516,165.09,30286,28590.0,0,1,17993.0,2012,1,0,0,0,0,1,0,0,1,0,0,1,1
13094517,1.11,86438,91792.0,0,1,26743.0,2019,1,0,0,0,0,1,0,0,1,0,0,1,1
13094519,40.44,2925,75002.0,0,1,17686.0,2007,0,0,1,0,0,1,0,0,1,0,0,1,1
13094520,4.00,46284,94014.0,0,2,14036.0,2010,1,0,0,0,0,0,1,0,1,0,0,1,1


In [24]:
# taking equal sample sizes from both classes
# validation and test sizes
# del df
gc.collect()

val_size_per_class = 200
test_size_per_class = 200

# create test sets
X_test_fraud = df_fraud.sample(n=test_size_per_class, random_state=42)
X_test_non_fraud = df_non_fraud.sample(n=test_size_per_class, random_state=42)

X_test = pd.concat([X_test_fraud, X_test_non_fraud]).sample(frac=1, random_state=42).reset_index(drop=True)
y_test = X_test['is_fraud']
X_test = X_test.drop('is_fraud', axis=1)

df_fraud_remaining = df_fraud.drop(X_test_fraud.index)
df_non_fraud_remaining = df_non_fraud.drop(X_test_non_fraud.index)

# create validiation sets
X_val_fraud = df_fraud_remaining.sample(n=val_size_per_class, random_state=42)
X_val_non_fraud = df_non_fraud_remaining.sample(n=val_size_per_class, random_state=42)

X_val = pd.concat([X_val_fraud, X_val_non_fraud]).sample(frac=1, random_state=42).reset_index(drop=True)
y_val = X_val['is_fraud']
X_val = X_val.drop('is_fraud', axis=1)

df_fraud_train = df_fraud_remaining.drop(X_val_fraud.index)
df_non_fraud_train = df_non_fraud_remaining.drop(X_val_non_fraud.index)

# create training sets
min_train_samples_per_class = min(len(df_fraud_train), len(df_non_fraud_train))

X_train_fraud = df_fraud_train.sample(n=min_train_samples_per_class, random_state=42)
X_train_non_fraud = df_non_fraud_train.sample(n=min_train_samples_per_class, random_state=42)

X_train = pd.concat([X_train_fraud, X_train_non_fraud]).sample(frac=1, random_state=42).reset_index(drop=True)
y_train = X_train['is_fraud']
X_train = X_train.drop('is_fraud', axis=1)


print("\n--- Final Dataset Shapes and Distributions ---")
print(f"X_train shape: {X_train.shape}, y_train distribution: {np.unique(y_train, return_counts=True)}")
print(f"X_val shape: {X_val.shape}, y_val distribution: {np.unique(y_val, return_counts=True)}")
print(f"X_test shape: {X_test.shape}, y_test distribution: {np.unique(y_test, return_counts=True)}")


--- Final Dataset Shapes and Distributions ---
X_train shape: (1582, 19), y_train distribution: (array([0, 1]), array([791, 791]))
X_val shape: (400, 19), y_val distribution: (array([0, 1]), array([200, 200]))
X_test shape: (400, 19), y_test distribution: (array([0, 1]), array([200, 200]))


In [25]:
train_target = 2000

smote_train = SMOTE(sampling_strategy={0: train_target, 1: train_target},
                    random_state=42)
X_train, y_train = smote_train.fit_resample(X_train, y_train)
print(f"\nAfter SMOTE with custom sampling_strategy (target train: {train_target}):")
print(f"X_train_oversampled shape: {X_train.shape}")
print(f"y_train_oversampled distribution: {Counter(y_train)}")


After SMOTE with custom sampling_strategy (target train: 2000):
X_train_oversampled shape: (4000, 19)
y_train_oversampled distribution: Counter({0: 2000, 1: 2000})


In [29]:
display(X_train)
display(y_train)
print(X_train.isna().sum())
print(y_train.isna().sum())

Unnamed: 0,amount,merchant_id,zip,num_cards_issued,credit_limit,year_pin_last_changed,use_chip_Chip Transaction,use_chip_Online Transaction,use_chip_Swipe Transaction,card_brand_Amex,card_brand_Discover,card_brand_Mastercard,card_brand_Visa,card_type_Credit,card_type_Debit,card_type_Debit (Prepaid),has_chip_NO,has_chip_YES,card_on_dark_web_No
0,10.130000,44862,11955.000000,1,17946.000000,2016,1,0,0,0,0,1,0,0,1,0,0,1,1
1,97.000000,22204,14559.000000,1,10233.000000,2017,0,0,1,0,0,1,0,0,1,0,0,1,1
2,11.300000,92991,48329.000000,1,15807.000000,2011,0,0,1,0,0,0,1,0,1,0,0,1,1
3,563.550000,57596,63367.000000,1,17883.000000,2017,0,0,1,0,0,1,0,0,1,0,1,0,1
4,439.000000,87685,44680.000000,2,1400.000000,2009,0,0,1,1,0,0,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,69.138168,48743,44680.000000,1,80.332263,2010,0,0,1,0,0,1,0,0,0,1,0,1,1
3996,-42.595803,74520,21597.294292,1,25698.642438,2014,0,0,1,0,0,0,1,0,0,0,0,0,1
3997,104.436516,34431,90350.099728,1,11744.967611,2013,0,0,0,0,0,0,0,0,0,0,0,0,1
3998,129.183136,80986,32861.532625,2,6917.505410,2015,0,0,1,0,0,0,0,0,0,0,0,1,1


0       0
1       0
2       0
3       1
4       1
       ..
3995    1
3996    1
3997    1
3998    1
3999    1
Name: is_fraud, Length: 4000, dtype: int64

amount                         0
merchant_id                    0
zip                            0
num_cards_issued               0
credit_limit                   0
year_pin_last_changed          0
use_chip_Chip Transaction      0
use_chip_Online Transaction    0
use_chip_Swipe Transaction     0
card_brand_Amex                0
card_brand_Discover            0
card_brand_Mastercard          0
card_brand_Visa                0
card_type_Credit               0
card_type_Debit                0
card_type_Debit (Prepaid)      0
has_chip_NO                    0
has_chip_YES                   0
card_on_dark_web_No            0
dtype: int64
0


In [None]:
categorical_features = X_train.select_dtypes(
    include=['object']).columns.tolist()
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(
    strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
numerical_features = X_train.select_dtypes(
    include=['int64', 'float64']).columns.tolist()
numerical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[(
    'num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features)])

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)
display(X_train_processed)
display(X_val_processed)
display(X_test_processed)

array([[-0.59150504, -0.15481555, -1.42794515, -0.71289463,  0.33060558,
         1.96968308],
       [ 0.18710224, -0.9932093 , -1.33112322, -0.71289463, -0.40073535,
         2.36547994],
       [-0.58101845,  1.62605888, -0.07548709, -0.71289463,  0.12778719,
        -0.00930123],
       ...,
       [ 0.253755  , -0.54078453,  1.48694143, -0.71289463, -0.25737171,
         0.7822925 ],
       [ 0.47555653,  1.18184858, -0.65059846,  1.3268611 , -0.71510809,
         1.57388622],
       [ 0.90766349,  1.11221061, -1.44980362, -0.71289463, -1.27864655,
        -0.80089495]], shape=(4000, 6))

array([[ 0.57232776,  0.42637655,  1.04309619, -0.71289463,  0.48307495,
        -1.59248867],
       [-0.35165779, -1.05555785, -0.59956738,  1.3268611 ,  1.73990422,
        -0.40509809],
       [-0.23531933,  1.3050656 , -0.21168475,  1.3268611 ,  0.1317696 ,
        -0.80089495],
       ...,
       [ 0.05167279, -0.21287186,  1.77216085, -0.71289463, -0.05303313,
        -3.17567612],
       [ 0.05884311, -0.69415827, -0.63968675, -0.71289463,  0.08919575,
        -0.80089495],
       [-0.62753591,  0.31403815, -1.5664481 , -0.71289463,  0.62568311,
         1.57388622]], shape=(400, 6))

array([[ 0.162275  ,  0.98925261,  1.58625085, -0.71289463, -0.20474395,
         0.7822925 ],
       [-0.10867352, -0.21287186, -0.22064562,  1.3268611 ,  0.01334035,
        -1.19669181],
       [ 0.80509432, -1.25451818, -1.09851263,  1.3268611 ,  2.409423  ,
        -3.17567612],
       ...,
       [-0.66428381,  0.06427393,  1.64518432,  1.3268611 , -0.47971979,
        -1.19669181],
       [ 0.05265871, -0.9932093 ,  0.36396697, -0.71289463, -0.17629817,
        -0.00930123],
       [-0.668586  ,  1.61688236, -0.21116421, -0.71289463, -1.36609022,
         0.7822925 ]], shape=(400, 6))

In [33]:
class MLP_SGD:
    def __init__(self, hidden_layer_sizes=(10,), learning_rate = 0.01, n_epochs=1000, batch_size=32):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.weights = []
        self.biases = []
        self.weights_history = []
        self.biases_history = []
        self.loss_history = []

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

    def _sigmoid_derivative(self, x):
        s = self._sigmoid(x)
        return s * (1 - s)

    def _relu(self, x):
        return np.maximum(0, x)

    def _relu_derivative(self, x):
        return (x > 0).astype(float)

    def _initialize_parameters(self, n_features):
        layer_sizes = [n_features] + list(self.hidden_layer_sizes) + [1]
        self.weights = []
        self.biases = []

        for i in range(len(layer_sizes) - 1):
            fan_in = layer_sizes[i]
            fan_out = layer_sizes[i+1]
            limit = np.sqrt(6 / (fan_in + fan_out))
            self.weights.append(np.random.uniform(-limit, limit, (fan_in, fan_out)))
            self.biases.append(np.zeros((1, fan_out)))


    def _forward_pass(self, X):
        activations = [X]
        zs = []

        for i in range(len(self.weights) - 1):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            zs.append(z)
            a = self._relu(z)
            activations.append(a)

        z_output = np.dot(activations[-1], self.weights[-1]) + self.biases[-1]
        zs.append(z_output)
        y_pred = self._sigmoid(z_output)
        activations.append(y_pred)

        return activations, zs
    
    def _compute_loss(self, y_true, y_pred):
        y_pred = np.clip(y_pred, 1e-10, 1 - 1e-10)
        loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        return loss 
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        y = np.asarray(y).reshape(-1, 1)
        X = np.asarray(X)
        self._initialize_parameters(n_features)
        self.weights_history.append([w.copy() for w in self.weights])
        self.biases_history.append([b.copy() for b in self.biases])
        activations, _ = self._forward_pass(X)
        initial_loss = self._compute_loss(y, activations[-1])
        self.loss_history.append(initial_loss)

        for epoch in range(self.n_epochs):
            permutation = np.random.permutation(n_samples)
            X_shuffled = X[permutation]
            y_shuffled = y[permutation]

            for i in range(0, n_samples, self.batch_size):
                X_batch = X_shuffled[i : i + self.batch_size]
                y_batch = y_shuffled[i : i + self.batch_size]

                activations, zs = self._forward_pass(X_batch)
                y_pred = activations[-1]

                delta = y_pred - y_batch
                dW = np.dot(activations[-2].T, delta) / X_batch.shape[0]
                db = np.sum(delta, axis=0) / X_batch.shape[0]
                self.weights[-1] -= self.learning_rate * dW
                self.biases[-1] -= self.learning_rate * db

                for l in range(len(self.weights) -2, -1, -1):
                    delta = np.dot(delta, self.weights[l+1].T) * self._relu_derivative(zs[l])
                    dW = np.dot(activations[l].T, delta) / X_batch.shape[0]
                    db = np.sum(delta, axis=0) / X_batch.shape[0]

                    self.weights[l] -= self.learning_rate * dW
                    self.biases[l] -= self.learning_rate * db

            self.weights_history.append([w.copy() for w in self.weights])
            self.biases_history.append([b.copy() for b in self.biases])

            activations, _ = self._forward_pass(X)
            epoch_loss = self._compute_loss(y, activations[-1])
            self.loss_history.append(epoch_loss)

            if (epoch + 1) % 100 == 0:
                print(f"Epoch {epoch + 1} / {self.n_epochs}, Loss : {epoch_loss: .4f}")

        return self

    def predict_proba(self, X):
        activations, _ = self._forward_pass(X)
        return activations[-1]
    
    def predict(self, X, threshold = 0.5):
        probabilities = self.predict_proba(X)
        return (probabilities >= threshold).astype(int).flatten()

In [34]:
mlp_sgd = MLP_SGD(hidden_layer_sizes=(30, 30, ), learning_rate=0.001, n_epochs=1000, batch_size=32)

mlp_sgd.fit(X_train_processed, y_train)

y_pred_train = mlp_sgd.predict(X_train_processed)
y_pred_val = mlp_sgd.predict(X_val_processed)

conf_matrix_train = confusion_matrix(y_train, y_pred_train)
conf_matrix_val = confusion_matrix(y_val, y_pred_val)
acc_train = accuracy_score(y_train, y_pred_train)
acc_val = accuracy_score(y_val, y_pred_val)
precision_train = precision_score(y_train, y_pred_train, pos_label=1)
precision_val = precision_score(y_val, y_pred_val, pos_label=1)
recall_train = recall_score(y_train, y_pred_train, pos_label=1)
recall_val = recall_score(y_val, y_pred_val, pos_label=1)
f1_train = f1_score(y_train, y_pred_train, pos_label=1)
f1_val = f1_score(y_val, y_pred_val, pos_label=1)

print(f"Confusion-matrix train: {conf_matrix_train}")
print(f"Confusion-matrix val: {conf_matrix_val}")
print(f"Accuracy train: {acc_train}")
print(f"Accuracy val: {acc_val}")
print(f"Precison train: {precision_train}")
print(f"Precision val: {precision_val}")
print(f"Recall train: {recall_train}")
print(f"Recall val: {recall_val}")
print(f"f1 train: {f1_train}")
print(f"f1 val: {f1_val}")

Epoch 100 / 1000, Loss :  0.6236
Epoch 200 / 1000, Loss :  0.5997
Epoch 300 / 1000, Loss :  0.5905
Epoch 400 / 1000, Loss :  0.5835
Epoch 500 / 1000, Loss :  0.5777
Epoch 600 / 1000, Loss :  0.5728
Epoch 700 / 1000, Loss :  0.5682
Epoch 800 / 1000, Loss :  0.5639
Epoch 900 / 1000, Loss :  0.5599
Epoch 1000 / 1000, Loss :  0.5559
Confusion-matrix train: [[1521  479]
 [ 634 1366]]
Confusion-matrix val: [[152  48]
 [ 89 111]]
Accuracy train: 0.72175
Accuracy val: 0.6575
Precison train: 0.740379403794038
Precision val: 0.6981132075471698
Recall train: 0.683
Recall val: 0.555
f1 train: 0.7105331599479844
f1 val: 0.6183844011142061
