# Model Training and  Model Explainability

In [1]:
import sys
import os
import random
import calendar
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, skew, ttest_ind
import warnings
warnings.filterwarnings('ignore')
sys.path.append('../scripts')
from data_loader import *
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
# path to the CSV file
filename1 = 'final_fraud_df.parquet'
filename2 = 'final_credit_df.parquet'

path1 = os.path.join('..', 'data/featured', filename1)
path2 = os.path.join('..', 'data/featured', filename2)

# Load dataset
fraud_ip_data = load_data(path1)
credit_card_data = load_data(path2)

## 1. Model Training -  Fraud-IP Data

In [19]:
fraud_ip_data

Unnamed: 0,user_id,purchase_value,age,ip_address,class,hour_of_day,day_of_week,time_since_signup,purchase_value_log,purchase_value_percentile,num_transactions,avg_purchase_value,user_transaction_count,user_transaction_velocity,ip_country_hash,source_encoded,browser_encoded,device_reuse_encoded,ip_location_change_encoded,source_change_encoded,browser_change_encoded,sex_encoded
27183,1.315594,0.538898,-0.129016,-1.184279,1,0.496157,-1.529269,-1.099457,0.677870,0.538898,0.0,0.538898,0.0,1.591326,0.457621,0.493131,1.103392,1,0,0,0,1
15367,1.493888,-1.212334,0.349942,1.210989,1,0.932265,0.956305,-1.099457,-1.407891,-1.212334,0.0,-1.212334,0.0,1.591326,-2.808711,0.551709,-1.032874,1,0,0,0,0
7533,0.138418,-0.477947,2.385509,-1.074856,0,-0.230689,1.453419,1.524431,-0.246150,-0.477947,0.0,-0.477947,0.0,-0.681026,0.457621,0.493131,-0.332836,0,0,0,0,1
14307,-1.015562,-0.364964,0.230202,0.246696,1,-0.957535,-1.032154,-0.014580,-0.117733,-0.364964,0.0,-0.364964,0.0,-0.649729,-2.808711,-1.912873,-0.332836,0,0,0,0,0
20331,-1.309655,0.651880,1.188116,-0.570875,1,0.641527,-1.529269,-1.099457,0.757087,0.651880,0.0,0.651880,0.0,1.591326,0.563010,-1.912873,1.103392,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,-0.834504,-1.381809,-0.248755,-1.564516,0,0.350788,-1.529269,-0.973309,-1.823229,-1.381809,0.0,-1.381809,0.0,-0.244205,0.457621,0.551709,1.103392,0,0,0,0,1
17289,-1.704959,-0.195490,0.589420,-1.469240,1,-0.376058,-0.037925,0.288176,0.059668,-0.195490,0.0,-0.195490,0.0,-0.661371,0.457621,0.493131,-1.032874,1,0,0,0,0
5192,-0.371902,-0.873386,-0.607973,0.704427,0,-0.521428,1.453419,-0.140729,-0.781614,-0.873386,0.0,-0.873386,0.0,-0.642708,-0.648905,0.551709,1.103392,0,0,0,0,1
12172,1.491306,-1.381809,0.469681,-0.214269,0,1.513742,0.459190,1.347823,-1.823229,-1.381809,0.0,-1.381809,0.0,-0.679433,0.913143,-1.912873,1.103392,0,0,0,0,0


### Checking class balancing

In [21]:
fraud_classes = fraud_ip_data['class'].value_counts()
fraud_classes

class
1    13973
0    13973
Name: count, dtype: int64

### Data Split

In [3]:
from sklearn.model_selection import train_test_split

X = fraud_ip_data.drop('class', axis=1)  
y = fraud_ip_data['class']  

# Split the data into training, validation, and validation sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42, stratify=y_val_test)

In [22]:
pd.Series(y_train).value_counts()

class
0    9781
1    9781
Name: count, dtype: int64

In [6]:
pd.Series(y_test).value_counts()

class
0    2096
1    2096
Name: count, dtype: int64

In [7]:
pd.Series(y_val).value_counts()

class
0    2096
1    2096
Name: count, dtype: int64

### Save the split datasets

In [8]:
import os
import pandas as pd

# Create the 'model_input' directory if it doesn't exist
model_input_dir = '../model_inputs'
if not os.path.exists(model_input_dir):
    os.makedirs(model_input_dir)

# Save the split datasets
pd.DataFrame(X_train, columns=X.columns).to_csv(os.path.join(model_input_dir, 'X_train.csv'), index=False)
pd.DataFrame(y_train).to_csv(os.path.join(model_input_dir, 'y_train.csv'), index=False)

pd.DataFrame(X_val, columns=X.columns).to_csv(os.path.join(model_input_dir, 'X_val.csv'), index=False)
pd.DataFrame(y_val).to_csv(os.path.join(model_input_dir, 'y_val.csv'), index=False)

pd.DataFrame(X_test, columns=X.columns).to_csv(os.path.join(model_input_dir, 'X_test.csv'), index=False)
pd.DataFrame(y_test).to_csv(os.path.join(model_input_dir, 'y_test.csv'), index=False)

### Model 1 - Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=2, n_jobs=-1)
rf.fit(X_train, y_train)

In [29]:
print(classification_report(y_val, rf.predict(X_val), target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.76      0.91      0.83      2096
       Fraud       0.89      0.72      0.80      2096

    accuracy                           0.82      4192
   macro avg       0.83      0.82      0.81      4192
weighted avg       0.83      0.82      0.81      4192



### Model 2 - XGBoost with XGBoost with  hyperparameter tuning

In [34]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [2, 3, 4, 5],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# XGBoost model
xgb_model = XGBClassifier(random_state=0)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

# best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'subsample': 1.0}


In [35]:
# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.93      0.83      2096
           1       0.91      0.70      0.79      2096

    accuracy                           0.82      4192
   macro avg       0.83      0.82      0.81      4192
weighted avg       0.83      0.82      0.81      4192



### Model 3 - Multi-Layer Perceptron (MLP)

In [36]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#  MLP model
mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons, respectively
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=42
)

# Train the MLP model
mlp.fit(X_train, y_train)

In [38]:
print(classification_report(y_val, mlp.predict(X_val), target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.74      0.74      0.74      2096
       Fraud       0.74      0.74      0.74      2096

    accuracy                           0.74      4192
   macro avg       0.74      0.74      0.74      4192
weighted avg       0.74      0.74      0.74      4192



In [24]:
%pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
import xgboost as xgb
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Example hyperparameters
params2 = {
    'colsample_bytree': 0.8,
    'gamma': 0.5,  # Adjust gamma to control regularization
    'learning_rate': 0.01,  # Increase learning rate for faster convergence
    'max_depth': 10,  # Increase max_depth for more complex trees
    'min_child_weight': 1,  # Reduce min_child_weight to potentially capture more samples
    'n_estimators': 200,  # Increase n_estimators for more boosting rounds
    'subsample': 0.8,
}

params3 = {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'subsample': 1.0}

params = {
    'colsample_bytree': 0.8,
    'gamma': 0.5,  # Adjust gamma to control regularization
    'learning_rate': 0.1,  # Increase learning rate for faster convergence
    'max_depth': 6,  # Increase max_depth for more complex trees
    'min_child_weight': 1,  # Reduce min_child_weight to potentially capture more samples
    'n_estimators': 200,  # Increase n_estimators for more boosting rounds
    'subsample': 0.8,
}


# Initialize XGBClassifier with params
xmodel = XGBClassifier(**params)

# Fit the model
xmodel.fit(X_train, y_train)

# Evaluate the model on validation data
y_val_pred = xmodel.predict(X_val)
print(classification_report(y_val, y_val_pred, target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.76      0.92      0.83      2096
       Fraud       0.89      0.71      0.79      2096

    accuracy                           0.81      4192
   macro avg       0.83      0.81      0.81      4192
weighted avg       0.83      0.81      0.81      4192



In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# XGBoost model
xgb = XGBClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=0)

# Fit the model 
xgb.fit(X_train, y_train)

### Saving the best model on FraudIP Data

In [41]:
import joblib
# Save the best model
if not os.path.exists('../models'):
    os.makedirs('../models')

model_path = os.path.join('../models', 'fraudIP_xgb_model.pkl')
joblib.dump(grid_search.best_estimator_, model_path)
print(f"Model saved to {model_path}")

Model saved to ../models\fraudIP_xgb_model.pkl


#  Model Explainability
### Uing the best model

## 2. Model Training - Credit Card Data

In [7]:
credit_card_data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
241,148897.0,2.060018,-0.171344,-1.486690,0.073000,0.405380,-0.296966,0.089310,-0.133385,0.332254,0.232706,0.065653,0.853144,0.276809,0.391430,-0.596583,0.385258,-0.915615,-0.052532,0.813967,-0.139899,-0.293461,-0.773428,0.186081,-1.033029,-0.194445,0.239585,-0.078187,-0.077387,17.98,0
839,132543.0,0.103641,0.933864,-0.521197,-0.714673,1.056614,-0.338123,0.895243,0.060319,-0.265760,-0.670279,0.724902,0.379861,-0.204578,-0.800934,-0.898745,0.621266,0.021961,0.398160,0.165481,0.053661,-0.303614,-0.789758,0.078140,0.240984,-0.414522,0.112468,0.212875,0.071812,14.98,0
414,79273.0,-0.588417,0.294412,1.154334,0.550848,0.619070,-1.112417,0.111636,0.042107,-0.254118,-0.403299,-0.847133,-1.148742,-1.539393,0.046438,1.439295,0.280419,0.221055,0.326598,0.757722,0.104254,-0.153458,-0.703388,0.141806,-0.062553,-0.721211,0.116289,0.103777,0.179208,1.98,0
765,166332.0,-3.681538,-0.638713,-0.479522,0.263719,1.536671,-0.026893,0.446716,0.401110,-0.200699,-0.729713,0.236865,0.375531,0.088070,-0.907722,-0.602456,1.307492,-0.187555,1.046145,-0.618044,-0.684217,-0.358368,-0.536902,-1.713020,-0.107236,0.807685,-0.655861,0.700997,-0.513219,206.00,0
604,79913.0,0.739956,-1.089863,1.016616,0.515513,-1.590026,-0.490107,-0.340169,-0.024464,1.162277,-0.559376,-0.506182,0.670395,-0.083764,-0.474870,-0.292808,-0.182776,0.272541,-0.692900,0.268892,0.381862,-0.072284,-0.478651,-0.083298,0.809246,0.006180,0.897313,-0.077445,0.058058,240.68,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,164376.0,0.120504,0.974526,-0.505851,-0.734961,1.135347,-0.262833,0.847474,0.083476,-0.280673,-0.671262,0.788595,0.465568,-0.085842,-0.821939,-0.865340,0.556862,0.068977,0.291642,0.099287,0.025786,-0.304979,-0.756434,0.050698,0.186424,-0.429785,0.118145,0.221560,0.070378,1.98,0
72,55311.0,-6.159607,1.468713,-6.850888,5.174706,-2.986704,-1.795054,-6.545072,2.621236,-3.605870,-8.122161,6.029033,-9.225855,-1.546759,-10.309334,0.308062,-7.787326,-12.822177,-4.367677,2.643984,-0.289830,1.061314,0.125737,0.589592,-0.568731,0.582825,-0.042583,0.951130,0.158996,0.83,1
908,32745.0,-2.179135,0.020218,-2.182733,2.572046,-3.663733,0.081568,0.268049,0.660437,-2.374027,-3.582810,1.975821,-3.500542,0.170681,-2.735940,1.670251,-4.046293,-5.079479,-2.586857,1.669261,1.957960,1.026421,0.299614,1.656800,0.328433,0.106457,0.691775,0.196779,0.241085,717.15,1
235,85252.0,-1.965275,-0.487194,0.817852,1.716078,1.229009,-0.599898,-0.487558,0.612338,-0.469782,-0.301798,-0.971595,0.742819,0.840619,0.256709,0.194611,-1.023729,0.655628,-0.568048,0.821456,0.618493,0.154758,0.096709,-0.131361,-0.226808,-0.005504,-0.219828,0.335099,-0.126759,25.00,0


### Checking class balancing

In [10]:
credit_card_data['Class'].value_counts()

Class
0    473
1    473
Name: count, dtype: int64

### Data Split

In [3]:
from sklearn.model_selection import train_test_split

X = credit_card_data.drop('Class', axis=1)  
y = credit_card_data['Class']  

# Split the data into training, validation, and validation sets
X_train_b, X_val_test_b, y_train_b, y_val_test_b = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val_b, X_test_b, y_val_b, y_test_b = train_test_split(X_val_test_b, y_val_test_b, test_size=0.5, random_state=42, stratify=y_val_test_b)

In [4]:
X_train_b.shape, X_test_b.shape, y_train_b.shape, y_test_b.shape, X_val_b.shape, y_val_b.shape

((662, 30), (142, 30), (662,), (142,), (142, 30), (142,))

### Checking class distribution

In [5]:
pd.Series(y_train_b).value_counts()

Class
1    331
0    331
Name: count, dtype: int64

In [27]:
pd.Series(y_test_b).value_counts()

Class
1    71
0    71
Name: count, dtype: int64

In [29]:
pd.Series(y_val_b).value_counts()

Class
0    71
1    71
Name: count, dtype: int64

### Model 1 - Logistic Regression 

In [30]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()
logistic_model.fit(X_train_b, y_train_b)

In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_val_b, logistic_model.predict(X_val_b), target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.87      1.00      0.93        71
       Fraud       1.00      0.85      0.92        71

    accuracy                           0.92       142
   macro avg       0.93      0.92      0.92       142
weighted avg       0.93      0.92      0.92       142



### Model 2 - Logistic Regression with StandardScaler

In [15]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# pipeline with StandardScaler and LogisticRegression
pipe = make_pipeline(StandardScaler(), LogisticRegression())

# Fit the model 
pipe.fit(X_train_b, y_train_b)

In [16]:
# Evaluate the model
accuracy = pipe.score(X_test_b, y_test_b)
print(f"Test Accuracy: {accuracy:.2f}")

# Generate the classification report
from sklearn.metrics import classification_report
y_pred_b = pipe.predict(X_test_b)
report = classification_report(y_test_b, y_pred_b, target_names=['Class 0', 'Class 1'])
print(report)

Test Accuracy: 0.96
              precision    recall  f1-score   support

     Class 0       0.96      0.96      0.96        71
     Class 1       0.96      0.96      0.96        71

    accuracy                           0.96       142
   macro avg       0.96      0.96      0.96       142
weighted avg       0.96      0.96      0.96       142



### Model 3 - XGBoost

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# XGBoost model
xgb = XGBClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=0)

# Fit the model 
xgb.fit(X_train_b, y_train_b)

In [7]:
# Evaluate the model 
y_val_pred = xgb.predict(X_val_b)
print(classification_report(y_val_b, y_val_pred, target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.89      0.93      0.91        71
       Fraud       0.93      0.89      0.91        71

    accuracy                           0.91       142
   macro avg       0.91      0.91      0.91       142
weighted avg       0.91      0.91      0.91       142



### Choosing the best model on the Credit Card Data

#### Model 2 (Logistic Regression with StandardScaler) is the best performing model overall, based on the metrics provided. It has the highest accuracy, macro average precision, macro average recall, and macro average F1-score. This model balances precision and recall well for both classes and outperforms the others slightly in most metrics.

### Saving the Best Model on Credit Card Data

In [18]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import os
from joblib import dump

# Save the trained model
model_dir = '../models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

model_path = os.path.join(model_dir, 'creditCard_lr_model.joblib')
dump(pipe, model_path)

['../models\\creditCard_lr_model.joblib']