# Setup Environment

In [90]:
import pandas as pd
import numpy as np
from numpy import mean, std
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import precision_recall_curve, auc, make_scorer, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import joblib

# Modelling

## Train Test Split

In [91]:
# Load intermediate dataset
data = pd.read_csv("Cleaned data/finalised_data.csv", index_col='Transaction ID')

In [92]:
data.head()

Unnamed: 0_level_0,Transaction Amount,Quantity,Customer Age,Is Fraudulent,Account Age Days,Transaction Hour,Age Group,Transaction Values Ordinal,Log Transaction Amount,Transaction Day_of_month,...,Proxy_True,Proxy_Unknown,Customer Life_stage_Teenage,Customer Life_stage_Young Adult,Customer Life_stage_Working Adult,Customer Life_stage_Middle-Aged,Customer Life_stage_Senior,Transaction Phase_of_month_Start,Transaction Phase_of_month_Middle,Transaction Phase_of_month_End
Transaction ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c12e07a0-8a06-4c0d-b5cc-04f3af688570,42.32,1,40,0,282,23,5,1,3.768614,24,...,False,False,False,False,True,False,False,False,False,True
7d187603-7961-4fce-9827-9698e2b6a201,301.34,3,35,0,223,0,4,3,5.711552,22,...,False,False,False,False,True,False,False,False,False,True
f2c14f9d-92df-4aaf-8931-ceaf4e63ed72,340.32,5,29,0,360,8,3,3,5.83282,22,...,False,False,False,True,False,False,False,False,False,True
e9949bfa-194d-486b-84da-9565fca9e5ce,95.77,5,45,0,325,20,5,1,4.572337,16,...,False,False,False,False,False,True,False,False,True,False
7362837c-7538-434e-8731-0df713f5f26d,77.45,5,42,0,116,15,5,1,4.362461,16,...,False,False,False,False,True,False,False,False,True,False


In [93]:
data.columns

Index(['Transaction Amount', 'Quantity', 'Customer Age', 'Is Fraudulent',
       'Account Age Days', 'Transaction Hour', 'Age Group',
       'Transaction Values Ordinal', 'Log Transaction Amount',
       'Transaction Day_of_month', 'Transaction During_weekend',
       'addressesMatch', 'isPOBox', 'Payment Method_PayPal',
       'Payment Method_bank transfer', 'Payment Method_credit card',
       'Payment Method_debit card', 'Product Category_clothing',
       'Product Category_electronics', 'Product Category_health & beauty',
       'Product Category_home & garden', 'Product Category_toys & games',
       'Device Used_desktop', 'Device Used_mobile', 'Device Used_tablet',
       'ipAddressMatchesUS_False', 'ipAddressMatchesUS_True',
       'ipAddressMatchesUS_Unknown', 'Transaction Time_of_day_Night',
       'Transaction Time_of_day_Morning', 'Transaction Time_of_day_Afternoon',
       'Transaction Time_of_day_Evening', 'Transaction Day_of_week_0',
       'Transaction Day_of_week_1', 'T

To prevent multicollinearity, multiple columns that represent customer ages in bins have to be seperated from each other.

Customer Life stage, Age Group and Customer Age have to be seperated.

Transaction Amount, Log Transaction Amount and Transaction Values Ordinal have to be separated as well.

In [94]:
# Splitting the age group categories
life_stage_cols = ['Customer Life_stage_Teenage', 'Customer Life_stage_Young Adult', 
                   'Customer Life_stage_Working Adult', 'Customer Life_stage_Middle-Aged', 
                   'Customer Life_stage_Senior']

data.drop(columns=life_stage_cols, inplace=True)
# data.drop(columns='Customer Age', inplace=True)
data.drop(columns='Age Group', inplace=True)

# Handling transaction amounts duplicate columns
data.drop(columns='Transaction Amount', inplace=True)
data.drop(columns='Transaction Values Ordinal', inplace=True)
# data.drop(columns='Log Transaction Amount', inplace=True)

In [95]:
# Train test split
X = data.loc[:, data.columns != 'Is Fraudulent']
y = data["Is Fraudulent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1, stratify=y)


# Model Creation and Evaluation


In [103]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

def evaluate_model(model, param_grid, cv):
    # Define sampler
    undersampler = RandomUnderSampler(random_state=1)
    smote = SMOTE(random_state=1)

    # # Create pipeline with sampler
    # pipeline = Pipeline(steps=[('undersampler', undersampler), ('classifier', model)])
    # pipeline = Pipeline(steps=[('smote', smote), ('classifier', model)])
    pipeline = Pipeline(steps=[('undersampler', undersampler), ('smote', smote), ('classifier', model)])

    # define search
    grid_search = GridSearchCV(pipeline, param_grid, 
                          scoring='recall', cv=cv, n_jobs=-1,
                          verbose=1)
    # Fit the grid search to the train data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Print the best training pr_auc
    print(f"Best training Recall score: {best_score}")

    # Create best model on the whole train data
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    # Predict on the test data
    y_scores = best_model.predict_proba(X_test)[:, 1]
    y_pred = best_model.predict(X_test)

    # Create the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

    # Calculate pr_auc
    # pr = pr_auc(y_test, y_pred)
    # print(f"PR AUC: {pr}")

    # Calculate auc
    # auc_score = roc_auc_score(y_test, y_scores)
    # print("AUC Score:", auc_score)

    recall = recall_score(y_test, y_pred)
    print("Test Recall Score:", recall)

    precision = precision_score(y_test, y_pred)
    print("Precision Score:", precision)

    f1 = f1_score(y_test, y_pred)
    print("F1 Score:", f1)

    # Save the best model
    joblib.dump(best_model, f"Models/{model.__class__.__name__}_model.pkl")

    return [best_model, best_params, y_scores] 

In [104]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Define the parameter grid for grid search
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    # 'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__class_weight': ['balanced']
}

param_grid_xgb = {
    'classifier__max_depth': [5, 10],
    'classifier__learning_rate': [0.1, 0.01, 0.001],
    'classifier__n_estimators': [100, 200],
    'classifier__reg_alpha': [0, 0.1, 0.5],
    'classifier__reg_lambda': [1, 1.5, 2],  
}

rf = evaluate_model(RandomForestClassifier(), 
                    param_grid_rf, cv)

xgb = evaluate_model(XGBClassifier(), 
                     param_grid_xgb, cv)


Fitting 30 folds for each of 54 candidates, totalling 1620 fits
Best training Recall score: 0.7737078336489236
Confusion Matrix:
 [[3262 1219]
 [  51  193]]
Test Recall Score: 0.7909836065573771
Precision Score: 0.136685552407932
F1 Score: 0.23309178743961353
Fitting 30 folds for each of 108 candidates, totalling 3240 fits
Best training Recall score: 0.7287046777473876
Confusion Matrix:
 [[3794  687]
 [  87  157]]
Test Recall Score: 0.6434426229508197
Precision Score: 0.18601895734597157
F1 Score: 0.28860294117647056


In [101]:
# Create stack classifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stack_model = StackingClassifier(estimators=[('rf', rf[0]), ('xgb', xgb[0])],
                                 final_estimator=LogisticRegression())

stack_model.fit(X_train, y_train)

y_scores = stack_model.predict_proba(X_test)[:, 1]
y_pred = stack_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
auc_score = roc_auc_score(y_test, y_scores)
print("AUC Score:", auc_score)


Confusion Matrix:
 [[4481    0]
 [ 244    0]]
AUC Score: 0.827493862977014


In [None]:
# Create shap explainability