# Setup Environment

In [61]:
import pandas as pd
import numpy as np
from numpy import mean, std
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import precision_recall_curve, auc, make_scorer, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import joblib

# Modelling

## Train Test Split

In [62]:
# Load intermediate dataset
data = pd.read_csv("Cleaned data/finalised_data.csv", index_col=None)

In [63]:
data.head()

Unnamed: 0,Transaction Amount,Quantity,Customer Age,Is Fraudulent,Account Age Days,Transaction Hour,Age Group,Transaction Values Ordinal,Log Transaction Amount,Transaction Day_of_month,...,Proxy_True,Proxy_Unknown,Customer Life_stage_Teenage,Customer Life_stage_Young Adult,Customer Life_stage_Working Adult,Customer Life_stage_Middle-Aged,Customer Life_stage_Senior,Transaction Phase_of_month_Start,Transaction Phase_of_month_Middle,Transaction Phase_of_month_End
0,42.32,1,40,0,282,23,5,1,3.768614,24,...,False,False,False,False,True,False,False,False,False,True
1,301.34,3,35,0,223,0,4,3,5.711552,22,...,False,False,False,False,True,False,False,False,False,True
2,340.32,5,29,0,360,8,3,3,5.83282,22,...,False,False,False,True,False,False,False,False,False,True
3,95.77,5,45,0,325,20,5,1,4.572337,16,...,False,False,False,False,False,True,False,False,True,False
4,77.45,5,42,0,116,15,5,1,4.362461,16,...,False,False,False,False,True,False,False,False,True,False


In [64]:
data.columns

Index(['Transaction Amount', 'Quantity', 'Customer Age', 'Is Fraudulent',
       'Account Age Days', 'Transaction Hour', 'Age Group',
       'Transaction Values Ordinal', 'Log Transaction Amount',
       'Transaction Day_of_month', 'Transaction During_weekend',
       'addressesMatch', 'isPOBox', 'Payment Method_PayPal',
       'Payment Method_bank transfer', 'Payment Method_credit card',
       'Payment Method_debit card', 'Product Category_clothing',
       'Product Category_electronics', 'Product Category_health & beauty',
       'Product Category_home & garden', 'Product Category_toys & games',
       'Device Used_desktop', 'Device Used_mobile', 'Device Used_tablet',
       'ipAddressMatchesUS_False', 'ipAddressMatchesUS_True',
       'ipAddressMatchesUS_Unknown', 'Transaction Time_of_day_Night',
       'Transaction Time_of_day_Morning', 'Transaction Time_of_day_Afternoon',
       'Transaction Time_of_day_Evening', 'Transaction Day_of_week_0',
       'Transaction Day_of_week_1', 'T

To prevent multicollinearity, multiple columns that represent customer ages in bins have to be seperated from each other.

Customer Life stage, Age Group and Customer Age have to be seperated.

Transaction Amount, Log Transaction Amount and Transaction Values Ordinal have to be separated as well.

In [65]:
# Splitting the age group categories
life_stage_cols = ['Customer Life_stage_Teenage', 'Customer Life_stage_Young Adult', 
                   'Customer Life_stage_Working Adult', 'Customer Life_stage_Middle-Aged', 
                   'Customer Life_stage_Senior']

data.drop(columns=life_stage_cols, inplace=True)
data.drop(columns='Customer Age', inplace=True)
# data.drop(columns='Age Group', inplace=True)

# Handling transaction amounts duplicate columns
data.drop(columns='Transaction Amount', inplace=True)
data.drop(columns='Transaction Values Ordinal', inplace=True)
# data.drop(columns='Log Transaction Amount', inplace=True)

In [66]:
# Train test split
X = data.loc[:, data.columns != 'Is Fraudulent']
y = data["Is Fraudulent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1, stratify=y)

smote = SMOTE(random_state=1)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Model Creation and Evaluation


In [67]:
# Create AUC eval function
def pr_auc(y_true, probas_pred):
    # calculate precision-recall curve
    p, r, _ = precision_recall_curve(y_true, probas_pred)
    # calculate area under curve
    return auc(r, p)

In [68]:
def evaluate_model(model, param_grid, cv):
    # Define undersampler
    # undersampler = RandomUnderSampler(random_state=1)

    # # Create pipeline with undersampler
    # pipeline = Pipeline(steps=[('undersampler', undersampler), ('classifier', model)])
    # define search
    grid_search = GridSearchCV(model, param_grid, 
                          scoring=make_scorer(pr_auc), cv=cv, n_jobs=-1,
                          verbose=1)
    # Fit the grid search to the train data
    grid_search.fit(X_train_res, y_train_res)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Print the best training pr_auc
    print(f"Best training PR AUC: {best_score}")

    # Create best model on the whole train data
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_res, y_train_res)

    # Predict on the test data
    y_pred = best_model.predict(X_test)

    # Create the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

    # Calculate pr_auc
    pr = pr_auc(y_test, y_pred)
    print(f"PR AUC: {pr}")

    # Save the best model
    joblib.dump(best_model, f"Models/{model.__class__.__name__}_model.pkl")

    return [best_model, best_params, y_pred, cm, pr] 

In [None]:
# Create meta model: logistic regression


In [72]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# Define the parameter grid for grid search
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8]
}

param_grid_xgb = {
    'max_depth': [5, 10],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2],  
}

rf = evaluate_model(RandomForestClassifier(), 
                    param_grid_rf, cv)

xgb = evaluate_model(XGBClassifier(), 
                     param_grid_xgb, cv)


Fitting 15 folds for each of 72 candidates, totalling 1080 fits


In [None]:
# Create stack classifier

In [None]:
# Create shap explainability