# Setup Environment

In [41]:
import pandas as pd
import numpy as np
from numpy import mean, std
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import precision_recall_curve, auc, make_scorer, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib



# Modelling

## Train Test Split

In [42]:
# Load intermediate dataset
data = pd.read_csv("Cleaned data/finalised_data.csv", index_col=None)

In [43]:
data.head()

Unnamed: 0,Quantity,Is Fraudulent,Account Age Days,Age Group,Transaction Values Ordinal,addressesMatch,isPOBox,Payment Method_PayPal,Payment Method_bank transfer,Payment Method_credit card,...,Transaction_day_of_week_0,Transaction_day_of_week_1,Transaction_day_of_week_2,Transaction_day_of_week_3,Transaction_day_of_week_4,Transaction_day_of_week_5,Transaction_day_of_week_6,Proxy_False,Proxy_True,Proxy_Unknown
0,1,0,282,4,1,1,False,True,False,False,...,False,False,False,False,False,False,True,True,False,False
1,3,0,223,4,3,1,False,False,False,True,...,True,False,False,False,False,False,False,True,False,False
2,5,0,360,3,3,0,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False
3,5,0,325,5,1,1,False,False,False,True,...,False,True,False,False,False,False,False,True,False,False
4,5,0,116,4,1,1,False,False,False,True,...,False,True,False,False,False,False,False,True,False,False


In [44]:
data.columns

Index(['Quantity', 'Is Fraudulent', 'Account Age Days', 'Age Group',
       'Transaction Values Ordinal', 'addressesMatch', 'isPOBox',
       'Payment Method_PayPal', 'Payment Method_bank transfer',
       'Payment Method_credit card', 'Payment Method_debit card',
       'Product Category_clothing', 'Product Category_electronics',
       'Product Category_health & beauty', 'Product Category_home & garden',
       'Product Category_toys & games', 'Device Used_desktop',
       'Device Used_mobile', 'Device Used_tablet', 'ipAddressMatchesUS_False',
       'ipAddressMatchesUS_True', 'ipAddressMatchesUS_Unknown',
       'Transaction_time_of_day_Night', 'Transaction_time_of_day_Morning',
       'Transaction_time_of_day_Afternoon', 'Transaction_time_of_day_Evening',
       'Transaction_day_of_week_0', 'Transaction_day_of_week_1',
       'Transaction_day_of_week_2', 'Transaction_day_of_week_3',
       'Transaction_day_of_week_4', 'Transaction_day_of_week_5',
       'Transaction_day_of_week_6',

In [45]:
# Train test split
X = data.loc[:, data.columns != 'Is Fraudulent']
y = data["Is Fraudulent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1, stratify=y)

smote = SMOTE(random_state=1)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Model Creation and Evaluation


In [46]:
# Create AUC eval function
def pr_auc(y_true, probas_pred):
    # calculate precision-recall curve
    p, r, _ = precision_recall_curve(y_true, probas_pred)
    # calculate area under curve
    return auc(r, p)

In [47]:
def evaluate_model(model, param_grid, cv):
    # define search
    grid_search = GridSearchCV(model, param_grid, 
                          scoring=make_scorer(pr_auc), cv=cv, n_jobs=-1,
                          verbose=1)
    # Fit the grid search to the train data
    grid_search.fit(X_train_res, y_train_res)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Print the best training pr_auc
    print(f"Best training PR AUC: {best_score}")

    # Create best model on the whole train data
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_res, y_train_res)

    # Predict on the test data
    y_pred = best_model.predict(X_test)

    # Create the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

    # Calculate pr_auc
    pr = pr_auc(y_test, y_pred)
    print(f"PR AUC: {pr}")

    # Save the best model
    joblib.dump(best_model, f"Models/{model}_model.pkl")

    return [best_model, best_params, y_pred, cm, pr] 

In [49]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Define the parameter grid for grid search
param_grid_rf = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}
param_grid_et = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
param_grid_xgb = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2],  
}
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

rf = evaluate_model(RandomForestClassifier(), 
                    param_grid_rf, cv)

# et = evaluate_model(ExtraTreesClassifier(), 
#                     param_grid_et, cv)

# xgb = evaluate_model(XGBClassifier(), 
                    #  param_grid_xgb, cv)

# ada = evaluate_model(AdaBoostClassifier(), 
#                      param_grid_ada, cv)

Fitting 30 folds for each of 256 candidates, totalling 7680 fits


KeyboardInterrupt: 

In [None]:
#Evaluation
# Confusion Matrix
# Need to ensure a high recall to not let fraudulent transactions go undetected.
#potentially area under curve