# Setup Environment

In [20]:
import pandas as pd
import numpy as np
from numpy import mean, std
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import precision_recall_curve, auc, make_scorer
import joblib



# Modelling

## Train Test Split

In [5]:
# Load intermediate dataset
data = pd.read_csv("Cleaned data/finalised_data.csv", index_col=None)

In [6]:
data.head()

Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Quantity,Customer Age,Customer Location,IP Address,Shipping Address,Billing Address,...,Payment Method_credit card,Payment Method_debit card,Product Category_clothing,Product Category_electronics,Product Category_health & beauty,Product Category_home & garden,Product Category_toys & games,Device Used_desktop,Device Used_mobile,Device Used_tablet
0,c12e07a0-8a06-4c0d-b5cc-04f3af688570,8ca9f102-02a4-4207-ab63-484e83a1bdf0,42.32,2024-03-24 23:42:43,1,40,East Jameshaven,110.87.246.85,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,...,False,False,False,True,False,False,False,True,False,False
1,7d187603-7961-4fce-9827-9698e2b6a201,4d158416-caae-4b09-bd5b-15235deb9129,301.34,2024-01-22 00:53:31,3,35,Kingstad,14.73.104.153,"5230 Stephanie Forge\nCollinsbury, PR 81853","5230 Stephanie Forge\nCollinsbury, PR 81853",...,True,False,False,True,False,False,False,False,False,True
2,f2c14f9d-92df-4aaf-8931-ceaf4e63ed72,ccae47b8-75c7-4f5a-aa9e-957deced2137,340.32,2024-01-22 08:06:03,5,29,North Ryan,67.58.94.93,"195 Cole Oval\nPort Larry, IA 58422","4772 David Stravenue Apt. 447\nVelasquezside, ...",...,False,True,False,False,False,False,True,True,False,False
3,e9949bfa-194d-486b-84da-9565fca9e5ce,b04960c0-aeee-4907-b1cd-4819016adcef,95.77,2024-01-16 20:34:53,5,45,Kaylaville,202.122.126.216,"7609 Cynthia Square\nWest Brenda, NV 23016","7609 Cynthia Square\nWest Brenda, NV 23016",...,True,False,False,True,False,False,False,False,True,False
4,7362837c-7538-434e-8731-0df713f5f26d,de9d6351-b3a7-4bc7-9a55-8f013eb66928,77.45,2024-01-16 15:47:23,5,42,North Edwardborough,96.77.232.76,"2494 Robert Ramp Suite 313\nRobinsonport, AS 5...","2494 Robert Ramp Suite 313\nRobinsonport, AS 5...",...,True,False,True,False,False,False,False,True,False,False


In [None]:
# Feature Selection of Numeric Variables
numeric_columns = data.select_dtypes(include=['int64', 'float64'])
# Check Variance of Numeric Variables
variance = numeric_columns.var()
print(variance)

Transaction Amount    73113.509492
Quantity                  2.001479
Customer Age            100.062697
Is Fraudulent             0.047616
Account Age Days      11419.962167
Transaction Hour         48.664510
Age Group                 1.163202
ShipBill                  0.090124
dtype: float64


In [None]:
# Check Variance of Categorical Variables
categorical_columns = data.select_dtypes(include=['object'])
# Count of Categorical features and remove if lacking variance
for column in categorical_columns.columns:
    print(categorical_columns[column].value_counts())

Transaction ID
15d2e414-8735-46fc-9e02-80b472b2580f    1
a81a89d3-15db-498d-b7a2-715233d5c477    1
e7e0aafc-d2f2-418d-9b7f-cc05cb9b2a8b    1
89fbc566-0421-4859-be00-a4803d0ed21a    1
8406043a-d4e0-463e-8ca4-bc3492c87503    1
                                       ..
6d036b10-29b9-46db-8024-f0ddb8c357aa    1
76fbbad6-34ca-4f1f-a503-dd5013ececab    1
59a01e3a-1539-4d17-bd21-adbfcdbeedcd    1
96cdb681-8b01-4611-a1a5-28a4fac98af3    1
d1a811a2-6015-47fa-95e5-0a5282755932    1
Name: count, Length: 1472694, dtype: int64
Customer ID
d1b87f62-51b2-493b-ad6a-77e0fe13e785    1
befaa147-b877-47f3-85a9-1920711f36fe    1
4bdf4788-d8b9-4e8f-bb69-b5a068c59f71    1
f1e7c30f-d719-4dff-ba6a-4f08fe5d62da    1
3407fadb-fd04-4ebe-947f-9e8541bf68eb    1
                                       ..
b587a76d-5830-4b6f-833b-f7958f8d7647    1
7783a7ea-7a52-4c79-893b-2834b19647b9    1
1c94d0c8-34bc-4df2-b263-7b1f232601fd    1
100fa7e8-8b17-4775-aed0-df01a255ac06    1
637d5ed1-a4af-4234-a1d9-a3791b6f1bb6    1
Name: 

In [7]:
# Remove variables if needed
drop_columns = ['Shipping Address', 'Billing Address', 'Transaction ID', 
                'Customer ID', 'Transaction Date', 'IP Address', 'Customer Location']
data = data.drop(columns=drop_columns)

In [8]:
print(data.isnull().sum())

Transaction Amount                  0
Quantity                            0
Customer Age                        0
Is Fraudulent                       0
Account Age Days                    0
Transaction Hour                    0
Log Transaction Amount              0
Age Group                           0
addressesMatch                      0
isPOBox                             0
ShipBill                            0
Payment Method_PayPal               0
Payment Method_bank transfer        0
Payment Method_credit card          0
Payment Method_debit card           0
Product Category_clothing           0
Product Category_electronics        0
Product Category_health & beauty    0
Product Category_home & garden      0
Product Category_toys & games       0
Device Used_desktop                 0
Device Used_mobile                  0
Device Used_tablet                  0
dtype: int64


In [23]:
# Train test split
X = data.loc[:, data.columns != 'Is Fraudulent']
y = data["Is Fraudulent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)


# Model Creation and Evaluation


In [22]:
# Create AUC eval function
def pr_auc(y_true, probas_pred):
    # calculate precision-recall curve
    p, r, _ = precision_recall_curve(y_true, probas_pred)
    # calculate area under curve
    return auc(r, p)

## Extra Trees Classifier

In [28]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Extra Trees classifier
extra_trees = ExtraTreesClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(extra_trees, param_grid, scoring=make_scorer(pr_auc, needs_proba=True),
                           cv=cv, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best score
print("Best PR-AUC Score:", best_score)

# Save the best model
best_et_model = grid_search.best_estimator_
joblib.dump(best_et_model, "Models/best_et_model.pkl")



Fitting 30 folds for each of 81 candidates, totalling 2430 fits
Best PR-AUC Score: 0.3697155835263206


['Models/best_et_model.pkl']

## Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer


# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the random forest classifier
rf = RandomForestClassifier()

# Perform grid search with repeated stratified k-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring=make_scorer(pr_auc), cv=cv, n_jobs=-1,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best pr_auc
print("Best pr_auc:", best_score)

# Create the random forest model with the best parameters
best_rf = RandomForestClassifier(**best_params)

# Fit the model to the data
best_rf.fit(X, y)

# Save model
joblib.dump(best_rf, 'Models/best_rf_model.pkl')

Fitting 30 folds for each of 81 candidates, totalling 2430 fits


Best pr_auc: 0.5217047751394168


['best_model.joblib']

## XGBoost Classifier

In [32]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_recall_curve, auc

# Define the parameter grid for grid search
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

# Create the XGBoost classifier
xgb_clf = xgb.XGBClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, 
                           scoring=make_scorer(pr_auc), cv=cv,
                           n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

# Get the best results
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best results
print("Best PR AUC Score:", best_score)

# Save the best model
best_xgb_model = grid_search.best_estimator_
joblib.dump(best_xgb_model, 'Models/best_xgb_model.pkl')

Fitting 30 folds for each of 27 candidates, totalling 810 fits
Best PR AUC Score: 0.5321213877178069


['Models/best_xgb_model.pkl']

In [None]:
#Evaluation
# Confusion Matrix
# Need to ensure a high recall to not let fraudulent transactions go undetected.
#potentially area under curve