In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('Fraud_Data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB


In [3]:
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [4]:
df.isnull().sum()

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

A data exploration

In [5]:
fraud_label = df.loc[df['class'] == 1]
non_fraud_label =  df.loc[df['class'] == 0]

print(f"This dataset has a total number of {len(fraud_label)} fraud labels and a total number of {len(non_fraud_label)} non-fraud labels ")

This dataset has a total number of 14151 fraud labels and a total number of 136961 non-fraud labels 


Observation: In this scenario, there seems to be a huge number of class imbalance, because the fraud labels and non fraud labels difference is huge

In [7]:
# converting the signup_time and the purchase_time columns to datetime
import datetime

df['signup_time'] = pd.to_datetime(df['signup_time'])

df['purchase_time'] = pd.to_datetime(df['purchase_time'])

df['signup_hour'] = df['signup_time'].dt.hour

df['purchase_hour'] = df['purchase_time'].dt.hour

# An Important Observation When Building Trad ML For Fraud Prediction

In fraud prediction models, F1 score is generally more important than Accuracy for both model evaluation and comparison. 

This is because fraud detection often involves imbalanced datasets where the cost of false negatives (FN) can be significant. 

F1 SCore balances precision and recall, this makes it effective in a scenario where bpth false positives and false negatives are critical.

Relying on Accuracy can be misleading, especially if the model predicts the majority class well but fails to identify fraud effectively

In [13]:
# Building the Traditional Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


In [14]:
df.columns

Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class',
       'signup_hour', 'purchase_hour'],
      dtype='object')

In [15]:
X = df.drop(['class', 'signup_time', 'purchase_time', 'user_id', 'device_id', 'ip_address'], axis=1)
y = df['class']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
cat_cols = ['source', 'browser', 'sex']

In [18]:
num_col = ['purchase_value', 'age']

In [19]:
ohe = OneHotEncoder()
scaler = RobustScaler()


preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, num_col),
        ('cat', ohe, cat_cols)
    ]
)

preprocessor

In [20]:
# the classifiers i intend to work with
# hyperparameter optimisation isn't done yet
models  = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'XGBoost Classifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    
}

In [21]:
# train and evaluate each model
from sklearn.pipeline import Pipeline

for model_name, model in  models.items():
    pipeline = Pipeline(
        steps=[('preprocessor', preprocessor),
               ('classifier', model)]
    )

    # fit the model
    pipeline.fit(X_train, y_train)

    # make a prediction on the test set
    y_pred = pipeline.predict(X_test)

    #evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # print the results 
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("The Classification Report:")
    print(report)
    print("\n" + "-"*60 + "\n")

Model: RandomForest
Accuracy: 0.9206
The Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     27373
           1       0.60      0.48      0.53      2850

    accuracy                           0.92     30223
   macro avg       0.77      0.72      0.74     30223
weighted avg       0.91      0.92      0.92     30223


------------------------------------------------------------

Model: LogisticRegression
Accuracy: 0.9057
The Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       0.00      0.00      0.00      2850

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223


------------------------------------------------------------

Model: XGBoost Classifier
Accuracy: 0.9096
The Classification Report:
              pr

In [22]:
# now i will apply SMOTE because I need to fix the issue of the class imbalance

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

for model_name, model in  models.items():
    pipeline = Pipeline(
        steps=[('preprocessor', preprocessor),
               ('smote', SMOTE(random_state=42)),
               ('classifier', model)]
    )

    # fit the model
    pipeline.fit(X_train, y_train)

    # make a prediction on the test set
    y_pred = pipeline.predict(X_test)

    #evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # print the results 
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("The Classification Report:")
    print(report)
    print("\n" + "-"*60 + "\n")

Model: RandomForest
Accuracy: 0.9051
The Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27393
           1       0.49      0.56      0.52      2830

    accuracy                           0.91     30223
   macro avg       0.72      0.75      0.74     30223
weighted avg       0.91      0.91      0.91     30223


------------------------------------------------------------

Model: LogisticRegression
Accuracy: 0.5344
The Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.54      0.68     27393
           1       0.10      0.50      0.17      2830

    accuracy                           0.53     30223
   macro avg       0.51      0.52      0.42     30223
weighted avg       0.84      0.53      0.63     30223


------------------------------------------------------------

Model: XGBoost Classifier
Accuracy: 0.7986
The Classification Report:
              pr

In [23]:
# hyperparameter optimization

models = {
    'RandomForest': {
        'classifier': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2,5,10],
            'classifier__min_samples_leaf': [1,2,4]
        }
    },

    'LogisticRegression' : {
        'classifier': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'classifier__C': np.logspace(-4, 4, 20),
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear']
        }
    },
    'XGBoost Classifier' : {
        'classifier' : XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        'params' : {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [3, 6, 10],
            'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3],
            'classifier__subsample': [0.7, 0.8, 0.9, 1.0]
        }
    }
}


for model_name, model_dict in models.items():
    pipeline = Pipeline(steps= [
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model_dict['classifier'])
    ])

    random_cv = RandomizedSearchCV(
        pipeline,
        param_distributions= model_dict['params'],
        n_iter= 10,
        scoring= 'accuracy',
        cv=3,
        random_state=42,
        n_jobs= -1
    )

    # fitting the model with RandomizedSearchCV
    random_cv.fit(X_train, y_train)

    # predict on the test set
    y_pred = random_cv.best_estimator_.predict(X_test)

    #model evaluation
    accuracy= accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Model: {model_name}")
    print(f"The Models Best Parameters: {random_cv.best_params_}")
    print(f"Accuracy score: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    print("\n" + "-"*60 + "\n")

Model: RandomForest
The Models Best Parameters: {'classifier__n_estimators': 200, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': None}
Accuracy score: 0.9075
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27393
           1       0.51      0.56      0.53      2830

    accuracy                           0.91     30223
   macro avg       0.73      0.75      0.74     30223
weighted avg       0.91      0.91      0.91     30223


------------------------------------------------------------

Model: LogisticRegression
The Models Best Parameters: {'classifier__solver': 'liblinear', 'classifier__penalty': 'l1', 'classifier__C': 0.0006951927961775605}
Accuracy score: 0.7460
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.80      0.85     27393
           1       0.10      0.22      0.14      2830

    accurac

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Model: XGBoost Classifier
The Models Best Parameters: {'classifier__subsample': 0.7, 'classifier__n_estimators': 50, 'classifier__max_depth': 10, 'classifier__learning_rate': 0.3}
Accuracy score: 0.8628
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.89      0.92     27393
           1       0.35      0.56      0.43      2830

    accuracy                           0.86     30223
   macro avg       0.65      0.73      0.68     30223
weighted avg       0.90      0.86      0.88     30223


------------------------------------------------------------



# Model Comparisons After The Hyperparameter Tuning

Random Forest Model:

- F1 score for class 0 (non-fraud): 0.95
- F1 score for class 1 (fraud): 0.53
- Weighted average F1 score: 0.91


Logistic Regression Model:

- F1 score for class 0 (non-fraud): 0.85
- F1 score for class 1 (fraud): 0.14
- Weighted average F1 score: 0.78


XGBoost Classifier:

- F1 score for class 0 (non-fraud): 0.92
- F1 score for class 1 (fraud): 0.43
- Weighted average F1 score: 0.88


Overall performance:

Random Forest still performs the best, followed closely by XGBoost, and then Logistic Regression.

The weighted average F1 scores are: 

- Random Forest (0.91) 
- XGBoost (0.88) 
- Logistic Regression (0.78).


Non-fraud detection (class 0):

Random Forest (0.95) slightly outperforms XGBoost (0.92) and Logistic Regression (0.85).

All models perform well in identifying non-fraudulent cases.


Fraud detection (class 1):

Random Forest (0.53) performs best, followed by XGBoost (0.43), with Logistic Regression (0.14) significantly behind.

This is crucial for fraud detection, as identifying the minority class (fraudulent cases) is typically the primary goal.


Model ranking based on F1 scores:

- Random Forest
- XGBoost
- Logistic Regression

For this fraud prediction task, both ensemble methods (Random Forest and XGBoost) did display superior performance compared to the simpler Logistic Regression model, highlighting the effectiveness of more complex, tree-based algorithms for this particular research topic