This notebook provides a comprehensive examination of the dataset utilised in the fraud detection model. It details the application of advanced data cleaning, feature extraction, and feature engineering techniques, with the aim of optimising the dataset and identifying the most relevant metrics to enhance predictive accuracy.

In [None]:
import shap
import plotly.express as px
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
from bson.json_util import dumps
from datetime import datetime
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import logging
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from plotnine.data import economics
from plotnine import (ggplot, aes, geom_line, geom_bar, labs,theme_minimal, theme, element_rect)
import seaborn as sns
import warnings
import plotly.graph_objects as go
from plotly.subplots import make_subplots

Mongodb is used as the database, as Cassandra is not available. The data is stored in JSON file format, which is subsequently extracted for use in creating the data pipeline.

In [None]:
client  = MongoClient("mongodb://localhost:27017/")
db = client["local"]
collections = db["inueron.ai data"]

In [None]:
records = [dumps(data) for data in collections.find()]
df = pd.read_json(f"[{','.join(records)}]")

In [None]:
df['TX_DATETIME'] = df['TX_DATETIME'].apply(lambda x: datetime.fromisoformat(x['$date'].replace('Z', '+00:00')))

In [None]:
df['TX_FRAUD'] = df['TX_FRAUD'].apply(lambda x: 0 if x == 'Legitimate' else 1)

In [None]:
if df.columns.dtype == "object":
    print(df.columns)

Index(['_id', 'TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TX_FRAUD_SCENARIO',
       'TX_FRAUD', 'TX_YEAR'],
      dtype='object')


In [None]:
df['TX_MONTH'] = pd.DatetimeIndex(df['TX_DATETIME']).month

In [None]:
df['TX_YEAR'] = pd.DatetimeIndex(df['TX_DATETIME']).year

In [None]:
df.drop(columns = ['_id'], inplace = True, axis = 1)

In [None]:
def is_night(timestamp): 
        tx_hour = timestamp.hour 
        is_night = tx_hour <= 6 
        return int(is_night)
def is_weekend(timestamp):
        tx_weekend = timestamp.weekday()
        is_weekend = tx_weekend >= 5
        return int(is_weekend)

In [None]:
df['TX_NIGHT'] = df['TX_DATETIME'].apply(is_night)
df['TX_WEEKEND'] = df['TX_DATETIME'].apply(is_weekend)

In [None]:
df['TX_IS_AMOUNT_HIGH'] = df['TX_AMOUNT'].apply(lambda x: 1 if x >= 150 else 0)
df['TX_TIME_TAKEN_HIGH'] = df['TX_TIME_SECONDS'].apply(lambda x: 1 if x > 7903233.708571933 else 0)

In [None]:
fraud_scenarios = pd.get_dummies(df['TX_FRAUD_SCENARIO'], prefix="FRAUD_SCENARIO", dtype = int)
df = pd.concat([df, fraud_scenarios], axis=1)

In [None]:
df

         TRANSACTION_ID  ... FRAUD_SCENARIO_Random Fraud
0                316917  ...                           0
1                316918  ...                           0
2                316919  ...                           0
3                316920  ...                           0
4                316921  ...                           0
...                 ...  ...                         ...
1754150          374287  ...                           0
1754151          374288  ...                           0
1754152          374289  ...                           0
1754153          374290  ...                           0
1754154          374291  ...                           0

[1754155 rows x 19 columns]


EDA has been done below for visualisation purposes.

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Transaction Amount Distribution",
        "Hourly Transaction Patterns",
        "Frauds Reported by Scenario",
        "Monthly Transaction Trends"
    )
)

fig.add_trace(
    go.Box(y=df['TX_AMOUNT'], name="Transaction Amount"),
    row=1, col=1
)

hourly_data = df.groupby('TX_HOUR').size().reset_index(name='Count')
fig.add_trace(
    go.Bar(x=hourly_data['TX_HOUR'], y=hourly_data['Count'], name="Hourly Transactions"),
    row=1, col=2
)

fraud_scenario = df.groupby('TX_FRAUD_SCENARIO')['TX_FRAUD'].apply(lambda x: (x == 1).sum())
fig.add_trace(
    go.Bar(x=fraud_scenario.index, y=fraud_scenario.values, name="Frauds by Scenario"),
    row=2, col=1
)

monthly_data = df.groupby('TX_MONTH').size().reset_index(name='Count')
fig.add_trace(
    go.Scatter(x=monthly_data['TX_MONTH'], y=monthly_data['Count'], mode='lines+markers', name="Monthly Trends"),
    row=2, col=2
)

fig.update_layout(
    title_text="Comprehensive EDA Dashboard",
    height=800,
    showlegend=True
)

fig.show()

In [None]:
mean = df['TX_AMOUNT'].mean()
std = df['TX_AMOUNT'].std()
threshold = 3
outliers = []
twentyfithpercentile = np.percentile(df['TX_AMOUNT'], 25)
seventyfifthpercentile = np.percentile(df['TX_AMOUNT'], 75)
outlier_count = 0
for index,data in df['TX_AMOUNT'].items():
    z_score = (data - mean) / std
    if z_score > threshold:
        df.at[index, 'TX_AMOUNT'] = seventyfifthpercentile
        outlier_count += 1
    elif z_score < -threshold:
        df.at[index, 'TX_AMOUNT'] = twentyfithpercentile
        outlier_count += 1


print(outlier_count)


12039


In [None]:
df['TX_HOUR'] = df['TX_DATETIME'].dt.hour

In [None]:
df['TX_RUSH_HOUR'] = df['TX_HOUR'].apply(lambda x: 1 if x in [8,9,10,16,17,18] else 0)

In [None]:
X = df.drop(columns = ['TX_FRAUD', 'TX_DATETIME', 'TX_FRAUD_SCENARIO', 'FRAUD_SCENARIO_Large Amount', 'FRAUD_SCENARIO_Leaked data', 'FRAUD_SCENARIO_Legitimate', 'FRAUD_SCENARIO_Random Fraud'])
y = df['TX_FRAUD']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

In [None]:
df['TX_AMOUNT_ROUNDED'] = df['TX_AMOUNT'].apply(lambda x: 1 if x % 100 == 0 else 0)

In [None]:
df['TX_LUNCH_TIME'] = df['TX_HOUR'].apply(lambda x: 1 if x in [11,12,13,14] else 0)
df['TX_LATE_NIGHT'] = df['TX_HOUR'].apply(lambda x: 1 if x in [23,0,1,2,3] else 0)

In [None]:
smote = SMOTE(random_state=42)
X_train_smo, y_train_smo = smote.fit_resample(x_train, y_train)
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train_smo)
X_test_scaled = scaler.transform(x_test)

Various machine learning models, including Decision Tree, CatBoost, Gradient Boosting, and Random Forest, were evaluated. However, XGBoost, being an ensemble method that leverages weak learners, demonstrated superior performance in achieving an optimal balance between precision and recall. As a result, XGBoost was selected for the final model. Given the highly imbalanced nature of the dataset, XGBoost performed consistently well. Hyperparameter tuning was conducted using RandomizedSearchCV, as the computational resources for GridSearchCV were unavailable.

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, make_scorer, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'scale_pos_weight': [1, 2, 3, 5],
}

scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

base_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    objective='binary:logistic',
    random_state=42
)

random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_distributions,
    n_iter=50,
    scoring='f1',
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=2,
    refit=True
)

random_search.fit(X_train_scaler, y_train_smo)

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nBest Parameters:")
for param, value in random_search.best_params_.items():
    print(f"{param}: {value}")

print(f"\nBest Cross-Validation Score: {random_search.best_score_:.4f}")

cv_results = pd.DataFrame(random_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_score')

print("\nTop 5 Parameter Combinations:")
top_params = cv_results[['params', 'mean_test_score', 'std_test_score']].head()
print(top_params.to_string())

Fitting 5 folds for each of 50 candidates, totalling 250 fits

Classification Report:
              precision    recall  f1-score   support

           0     0.9931    0.7467    0.8525    521843
           1     0.0128    0.3892    0.0248      4404

    accuracy                         0.7437    526247
   macro avg     0.5030    0.5679    0.4386    526247
weighted avg     0.9849    0.7437    0.8455    526247


Best Parameters:
subsample: 0.8
scale_pos_weight: 3
n_estimators: 500
min_child_weight: 5
max_depth: 10
learning_rate: 0.2
gamma: 0.2
colsample_bytree: 0.8

Best Cross-Validation Score: 0.8834

Top 5 Parameter Combinations:
                                                                                                                                                                 params  mean_test_score  std_test_score
7   {'subsample': 0.8, 'scale_pos_weight': 3, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0.2, 'colsample_bytree'

Visualising the model performance below to get a great understanding.SHAP importance feature provides additional details on how each performance has contributed to the overall model performance.

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

cm = confusion_matrix(y_test, y_pred)

explainer = shap.Explainer(best_model)
shap_values = explainer(x_test)
shap_values = explainer.shap_values(x_test[:1000])

shap_importance = np.abs(shap_values.values).mean(axis=0)
feature_importance = pd.DataFrame({
    'feature': x_test.columns,
    'importance': shap_importance
}).sort_values('importance', ascending=False)

print("Feature Importance:\n", feature_importance)

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "AUC-ROC Curve",
        "Confusion Matrix",
        "SHAP Feature Importance",
        ""
    )
)

fig.add_trace(
    go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve', line=dict(color='blue')),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(dash='dash', color='red')),
    row=1, col=1
)

fig.add_trace(
    go.Heatmap(z=cm, colorscale='Viridis', showscale=True, text=cm, hovertemplate="%{text}", colorbar=dict(title="Count")),
    row=1, col=2
)

fig.add_trace(
    go.Bar(x=feature_importance['feature'], y=feature_importance['importance'], name='Feature Importance', marker=dict(color='green')),
    row=2, col=2
)

fig.update_layout(
    title_text="Model Evaluation and Feature Importance",
    showlegend=False
)

fig.show()