1: Import Necessary Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, LSTM, SimpleRNN
from tensorflow.keras.utils import to_categorical
import shap
import lime
import lime.lime_tabular
from lime.lime_tabular import LimeTabularExplainer

2: Data Preparation with SMOTE

In [3]:
# Load the dataset
data = pd.read_csv('../../src/data/e-commerce_processed_data.csv')

# Feature and Target Separation
X = data.drop(columns=['class'])
y = data['class']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the class distribution before SMOTE
print("Class distribution before SMOTE:")
print(pd.Series(y_train).value_counts())

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Display the class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

Class distribution before SMOTE:
class
0    108786
1     11250
Name: count, dtype: int64
Class distribution after SMOTE:
class
0    108786
1    108786
Name: count, dtype: int64


3: Model Training

Traditional Machine Learning Models

In [4]:
# Initialize and train the models
lr_model = LogisticRegression(max_iter=1000, random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
mlp_model = MLPClassifier(random_state=42)

# Train the models
lr_model.fit(X_train_smote, y_train_smote)
dt_model.fit(X_train_smote, y_train_smote)
rf_model.fit(X_train_smote, y_train_smote)
gb_model.fit(X_train_smote, y_train_smote)
mlp_model.fit(X_train_smote, y_train_smote)

Neural Network Models

In [6]:
# Build the CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train_smote.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

# Compile the model
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train_smote.values.reshape(-1, X_train_smote.shape[1], 1), to_categorical(y_train_smote), epochs=10, batch_size=32, validation_data=(X_test.values.reshape(-1, X_test.shape[1], 1), to_categorical(y_test)))

# Build the RNN model
rnn_model = Sequential([
    SimpleRNN(64, input_shape=(X_train_smote.shape[1], 1), activation='relu'),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

# Compile the model
rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
rnn_model.fit(X_train_smote.values.reshape(-1, X_train_smote.shape[1], 1), to_categorical(y_train_smote), epochs=10, batch_size=32, validation_data=(X_test.values.reshape(-1, X_test.shape[1], 1), to_categorical(y_test)))

# Build the LSTM model
lstm_model = Sequential([
    LSTM(64, input_shape=(X_train_smote.shape[1], 1), activation='relu'),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

# Compile the model
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_smote.values.reshape(-1, X_train_smote.shape[1], 1), to_categorical(y_train_smote), epochs=10, batch_size=32, validation_data=(X_test.values.reshape(-1, X_test.shape[1], 1), to_categorical(y_test)))

Epoch 1/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 4ms/step - accuracy: 0.7428 - loss: 0.5078 - val_accuracy: 0.9057 - val_loss: 0.4465
Epoch 2/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 4ms/step - accuracy: 0.7747 - loss: 0.4430 - val_accuracy: 0.9430 - val_loss: 0.3521
Epoch 3/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 0.7794 - loss: 0.4358 - val_accuracy: 0.9415 - val_loss: 0.3768
Epoch 4/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 4ms/step - accuracy: 0.7865 - loss: 0.4269 - val_accuracy: 0.9434 - val_loss: 0.3670
Epoch 5/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 0.7886 - loss: 0.4222 - val_accuracy: 0.9494 - val_loss: 0.3498
Epoch 6/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.7921 - loss: 0.4163 - val_accuracy: 0.9448 - val_loss: 0.3679
Epoch 7/10

  super().__init__(**kwargs)


Epoch 1/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4ms/step - accuracy: 0.7550 - loss: 0.4799 - val_accuracy: 0.9562 - val_loss: 0.3678
Epoch 2/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 4ms/step - accuracy: 0.7655 - loss: 0.4594 - val_accuracy: 0.9562 - val_loss: 0.3935
Epoch 3/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.7680 - loss: 0.4548 - val_accuracy: 0.9562 - val_loss: 0.4204
Epoch 4/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.7688 - loss: 0.4534 - val_accuracy: 0.9558 - val_loss: 0.3763
Epoch 5/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 4ms/step - accuracy: 0.7689 - loss: 0.4522 - val_accuracy: 0.9558 - val_loss: 0.4086
Epoch 6/10
[1m6800/6800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 4ms/step - accuracy: 0.7676 - loss: 0.4531 - val_accuracy: 0.9559 - val_loss: 0.3880
Epoch 7/10

<keras.src.callbacks.history.History at 0x24700c28e60>

4: Model Evaluation

In [8]:
# Predict and evaluate the traditional machine learning models
models = {
    "Logistic Regression": lr_model,
    "Decision Tree": dt_model,
    "Random Forest": rf_model,
    "Gradient Boosting": gb_model,
    "Multi-Layer Perceptron": mlp_model
}

evaluation_results = []

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    evaluation_results.append({
        "Model": model_name,
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-Score": report["weighted avg"]["f1-score"]
    })

# Evaluate the neural network models
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test.values.reshape(-1, X_test.shape[1], 1), to_categorical(y_test))
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test.values.reshape(-1, X_test.shape[1], 1), to_categorical(y_test))
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test.values.reshape(-1, X_test.shape[1], 1), to_categorical(y_test))

# Collect evaluation results for neural networks
nn_evaluation_results = [
    {"Model": "CNN", "Accuracy": cnn_accuracy},
    {"Model": "RNN", "Accuracy": rnn_accuracy},
    {"Model": "LSTM", "Accuracy": lstm_accuracy}
]

# Create DataFrames for evaluation results
evaluation_df = pd.DataFrame(evaluation_results)
nn_evaluation_df = pd.DataFrame(nn_evaluation_results)

# Combine evaluation results from all models
all_evaluation_df = pd.concat([evaluation_df, nn_evaluation_df], ignore_index=True)

# Display the combined evaluation results in tabular form
print("\nEvaluation Results for All Models:")
print(all_evaluation_df.to_string(index=False))

[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9536 - loss: 0.3209
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9553 - loss: 0.3913
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9476 - loss: 0.3036

Evaluation Results for All Models:
                 Model  Precision   Recall  F1-Score  Accuracy
   Logistic Regression   0.946027 0.949082  0.943539       NaN
         Decision Tree   0.906498 0.894598  0.899808       NaN
         Random Forest   0.953252 0.953547  0.947555       NaN
     Gradient Boosting   0.958133 0.956180  0.950004       NaN
Multi-Layer Perceptron   0.940578 0.945083  0.939949       NaN
                   CNN        NaN      NaN       NaN  0.953914
                   RNN        NaN      NaN       NaN  0.955747
                  LSTM        NaN      NaN       NaN  0.947582


5. Model Explainability

5.1 Using SHAP

In [None]:
# Function to generate SHAP plots for a given model
def generate_shap_plots(model, model_name, X_train, X_test):
    explainer = shap.Explainer(model, X_train)
    shap_values = explainer(X_test)
    
    # SHAP Summary Plot
    plt.title(f'SHAP Summary Plot - {model_name}')
    shap.summary_plot(shap_values, X_test)
    
    # SHAP Force Plot for a single prediction
    plt.title(f'SHAP Force Plot (Single Prediction) - {model_name}')
    shap.initjs()
    shap.force_plot(explainer.expected_value[1], shap_values[0].values, X_test.iloc[0,:])
    
    # SHAP Dependence Plot for all features
    for feature in X_test.columns:
        plt.title(f'SHAP Dependence Plot - {model_name} - Feature: {feature}')
        shap.dependence_plot(feature, shap_values, X_test)

# Generate SHAP plots for all models
models = {
    "Logistic Regression": lr_model,
    "Decision Tree": dt_model,
    "Random Forest": rf_model,
    "Gradient Boosting": gb_model,
    "Multi-Layer Perceptron": mlp_model,
    "CNN": cnn_model,
    "RNN": rnn_model,
    "LSTM": lstm_model
}

for model_name, model in models.items():
    generate_shap_plots(model, model_name, X_train_smote, X_test)

3: Using LIME

In [None]:
# Function to generate LIME explanations for a given model
def generate_lime_explanations(model, model_name, X_train, X_test):
    lime_explainer = LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=['Non-Fraud', 'Fraud'], mode='classification')
    
    # Explain a prediction
    exp = lime_explainer.explain_instance(X_test.iloc[0].values, model.predict_proba, num_features=len(X_test.columns))
    
    # LIME Feature Importance Plot
    plt.title(f'LIME Feature Importance Plot - {model_name}')
    exp.show_in_notebook(show_table=True, show_all=False)

# Generate LIME explanations for all models
for model_name, model in models.items():
    generate_lime_explanations(model, model_name, X_train_smote, X_test)

5: Feature Importance

In [None]:
# Calculate feature importances from the Random Forest model
feature_importances = rf_model.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.title('Feature Importances from Random Forest')
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.gca().invert_yaxis()
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# Display the top features
print("Top 10 Important Features:")
print(feature_importance_df.head(10))