In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys
import networkx as nx


#sys.path.append('../prepro/')

# Import the variables from prepro_trans_smote
#from prepro_trans_smote import X_res, y_res




from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

In [2]:
data = pd.read_pickle("../data/transactions.pkl")

X = data.drop(['fraud'],axis=1)
y = data['fraud']

## RF with graph

In [3]:
# Split data into train, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(data.drop(['fraud'], axis=1), data['fraud'], test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42) 

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_valid.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 356785 samples
Validation set size: 118929 samples
Testing set size: 118929 samples


In [4]:
# Function to add graph features
def add_graph_features(df):
    G = nx.from_pandas_edgelist(df, 'customer', 'merchant', create_using=nx.Graph())
    degree_centrality = nx.degree_centrality(G)
    closeness_centrality = nx.closeness_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)

    df['degree_centrality'] = df['customer'].map(degree_centrality).fillna(0)
    df['closeness_centrality'] = df['customer'].map(closeness_centrality).fillna(0)
    df['betweenness_centrality'] = df['customer'].map(betweenness_centrality).fillna(0)
    return df

# Apply graph features
X_train = add_graph_features(X_train)
X_valid = add_graph_features(X_valid)
X_test = add_graph_features(X_test)

In [6]:
# Define the model and parameter grid for GridSearch
model = RandomForestClassifier(random_state=42, verbose=1, class_weight="balanced")
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [8, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Setup the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='recall', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions using the best model
y_pred_valid = best_model.predict(X_valid)
y_pred_test = best_model.predict(X_test)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   38.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 1

[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  40.1s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  27.4s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  28.0s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  56.5s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  53.7s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3min
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  28.4s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time= 1.0min
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time= 1.5min
[CV] END max_depth=8, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  31.

In [7]:
# Evaluation on validation and test data
print("Validation Set Classification Report: \n", classification_report(y_valid, y_pred_valid))
print("Validation Set Confusion Matrix: \n", confusion_matrix(y_valid, y_pred_valid))
print("Test Set Classification Report: \n", classification_report(y_test, y_pred_test))
print("Test Set Confusion Matrix: \n", confusion_matrix(y_test, y_pred_test))

Validation Set Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    117536
           1       0.45      0.92      0.61      1393

    accuracy                           0.99    118929
   macro avg       0.72      0.95      0.80    118929
weighted avg       0.99      0.99      0.99    118929

Validation Set Confusion Matrix: 
 [[115965   1571]
 [   107   1286]]
Test Set Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    117512
           1       0.47      0.93      0.63      1417

    accuracy                           0.99    118929
   macro avg       0.74      0.96      0.81    118929
weighted avg       0.99      0.99      0.99    118929

Test Set Confusion Matrix: 
 [[116045   1467]
 [    97   1320]]


In [8]:
# Function to calculate specificity
def calculate_specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return specificity

# Predictions and scoring
y_pred_valid = best_model.predict(X_valid)
y_pred_test = best_model.predict(X_test)

# Calculate specificity
specificity_valid = calculate_specificity(y_valid, y_pred_valid)
specificity_test = calculate_specificity(y_test, y_pred_test)

# Print metrics
print(f"Specificity for Validation Set: {specificity_valid:.4f}")
print(f"Specificity for Test Set: {specificity_test:.4f}")


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Specificity for Validation Set: 0.9866
Specificity for Test Set: 0.9875


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.5s finished
