In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.metrics import recall_score, precision_score

#from genetic_selection import GeneticSelectionCV
from sklearn_genetic import GAFeatureSelectionCV
from imblearn.over_sampling import SMOTE
import timeit
from tabulate import tabulate
from collections import Counter
from sklearn.ensemble import VotingClassifier

from utilities import evaluation_metrics, scores_table, plot_barchart, plot_roc_auc, grid_search_cv
#pip install sklearn-genetic
#pip install sklearn-genetic-opt



In [2]:
train_df = pd.read_csv('train_selected.csv')
train_df.head()

Unnamed: 0,V4,V7,V10,V12,V14,V16,V17,V21,V24,V28,Amount,Class
0,1.681259,0.937822,-0.116385,-0.523544,0.987795,0.084641,-0.136063,-0.112173,0.631496,0.103991,465.0,0
1,0.29782,-2.571801,1.269865,1.598242,0.587305,0.124582,-0.352808,-0.38584,0.635952,0.315491,6.01,0
2,0.08384,0.816489,0.024556,0.111034,0.875725,-0.634152,-0.230492,0.153164,0.098628,-0.07709,16.37,0
3,1.496151,0.29842,-0.209503,0.911417,-0.044924,-0.736753,0.225039,-0.010828,0.115813,0.041414,145.67,0
4,2.810175,2.216271,0.779595,-0.332759,0.045206,0.471108,-0.643196,0.036636,0.401559,-0.302437,239.0,0


In [3]:
val_df = pd.read_csv('val_selected.csv')
val_df.head()

Unnamed: 0,V4,V7,V10,V12,V14,V16,V17,V21,V24,V28,Amount,Class
0,0.327766,0.252869,-0.615363,0.10784,-0.978809,0.288609,0.342724,-0.372971,-0.104115,-0.02137,9.99,0
1,0.597403,1.179119,-0.540011,0.625409,-0.758539,0.031087,0.305243,0.007156,-0.512283,0.239459,6.9,0
2,0.857215,0.002159,-0.551364,0.56388,0.762662,-1.511864,1.282341,0.310285,-0.624707,-0.030211,1.5,0
3,-0.586648,3.147745,-0.744389,-0.110292,-0.42719,1.249822,-1.502131,0.133062,0.206276,-0.139528,756.04,0
4,-0.162242,0.527353,0.143305,1.081719,0.108511,0.058658,-0.439721,-0.135906,0.014172,0.145412,8.91,0


In [4]:
test_df = pd.read_csv('test_selected.csv')
test_df.head()

Unnamed: 0,V4,V7,V10,V12,V14,V16,V17,V21,V24,V28,Amount,Class
0,-0.277591,0.991747,-1.744176,1.145865,-0.096612,-0.294159,-0.082378,0.856441,0.536058,0.236036,1217.0,0
1,-2.770975,-0.49124,-0.673573,1.705658,0.23769,-2.936531,0.484655,-0.416346,0.009622,0.014826,26.31,0
2,-1.028361,3.308312,-2.012017,0.29991,-1.45117,0.995288,-0.008342,-0.084593,-0.611013,0.253554,680.91,0
3,0.070326,0.8867,-0.006214,0.697163,0.655563,-0.686316,-0.293587,0.163748,0.128632,-0.075323,12.92,0
4,-0.192904,0.467901,7.407039,-1.476121,-1.337504,-0.712907,-0.650421,-0.477313,-1.535126,0.758996,28.9,0


In [5]:
train_df.shape

(339912, 12)

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,284807.0,94813.86,47488.145955,0.0,54201.5,84692.0,139320.5,172792.0
V1,284807.0,1.168375e-15,1.958696,-56.40751,-0.920373,0.018109,1.315642,2.45493
V2,284807.0,3.416908e-16,1.651309,-72.715728,-0.59855,0.065486,0.803724,22.057729
V3,284807.0,-1.379537e-15,1.516255,-48.325589,-0.890365,0.179846,1.027196,9.382558
V4,284807.0,2.074095e-15,1.415869,-5.683171,-0.84864,-0.019847,0.743341,16.875344
V5,284807.0,9.604066e-16,1.380247,-113.743307,-0.691597,-0.054336,0.611926,34.801666
V6,284807.0,1.487313e-15,1.332271,-26.160506,-0.768296,-0.274187,0.398565,73.301626
V7,284807.0,-5.556467e-16,1.237094,-43.557242,-0.554076,0.040103,0.570436,120.589494
V8,284807.0,1.213481e-16,1.194353,-73.216718,-0.20863,0.022358,0.327346,20.007208
V9,284807.0,-2.406331e-15,1.098632,-13.434066,-0.643098,-0.051429,0.597139,15.594995


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [8]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [9]:
df.duplicated().sum()

1081

In [10]:
df = df.drop_duplicates()
df.duplicated().sum()

0

In [11]:
print("There are " + str(df.shape[0]) + " attributes and " + str(df.shape[1]) + " features after duplicates have been removed")

There are 283726 attributes and 31 features after duplicates have been removed


## BASE MODELS - WITHOUT FEATURE SELECTION OR OVERSAMPLING

In [30]:
models = [
    LogisticRegression(random_state=42),
    SVC(random_state=42, probability=True),
    GaussianNB(),
    RandomForestClassifier(random_state=42),
    XGBClassifier(random_state=42)
    ]

model_names = [
    'Logistic Regression',
    'SVM',
    'Gaussian Naive Bayes',
    'Random Forest',
    'XGBoost'
    ]

## TRAINING PIPELINE WITH FEATURE SCALING

### STANDARDIZATION

In [31]:
pipelines = []

for model in models:
  pipe = Pipeline([
      ('scaler', StandardScaler()),
      ('classifier', model)
      ])
  pipelines.append(pipe)

In [32]:
pipelines

[Pipeline(steps=[('scaler', StandardScaler()),
                 ('classifier', LogisticRegression(random_state=42))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('classifier', SVC(probability=True, random_state=42))]),
 Pipeline(steps=[('scaler', StandardScaler()), ('classifier', GaussianNB())]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('classifier', RandomForestClassifier(random_state=42))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('classifier',
                  XGBClassifier(base_score=None, booster=None, callbacks=None,
                                colsample_bylevel=None, colsample_bynode=None,
                                colsample_bytree=None,
                                early_stopping_rounds=None,
                                enable_categorical=False, eval_metric=None,
                                feature_types=None, gamma=None, gpu_id=None,
                                grow_policy=None, im

# TRAINING ON FEATURES FILTERED DATA

In [None]:
all_accuracy_ga = []
all_recall_ga = []
all_precision_ga = []
all_roc_auc_ga = []
all_fpr_ga =[]
all_tpr_ga = []

In [None]:
# Train and evaluate each model using the pipelines

for model_name, pipeline_ga in zip(model_names, pipelines):

  # Time the training process
  training_time = timeit.timeit(
      stmt = '''pipeline_ga.fit(df_train_selected, y_train_copy)''',
      globals = globals(), number = 1
      )

  # Print the training time for the current classifier
  print(f"Time taken to train {model_name}: {training_time:.2f} seconds")

  # Evaluate model
  accuracy, recall, precision, roc_auc_Score, fpr, tpr = evaluation_metrics(pipeline_ga, df_val_selected, y_val_copy, model_name, labels)
    

  all_accuracy_ga.append(accuracy)
  all_recall_ga.append(recall)
  all_precision_ga.append(precision)
  all_roc_auc_ga.append(roc_auc_Score)
  all_fpr_ga.append(fpr)
  all_tpr_ga.append(tpr)
  print('===================================================================================')

In [None]:
scores_table(model_names, all_accuracy_ga, all_recall_ga, all_roc_auc_ga, 'Model Performance - GA')

In [None]:
plot_barchart(model_names, all_accuracy_ga, 'Accuracy', 'Accuracy Comparison - GA')

In [None]:
plot_barchart(model_names, all_recall_ga, 'Recall', 'Recall Comparison - GA')

In [None]:
plot_roc_auc(model_names, all_fpr_ga, all_tpr_ga, all_roc_auc_ga, 'ROC AUC CURVES - GA')

## SOLVING IMBALANCE PROBLEM - OVERSAMPLING



In [None]:
sm = SMOTE(random_state=0)

df_selected_oversampled, y_selected_oversampled = sm.fit_resample(df_train_selected, y_train_copy)

In [None]:
Counter(y_selected_oversampled)

In [None]:
df_selected_oversampled.shape

In [None]:
# df_val_selected_oversampled, y_val_selected_oversampled = sm.fit_resample(df_val_selected, y_val_copy)
# df_test_selected_oversampled, y_test_selected_oversampled= sm.fit_resample(df_test_selected, y_test_copy)

In [None]:
# df_val_selected_oversampled.shape

In [None]:
new_pipelines = []

for model in models:
  pipe = Pipeline([
      ('sampler', SMOTE(random_state=0)),
      ('scaler', StandardScaler()),
      ('classifier', model)
      ])
  new_pipelines.append(pipe)

In [None]:
new_pipelines

## TRAINING AND EVALUATION ON VALIDATION SET WITH FILTERED DATA

In [None]:
all_accuracy_ga_sm = []
all_recall_ga_sm = []
all_precision_ga_sm = []
all_roc_auc_ga_sm = []
all_fpr_ga_sm =[]
all_tpr_ga_sm = []

In [None]:
# Train and evaluate each model using the pipelines

for model_name, pipeline_ga_sm in zip(model_names, new_pipelines):

  # Time the training process
  training_time = timeit.timeit(
      stmt = '''pipeline_ga_sm.fit(df_train_selected, y_train_copy)''',
      globals = globals(), number = 1
      )

  # Print the training time for the current classifier
  print(f"Time taken to train {model_name}: {training_time:.2f} seconds")

  # Evaluate model
  accuracy, recall, precision, roc_auc_Score, fpr, tpr = evaluation_metrics(pipeline_ga_sm, df_val_selected, y_val_copy, model_name, labels)
    

  all_accuracy_ga_sm.append(accuracy)
  all_recall_ga_sm.append(recall)
  all_precision_ga_sm.append(precision)
  all_roc_auc_ga_sm.append(roc_auc_Score)
  all_fpr_ga_sm.append(fpr)
  all_tpr_ga_sm.append(tpr)
  print('===================================================================================')

In [None]:
scores_table(model_names, all_accuracy_ga_sm, all_recall_ga_sm, all_roc_auc_ga_sm, 'Model Performance - GA + SM')

In [None]:
plot_barchart(model_names, all_accuracy_ga_sm, 'Accuracy', 'Accuracy Comparison - GA + SM')

In [None]:
plot_barchart(model_names, all_recall_ga_sm, 'Recall', 'Recall Comparison - GA + SM')

In [None]:
plot_roc_auc(model_names, all_fpr_ga_sm, all_tpr_ga_sm, all_roc_auc_ga_sm, 'ROC AUC CURVES - GA + SM')

## DEFINE PARAMETERS

In [56]:
# Logistic Regression
param_grid_lr = {
    'C': np.logspace(-4, 2, 7),
}

# Support Vector Machines
param_grid_svm = {
    'C': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly'],
    }

# Naive Bayes
param_grid_gnb = {
    'var_smoothing': [1e-9, 1e-8, 1e-7],
}

# Random Forest
param_grid_rf = {
    'n_estimators': [10, 100, 1000],
    'max_depth': [None, 10, 100],
}

# XGBoost
param_grid_xgb = {
    'eta': [0.01, 0.1, 0.3],
    'max_depth': [2, 4, 6],
    'gamma': [0, 0.1, 1],
}

In [57]:
param_grids = [
    param_grid_lr,
    param_grid_svm,
    param_grid_gnb,
    param_grid_rf,
    param_grid_xgb
    ]

## FUNCTION FOR HYPERPARAMETER OPTIMIZATION

In [58]:
# def grid_search_cv(X_train, y_train, models, param_grids, cv=3):
#   best_models = []
#   best_scores = []

#   # Initialize StandardScaler
#   scaler = StandardScaler()
#   X_train_scaled = scaler.fit_transform(X_train)

#   for model, param_grid in zip(models, param_grids):
#     grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)

#     grid_search.fit(X_train_scaled, y_train)

#     best_models.append(grid_search.best_estimator_)
#     best_scores.append(grid_search.best_score_)

#   return best_models, best_scores

In [None]:
best_models, best_scores = grid_search_cv(df_selected_oversampled, y_selected_oversampled, models, param_grids)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [None]:
best_models

In [None]:
best_scores

# TRAINING AND EVALUATION WITH OPTIMIZED HYPERPARAMETER

In [None]:
grid_models = [
    LogisticRegression(random_state=42),
    SVC(random_state=42, probability=True),
    GaussianNB(),
    RandomForestClassifier(random_state=42),
    XGBClassifier(random_state=42)
    ]

In [None]:
grid_pipelines = []

for model in grid_models:
  pipe = Pipeline([
      ('scaler', StandardScaler()),
      ('classifier', model)
      ])
  grid_pipelines.append(pipe)

In [None]:
# Lists to store evaluation metrics across models
grid_accuracy = []
grid_recall = []
grid_precision = []
grid_roc_auc = []
grid_fpr = []
grid_tpr = []

In [None]:
# Train and evaluate each model using the pipelines

for model_name, grid_pipeline in zip(model_names, grid_pipelines):

  # Time the training process
  training_time = timeit.timeit(
      stmt = '''grid_pipeline.fit(df_selected_oversampled, y_selected_oversampled)''',
      globals = globals(), number = 1
      )

  # Print the training time for the current classifier
  print(f"Time taken to train {model_name}: {training_time:.2f} seconds")

  # Evaluate model
  accuracy, recall, precision, roc_auc_Score, fpr, tpr = evaluation_metrics(grid_pipeline, df_val_selected_oversampled, y_val_selected_oversampled, model_name, labels)

  grid_accuracy.append(accuracy)
  grid_recall.append(recall)
  grid_precision.append(precision)
  grid_roc_auc.append(roc_auc_Score)
  grid_fpr.append(fpr)
  grid_tpr.append(tpr)
  print('===================================================================================')

In [None]:
scores_table(model_names, grid_accuracy, grid_recall, grid_roc_auc, 'Model Performance - GA + SM + HO')

In [None]:
plot_barchart(model_names, grid_accuracy, 'Accuracy', 'Accuracy Comparison - GA + SM + HO')

In [None]:
plot_barchart(model_names, grid_recall, 'Recall', 'Recall Comparison - GA + SM + HO')

In [None]:
plot_roc_auc(model_names, grid_fpr, grid_tpr, grid_roc_auc, 'ROC AUC CURVES - GA + SM + HO')

# ENSEMBLE LEARNING AND HYPERPARAMETER OPTIMIZATION

In [None]:
lr = LogisticRegression(random_state=42),
svc = SVC(random_state=42, probability=True),
gnb = GaussianNB(),
rf = RandomForestClassifier(random_state=42),
xgb = XGBClassifier(random_state=42)

estimators = [
    ('lr', lr),
    ('svc', svc), 
    ('gnb', gnb), 
    ('rf', rf), 
    ('xgb', xgb)
]

In [None]:
ensemble_clf = VotingClassifier(
    estimators = estimators,
    voting = 'soft',
    n_jobs = -1,
    verbose= True
)

In [None]:
ensemble_clf.fit(df_selected_oversampled, y_selected_oversampled)
#ensemble_clf.predict(X)
accuracy, recall, precision, roc_auc_Score, fpr, tpr = evaluation_metrics(ensemble_clf, df_val_selected_oversampled, y_val_selected_oversampled, 'Ensemble Learning', )