In [1]:
#!pip install xgboost lightgbm imblearn mlflow

In [2]:
#!pip install alibi-detect

In [3]:
#!pip install shap

In [4]:
# pip install --upgrade huggingface_hub datasets alibi-detect

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import shap
import mlflow
import mlflow.sklearn
from alibi_detect.cd import KSDrift

In [6]:
df = pd.read_csv("creditcard.csv")

In [7]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [8]:
df['ammount_log'] = np.log(df['Amount'] +1)

In [9]:
df['ammount_log']

0         5.014760
1         1.305626
2         5.939276
3         4.824306
4         4.262539
            ...   
284802    0.570980
284803    3.249987
284804    4.232366
284805    2.397895
284806    5.384495
Name: ammount_log, Length: 284807, dtype: float64

In [10]:
df['Time_hour'] = df['Time'] //3600
df['Time_day'] = df['Time']//(3600*24)

In [11]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V24,V25,V26,V27,V28,Amount,Class,ammount_log,Time_hour,Time_day
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,5.014760,0.0,0.0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0,1.305626,0.0,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,5.939276,0.0,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0,4.824306,0.0,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0,4.262539,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0,0.570980,47.0,1.0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0,3.249987,47.0,1.0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0,4.232366,47.0,1.0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0,2.397895,47.0,1.0


In [12]:
for i in range(1,29):
    for j in range(i+1,29):
        df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']

  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * df[f'V{j}']
  df[f'V{i}V{j}'] = df[f'V{i}'] * 

In [13]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V24V25,V24V26,V24V27,V24V28,V25V26,V25V27,V25V28,V26V27,V26V28,V27V28
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.008603,-0.012657,0.008939,-0.001409,-0.024309,0.017168,-0.002706,-0.025258,0.003981,-0.002812
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.056812,-0.042785,0.003053,-0.005004,0.021046,-0.001502,0.002461,-0.001131,0.001854,-0.000132
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.225837,0.095877,0.038154,0.041186,0.045574,0.018136,0.019577,0.007699,0.008311,0.003307
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.761039,0.260894,-0.073735,-0.072248,-0.143671,0.040605,0.039786,-0.013920,-0.013639,0.003855
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.029102,0.070957,0.030997,0.030394,-0.103477,-0.045203,-0.044324,0.110214,0.108070,0.047209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,-0.731835,-0.127355,-0.480647,-0.419566,0.359251,1.355845,1.183542,0.235945,0.205961,0.777315
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.616467,0.401668,-0.069583,0.054396,0.239771,-0.041537,0.032471,-0.027064,0.021157,-0.003665
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.170113,-0.055929,0.002852,-0.017002,-0.023218,0.001184,-0.007058,-0.000389,0.002321,-0.000118
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,-0.070123,0.067352,0.013407,0.012879,-0.311141,-0.061936,-0.059496,0.059489,0.057145,0.011375


In [14]:
X = df.drop(['Class', "Time"], axis =1)

In [15]:
y = df['Class']

In [16]:
X

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V24V25,V24V26,V24V27,V24V28,V25V26,V25V27,V25V28,V26V27,V26V28,V27V28
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.008603,-0.012657,0.008939,-0.001409,-0.024309,0.017168,-0.002706,-0.025258,0.003981,-0.002812
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.056812,-0.042785,0.003053,-0.005004,0.021046,-0.001502,0.002461,-0.001131,0.001854,-0.000132
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.225837,0.095877,0.038154,0.041186,0.045574,0.018136,0.019577,0.007699,0.008311,0.003307
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.761039,0.260894,-0.073735,-0.072248,-0.143671,0.040605,0.039786,-0.013920,-0.013639,0.003855
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.029102,0.070957,0.030997,0.030394,-0.103477,-0.045203,-0.044324,0.110214,0.108070,0.047209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,-0.731835,-0.127355,-0.480647,-0.419566,0.359251,1.355845,1.183542,0.235945,0.205961,0.777315
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.616467,0.401668,-0.069583,0.054396,0.239771,-0.041537,0.032471,-0.027064,0.021157,-0.003665
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.170113,-0.055929,0.002852,-0.017002,-0.023218,0.001184,-0.007058,-0.000389,0.002321,-0.000118
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,-0.070123,0.067352,0.013407,0.012879,-0.311141,-0.061936,-0.059496,0.059489,0.057145,0.011375


In [17]:
y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42 )

In [19]:
sc = StandardScaler()

In [20]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [21]:
over = SMOTE(sampling_strategy=0.1, random_state=42)
under = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
steps = [('over', over), ('under', under)]
pipeline = ImbPipeline(steps=steps)
X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)

In [22]:
len(X_resampled)

68235

In [23]:
selector = SelectFromModel(estimator=LGBMClassifier(random_state=42), max_features=50)
X_selected = selector.fit_transform(X_resampled, y_resampled)
selected_features = X.columns[selector.get_support()].tolist()

[LightGBM] [Info] Number of positive: 22745, number of negative: 45490
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.128722 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 104550
[LightGBM] [Info] Number of data points in the train set: 68235, number of used features: 410
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147




In [24]:
len(selected_features)

50

In [25]:
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_selected)

In [26]:
pca.n_components_

7

In [27]:
X_pca

array([[-1.54344524e+01,  1.11834552e+00,  8.30598394e-01, ...,
         5.77910235e-01,  7.92059874e-01,  6.67687320e-01],
       [-1.55145724e+01,  6.54998502e-01,  1.58765417e+00, ...,
         5.17606889e-01, -2.01338416e-01,  6.23506726e-01],
       [-1.55810604e+01,  7.14348859e-01,  1.43884070e+00, ...,
         2.45288198e-01, -1.42800245e-01,  1.44880385e-01],
       ...,
       [ 9.94965135e+01,  1.30325939e+02, -2.25916854e+01, ...,
        -7.09617071e+00,  5.98576545e+00, -1.51401562e+00],
       [ 1.55218690e+02, -2.62453105e+01, -2.69765322e+00, ...,
        -3.86565074e+00, -3.46263836e+00,  1.16008779e+01],
       [ 5.86365192e+00, -1.04210492e+01, -2.32430972e+01, ...,
        -2.22444093e+00, -9.77409973e+00, -1.55464664e+01]])

In [28]:
kmeans = KMeans(n_clusters=5, random_state=42)
cluster_labels = kmeans.fit_predict(X_pca)
X_with_clusters = np.column_stack((X_pca, cluster_labels))

In [29]:
X_with_clusters

array([[-1.54344524e+01,  1.11834552e+00,  8.30598394e-01, ...,
         7.92059874e-01,  6.67687320e-01,  0.00000000e+00],
       [-1.55145724e+01,  6.54998502e-01,  1.58765417e+00, ...,
        -2.01338416e-01,  6.23506726e-01,  0.00000000e+00],
       [-1.55810604e+01,  7.14348859e-01,  1.43884070e+00, ...,
        -1.42800245e-01,  1.44880385e-01,  0.00000000e+00],
       ...,
       [ 9.94965135e+01,  1.30325939e+02, -2.25916854e+01, ...,
         5.98576545e+00, -1.51401562e+00,  3.00000000e+00],
       [ 1.55218690e+02, -2.62453105e+01, -2.69765322e+00, ...,
        -3.46263836e+00,  1.16008779e+01,  4.00000000e+00],
       [ 5.86365192e+00, -1.04210492e+01, -2.32430972e+01, ...,
        -9.77409973e+00, -1.55464664e+01,  0.00000000e+00]])

In [30]:
models = {"Random Fortest": RandomForestClassifier(random_state=42),
         "Gradiant Boosting":GradientBoostingClassifier(random_state=42),
         "XGBoost":XGBClassifier(random_state=42),
         "LightBGM": LGBMClassifier(random_state=42),
         "Logistic": LogisticRegression(random_state=42)}

In [31]:
best_model = None
best_acc = 0

In [36]:
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_with_clusters, y_resampled)
    y_pred = model.predict(np.column_stack((pca.transform(selector.transform(X_test)), kmeans.predict(pca.transform(selector.transform(X_test))))))
    auc = roc_auc_score(y_test, y_pred)
    print(f"AUC for {name}: {auc}")
    if auc > best_acc:
        best_acc = auc
        best_model = model


Training Random Fortest...




AUC for Random Fortest: 0.9318533371423978

Training Gradiant Boosting...




AUC for Gradiant Boosting: 0.9479265960745581

Training XGBoost...




AUC for XGBoost: 0.9296551097929323

Training LightBGM...
[LightGBM] [Info] Number of positive: 22745, number of negative: 45490
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1790
[LightGBM] [Info] Number of data points in the train set: 68235, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147




AUC for LightBGM: 0.9342823335017744

Training Logistic...
AUC for Logistic: 0.9282482442892744


