In [None]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import pandas as pd
import numpy as np
import matplotlib
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
%matplotlib inline


import gc
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score,accuracy_score,f1_score,confusion_matrix,plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn import svm
import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb
from xgboost import XGBClassifier


In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

##Load Downloaded dataset

In [None]:
Dataset = pd.read_csv('/content/drive/MyDrive/creditcard (1).csv')
pd.read_csv('/content/drive/MyDrive/Colab-Notebooks/archive/creditcard.csv')

##Check Dataset Informations

In [None]:
Dataset.head()

In [None]:
print("Credit Card Fraud Detection data -  rows:",Dataset.shape[0]," columns:", Dataset.shape[1])

In [None]:
Dataset.describe()

In [None]:
Dataset.info()

In [None]:
Dataset.isnull().sum()

In [None]:
Dataset.groupby('Class').mean()

In [None]:
Dataset['Class'].value_counts()

In [None]:
fig = px.pie(Dataset.Class,values = Dataset.Class.value_counts(),
             names=['Legit', 'Fraud'], title='Legit vs Fraud Transactions in Dataset')
fig.show()

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(16,4))

sns.distplot(Dataset[(Dataset['Class'] == 1)]['Time'], bins=100, color='red', ax=axs[0])
axs[0].set_title("Distribution of Fraud Transactions")

sns.distplot(Dataset[(Dataset['Class'] == 0)]['Time'], bins=100, color='green', ax=axs[1])
axs[1].set_title("Distribution of Legit Transactions")

plt.show()

In [None]:
Legit = Dataset[(Dataset['Class'] == 0)]['Time']
Fraud = Dataset[Dataset['Class'] == 1]["Time"]

hist_data = [Legit,Fraud]
group_labels = ['Real Transactions', 'Fraudulent Transactions']

fig = ff.create_distplot(hist_data, group_labels, show_hist=False, show_rug=False)
fig['layout'].update(title='Time of Credit Card Transactions Density Plot', xaxis=dict(title='Time [s]'))
iplot(fig, filename='dist_only')

Fraudulent transactions seems to have a distribution more even than legit transactions. Legit are equaly distributed in time, including the legit transaction times that are low, during the night in Europe timezone

In [None]:
Legit_desc = Dataset[(Dataset['Class'] == 0)]['Amount']
Fraud_desc = Dataset[Dataset['Class'] == 1]['Amount']
Legit_desc.describe()

In [None]:
Fraud_desc.describe()

Legit transactions have a larger mean value, larger Q1, smaller Q3 and Q4 and larger outliers, but fraudulent transactions have a smaller Q1 and mean, larger Q4 and smaller outliers.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

amount_col = Dataset['Amount'].values
time_col = Dataset['Time'].values

sns.distplot(amount_col, ax=ax[0], color='r')
ax[0].set_title('Transaction Amount Distribution')
ax[0].set_xlim([min(amount_col), max(amount_col)])

sns.distplot(time_col, ax=ax[1], color='b')
ax[1].set_title('Transaction Time Distribution')
ax[1].set_xlim([min(time_col), max(time_col)])

plt.show()

In [None]:
fig = px.scatter(Dataset.query("Class==1"), x = 'Amount', y =Dataset.query("Class==1").index,
                title = 'Distribution of Fraudulent transactions Amounts')
fig.update_layout(xaxis_title='Amount',
                    yaxis_title='Transactions')
fig.show()

In [None]:
fig = px.scatter(Dataset.query("Class==0"), x = 'Amount', y =Dataset.query("Class==0").index,
                title = 'Distribution of Legit transactions Amounts')
fig.update_layout(xaxis_title='Amount',
                    yaxis_title='Transactions')
fig.show()

In [None]:
Correlation = Dataset.corr()
print(Correlation['Class'].sort_values(ascending =False), '\n')

In [None]:
plt.figure(figsize=(24,20))
corr = Dataset.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, )
plt.title("Features Correlation", fontsize=14)

there is no notable correlation between column V1-V28. There are correlations between some of these features and Time i.e inverse correlation with V3 and Amount has direct correlation with V7 and V20, and inverse correlation with V1 and V5).

Time and amount should be scaled as the other columns

In [None]:
Scaler = StandardScaler()
Dataset['scaled_amount'] = Scaler.fit_transform(Dataset['Amount'].values.reshape(-1,1))
Dataset['scaled_time'] = Scaler.fit_transform(Dataset['Time'].values.reshape(-1,1))

Dataset.drop(['Time','Amount'], axis=1, inplace=True)

In [None]:
RFC_METRIC = 'gini'
NUM_ESTIMATORS = 100
NO_JOBS = 4



NUMBER_KFOLDS = 5

MAX_ROUNDS = 1000
EARLY_STOP = 50
OPT_ROUNDS = 1000
VERBOSE_EVAL = 50



In [None]:
X = Dataset.drop(columns = 'Class', axis = 1)
y = Dataset['Class']

In [None]:
y.value_counts()

OverSampling Using SMOTE

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, stratify = y, random_state = 2)
X_train, y_train = SMOTE().fit_resample(X_train,y_train)
y_train.value_counts()

##Model Building

Baseline Model

In [None]:
results = []
ROC_results = []
f1_results = []

##Random Forest Classifier

In [None]:
RFclassifier = RandomForestClassifier()
RFclassifier.fit(X_train,y_train)
RFpreds = RFclassifier.predict(X_test)

In [None]:
RF_F1_Score = f1_score(RFpreds,y_test)
RF_ROC = roc_auc_score(RFpreds,y_test)
print('F1=', RF_F1_Score)
print('ROC =', RF_ROC)
ROC_results.append(RF_ROC)
f1_results.append(RF_F1_Score)


In [None]:
rf_cvs_score = cross_val_score(RFclassifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
rf_cvs_score_test = cross_val_score(RFclassifier, X_test, y_test, cv=10, scoring='roc_auc').mean()
print(rf_cvs_score)
print(rf_cvs_score_test)
results.append(rf_cvs_score)

In [None]:
plot_roc_curve(RFclassifier,  X_train, y_train)
plt.title('RF_ROC_AUC')
plt.show()

In [None]:
RFconfusion = confusion_matrix(RFpreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in RFconfusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in RFconfusion.flatten()/np.sum(RFconfusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(RFconfusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
pred_Variables = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',\
       'Amount']

In [None]:

tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': RFclassifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

##AdaBoost Classifier

In [None]:
AdaClassifier = AdaBoostClassifier()
AdaClassifier.fit(X_train,y_train)
AdaPreds = AdaClassifier.predict(X_test)

In [None]:
Ada_F1_Score = f1_score(AdaPreds,y_test)
Ada_ROC = roc_auc_score(AdaPreds,y_test)
print('F1=', Ada_F1_Score)
print('ROC =', Ada_ROC)
ROC_results.append(Ada_ROC)
f1_results.append(Ada_F1_Score)


In [None]:
Ada_cvs_score = cross_val_score(AdaClassifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
Ada_cvs_score_test = cross_val_score(AdaClassifier, X_test, y_test, cv=10, scoring='roc_auc').mean()
print(Ada_cvs_score)
print(Ada_cvs_score_test)
results.append(Ada_cvs_score)

In [None]:
plot_roc_curve(AdaClassifier,X_test, y_test)
plt.title('Ada_ROC_AUC')
plt.show()

In [None]:
Ada_confusion = confusion_matrix(AdaPreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in Ada_confusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in Ada_confusion.flatten()/np.sum(Ada_confusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(Ada_confusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
pred_Variables = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',\
       'Amount']
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': AdaClassifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

##CatBoost Classifier

In [None]:
CBClassifier = CatBoostClassifier()
CBClassifier.fit(X_train,y_train)
CBPreds = CBClassifier.predict(X_test)

In [None]:
CB_F1_Score = f1_score(CBPreds,y_test)
CB_ROC = roc_auc_score(CBPreds,y_test)
print('F1=', CB_F1_Score)
print('ROC =', CB_ROC)
ROC_results.append(CB_ROC)
f1_results.append(CB_F1_Score)

In [None]:
CB_cvs_score = cross_val_score(CBClassifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
CB_cvs_score = cross_val_score(CBClassifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
print(CB_cvs_score)
results.append(CB_cvs_score)

In [None]:
plot_roc_curve(CBClassifier, X_train, y_train)
plt.title('CB_ROC_AUC')
plt.show()

In [None]:
CB_confusion = confusion_matrix(CBPreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in CB_confusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in CB_confusion.flatten()/np.sum(CB_confusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(CB_confusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
pred_Variables = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',\
       'Amount']
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': CBClassifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

##XGB Classifier

In [None]:
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train,y_train)
XgbPreds = xgb_classifier.predict(X_test)

In [None]:
XGB_F1_Score = f1_score(XgbPreds,y_test)
XGB_ROC = roc_auc_score(XgbPreds,y_test)
print('F1=', XGB_F1_Score)
print('ROC =', XGB_ROC)
ROC_results.append(XGB_ROC)
f1_results.append(XGB_F1_Score)

In [None]:
Xgb_cvs_score = cross_val_score(xgb_classifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
Xgb_cvs_score_test = cross_val_score(xgb_classifier, X_test, y_test, cv=10, scoring='roc_auc').mean()
print(Xgb_cvs_score)
print(Xgb_cvs_score_test)
results.append(Xgb_cvs_score)

In [None]:
plot_roc_curve(xgb_classifier, X_train, y_train)
plt.title('Xgb_ROC_AUC')
plt.show()

In [None]:
Xgb_confusion = confusion_matrix(XgbPreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in Xgb_confusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in Xgb_confusion.flatten()/np.sum(Xgb_confusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(Xgb_confusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
pred_Variables = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',\
       'Amount']
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': xgb_classifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

##LGBM Classifier

In [None]:
lgb_classifier = LGBMClassifier()
lgb_classifier.fit(X_train,y_train)
LgbPreds = lgb_classifier.predict(X_test)

In [None]:
Lgb_F1_Score = f1_score(LgbPreds,y_test)
Lgb_ROC = roc_auc_score(LgbPreds,y_test)
print('F1=', Lgb_F1_Score)
print('ROC =', Lgb_ROC)
ROC_results.append(Lgb_ROC)
f1_results.append(Lgb_F1_Score)

In [None]:
Lgb_cvs_score = cross_val_score(lgb_classifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
Lgb_cvs_score_test = cross_val_score(lgb_classifier, X_test, y_test, cv=10, scoring='roc_auc').mean()
print(Lgb_cvs_score)
print(Lgb_cvs_score_test)
results.append(Lgb_cvs_score)

In [None]:
plot_roc_curve(lgb_classifier, X_train, y_train)
plt.title('Lgb_ROC_AUC')
plt.show()

In [None]:
Lgb_confusion = confusion_matrix(LgbPreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in Lgb_confusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in Lgb_confusion.flatten()/np.sum(Lgb_confusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(Lgb_confusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': lgb_classifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
algorithms = ["Random Forest", "AdaBoost", "CatBoost", "Xgb","LGBM"]
Dict = {"Cross_Validation":results, "ROC-AUC":ROC_results, "F1_Score":f1_results}
alg = pd.DataFrame(Dict, index = algorithms)
#alg.columns=["Cross_Validation","ROC-AUC","F1_Score"]
#alg.sort_values(by="Cross_Validation", ascending=False)
print(alg[['Cross_Validation','ROC-AUC',"F1_Score"]])

Hyper Parameter Tunning

In [None]:
results_2 = []
ROC_results_2 = []
f1_results_2 = []

In [None]:
RFclassifier = RandomForestClassifier(max_depth=5, max_features = 7, n_estimators = 100 )
RFclassifier.fit(X_train,y_train)
RFpreds = RFclassifier.predict(X_test)

In [None]:
RF_F1_Score = f1_score(RFpreds,y_test)
RF_ROC = roc_auc_score(RFpreds,y_test)
print('F1=', RF_F1_Score)
print('ROC =', RF_ROC)
ROC_results_2.append(RF_ROC)
f1_results_2.append(RF_F1_Score)

In [None]:
from sklearn.model_selection import cross_val_score
rf_cvs_score = cross_val_score(RFclassifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
print(rf_cvs_score)
results_2.append(rf_cvs_score)

In [None]:
plot_roc_curve( RFclassifier,  X_train, y_train)
plt.title('RF_ROC_AUC')
plt.show()

In [None]:
RFconfusion = confusion_matrix(RFpreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in RFconfusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in RFconfusion.flatten()/np.sum(RFconfusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(RFconfusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
pred_Variables = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',\
       'Amount']
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': RFclassifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
AdaClassifier = AdaBoostClassifier(learning_rate=0.008,n_estimators=200)
AdaClassifier.fit(X_train,y_train)
AdaPreds = AdaClassifier.predict(X_test)

In [None]:
Ada_F1_Score = f1_score(AdaPreds,y_test)
Ada_ROC = roc_auc_score(AdaPreds,y_test)
print('F1=', Ada_F1_Score)
print('ROC =', Ada_ROC)
ROC_results_2.append(Ada_ROC)
f1_results_2.append(Ada_F1_Score)

In [None]:
Ada_cvs_score = cross_val_score(AdaClassifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
Ada_cvs_score_test = cross_val_score(AdaClassifier, X_test, y_test, cv=10, scoring='roc_auc').mean()
print(Ada_cvs_score)
print(X_train, y_train)
results_2.append(Ada_cvs_score)

In [None]:
plot_roc_curve(AdaClassifier, X_test, y_test)
plt.title('Ada_ROC_AUC')
plt.show()

In [None]:
Ada_confusion = confusion_matrix(AdaPreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in Ada_confusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in Ada_confusion.flatten()/np.sum(Ada_confusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(Ada_confusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': AdaClassifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
CBClassifier = CatBoostClassifier(iterations=500,learning_rate=0.02)
CBClassifier.fit(X_train,y_train)
CBPreds = CBClassifier.predict(X_test)

In [None]:
CB_F1_Score = f1_score(CBPreds,y_test)
CB_ROC = roc_auc_score(CBPreds,y_test)
print('F1=', CB_F1_Score)
print('ROC =', CB_ROC)
ROC_results_2.append(CB_ROC)
f1_results_2.append(CB_F1_Score)

In [None]:
CB_cvs_score = cross_val_score(CBClassifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
CB_cvs_score_test = cross_val_score(CBClassifier, X_test, y_test, cv=10, scoring='roc_auc').mean()
print(CB_cvs_score)
print(CB_cvs_score_test)
results_2.append(CB_cvs_score)

In [None]:
plot_roc_curve(CBClassifier,X_test, y_test)
plt.title('CB_ROC_AUC')
plt.show()

In [None]:
CB_confusion = confusion_matrix(CBPreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in CB_confusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in CB_confusion.flatten()/np.sum(CB_confusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(CB_confusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': CBClassifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
xgb_classifier = XGBClassifier(max_depth =10 )
xgb_classifier.fit(X_train,y_train)
XgbPreds = xgb_classifier.predict(X_test)

In [None]:
XGB_F1_Score = f1_score(XgbPreds,y_test)
XGB_ROC = roc_auc_score(XgbPreds,y_test)
print('F1=', XGB_F1_Score)
print('ROC =', XGB_ROC)
ROC_results_2.append(XGB_ROC)
f1_results_2.append(XGB_F1_Score)

In [None]:
Xgb_cvs_score = cross_val_score(xgb_classifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
Xgb_cvs_score_test = cross_val_score(xgb_classifier, X_test, y_test, cv=10, scoring='roc_auc').mean()
print(Xgb_cvs_score)
print(Xgb_cvs_score_test)
results_2.append(Xgb_cvs_score)

In [None]:
plot_roc_curve(xgb_classifier, X_test, y_test)
plt.title('Xgb_ROC_AUC')
plt.show()

In [None]:
Xgb_confusion = confusion_matrix(XgbPreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in Xgb_confusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in Xgb_confusion.flatten()/np.sum(Xgb_confusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(Xgb_confusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
pred_Variables = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',\
       'Amount']
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': xgb_classifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
lgb_classifier = LGBMClassifier(learning_rate = 0.05, max_depth = 4)
lgb_classifier.fit(X_train,y_train)
LgbPreds = lgb_classifier.predict(X_test)

In [None]:
Lgb_F1_Score = f1_score(LgbPreds,y_test)
Lgb_ROC = roc_auc_score(LgbPreds,y_test)
print('F1=', Lgb_F1_Score)
print('ROC =', Lgb_ROC)
ROC_results_2.append(Lgb_ROC)
f1_results_2.append(Lgb_F1_Score)

In [None]:
Lgb_cvs_score = cross_val_score(lgb_classifier, X_train, y_train, cv=10, scoring='roc_auc').mean()
Lgb_cvs_score_test = cross_val_score(lgb_classifier, X_test, y_test, cv=10, scoring='roc_auc').mean()
print(Lgb_cvs_score)
print(Lgb_cvs_score_test)
results_2.append(Lgb_cvs_score)

In [None]:
plot_roc_curve(lgb_classifier, X_train, y_train)
plt.title('Lgb_ROC_AUC')
plt.show()

In [None]:
plot_roc_curve(lgb_classifier, X_test, y_test)
plt.title('Lgb_ROC_AUC')
plt.show()

In [None]:
Lgb_confusion = confusion_matrix(LgbPreds,y_test)
names = ['True Neg','False Pos','False Neg','True Pos']
counts = [value for value in Lgb_confusion.flatten()]
percentages = ['{0:.2%}'.format(value) for value in Lgb_confusion.flatten()/np.sum(Lgb_confusion)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(Lgb_confusion,annot = labels,cmap = 'Blues',fmt ='')

In [None]:
tmp = pd.DataFrame({'Feature': pred_Variables, 'Feature importance': lgb_classifier.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
algorithms = ["Random Forest", "AdaBoost", "CatBoost", "Xgb","LGBM"]
Dict = {"Cross_Validation":results_2, "ROC-AUC":ROC_results_2, "F1_Score":f1_results_2}
alg = pd.DataFrame(Dict, index = algorithms)
#alg.columns=["Cross_Validation","ROC-AUC","F1_Score"]
#alg.sort_values(by="Cross_Validation", ascending=False)
print(alg[['Cross_Validation','ROC-AUC',"F1_Score"]])