In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings   
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression      
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../data/creditcard.csv')

In [3]:
data.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


In [4]:
x = data.drop(['id','Class'],axis=1)
y = data.Class

In [5]:
x.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,0.637735,...,0.091202,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1
1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,0.529808,...,-0.233984,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37
2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,0.690708,...,0.361652,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54
3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,0.575231,...,-0.378223,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44
4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,0.968046,...,0.247237,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97


In [6]:
print('Shape of x',x.shape)
print('Shape of y',y.shape)

Shape of x (568630, 29)
Shape of y (568630,)


In [7]:
sc = StandardScaler()

In [8]:
x_scaled = sc.fit_transform(x) 
x_scaled_df = pd.DataFrame(x_scaled,columns=x.columns)
x_scaled_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,0.637735,...,0.091202,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,0.858447
1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,0.529808,...,-0.233984,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,-0.796369
2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,0.690708,...,0.361652,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,-1.377011
3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,0.575231,...,-0.378223,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,-0.962119
4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,0.968046,...,0.247237,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,0.323285


In [9]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled_df,y,test_size=0.25,random_state=15,stratify= y)

In [10]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(426472, 29)
(142158, 29)
(426472,)
(142158,)


In [11]:
lr=LogisticRegression()
lr.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [12]:
def model_eval(actual, predicted):
  acc_score = accuracy_score(actual, predicted)
  conf_matrix = confusion_matrix(actual, predicted)
  clas_rep = classification_report(actual, predicted)
  print('Model Accuracy is: ', round(acc_score, 2))
  print(conf_matrix)
  print(clas_rep)

In [13]:
preds_lr_train = lr.predict(x_train)
preds_lr_test = lr.predict(x_test)

In [14]:
print('-------Training Accuracy---------')
model_eval(y_train,preds_lr_train)

-------Training Accuracy---------
Model Accuracy is:  0.96
[[208639   4597]
 [ 10333 202903]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97    213236
           1       0.98      0.95      0.96    213236

    accuracy                           0.96    426472
   macro avg       0.97      0.96      0.96    426472
weighted avg       0.97      0.96      0.96    426472



In [15]:
print('-------Test Accuracy---------')
model_eval(y_test, preds_lr_test)

-------Test Accuracy---------
Model Accuracy is:  0.96
[[69544  1535]
 [ 3516 67563]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.96     71079
           1       0.98      0.95      0.96     71079

    accuracy                           0.96    142158
   macro avg       0.96      0.96      0.96    142158
weighted avg       0.96      0.96      0.96    142158



In [16]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [17]:
preds_dtree_train = dtree.predict(x_train)
preds_dtree_test = dtree.predict(x_test)

In [18]:
print('-------Training Accuracy---------')
model_eval(y_train,preds_dtree_train)

-------Training Accuracy---------
Model Accuracy is:  1.0
[[213236      0]
 [     0 213236]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    213236
           1       1.00      1.00      1.00    213236

    accuracy                           1.00    426472
   macro avg       1.00      1.00      1.00    426472
weighted avg       1.00      1.00      1.00    426472



In [19]:
print('-------Test Accuracy---------')
model_eval(y_test,preds_dtree_test)

-------Test Accuracy---------
Model Accuracy is:  1.0
[[70864   215]
 [   94 70985]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71079
           1       1.00      1.00      1.00     71079

    accuracy                           1.00    142158
   macro avg       1.00      1.00      1.00    142158
weighted avg       1.00      1.00      1.00    142158



In [20]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
preds_rf_train = rf.predict(x_train)
preds_rf_test = rf.predict(x_test)

In [22]:
print('-------Training Accuracy---------')
model_eval(y_train, preds_rf_train)

-------Training Accuracy---------
Model Accuracy is:  1.0
[[213236      0]
 [     0 213236]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    213236
           1       1.00      1.00      1.00    213236

    accuracy                           1.00    426472
   macro avg       1.00      1.00      1.00    426472
weighted avg       1.00      1.00      1.00    426472



In [23]:
print('-------Test Accuracy---------')
model_eval(y_test, preds_rf_test)

-------Test Accuracy---------
Model Accuracy is:  1.0
[[71061    18]
 [    0 71079]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71079
           1       1.00      1.00      1.00     71079

    accuracy                           1.00    142158
   macro avg       1.00      1.00      1.00    142158
weighted avg       1.00      1.00      1.00    142158



In [24]:
xgclf = xgb.XGBRFClassifier()
xgclf.fit(x_train,y_train)

0,1,2
,learning_rate,1.0
,subsample,0.8
,colsample_bynode,0.8
,reg_lambda,1e-05
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bytree,


In [25]:
preds_xgb_train = xgclf.predict(x_train)
preds_xgb_test = xgclf.predict(x_test)

In [26]:
print('-------Training Accuracy---------')
model_eval(y_train,preds_xgb_train)

-------Training Accuracy---------
Model Accuracy is:  0.97
[[209872   3364]
 [  9681 203555]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97    213236
           1       0.98      0.95      0.97    213236

    accuracy                           0.97    426472
   macro avg       0.97      0.97      0.97    426472
weighted avg       0.97      0.97      0.97    426472



In [27]:
print('-------Test Accuracy---------')
model_eval(y_test,preds_xgb_test)

-------Test Accuracy---------
Model Accuracy is:  0.97
[[69952  1127]
 [ 3258 67821]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     71079
           1       0.98      0.95      0.97     71079

    accuracy                           0.97    142158
   macro avg       0.97      0.97      0.97    142158
weighted avg       0.97      0.97      0.97    142158



In [28]:
param_dist_xgb = {
    'n_estimators': [50,100,150,200,300,400],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6]
}

In [29]:
xgb_clf = RandomizedSearchCV(xgclf,param_dist_xgb,verbose = 2)

In [30]:
xgb_clf.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ...learning_rate=0.3, max_depth=4, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.3, max_depth=4, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.3, max_depth=4, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.3, max_depth=4, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.3, max_depth=4, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.2, max_depth=6, n_estimators=200; total time=   1.8s
[CV] END ...learning_rate=0.2, max_depth=6, n_estimators=200; total time=   1.9s
[CV] END ...learning_rate=0.2, max_depth=6, n_estimators=200; total time=   1.9s
[CV] END ...learning_rate=0.2, max_depth=6, n_estimators=200; total time=   1.9s
[CV] END ...learning_rate=0.2, max_depth=6, n_estimators=200; total time=   1.9s
[CV] END ....learning_rate=0.3, max_depth=3, n_estimators=50; total time=   0.5s
[CV] END ....learning_rate=0.3, max_depth=3, n_e

0,1,2
,estimator,"XGBRFClassifi...ate=None, ...)"
,param_distributions,"{'learning_rate': [0.01, 0.1, ...], 'max_depth': [3, 4, ...], 'n_estimators': [50, 100, ...]}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,learning_rate,0.2
,subsample,0.8
,colsample_bynode,0.8
,reg_lambda,1e-05
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bytree,


In [31]:
print('Best Parameters for XG Boost :',xgb_clf.best_params_)

Best Parameters for XG Boost : {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.2}


In [32]:
preds_xgb_clf_train = xgb_clf.predict(x_train)
preds_xgb_clf_test = xgb_clf.predict(x_test)

In [33]:
print('-------Training Accuracy---------')
model_eval(y_train,preds_xgb_clf_train)

-------Training Accuracy---------
Model Accuracy is:  0.97
[[209940   3296]
 [  9818 203418]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97    213236
           1       0.98      0.95      0.97    213236

    accuracy                           0.97    426472
   macro avg       0.97      0.97      0.97    426472
weighted avg       0.97      0.97      0.97    426472



In [34]:
print('-------Test Accuracy---------')
model_eval(y_test,preds_xgb_clf_test)

-------Test Accuracy---------
Model Accuracy is:  0.97
[[69983  1096]
 [ 3314 67765]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97     71079
           1       0.98      0.95      0.97     71079

    accuracy                           0.97    142158
   macro avg       0.97      0.97      0.97    142158
weighted avg       0.97      0.97      0.97    142158

