<h2> Importing Libraries </h2>

<h4> Note: Unzip the dataset file before use </h4>

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

<h2> Data Collection and Preprocessing </h2>

In [2]:
dataset = pd.read_csv('creditcard.csv')

In [3]:
dataset.shape

(284807, 31)

In [4]:
dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
dataset['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [6]:
X = dataset.drop(columns='Class', axis = 1)
y = dataset['Class']

<h4> Standardistaion </h4>

In [4]:
scalar = StandardScaler()
scalar.fit(X)
X = scalar.transform(X)

<h4> Train Test Split </h4>

In [16]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

<h4> Hyperparameter Tuning </h4>

In [17]:
model = XGBClassifier(scale_pos_weight = (len(ytrain)-np.sum(ytrain))/np.sum(ytrain))

In [36]:
params = {
    "learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5],
    "max_depth" : [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    "min_child_weight" : [1, 3, 5, 7, 9, 11, 13],
    "gamma" : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "colsample_bytree" : [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
}

randomSearch = RandomizedSearchCV(model, param_distributions=params, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=10, verbose=3)

randomSearch.fit(Xtrain, ytrain)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV 1/10] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=13;, score=0.954 total time=  27.1s
[CV 8/10] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=13;, score=0.987 total time=  27.4s
[CV 6/10] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=13;, score=0.973 total time=  27.4s
[CV 7/10] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=13;, score=0.973 total time=  27.4s
[CV 5/10] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=13;, score=0.992 total time=  27.5s
[CV 4/10] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=13;, score=0.975 total time=  27.5s
[CV 9/10] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=13;, score=0.992 total time=  27.6s
[CV 3/10] END

In [37]:
randomSearch.best_params_

{'min_child_weight': 7,
 'max_depth': 10,
 'learning_rate': 0.25,
 'gamma': 0.3,
 'colsample_bytree': 0.8}

<h2> Model Training </h2>

In [38]:
model = XGBClassifier(colsample_bytree=0.8, gamma=0.3, learning_rate=0.25, max_depth=10, min_child_weight=7, n_estimators=100, scale_pos_weight = (len(y)-np.sum(y))/np.sum(y))

In [39]:
model.fit(Xtrain, ytrain)

In [40]:
p = model.predict(Xtrain)
pre=metrics.precision_score(ytrain, p)
re = metrics.recall_score(ytrain, p)
roc = metrics.roc_auc_score(ytrain, p)
f1 = metrics.f1_score(ytrain, p)
print(pre, " ", re, " ", roc, " ", f1)

0.9899497487437185   1.0   0.9999912068973097   0.9949494949494949


In [41]:
ptest = model.predict(Xtest)
pret=metrics.precision_score(ytest, ptest)
ret = metrics.recall_score(ytest, ptest)
roct = metrics.roc_auc_score(ytest, ptest)
f1t = metrics.f1_score(ytest, ptest)
print(pret, " ", ret, " ", roct, " ", f1t)

0.8556701030927835   0.8469387755102041   0.923346287023532   0.8512820512820514


<h2> Making Predictions <h2>

In [68]:
yi = dataset[dataset['Class'] == 1]
yi = yi.drop(columns=['Class'], axis=1)

In [69]:
print(yi)

            Time        V1        V2        V3        V4        V5        V6  \
541        406.0 -2.312227  1.951992 -1.609851  3.997906 -0.522188 -1.426545   
623        472.0 -3.043541 -3.157307  1.088463  2.288644  1.359805 -1.064823   
4920      4462.0 -2.303350  1.759247 -0.359745  2.330243 -0.821628 -0.075788   
6108      6986.0 -4.397974  1.358367 -2.592844  2.679787 -1.128131 -1.706536   
6329      7519.0  1.234235  3.019740 -4.304597  4.732795  3.624201 -1.357746   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [70]:
yi = scalar.transform(yi)

In [71]:
nyi = np.asarray(yi)

In [72]:
print(nyi)

[[-1.98803351 -1.180495    1.18209005 ...  0.6469882  -0.43406056
  -0.35322939]
 [-1.98664368 -1.55386353 -1.91200646 ... -0.62624682  0.10834929
   1.7617582 ]
 [-1.90262257 -1.17596291  1.06536753 ...  0.09802496 -0.46360746
   0.60603143]
 ...
 [ 1.56959742 -0.34520105  0.68210633 ...  0.95410585  0.58882664
  -0.04181846]
 [ 1.58254804 -1.58975021  0.35478844 ...  2.19228408 -0.76859627
   0.62630172]
 [ 1.59059217  1.01699283  0.09597005 ...  0.00740175 -0.04637872
  -0.18319079]]


In [83]:
ny = nyi[101].reshape(1, -1)
nx = X[6556].reshape(1, -1)

In [84]:
model.predict(ny)

array([1])