In [18]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
import pickle

In [3]:
df=pd.read_csv('creditcard.csv')

In [4]:
# to see all the columns in dataset

pd.options.display.max_columns=None

In [5]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# data Preprocessing


In [6]:
df['Class'].value_counts()

# this is an imbalance data we have to apply undersampling or oversampling to balance it

Class
0    284315
1       492
Name: count, dtype: int64

In [7]:
# we are going to standardize the amount column so that it can scale with other columns
ss =  StandardScaler()
df['Amount']=ss.fit_transform(df[['Amount']])

In [8]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [9]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,2.239053e-15,1.673327e-15,-1.247012e-15,8.190001e-16,1.207294e-15,4.887456e-15,1.437716e-15,-3.772171e-16,9.564149e-16,1.039917e-15,6.406204e-16,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,2.9139520000000004e-17,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,1.08885,1.020713,0.9992014,0.9952742,0.9585956,0.915316,0.8762529,0.8493371,0.8381762,0.8140405,0.770925,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,1.000002,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,-24.58826,-4.797473,-18.68371,-5.791881,-19.21433,-4.498945,-14.12985,-25.1628,-9.498746,-7.213527,-54.49772,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,-0.3532294,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,-0.5354257,-0.7624942,-0.4055715,-0.6485393,-0.425574,-0.5828843,-0.4680368,-0.4837483,-0.4988498,-0.4562989,-0.2117214,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,-0.3308401,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,-0.09291738,-0.03275735,0.1400326,-0.01356806,0.05060132,0.04807155,0.06641332,-0.06567575,-0.003636312,0.003734823,-0.06248109,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,-0.2652715,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,0.4539234,0.7395934,0.618238,0.662505,0.4931498,0.6488208,0.5232963,0.399675,0.5008067,0.4589494,0.1330408,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,-0.04471707,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,23.74514,12.01891,7.848392,7.126883,10.52677,8.877742,17.31511,9.253526,5.041069,5.591971,39.4209,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,102.3622,1.0


In [10]:
df.shape

(284807, 31)

In [11]:
x=df.drop(['Class','Time'],axis=1)
y=df['Class']

# Over Sampling

In [12]:
ros = RandomOverSampler(random_state=2, sampling_strategy='auto')

In [13]:
x_ros,y_ros=ros.fit_resample(x,y)
y_ros.value_counts()

Class
0    284315
1    284315
Name: count, dtype: int64

In [66]:
x_train,x_test,y_train,y_test = train_test_split(x_ros,y_ros,test_size=0.3)

In [16]:
models = {
    'decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression()
}

for model_name, model in models.items():
    print(f'\n================ {model_name} ======================\n')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(f'Accuracy Score: {accuracy_score(y_pred,y_test)}')
    print(f'Precision: {precision_score(y_pred,y_test)}')
    print(f'f1 Score: {f1_score(y_pred,y_test)}')
    print(f'recall Score: {recall_score(y_pred,y_test)}')



Accuracy Score: 0.9997303460363799
Precision: 1.0
f1 Score: 0.9997296884365414
recall Score: 0.9994595229702737


Accuracy Score: 0.9493578132235959
Precision: 0.9218470798457632
f1 Score: 0.9477914559046601
recall Score: 0.9752384742621911


In [17]:
# picking the decision tree model

dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)

In [19]:
pickle.dump(dtc,open('dtc_model.pkl','wb'))

In [20]:
pickle.dump(ss,open('StandardScaler.pkl','wb'))

# Under Sampling

In [21]:
rus = RandomUnderSampler(random_state=2, sampling_strategy='majority')

In [25]:
x_rus,y_rus=rus.fit_resample(x,y)
y_rus.value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [26]:
x_train,x_test,y_train,y_test = train_test_split(x_rus,y_rus,test_size=0.3)

In [27]:
models = {
    'decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression()
}

for model_name, model in models.items():
    print(f'\n================ {model_name} ======================\n')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(f'Accuracy Score: {accuracy_score(y_pred,y_test)}')
    print(f'Precision: {precision_score(y_pred,y_test)}')
    print(f'f1 Score: {f1_score(y_pred,y_test)}')
    print(f'recall Score: {recall_score(y_pred,y_test)}')



Accuracy Score: 0.8986486486486487
Precision: 0.922077922077922
f1 Score: 0.9044585987261146
recall Score: 0.8875


Accuracy Score: 0.9527027027027027
Precision: 0.9285714285714286
f1 Score: 0.9533333333333334
recall Score: 0.9794520547945206


In [28]:
# Over sampling work instead of under sampling so we will go with over sampling

In [64]:
dtc.predict(x_ros.iloc[20080].values.reshape(1,-1))



array([0], dtype=int64)

In [41]:
np.array([-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62]).reshape(1,-1)

array([[-1.35980713e+00, -7.27811733e-02,  2.53634674e+00,
         1.37815522e+00, -3.38320770e-01,  4.62387778e-01,
         2.39598554e-01,  9.86979013e-02,  3.63786970e-01,
         9.07941720e-02, -5.51599533e-01, -6.17800856e-01,
        -9.91389847e-01, -3.11169354e-01,  1.46817697e+00,
        -4.70400525e-01,  2.07971242e-01,  2.57905802e-02,
         4.03992960e-01,  2.51412098e-01, -1.83067779e-02,
         2.77837576e-01, -1.10473910e-01,  6.69280749e-02,
         1.28539358e-01, -1.89114844e-01,  1.33558377e-01,
        -2.10530535e-02,  1.49620000e+02]])

In [45]:
df.drop(['Time','Class'],axis=1).iloc[0].values

array([-1.35980713, -0.07278117,  2.53634674,  1.37815522, -0.33832077,
        0.46238778,  0.23959855,  0.0986979 ,  0.36378697,  0.09079417,
       -0.55159953, -0.61780086, -0.99138985, -0.31116935,  1.46817697,
       -0.47040053,  0.20797124,  0.02579058,  0.40399296,  0.2514121 ,
       -0.01830678,  0.27783758, -0.11047391,  0.06692807,  0.12853936,
       -0.18911484,  0.13355838, -0.02105305,  0.24496426])

In [65]:
y_ros.value_counts()

Class
0    284315
1    284315
Name: count, dtype: int64

Class
0    284315
1       492
Name: count, dtype: int64