In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
data.info(())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
# missing values in each column
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
# distribution of legit and fraudulent transaction
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [6]:
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.893285e-16,-3.14764e-15,...,1.47312e-16,8.042109e-16,5.282512e-16,4.456271e-15,1.426896e-15,1.70164e-15,-3.662252e-16,-1.217809e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [7]:
#seperating data
legit = data[data.Class == 0]
fraud = data[data.Class == 1]

In [8]:
print(legit.shape,fraud.shape)

(284315, 31) (492, 31)


In [9]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [10]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [11]:
# compare values for both transaction
data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


# Under-sampling

build a sample dataset containing similar distribution of normal and fraudalent transaction
Number of fraudalant transaction = 492


In [12]:
legit_sample = legit.sample(n=492)

In [13]:
#concanatinmg two data set
df = pd.concat([legit_sample,fraud],axis = 0)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
89132,62440.0,-1.065682,1.225237,1.434769,-0.581327,0.10198,-0.721008,0.514357,0.350528,-0.770405,...,-0.19245,-0.812468,-0.200971,-0.129838,0.313254,0.042425,-0.136234,-0.018203,1.79,0
104940,69297.0,1.276014,-0.672705,-0.425494,-0.777398,-0.582088,-0.880396,-0.103505,-0.203036,-1.241653,...,0.256003,0.408211,-0.274815,0.028707,0.743485,-0.087089,-0.054869,-7e-06,92.68,0
157072,109463.0,-0.317202,1.07087,0.138569,-0.173519,0.308527,-1.173072,1.031676,-0.312604,1.09467,...,0.274317,1.392739,0.055519,0.5346,-0.825099,-0.305736,0.614108,0.358774,38.23,0
273437,165587.0,2.062781,-0.168869,-0.729689,0.039198,0.046223,0.040729,-0.414819,-0.045695,1.138833,...,-0.156679,-0.273709,0.279176,0.012023,-0.198037,-0.918825,0.063181,-0.024279,1.0,0
121932,76373.0,-2.612299,-0.34826,1.346331,-0.40767,0.954306,-0.865407,0.938943,-0.330892,0.034879,...,-0.636763,-0.662938,0.793265,-0.066059,0.526914,0.023036,0.010116,-0.326678,44.99,0


In [14]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,...,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0
mean,86333.461382,-2.380215,1.767374,-3.540488,2.270745,-1.522284,-0.672158,-2.824288,0.294064,-1.332808,...,0.355237,0.029935,-0.017196,-0.065711,0.031092,0.016812,0.094755,0.027488,102.569289,0.5
std,47469.251095,5.533564,3.733014,6.214942,3.204364,4.228128,1.780816,5.830505,4.890862,2.289547,...,2.780967,1.166982,1.188998,0.577631,0.66581,0.474776,1.005039,0.427942,222.371829,0.500254
min,406.0,-30.55238,-11.34971,-31.103685,-3.838025,-22.105532,-6.406267,-43.557242,-41.044261,-13.434066,...,-22.797604,-8.887017,-19.254328,-2.541474,-4.781606,-1.255235,-7.263482,-2.260319,0.0,0.0
25%,45869.0,-2.939486,-0.231729,-5.138122,-0.136409,-1.774345,-1.599839,-3.107826,-0.191029,-2.324444,...,-0.157752,-0.507584,-0.255934,-0.403344,-0.320602,-0.296336,-0.059497,-0.05968,1.0,0.0
50%,78524.5,-0.736592,0.968696,-1.419319,1.38065,-0.409926,-0.639088,-0.716324,0.163747,-0.773543,...,0.157976,0.053311,-0.033742,-0.004016,0.052677,-0.029397,0.042349,0.029711,15.725,0.5
75%,129945.5,1.080633,2.814266,0.327225,4.250632,0.494301,0.109062,0.228613,0.867753,0.08809,...,0.648533,0.567171,0.185901,0.36705,0.401687,0.303328,0.454884,0.188068,99.99,1.0
max,172356.0,2.334606,22.057729,2.827994,12.114672,11.095089,6.474115,5.802537,20.007208,4.609397,...,27.202839,8.361985,6.389925,1.133652,2.208209,2.745261,3.052358,1.784166,2125.87,1.0


In [15]:
df['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [16]:
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,91920.115854,0.011518,-0.089029,-0.047695,-0.00054,0.106657,0.053421,-0.079845,0.017493,-0.084493,...,0.009494,-0.003115,0.04582,0.005915,-0.026292,0.020735,-0.018024,0.018935,-0.020692,82.927256
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [17]:
X = df.drop(columns = 'Class',axis = 1)
Y = df['Class']

In [18]:
print(X,Y)

            Time        V1        V2        V3        V4        V5        V6  \
89132    62440.0 -1.065682  1.225237  1.434769 -0.581327  0.101980 -0.721008   
104940   69297.0  1.276014 -0.672705 -0.425494 -0.777398 -0.582088 -0.880396   
157072  109463.0 -0.317202  1.070870  0.138569 -0.173519  0.308527 -1.173072   
273437  165587.0  2.062781 -0.168869 -0.729689  0.039198  0.046223  0.040729   
121932   76373.0 -2.612299 -0.348260  1.346331 -0.407670  0.954306 -0.865407   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [19]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [20]:
print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)

(787, 30) (787,) (197, 30) (197,)


In [21]:
# Model training

In [22]:
# logistic Regression

In [54]:
model = LogisticRegression()

In [55]:
model.fit(X_train,Y_train)

LogisticRegression()

In [56]:
# evaluation

In [57]:
X_train_prediction=model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)
print(training_data_accuracy)

0.9123252858958069


In [58]:
X_test_prediction=model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(test_data_accuracy)
print(precision_score(Y_test,model.predict(X_test)))

0.8883248730964467
0.9222222222222223


In [28]:
a = X_test.iloc[0]
a=np.array(a)
a.reshape(1,30)

array([[ 1.53761000e+05,  1.14625927e+00,  1.40345824e+00,
        -4.15914819e+00,  2.66010706e+00, -3.23216827e-01,
        -1.83607089e+00, -1.62374006e+00,  2.59562498e-01,
        -1.13204415e+00, -3.35647371e+00,  3.64647784e+00,
        -3.00268416e+00, -6.47500580e-01, -5.94500331e+00,
         1.74650037e-01, -1.47563951e+00, -3.08227353e+00,
         2.24739755e-01, -3.00930779e-01,  2.84830656e-01,
         5.64449516e-01,  4.45743912e-01, -1.41136197e-01,
        -2.65517116e-01,  3.62259920e-01, -4.16062098e-01,
         5.07369853e-01,  2.43744114e-01,  5.13700000e+01]])

In [29]:
model.predict([a])[0]



1

In [30]:
Y_test

247995    1
27324     0
207233    0
185560    0
15451     1
         ..
108258    1
12108     1
99506     1
74496     1
19342     0
Name: Class, Length: 197, dtype: int64

In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
model = DecisionTreeClassifier(random_state=0, max_depth=4)

In [33]:
model.fit(X_train,Y_train)

DecisionTreeClassifier(max_depth=4, random_state=0)

In [35]:
X_test_prediction=model.predict(X_test)
test_data_accuracy = accuracy_score(model.predict(X_test),Y_test)
print(test_data_accuracy)

0.9187817258883249


In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

In [51]:
model = RandomForestClassifier()

In [52]:
model.fit(X_train,Y_train)

RandomForestClassifier()

In [53]:
X_test_prediction=model.predict(X_test)
test_data_accuracy = accuracy_score(model.predict(X_test),Y_test)
print(test_data_accuracy)
print(precision_score(Y_test,model.predict(X_test)))

0.9238578680203046
0.9662921348314607


In [41]:
from sklearn.svm import SVC
model = SVC(kernel='sigmoid', gamma=1)
model.fit(X_train,Y_train)
X_test_prediction=model.predict(X_test)
test_data_accuracy = accuracy_score(model.predict(X_test),Y_test)
print(test_data_accuracy)

0.49746192893401014


In [42]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train,Y_train)
X_test_prediction=model.predict(X_test)
test_data_accuracy = accuracy_score(model.predict(X_test),Y_test)
print(test_data_accuracy)

0.6903553299492385
