Importing the Dependencies

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [29]:
# loading the dataset to a Pandas DataFrame
data = pd.read_csv('creditcard.csv')

In [30]:
#Checking first two rows
data.head(2)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0


In [31]:
#Checking last two rows
data.tail(2)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [32]:
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

This Dataset is highly unbalanced

class 0 represents Normal Transaction

class 1 represents fraudulent transaction

In [33]:
# separating data for further analysis
valid = data[data.Class == 0]
fraud = data[data.Class == 1]

In [34]:
print(valid.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [35]:
# statistical measures of the data for amouunt field
valid.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [36]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [37]:
# compare the values for both transactions
data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


# We'll Perform Under-Sampling

Build a sample dataset which contains similar distribution of Valid transactions and Fraud Transactions

Number of Fraudulent Transactions --> 492

In [38]:
valid_sample = valid.sample(n=492)

Concatenating two DataFrames

In [39]:
new_dataset = pd.concat([valid_sample, fraud], axis=0)

In [40]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
27195,34434.0,0.935231,-0.640775,1.798167,1.727387,-1.380587,1.120568,-1.24395,0.669725,1.653708,...,0.001659,0.363272,-0.050644,0.219172,0.356819,-0.239785,0.103825,0.028184,37.5,0
18517,29562.0,1.422899,-0.467411,0.216285,-0.685022,-0.87378,-0.761465,-0.451497,-0.208163,-0.809064,...,-0.483639,-0.912195,0.04906,-0.097546,0.210814,0.979842,-0.05582,0.00543,10.0,0
131142,79508.0,1.188679,-0.485522,0.990296,-0.780954,-1.364299,-0.646853,-0.757089,0.172751,1.739461,...,0.03818,0.260555,-0.007762,0.524924,0.40009,-0.712413,0.08182,0.024462,1.0,0
219734,141856.0,-0.716489,-0.815458,-0.862605,-0.477557,1.718341,1.347224,0.781721,0.520588,0.330298,...,0.024048,-0.175514,0.745718,-2.422943,-0.760945,0.168527,0.245236,0.108554,209.37,0
282857,171198.0,-0.174957,1.069002,-0.98054,-0.609138,0.699183,-0.478241,0.580276,0.339158,0.058202,...,0.344579,1.024001,-0.103133,0.031675,-0.690969,-0.203081,0.36779,0.257249,9.99,0


In [41]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [42]:
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [43]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,95331.654472,0.106493,-0.076948,-0.054851,-0.050676,-0.092465,-0.018287,0.023211,0.020487,-0.05039,...,-0.001198,-0.011302,-0.036038,0.033487,-0.051859,0.013473,0.006132,-0.000388,0.011965,100.059492
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Splitting the data into Features & Targets

In [44]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [45]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
27195    34434.0  0.935231 -0.640775  1.798167  1.727387 -1.380587  1.120568   
18517    29562.0  1.422899 -0.467411  0.216285 -0.685022 -0.873780 -0.761465   
131142   79508.0  1.188679 -0.485522  0.990296 -0.780954 -1.364299 -0.646853   
219734  141856.0 -0.716489 -0.815458 -0.862605 -0.477557  1.718341  1.347224   
282857  171198.0 -0.174957  1.069002 -0.980540 -0.609138  0.699183 -0.478241   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [46]:
print(Y)

27195     0
18517     0
131142    0
219734    0
282857    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


Split the data into Training data & Testing Data

In [47]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [48]:
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


Model Training Using Logistic Regression

In [49]:
model = LogisticRegression()

In [50]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

Model Evaluation Based on Accuracy Score

In [51]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [52]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9174078780177891


In [53]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [54]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9137055837563451
