In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/credit_data.csv')

In [4]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [10]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
210008,137803.0,-1.571064,-2.758091,0.314888,1.852422,3.903963,-1.31429,-1.527671,0.126093,0.815512,...,-0.083974,-0.623219,1.102707,-0.026264,-2.109336,-1.309397,0.196817,0.236764,18.0,0.0
210009,137803.0,-0.018165,0.079155,-0.173224,-1.891858,0.26255,0.092924,-0.120425,0.140978,-0.737219,...,0.396946,1.091351,-0.098603,-0.021136,-1.008299,-0.282105,0.157513,0.26344,39.99,0.0
210010,137803.0,-0.960193,0.854343,1.24788,0.820755,-0.146576,0.370667,-0.173084,0.698069,-0.341302,...,-0.025489,-0.270514,-0.099729,-0.547979,-0.384533,-0.747679,-0.041341,0.041936,25.0,0.0
210011,137803.0,1.832529,0.279144,-0.038382,3.643664,-0.004576,0.467475,-0.360268,0.248727,-0.672839,...,-0.287207,-0.952697,0.433825,-0.470534,-0.613443,-0.474556,-0.014152,-0.040156,18.96,0.0
210012,137803.0,1.55574,-1.014458,0.271766,1.548964,-1.352834,-0.006791,-0.888262,0.187306,1.559778,...,,,,,,,,,,


In [11]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210013 entries, 0 to 210012
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    210013 non-null  float64
 1   V1      210013 non-null  float64
 2   V2      210013 non-null  float64
 3   V3      210013 non-null  float64
 4   V4      210013 non-null  float64
 5   V5      210013 non-null  float64
 6   V6      210013 non-null  float64
 7   V7      210013 non-null  float64
 8   V8      210013 non-null  float64
 9   V9      210013 non-null  float64
 10  V10     210013 non-null  float64
 11  V11     210013 non-null  float64
 12  V12     210013 non-null  float64
 13  V13     210013 non-null  float64
 14  V14     210012 non-null  float64
 15  V15     210012 non-null  float64
 16  V16     210012 non-null  float64
 17  V17     210012 non-null  float64
 18  V18     210012 non-null  float64
 19  V19     210012 non-null  float64
 20  V20     210012 non-null  float64
 21  V21     21

In [12]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [17]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    209618
1.0       394
Name: Class, dtype: int64

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [18]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [19]:
print(legit.shape)
print(fraud.shape)

(209618, 31)
(394, 31)


In [20]:
# statistical measures of the data
legit.Amount.describe()

count    209618.000000
mean         89.889280
std         248.067423
min           0.000000
25%           6.000000
50%          23.020000
75%          79.540000
max       19656.530000
Name: Amount, dtype: float64

In [21]:
fraud.Amount.describe()

count     394.000000
mean      123.172970
std       256.841767
min         0.000000
25%         1.000000
50%        13.385000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [22]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,73757.442119,-0.075414,-0.010262,0.280457,0.054282,-0.08378,0.0435,-0.02788,0.010674,0.012168,...,0.015175,-0.013111,-0.037335,-0.013681,0.003258,0.051649,0.00455,0.000703,0.001457,89.88928
1.0,62970.449239,-5.564086,4.145138,-7.653811,4.735781,-4.009483,-1.39332,-6.501846,0.66457,-2.742972,...,0.394589,0.778108,-0.01242,-0.048604,-0.067137,0.065962,0.038256,0.182273,0.06039,123.17297


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [23]:
legit_sample = legit.sample(n=492)

In [24]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [25]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
10371,16650.0,1.19986,0.153438,0.477112,1.409864,0.043604,0.45079,-0.304877,0.10086,1.937367,...,-0.458667,-0.923957,-0.00897,-0.881507,0.46691,-0.482395,0.01937,0.008167,11.28,0.0
166457,118093.0,1.970463,-1.131436,-1.509942,-0.992207,-0.504159,-0.64848,-0.383113,-0.191269,-0.741379,...,0.560216,1.299475,-0.081009,0.824036,0.123843,0.008477,-0.067171,-0.048288,125.75,0.0
97610,66296.0,-1.622251,2.257879,0.016618,1.589634,-1.721856,1.773535,-4.245506,-10.519861,-1.268709,...,-5.143928,1.316616,0.22275,0.327744,0.278374,-0.354858,0.149153,0.322394,150.0,0.0
60483,49345.0,-0.740512,1.15336,0.727456,1.032566,0.195592,0.804376,0.12352,0.781655,-0.529984,...,-0.001094,0.144173,-0.177627,-0.824484,-0.014009,-0.183897,0.290016,0.122174,20.61,0.0
147666,88817.0,1.836252,-0.566344,-0.002947,1.201871,-0.568528,0.608401,-0.92565,0.175063,1.362657,...,0.242373,0.795158,0.001789,-1.056979,-0.182497,-0.529107,0.090166,-0.013274,77.4,0.0


In [26]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
203700,134928.0,1.204934,3.23807,-6.010324,5.720847,1.5484,-2.321064,-0.78188,0.076619,-2.976249,...,0.098341,-0.845866,-0.031228,0.421146,0.388361,0.056035,0.491828,0.340847,0.0,1.0
204064,135095.0,0.232512,0.938944,-4.64778,3.079844,-1.902655,-1.041408,-1.020407,0.547069,-1.10599,...,0.911373,1.042929,0.999394,0.90126,-0.452093,0.192959,0.180859,-0.029315,345.0,1.0
204079,135102.0,1.862102,-0.124052,-1.989752,0.382609,0.473032,-0.674517,0.298621,-0.282416,0.802053,...,-0.204158,-0.511441,0.077874,0.388335,0.007896,-0.12098,-0.019579,0.006155,108.51,1.0
204503,135314.0,-3.15899,1.765452,-3.390168,0.98741,-1.50993,-1.280952,-2.719557,0.718325,-1.660183,...,1.120533,1.605085,-0.618637,-0.251283,-0.240528,-0.004327,-0.235477,0.018129,84.28,1.0
208651,137211.0,0.630579,1.183631,-5.066283,2.179903,-0.703376,-0.103614,-3.49035,1.094734,-0.717418,...,0.621622,0.043807,0.102711,-0.601505,0.127371,-0.163009,0.853792,0.356503,39.45,1.0


In [27]:
new_dataset['Class'].value_counts()

0.0    492
1.0    394
Name: Class, dtype: int64

In [28]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,73460.908537,-0.038335,0.138038,0.241427,0.134658,-0.132987,-0.009567,-0.121306,-0.128866,-0.004367,...,0.022482,-0.110383,-0.028898,0.002602,0.009315,0.012292,0.023752,0.001884,0.038101,68.434553
1.0,62970.449239,-5.564086,4.145138,-7.653811,4.735781,-4.009483,-1.39332,-6.501846,0.66457,-2.742972,...,0.394589,0.778108,-0.01242,-0.048604,-0.067137,0.065962,0.038256,0.182273,0.06039,123.17297


In [29]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [30]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
10371    16650.0  1.199860  0.153438  0.477112  1.409864  0.043604  0.450790   
166457  118093.0  1.970463 -1.131436 -1.509942 -0.992207 -0.504159 -0.648480   
97610    66296.0 -1.622251  2.257879  0.016618  1.589634 -1.721856  1.773535   
60483    49345.0 -0.740512  1.153360  0.727456  1.032566  0.195592  0.804376   
147666   88817.0  1.836252 -0.566344 -0.002947  1.201871 -0.568528  0.608401   
...          ...       ...       ...       ...       ...       ...       ...   
203700  134928.0  1.204934  3.238070 -6.010324  5.720847  1.548400 -2.321064   
204064  135095.0  0.232512  0.938944 -4.647780  3.079844 -1.902655 -1.041408   
204079  135102.0  1.862102 -0.124052 -1.989752  0.382609  0.473032 -0.674517   
204503  135314.0 -3.158990  1.765452 -3.390168  0.987410 -1.509930 -1.280952   
208651  137211.0  0.630579  1.183631 -5.066283  2.179903 -0.703376 -0.103614   

              V7         V8        V9  

In [31]:
print(Y)

10371     0.0
166457    0.0
97610     0.0
60483     0.0
147666    0.0
         ... 
203700    1.0
204064    1.0
204079    1.0
204503    1.0
208651    1.0
Name: Class, Length: 886, dtype: float64


In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [33]:
print(X.shape, X_train.shape, X_test.shape)

(886, 30) (708, 30) (178, 30)


Model Training

Logistic Regression

In [34]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Model Evaluation

Accuracy Score

In [40]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [41]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9562146892655368


In [42]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [43]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9213483146067416
