Importing the Dependencies

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [8]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [9]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
79313,57966,-0.527571,-3.483241,-0.206444,-0.583181,-2.067419,-0.129826,0.454118,-0.149176,1.863281,...,0.691203,0.22447,-0.875993,0.204226,0.079542,0.029189,-0.10922,0.178251,912.79,0.0
79314,57966,-0.541422,1.341092,0.576882,0.733835,0.259657,-0.233639,0.712069,0.190731,-1.067355,...,0.189997,0.493842,-0.141807,-0.406546,-0.043605,-0.28751,0.045194,0.100293,49.33,0.0
79315,57966,-0.830218,0.971781,0.974916,-1.348723,-0.230572,-0.778253,0.395464,0.109896,0.234581,...,-0.115767,-0.33595,-0.068152,-0.109514,-0.204347,0.7035,-0.38353,-0.22378,14.6,0.0
79316,57966,1.104838,-1.141651,0.942113,-0.661271,-1.591229,-0.056758,-1.19257,0.275125,-0.4479,...,0.508825,1.146845,-0.117753,0.241582,0.212189,-0.054096,0.023744,0.026206,99.0,0.0
79317,57967,-3.560685,3.485801,-0.070146,2.191571,-0.429913,1.075498,-0.935968,-2.147517,-1.165398,...,,,,,,,,,,


In [10]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79318 entries, 0 to 79317
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    79318 non-null  int64  
 1   V1      79318 non-null  float64
 2   V2      79318 non-null  float64
 3   V3      79318 non-null  float64
 4   V4      79318 non-null  float64
 5   V5      79318 non-null  float64
 6   V6      79318 non-null  float64
 7   V7      79318 non-null  float64
 8   V8      79318 non-null  float64
 9   V9      79318 non-null  float64
 10  V10     79318 non-null  float64
 11  V11     79317 non-null  float64
 12  V12     79317 non-null  float64
 13  V13     79317 non-null  float64
 14  V14     79317 non-null  float64
 15  V15     79317 non-null  float64
 16  V16     79317 non-null  float64
 17  V17     79317 non-null  float64
 18  V18     79317 non-null  float64
 19  V19     79317 non-null  float64
 20  V20     79317 non-null  float64
 21  V21     79317 non-null  float64
 22

In [11]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [12]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,79126
1.0,191


This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [13]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [14]:
print(legit.shape)
print(fraud.shape)

(79126, 31)
(191, 31)


In [15]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,79126.0
mean,97.773259
std,269.765878
min,0.0
25%,7.7
50%,26.925
75%,89.0
max,19656.53


In [16]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,191.0
mean,95.142251
std,214.122813
min,0.0
25%,1.0
50%,7.52
75%,99.99
max,1809.68


In [17]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,37164.532164,-0.241928,-0.047016,0.701122,0.152361,-0.264869,0.100851,-0.097874,0.046041,0.000222,...,0.041575,-0.030941,-0.10567,-0.03743,0.007877,0.134383,0.026147,0.000715,0.002542,97.773259
1.0,32448.565445,-6.660361,4.685496,-8.847226,5.220761,-4.885698,-2.012799,-7.016277,3.141649,-3.135469,...,0.378115,0.779382,-0.158471,-0.220629,-0.091717,0.23757,0.096534,0.58607,0.053246,95.142251


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [18]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [19]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [20]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
31729,36492,-3.76106,-2.337649,0.966992,-0.759637,-2.128829,-0.312842,-0.55467,1.11995,-0.568886,...,0.226903,-0.07969,-0.311079,0.570813,0.39413,-0.362292,0.317182,-0.361611,329.1,0.0
27638,34639,1.131626,-0.088298,1.172179,1.152068,-0.764832,0.305715,-0.664862,0.254449,0.646994,...,-0.031167,0.083674,-0.05888,0.002381,0.429609,-0.406019,0.068524,0.024929,9.99,0.0
18928,29873,1.472433,-0.933125,0.721166,-1.418353,-1.629346,-0.793324,-1.051318,-0.088932,-2.134612,...,-0.393444,-0.798171,0.232671,0.47127,0.071631,-0.509554,0.035277,0.018688,2.95,0.0
44122,41804,-0.700458,0.422444,3.482113,3.151253,-0.316988,1.780377,-0.669963,0.413248,0.457892,...,0.004406,0.76076,-0.455606,-0.353992,0.356581,0.573155,0.253389,-0.059369,12.5,0.0
34101,37513,-0.555987,0.384655,0.526214,0.220699,1.102909,1.147611,1.012099,0.354005,-0.353629,...,0.344071,1.090052,0.322032,-0.950776,-0.918436,-0.494331,0.260999,0.224535,102.46,0.0


In [21]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
76929,56806,0.016828,2.400826,-4.22036,3.462217,-0.624142,-1.294303,-2.986028,0.751883,-1.606672,...,0.285832,-0.771508,-0.2652,-0.873077,0.939776,-0.219085,0.874494,0.470434,1.0,1.0
77099,56887,-0.075483,1.812355,-2.566981,4.127549,-1.628532,-0.805895,-3.390135,1.019353,-2.451251,...,0.794372,0.270471,-0.143624,0.013566,0.634203,0.213693,0.773625,0.387434,5.0,1.0
77348,57007,-1.271244,2.462675,-2.851395,2.32448,-1.372245,-0.948196,-3.065234,1.166927,-2.268771,...,0.652941,0.081931,-0.221348,-0.523582,0.224228,0.756335,0.6328,0.250187,0.01,1.0
77387,57027,-2.335655,2.22538,-3.37945,2.178538,-3.568264,0.316814,-1.734948,1.449139,-1.980033,...,0.78554,0.297412,0.308536,-0.598416,-0.12185,-0.491018,0.701606,0.206966,444.17,1.0
77682,57163,-10.363049,4.543672,-9.795898,5.508003,-6.037156,-0.133493,-11.724346,-3.198346,-4.767842,...,-2.457145,1.687257,0.977178,-0.543369,-0.289125,-0.107586,0.330642,0.163577,1.0,1.0


In [22]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,191


In [23]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,37226.006098,-0.412665,-0.106572,0.638343,0.192178,-0.160299,0.049095,-0.174438,0.050316,0.11979,...,-0.028704,-0.071871,-0.092759,-0.03431,-0.02247,0.151294,-0.006947,-0.04099,0.064714,87.444309
1.0,32448.565445,-6.660361,4.685496,-8.847226,5.220761,-4.885698,-2.012799,-7.016277,3.141649,-3.135469,...,0.378115,0.779382,-0.158471,-0.220629,-0.091717,0.23757,0.096534,0.58607,0.053246,95.142251


Splitting the data into Features & Targets

In [24]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [25]:
print(X)

        Time         V1        V2        V3        V4        V5        V6  \
31729  36492  -3.761060 -2.337649  0.966992 -0.759637 -2.128829 -0.312842   
27638  34639   1.131626 -0.088298  1.172179  1.152068 -0.764832  0.305715   
18928  29873   1.472433 -0.933125  0.721166 -1.418353 -1.629346 -0.793324   
44122  41804  -0.700458  0.422444  3.482113  3.151253 -0.316988  1.780377   
34101  37513  -0.555987  0.384655  0.526214  0.220699  1.102909  1.147611   
...      ...        ...       ...       ...       ...       ...       ...   
76929  56806   0.016828  2.400826 -4.220360  3.462217 -0.624142 -1.294303   
77099  56887  -0.075483  1.812355 -2.566981  4.127549 -1.628532 -0.805895   
77348  57007  -1.271244  2.462675 -2.851395  2.324480 -1.372245 -0.948196   
77387  57027  -2.335655  2.225380 -3.379450  2.178538 -3.568264  0.316814   
77682  57163 -10.363049  4.543672 -9.795898  5.508003 -6.037156 -0.133493   

              V7        V8        V9  ...       V20       V21       V22  \


In [26]:
print(Y)

31729    0.0
27638    0.0
18928    0.0
44122    0.0
34101    0.0
        ... 
76929    1.0
77099    1.0
77348    1.0
77387    1.0
77682    1.0
Name: Class, Length: 683, dtype: float64


Split the data into Training data & Testing Data

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [28]:
print(X.shape, X_train.shape, X_test.shape)

(683, 30) (546, 30) (137, 30)


Model Training

Logistic Regression

In [29]:
model = LogisticRegression()

In [30]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [31]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [32]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9725274725274725


In [33]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [34]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.927007299270073
