First, import the required modules

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
log = LogisticRegression()

Load the required dataset (i.e. creditcard)

In [3]:
data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Now, check if there are null values in the dataset

In [4]:
data.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

From the above description we can say that the dataset has no missing values.

In [6]:
data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

As shown above, there is a lot of difference in the distribution of data.

Let's separate the data of target class (i.e. Class column) for analysis.

It's given that '0' is for legit transaction and '1' is for fraud transaction in the description of dataset.

In [7]:
#separating the data for analysis
legit = data[data.Class == 0]
fraud = data[data.Class == 1]

In [8]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [9]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

From the above description, it's clear that the data is distributed unbalanced.

In [10]:
# Compare the values for both transactions
data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Applying UnderSampling

Here we are using to equate the sample of fraud and legit transactions as legit _sample >> fraud_sample.

Creating a new legit sample with sample size=492 as the fraud sample size is 492.

In [11]:
#Take the legit transaction sample size=492 as the fraud transaction sample size=492
sample_legit = legit.sample(n=492)

In [12]:
#Concat the sample_legit and fraund as one dataset
new_df = pd.concat([sample_legit,fraud], axis=0)

In [13]:
new_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
253286,156177.0,-0.356156,-0.224752,1.356568,-2.454605,-0.441624,-0.259961,0.089227,0.056194,-0.942575,...,-0.494318,-1.016657,0.148835,-0.355651,-0.747707,0.457955,0.083475,0.140329,43.95,0
206289,136152.0,-15.329347,1.517809,-10.115838,0.970134,-3.900967,-0.288366,2.156707,1.233429,6.325787,...,-2.261459,-0.176708,-2.427285,0.424,0.933384,-0.787926,-0.26281,2.292313,144.0,0
161179,113954.0,1.943036,0.058374,-3.164921,0.39219,2.931009,3.2358,-0.129606,0.755048,0.09164,...,-0.199926,-0.600387,0.191218,0.553598,0.128396,-0.756034,0.029588,-0.023273,33.75,0
221549,142606.0,1.824009,-0.927934,-0.376237,0.161429,-0.664074,0.609094,-1.039845,0.368005,1.3361,...,0.083643,0.075435,0.202927,0.077609,-0.576775,0.353705,-0.036954,-0.034308,89.8,0
242231,151403.0,-0.060003,0.590023,1.411522,-0.241472,-0.126207,-0.340733,0.310149,-0.030642,0.336482,...,0.344489,1.228925,-0.300912,0.053249,0.055476,-0.067491,0.098941,0.038168,7.48,0


In [14]:
new_df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


Let's check the distribution of new_data sample 

In [15]:
new_df['Class'].value_counts()

Class
0    492
1    492
Name: count, dtype: int64

Here the distribution is exactly equal.

In [16]:
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,97435.347561,-0.134528,-0.111105,-0.112494,-0.140854,0.033644,0.034024,0.074956,-0.024225,0.037428,...,-0.000704,0.039259,-0.003847,0.0168,0.020088,0.020127,-0.010741,0.00274,-0.010304,108.158272
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Here also there  is a difference between the means of fraud and legit transactions but not much.

In [17]:
x = new_df.drop(columns='Class')
y = new_df['Class']

In [18]:
x.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
253286,156177.0,-0.356156,-0.224752,1.356568,-2.454605,-0.441624,-0.259961,0.089227,0.056194,-0.942575,...,-0.328916,-0.494318,-1.016657,0.148835,-0.355651,-0.747707,0.457955,0.083475,0.140329,43.95
206289,136152.0,-15.329347,1.517809,-10.115838,0.970134,-3.900967,-0.288366,2.156707,1.233429,6.325787,...,1.000485,-2.261459,-0.176708,-2.427285,0.424,0.933384,-0.787926,-0.26281,2.292313,144.0
161179,113954.0,1.943036,0.058374,-3.164921,0.39219,2.931009,3.2358,-0.129606,0.755048,0.09164,...,-0.113384,-0.199926,-0.600387,0.191218,0.553598,0.128396,-0.756034,0.029588,-0.023273,33.75
221549,142606.0,1.824009,-0.927934,-0.376237,0.161429,-0.664074,0.609094,-1.039845,0.368005,1.3361,...,-0.044341,0.083643,0.075435,0.202927,0.077609,-0.576775,0.353705,-0.036954,-0.034308,89.8
242231,151403.0,-0.060003,0.590023,1.411522,-0.241472,-0.126207,-0.340733,0.310149,-0.030642,0.336482,...,-0.026078,0.344489,1.228925,-0.300912,0.053249,0.055476,-0.067491,0.098941,0.038168,7.48


In [19]:
print(y.head(), y.tail())

253286    0
206289    0
161179    0
221549    0
242231    0
Name: Class, dtype: int64 279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, dtype: int64


splitting the Training and testing the Dataset using train_test_split

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=3)

In [21]:
print(x.shape, x_train.shape, x_test.shape)

(984, 30) (787, 30) (197, 30)


Machine training using Logistic Regression as the problem is binary classification problem.(legit or fraud).

In [22]:
import warnings
warnings.filterwarnings('ignore')
log.fit(x_train, y_train)

Evaluate the model using accuracy_score

In [23]:
x_train_prediction = log.predict(x_train)
train_data_accuracy = accuracy_score(x_train_prediction, y_train)
print('Accuracy of training data: ',train_data_accuracy)

Accuracy of training data:  0.9466327827191868


In [24]:
x_test_prediction = log.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('Accuracy of testing data: ',test_data_accuracy)

Accuracy of testing data:  0.9187817258883249


From the above accuracy score, it's clear that our model has predicted 94% of fraud transactions correctly

Finally, we created a model which has predicted the fraud tranctions correctly.