Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#loading the dataset to a Pandas dataframe
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [3]:
#first five rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
105062,69348,1.25569,-1.39354,0.692738,-1.28943,-1.682706,-0.27684,-1.136732,-0.076399,-1.738348,...,0.039513,0.368717,-0.072065,0.106986,0.257991,-0.087901,0.054217,0.043817,116.0,0.0
105063,69349,-1.250199,0.123183,1.051709,-0.236427,1.611834,0.284241,0.28819,0.30399,-0.430526,...,0.034681,-0.039897,-0.04946,-1.399614,0.217418,-0.496523,-0.042176,0.161224,3.99,0.0
105064,69349,-1.925153,-0.140133,2.456659,0.148112,-0.284164,-0.15052,-1.129736,1.003991,-0.218409,...,0.447094,0.895815,-0.233737,0.26249,0.050375,0.51514,0.207773,-0.090777,4.69,0.0
105065,69349,1.277856,-0.027651,0.249167,0.697814,-0.232188,-0.118324,-0.145313,-0.048568,0.677505,...,-0.14195,-0.164073,-0.194695,-0.381716,0.69633,0.489582,-0.017759,0.002991,10.0,0.0
105066,69350,-1.176438,0.811442,0.539281,-1.505794,0.401363,0.202677,0.284898,0.547988,-0.438512,...,0.077511,0.008094,-0.020608,,,,,,,


In [6]:
#dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105067 entries, 0 to 105066
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    105067 non-null  int64  
 1   V1      105067 non-null  float64
 2   V2      105067 non-null  float64
 3   V3      105067 non-null  float64
 4   V4      105067 non-null  float64
 5   V5      105067 non-null  float64
 6   V6      105067 non-null  float64
 7   V7      105067 non-null  float64
 8   V8      105067 non-null  float64
 9   V9      105067 non-null  float64
 10  V10     105067 non-null  float64
 11  V11     105067 non-null  float64
 12  V12     105067 non-null  float64
 13  V13     105067 non-null  float64
 14  V14     105067 non-null  float64
 15  V15     105067 non-null  float64
 16  V16     105067 non-null  float64
 17  V17     105067 non-null  float64
 18  V18     105067 non-null  float64
 19  V19     105067 non-null  float64
 20  V20     105067 non-null  float64
 21  V21     10

In [7]:
#checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [8]:
#distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    104834
1.0       232
Name: Class, dtype: int64

This dataset is highly unbalanced

0--->Norma transaction
1--->Fraudulent transaction

In [9]:
#Seperating the data for analysis
legit = credit_card_data[credit_card_data.Class==0]
fraud = credit_card_data[credit_card_data.Class==1]


In [10]:
print(legit.shape)
print(fraud.shape)

(104834, 31)
(232, 31)


In [11]:
#Statistical measures of the data
legit.Amount.describe()

count    104834.000000
mean         96.584297
std         262.333583
min           0.000000
25%           7.190000
50%          25.540000
75%          87.440000
max       19656.530000
Name: Amount, dtype: float64

In [12]:
fraud.Amount.describe()

count     232.000000
mean      115.520474
std       253.721393
min         0.000000
25%         1.000000
50%         7.595000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [13]:
#compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,43692.851508,-0.245296,-0.039251,0.702486,0.146715,-0.274535,0.102607,-0.102664,0.053767,-0.044415,...,0.04386,-0.032987,-0.10494,-0.037011,0.010118,0.13341,0.025912,0.000488,0.001663,96.584297
1.0,37898.655172,-6.138432,4.280104,-7.850475,4.80003,-4.349687,-1.566011,-6.505922,1.640373,-2.78888,...,0.237631,1.386881,-0.351377,-0.11421,-0.118153,0.208771,0.094034,0.536235,0.061592,115.520474


Under-Sampling

Build a sample dataset containing similar distribution of normal transaction and fraudulent transaction

Number of fraudulent transaction = 232

In [14]:
legit_sample = legit.sample(n=232)

Concatenating two dataframes

In [15]:
new_dataset = pd.concat([legit_sample, fraud],axis=0)

In [16]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
95232,65218,-0.99592,1.090484,1.117586,-1.292324,-0.258208,-0.927743,0.649965,-0.082932,0.390745,...,-0.006511,-0.04074,0.030545,0.42054,-0.329448,0.743132,-0.104956,0.148008,13.75,0.0
104851,69255,0.963862,-0.301404,0.477106,0.50957,-0.609917,-0.327312,-0.103048,0.138685,0.01765,...,-0.216616,-0.955966,0.175786,0.170025,-0.11635,0.096555,-0.052473,0.022328,103.39,0.0
33373,37207,1.317092,0.695169,-1.029208,0.820506,1.066967,0.091746,0.234543,-0.039843,-0.37123,...,-0.119113,-0.252637,-0.360711,-1.507392,0.922762,-0.240744,0.035168,0.031767,1.0,0.0
52868,45691,1.274717,-0.3875,0.653531,-0.26326,-0.515406,0.608076,-0.815263,0.263835,0.892932,...,-0.152474,-0.255513,-0.110327,-0.874637,0.263544,1.052765,-0.037516,-0.007284,4.0,0.0
58250,48303,1.140932,-1.342715,0.11147,-1.625362,-0.597174,1.190342,-1.003652,0.431272,-2.327005,...,-0.427502,-1.007389,0.336605,-1.113284,-0.293589,-0.538163,0.068914,0.013481,95.0,0.0


In [17]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
102443,68207,-13.192671,12.785971,-9.90665,3.320337,-4.801176,5.760059,-18.750889,-37.353443,-0.39154,...,27.202839,-8.887017,5.303607,-0.639435,0.263203,-0.108877,1.269566,0.939407,1.0,1.0
102444,68207,-13.192671,12.785971,-9.90665,3.320337,-4.801176,5.760059,-18.750889,-37.353443,-0.39154,...,27.202839,-8.887017,5.303607,-0.639435,0.263203,-0.108877,1.269566,0.939407,1.0,1.0
102445,68207,-13.192671,12.785971,-9.90665,3.320337,-4.801176,5.760059,-18.750889,-37.353443,-0.39154,...,27.202839,-8.887017,5.303607,-0.639435,0.263203,-0.108877,1.269566,0.939407,1.0,1.0
102446,68207,-13.192671,12.785971,-9.90665,3.320337,-4.801176,5.760059,-18.750889,-37.353443,-0.39154,...,27.202839,-8.887017,5.303607,-0.639435,0.263203,-0.108877,1.269566,0.939407,1.0,1.0
102782,68357,1.232604,-0.548931,1.087873,0.894082,-1.433055,-0.356797,-0.717492,0.003167,-0.100397,...,-0.448671,-0.517568,0.012833,0.699217,0.527258,-0.322607,0.080805,0.035427,19.59,1.0


In [18]:
new_dataset['Class'].value_counts()

0.0    232
1.0    232
Name: Class, dtype: int64

In [19]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,39741.810345,-0.307779,0.006088,0.812132,0.244447,-0.333097,0.009068,-0.106309,0.084954,0.156843,...,0.097998,-0.060417,-0.058651,-0.040223,-0.02066,0.151331,0.037973,0.015656,-0.028446,93.463707
1.0,37898.655172,-6.138432,4.280104,-7.850475,4.80003,-4.349687,-1.566011,-6.505922,1.640373,-2.78888,...,0.237631,1.386881,-0.351377,-0.11421,-0.118153,0.208771,0.094034,0.536235,0.061592,115.520474


Splittig the data into features and targets

In [20]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [21]:
print(X)

         Time         V1         V2        V3        V4        V5        V6  \
95232   65218  -0.995920   1.090484  1.117586 -1.292324 -0.258208 -0.927743   
104851  69255   0.963862  -0.301404  0.477106  0.509570 -0.609917 -0.327312   
33373   37207   1.317092   0.695169 -1.029208  0.820506  1.066967  0.091746   
52868   45691   1.274717  -0.387500  0.653531 -0.263260 -0.515406  0.608076   
58250   48303   1.140932  -1.342715  0.111470 -1.625362 -0.597174  1.190342   
...       ...        ...        ...       ...       ...       ...       ...   
102443  68207 -13.192671  12.785971 -9.906650  3.320337 -4.801176  5.760059   
102444  68207 -13.192671  12.785971 -9.906650  3.320337 -4.801176  5.760059   
102445  68207 -13.192671  12.785971 -9.906650  3.320337 -4.801176  5.760059   
102446  68207 -13.192671  12.785971 -9.906650  3.320337 -4.801176  5.760059   
102782  68357   1.232604  -0.548931  1.087873  0.894082 -1.433055 -0.356797   

               V7         V8        V9  ...       V

In [22]:
print(Y)

95232     0.0
104851    0.0
33373     0.0
52868     0.0
58250     0.0
         ... 
102443    1.0
102444    1.0
102445    1.0
102446    1.0
102782    1.0
Name: Class, Length: 464, dtype: float64


Split the data into training data and testing data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state = 2)

In [24]:
print(X.shape, X_train.shape, X_test.shape)

(464, 30) (371, 30) (93, 30)


Model Training

Logistic Regression

In [25]:
model = LogisticRegression()

In [26]:
#training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Model Evaluation

Accuracy Score

In [29]:
#acuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [30]:
print('Accuracy on Training data : ',training_data_accuracy)

Accuracy on Training data :  0.9460916442048517


In [31]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [32]:
print('Accuracy score on Test Data:', test_data_accuracy)

Accuracy score on Test Data: 0.9247311827956989
