In [197]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [198]:
#loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('creditcard.csv')

In [199]:
#first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [200]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
99949,67808,-1.348395,-5.053235,-1.843068,0.393821,-1.792322,0.285798,1.595,-0.495461,-1.247178,...,0.421061,-1.448189,-1.395326,-0.387639,0.009909,0.97293,-0.341508,0.230113,1442.0,0
99950,67808,-0.315079,1.274081,0.651319,4.310447,0.94277,0.277408,0.968199,-0.310208,-2.076947,...,0.110451,0.650782,-0.035583,0.030488,-1.164376,0.149297,0.131767,0.111846,67.77,0
99951,67808,0.647934,-0.885499,-0.807046,0.265026,-0.210056,-0.977249,0.995989,-0.531029,-0.339985,...,0.155967,-0.299302,-0.486101,0.0197,0.522852,1.058319,-0.175774,0.052481,375.0,0
99952,67808,0.276525,-0.988879,-0.60528,1.535243,-0.220934,0.088207,0.53276,0.041954,-0.252952,...,0.183679,-0.499183,-0.435057,-0.418904,0.258726,-0.462133,-0.03891,0.115576,457.95,0
99953,67808,0.95582,-0.194154,0.27172,1.490526,-0.330564,-0.1689,0.157537,0.000831,0.400325,...,-0.045628,-0.177956,-0.14776,0.065619,0.595922,-0.321031,0.014353,0.032947,116.11,0


In [201]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99954 entries, 0 to 99953
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    99954 non-null  int64  
 1   V1      99954 non-null  float64
 2   V2      99954 non-null  float64
 3   V3      99954 non-null  float64
 4   V4      99954 non-null  float64
 5   V5      99954 non-null  float64
 6   V6      99954 non-null  float64
 7   V7      99954 non-null  float64
 8   V8      99954 non-null  float64
 9   V9      99954 non-null  float64
 10  V10     99954 non-null  float64
 11  V11     99954 non-null  float64
 12  V12     99954 non-null  float64
 13  V13     99954 non-null  float64
 14  V14     99954 non-null  float64
 15  V15     99954 non-null  float64
 16  V16     99954 non-null  float64
 17  V17     99954 non-null  float64
 18  V18     99954 non-null  float64
 19  V19     99954 non-null  float64
 20  V20     99954 non-null  float64
 21  V21     99954 non-null  float64
 22

In [202]:
#checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [203]:
#distribution of legit transactions and fraudulent transactions
credit_card_data['Class'].value_counts()

0    99730
1      224
Name: Class, dtype: int64

In [204]:
#seperating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [205]:
print(legit.shape)
print(fraud.shape)

(99730, 31)
(224, 31)


In [206]:
#statistical measures of the data
legit.Amount.describe()

count    99730.000000
mean        97.735036
std        264.070158
min          0.000000
25%          7.530000
50%         26.200000
75%         88.950000
max      19656.530000
Name: Amount, dtype: float64

In [207]:
fraud.Amount.describe()

count     224.000000
mean      118.103393
std       257.090668
min         0.000000
25%         1.000000
50%         8.565000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [208]:
#compare the values of both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,43366.600993,-0.24494,-0.048238,0.69971,0.148675,-0.275068,0.103437,-0.10273,0.054018,-0.040716,...,0.043889,-0.033542,-0.104388,-0.037282,0.009013,0.133668,0.02583,-0.000663,0.001502,97.735036
1,36817.107143,-6.002024,4.098758,-7.871751,4.878291,-4.369771,-1.774953,-6.235919,2.700881,-2.874648,...,0.34879,0.712737,-0.120749,-0.258527,-0.106577,0.20683,0.103959,0.520878,0.038175,118.103393


In [209]:
legit_sample = legit.sample(n=224)

In [210]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [211]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
51431,45673,-2.980326,0.647877,1.17368,2.605468,-0.324152,0.538965,-1.201967,0.219244,-0.588384,...,0.647399,0.212095,0.7251,0.024128,-0.364549,-0.017104,0.096971,-0.090163,9.29,0
38341,39947,1.091012,-0.179823,0.71829,0.979339,-0.114017,1.231551,-0.674873,0.385185,0.439275,...,0.155573,0.572252,-0.255425,-1.102321,0.565356,-0.164061,0.075723,0.018359,43.01,0
54242,47051,0.973451,-0.290197,-0.234774,0.851324,0.043632,0.08376,0.177636,0.047729,-0.022076,...,-0.123767,-0.795378,-0.225653,-0.888738,0.486317,-0.520883,-0.023585,0.029141,159.64,0
57994,48823,-3.946977,-0.914742,-0.126195,0.600597,0.542146,-1.322659,-0.385989,0.681339,-0.07282,...,-0.765197,-0.417344,1.879092,-0.060562,0.342173,0.121957,-0.25203,-0.883003,16.99,0
86815,62048,-0.534402,1.040452,2.794049,2.476232,-0.596967,0.66211,-0.080138,0.246809,-0.873613,...,0.426032,1.281417,-0.033007,0.432257,-0.711422,0.167431,0.097991,0.17085,34.13,0


In [212]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
94930,65728,1.227614,-0.668974,-0.271785,-0.58944,-0.604795,-0.350285,-0.486365,-0.010809,-0.794944,...,-0.026055,-0.295255,-0.180459,-0.436539,0.494649,-0.283738,-0.001128,0.035075,98.01,1
95378,65936,-3.593476,0.781442,-1.822448,0.605761,-1.194656,-0.517195,-1.722523,0.12889,0.014963,...,0.351792,0.391249,-0.252875,-0.498042,0.010172,0.909929,-1.478767,0.722673,101.5,1
95583,66037,0.286302,1.399345,-1.682503,3.864377,-1.185373,-0.341732,-2.53938,0.768378,-1.547882,...,0.352456,-0.243678,-0.194079,-0.172201,0.742237,0.12779,0.569731,0.291206,7.53,1
98095,67150,-1.824295,0.403327,-1.994122,2.756558,-3.139064,0.408185,-1.209045,1.095634,-1.447225,...,0.83876,0.341727,0.947506,-0.145493,0.049326,0.831065,0.332421,0.252713,489.71,1
99212,67571,-0.758469,-0.04541,-0.168438,-1.313275,-1.901763,0.739433,3.071892,-0.483422,0.618203,...,0.042619,0.397224,0.072229,-0.242276,0.560916,-0.540955,0.150606,-0.11714,549.06,1


In [213]:
new_dataset['Class'].value_counts()

0    224
1    224
Name: Class, dtype: int64

In [214]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,43742.290179,0.003339,-0.165031,0.671775,0.21973,-0.537773,0.050814,-0.18977,0.118545,-0.119756,...,0.047273,0.010826,-0.173285,-0.037962,0.026777,0.124728,0.001231,-0.013926,0.000816,119.152411
1,36817.107143,-6.002024,4.098758,-7.871751,4.878291,-4.369771,-1.774953,-6.235919,2.700881,-2.874648,...,0.34879,0.712737,-0.120749,-0.258527,-0.106577,0.20683,0.103959,0.520878,0.038175,118.103393


In [215]:
X = new_dataset.drop(columns='Class',axis=1)
Y = new_dataset['Class']

In [216]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
51431  45673 -2.980326  0.647877  1.173680  2.605468 -0.324152  0.538965   
38341  39947  1.091012 -0.179823  0.718290  0.979339 -0.114017  1.231551   
54242  47051  0.973451 -0.290197 -0.234774  0.851324  0.043632  0.083760   
57994  48823 -3.946977 -0.914742 -0.126195  0.600597  0.542146 -1.322659   
86815  62048 -0.534402  1.040452  2.794049  2.476232 -0.596967  0.662110   
...      ...       ...       ...       ...       ...       ...       ...   
94930  65728  1.227614 -0.668974 -0.271785 -0.589440 -0.604795 -0.350285   
95378  65936 -3.593476  0.781442 -1.822448  0.605761 -1.194656 -0.517195   
95583  66037  0.286302  1.399345 -1.682503  3.864377 -1.185373 -0.341732   
98095  67150 -1.824295  0.403327 -1.994122  2.756558 -3.139064  0.408185   
99212  67571 -0.758469 -0.045410 -0.168438 -1.313275 -1.901763  0.739433   

             V7        V8        V9  ...       V20       V21       V22  \
51431 -1.2019

In [217]:
print(Y)

51431    0
38341    0
54242    0
57994    0
86815    0
        ..
94930    1
95378    1
95583    1
98095    1
99212    1
Name: Class, Length: 448, dtype: int64


In [218]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [219]:
print(X.shape,X_train.shape,X_test.shape)

(448, 30) (358, 30) (90, 30)


In [220]:
model=LogisticRegression()

In [221]:
#training the logistic regression model with training data
model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [222]:
#accuracy on training data 
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [223]:
print('Accuracy on Training data : ',training_data_accuracy)

Accuracy on Training data :  0.9692737430167597


In [224]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [225]:
print('Accuracy score on Test Data : ',test_data_accuracy)

Accuracy score on Test Data :  0.9


In [226]:
import pickle
m=open('model.pkl','wb')
pickle.dump(model,m)
m.close()