In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [58]:
credit_card_data = pd.read_csv('creditcard.csv')

In [59]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23858 entries, 0 to 23857
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    23858 non-null  int64  
 1   V1      23858 non-null  float64
 2   V2      23858 non-null  float64
 3   V3      23858 non-null  float64
 4   V4      23858 non-null  float64
 5   V5      23858 non-null  float64
 6   V6      23858 non-null  float64
 7   V7      23858 non-null  float64
 8   V8      23858 non-null  float64
 9   V9      23858 non-null  float64
 10  V10     23858 non-null  float64
 11  V11     23858 non-null  float64
 12  V12     23858 non-null  float64
 13  V13     23858 non-null  float64
 14  V14     23858 non-null  float64
 15  V15     23858 non-null  float64
 16  V16     23858 non-null  float64
 17  V17     23858 non-null  float64
 18  V18     23858 non-null  float64
 19  V19     23858 non-null  float64
 20  V20     23858 non-null  float64
 21  V21     23858 non-null  float64
 22

In [60]:
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [61]:
credit_card_data['Class'].value_counts()

Class
0.0    23769
1.0       88
Name: count, dtype: int64

In [62]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [63]:
print(legit.shape)
print(fraud.shape)

(23769, 31)
(88, 31)


In [64]:
legit.Amount.describe()

count    23769.000000
mean        73.880199
std        212.541174
min          0.000000
25%          6.000000
50%         18.110000
75%         65.850000
max       7879.420000
Name: Amount, dtype: float64

In [65]:
fraud.Amount.describe()

count      88.000000
mean      100.010000
std       265.845031
min         0.000000
25%         1.000000
50%         1.000000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [66]:
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,18213.77782,-0.208135,0.175983,0.774952,0.226463,-0.166866,0.092304,-0.101868,0.006906,0.512306,...,0.038311,-0.043337,-0.136382,-0.037279,0.014613,0.127111,0.02694,0.010111,0.004578,73.880199
1.0,17935.875,-8.613716,6.376169,-12.221731,6.231847,-6.027247,-2.48708,-8.308784,4.351326,-2.987199,...,0.714069,0.539387,-0.381823,-0.350615,-0.25297,0.346695,0.17976,0.856336,0.100578,100.01


In [67]:
legit_sample = legit.sample(n=88)
legit_sample.shape

(88, 31)

In [68]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)
new_dataset.shape

(176, 31)

In [69]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
10869,18612,-0.658079,1.178308,1.834773,-0.03469,0.261055,-0.116451,0.720496,-0.127723,0.849355,...,-0.362759,-0.45704,0.02005,0.157679,-0.294889,0.00719,0.205978,-0.063897,14.99,0.0
21992,31976,-0.382265,0.916794,1.404939,0.422245,0.30039,-0.461869,0.507835,0.034272,-0.603885,...,-0.14329,-0.426878,0.048064,0.030835,-0.621642,0.152824,0.123653,0.15068,1.98,0.0
9994,14997,1.057155,0.495355,1.212903,2.548761,-0.105953,0.562292,-0.414434,0.135318,0.412228,...,0.089113,0.515654,-0.05305,-0.014731,0.340114,0.066419,0.003253,0.014148,17.44,0.0
14087,25064,0.079127,1.160375,1.242255,1.15403,0.194849,-0.744758,0.647717,-0.284964,0.653472,...,-0.015373,0.302385,-0.125899,0.355553,-0.16148,-0.371128,0.062175,0.03824,1.5,0.0
200,132,-1.571359,1.687508,0.73467,1.29335,-0.217532,-0.002677,0.147364,0.515362,-0.372442,...,0.048549,0.377256,-0.030436,0.117608,-0.06052,-0.29655,-0.48157,-0.167897,10.0,0.0


In [70]:
new_dataset['Class'].value_counts()

Class
0.0    88
1.0    88
Name: count, dtype: int64

In [71]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,19954.477273,-0.765528,0.169858,0.409689,0.168883,-0.155355,0.162685,-0.112875,0.353983,0.390604,...,0.002894,-0.09056,-0.17366,-0.116929,0.07342,0.034289,0.041501,-0.101879,0.035479,91.394318
1.0,17935.875,-8.613716,6.376169,-12.221731,6.231847,-6.027247,-2.48708,-8.308784,4.351326,-2.987199,...,0.714069,0.539387,-0.381823,-0.350615,-0.25297,0.346695,0.17976,0.856336,0.100578,100.01


In [72]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [73]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
10869  18612 -0.658079  1.178308  1.834773 -0.034690  0.261055 -0.116451   
21992  31976 -0.382265  0.916794  1.404939  0.422245  0.300390 -0.461869   
9994   14997  1.057155  0.495355  1.212903  2.548761 -0.105953  0.562292   
14087  25064  0.079127  1.160375  1.242255  1.154030  0.194849 -0.744758   
200      132 -1.571359  1.687508  0.734670  1.293350 -0.217532 -0.002677   
...      ...       ...       ...       ...       ...       ...       ...   
18773  29753  0.269614  3.549755 -5.810353  5.809370  1.538808 -2.269219   
18809  29785  0.923764  0.344048 -2.880004  1.721680 -3.019565 -0.639736   
20198  30852 -2.830984  0.885657  1.199930  2.861292  0.321669  0.289966   
23308  32686  0.287953  1.728735 -1.652173  3.813544 -1.090927 -0.984745   
23422  32745 -2.179135  0.020218 -2.182733  2.572046 -3.663733  0.081568   

             V7        V8        V9  ...       V20       V21       V22  \
10869  0.7204

In [74]:
print(Y)

10869    0.0
21992    0.0
9994     0.0
14087    0.0
200      0.0
        ... 
18773    1.0
18809    1.0
20198    1.0
23308    1.0
23422    1.0
Name: Class, Length: 176, dtype: float64


In [75]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [76]:
print(X.shape, X_train.shape, X_test.shape)

(176, 30) (140, 30) (36, 30)


In [77]:
model = LogisticRegression()

In [78]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [79]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [80]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9571428571428572


In [81]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [82]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9444444444444444
