In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [None]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128821 entries, 0 to 128820
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    128821 non-null  int64  
 1   V1      128821 non-null  float64
 2   V2      128821 non-null  float64
 3   V3      128821 non-null  float64
 4   V4      128821 non-null  float64
 5   V5      128821 non-null  float64
 6   V6      128821 non-null  float64
 7   V7      128821 non-null  float64
 8   V8      128821 non-null  float64
 9   V9      128821 non-null  float64
 10  V10     128821 non-null  float64
 11  V11     128821 non-null  float64
 12  V12     128821 non-null  float64
 13  V13     128821 non-null  float64
 14  V14     128821 non-null  float64
 15  V15     128821 non-null  float64
 16  V16     128820 non-null  float64
 17  V17     128820 non-null  float64
 18  V18     128820 non-null  float64
 19  V19     128820 non-null  float64
 20  V20     128820 non-null  float64
 21  V21     12

In [None]:
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [None]:
credit_card_data['Class'].value_counts()

0.0    128559
1.0       261
Name: Class, dtype: int64

In [None]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(128559, 31)
(261, 31)


In [None]:
legit.Amount.describe()

count    128559.000000
mean         92.909269
std         251.540738
min           0.000000
25%           6.500000
50%          24.950000
75%          83.190000
max       19656.530000
Name: Amount, dtype: float64

In [None]:
fraud.Amount.describe()

count     261.000000
mean      116.679693
std       246.300626
min         0.000000
25%         1.000000
50%        11.380000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [None]:
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,49351.046259,-0.238744,-0.007156,0.691484,0.13934,-0.276329,0.085922,-0.104433,0.059177,-0.08367,...,0.042065,-0.039697,-0.115701,-0.034179,0.012188,0.130042,0.025932,-0.000691,0.002019,92.909269
1.0,41975.996169,-5.665868,3.97347,-7.225782,4.526519,-4.014566,-1.501321,-5.982089,1.526847,-2.616088,...,0.238243,1.27186,-0.320923,-0.117196,-0.105943,0.199014,0.054738,0.492548,0.081347,116.679693


In [None]:
legit_sample = legit.sample(n=261)

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset['Class'].value_counts()

0.0    261
1.0    261
Name: Class, dtype: int64

In [None]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,47312.191571,-0.018547,-0.113878,0.674954,0.203367,-0.142208,0.188021,-0.043666,0.105041,-0.089827,...,0.083224,-0.016099,-0.141551,-0.010878,0.077004,0.175898,0.016876,0.021603,0.019003,99.546284
1.0,41975.996169,-5.665868,3.97347,-7.225782,4.526519,-4.014566,-1.501321,-5.982089,1.526847,-2.616088,...,0.238243,1.27186,-0.320923,-0.117196,-0.105943,0.199014,0.054738,0.492548,0.081347,116.679693


In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

         Time        V1        V2        V3        V4        V5        V6  \
68670   53090  1.191449 -0.042985  0.839853  0.945714 -0.272046  0.808621   
125157  77553  1.219013 -0.887890  0.697641 -0.708644 -1.352628 -0.279978   
21418   31651  1.121702  0.373426  0.281198  1.264172 -0.091162 -0.772645   
14328   25410 -1.902767 -0.677378  3.056575 -1.447365 -0.577530  0.688135   
7266     9659  1.173540  0.372273  0.440919  0.729904 -0.045515 -0.227947   
...       ...       ...       ...       ...       ...       ...       ...   
124087  77171  1.118560  1.291858 -1.298805  2.135772  0.772204 -1.147291   
124115  77182 -1.410852  2.268271 -2.297554  1.871331  0.248957 -1.208799   
124176  77202 -0.356326  1.435305 -0.813564  1.993117  2.055878 -0.543579   
125342  77627 -7.139060  2.773082 -6.757845  4.446456 -5.464428 -1.713401   
128479  78725 -4.312479  1.886476 -2.338634 -0.475243 -1.185444 -2.112079   

              V7        V8        V9  ...       V20       V21       V22  \


In [None]:
print(Y)

68670     0.0
125157    0.0
21418     0.0
14328     0.0
7266      0.0
         ... 
124087    1.0
124115    1.0
124176    1.0
125342    1.0
128479    1.0
Name: Class, Length: 753, dtype: float64


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(522, 30) (417, 30) (105, 30)


In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9376498800959233


In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9714285714285714
