In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
cddata = pd.read_csv('creditcard.csv')

In [3]:
cddata.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
cddata.info

<bound method DataFrame.info of             Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  

In [6]:
cddata['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [7]:
#SEPERATING THE DATA
normal = cddata[cddata.Class==0]
fraud = cddata[cddata.Class==1]


In [9]:
normal.shape

(284315, 31)

In [10]:
fraud.shape

(492, 31)

In [11]:
normal_sample = normal.sample(n=492)

In [12]:
#CONCATENATING THE TWO DATASET
new_dataset = pd.concat([normal_sample,fraud],axis=0)

In [13]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
154907,103568.0,-0.377927,1.056859,0.219114,-0.468625,0.482012,-1.015616,0.98355,-0.222748,1.620545,...,-0.369735,-0.565833,0.100866,-0.199926,-0.819197,0.132815,0.47318,0.317889,15.52,0
68232,52891.0,-0.446802,0.431156,2.356112,1.567798,0.055479,1.956103,-0.489871,0.732155,0.05118,...,0.218038,1.026319,0.124798,-0.602211,-0.923676,-0.143042,0.321466,0.173026,9.99,0
5017,4610.0,-0.927913,1.629688,1.983769,1.111621,0.032919,-0.763236,0.739622,-0.521593,0.764524,...,-0.455862,-0.647964,-0.016118,0.657201,-0.017083,0.289868,-0.007996,-0.029817,1.29,0
219003,141555.0,1.715824,-0.430658,-0.263414,1.527287,-0.656434,0.163019,-0.874996,0.198343,1.313725,...,0.242107,0.649195,0.115648,0.616687,-0.315266,-0.632198,0.087854,0.035623,108.45,0
123262,76865.0,-0.398572,0.938544,0.297187,-0.026324,-0.105491,-0.624443,0.50813,0.386539,-0.827379,...,0.127869,0.109273,0.013694,0.001434,-0.204477,0.265807,-0.145451,-0.047377,42.81,0


In [15]:
x = new_dataset.drop(columns='Class',axis=1)
y = new_dataset['Class']

In [16]:
#SPLITTING THE DATASET

x_train,y_train,x_test,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

In [17]:
x_train,y_train,x_test,y_test

(            Time         V1         V2         V3        V4         V5  \
 13014    22857.0  -1.060737  -0.533204   2.413634 -1.693315   0.715752   
 167374  118642.0  -0.477870   1.317511  -1.686981 -0.794215   1.838832   
 56703    47545.0   1.176716   0.557091  -0.490800  0.756424   0.249192   
 150684   93888.0 -10.040631   6.139183 -12.972972  7.740555  -8.684705   
 6882      8808.0  -4.617217   1.695694  -3.114372  4.328199  -1.873257   
 ...          ...        ...        ...        ...       ...        ...   
 39183    39729.0  -0.964567  -1.643541  -0.187727  1.158253  -2.458336   
 240291  150534.0  -2.065219   1.552305   1.309643 -0.463272  -0.569057   
 143335   85285.0  -6.713407   3.921104  -9.746678  5.148263  -5.151563   
 1570      1228.0  -0.198219   0.469060   0.084932 -0.980948   2.476429   
 17317    28625.0 -27.848181  15.598193 -28.923756  6.418442 -20.346228   
 
               V6         V7         V8        V9  ...       V20       V21  \
 13014   0.905675  -

In [29]:
#MODEL TRAINING
from sklearn.model_selection import train_test_split

# X = features DataFrame, y = label Series or DataFrame with one column
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Check the shapes now:
print(x_train.shape)  # should match y_train.shape[0]
print(y_train.shape)

model = LogisticRegression()
model.fit(x_train, y_train)  # This should have run successfully



(738, 30)
(738,)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [30]:
y_predict = model.predict(x_test)

In [31]:
accuracy = accuracy_score(y_predict,y_test)

In [32]:
accuracy

0.9186991869918699