In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv("creditcard.csv")
X_master = df.drop("Class",axis = 1)
X1 = np.array(X_master)
Y_mas = df["Class"]
df.head()


(1, 284807, 30)

In [30]:
df["Class"].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [31]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

## Data set is highly unbalanced and therefore a good amount of processing is needed

In [32]:
fraud= df[df.Class == 1]
normal = df[df.Class == 0]
print(normal.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [33]:
df.groupby("Class").mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


## Clearly there is a huge difference in mean values of the normal and the fraudelent data sets

In [34]:
normal_new = normal.sample(n = 492,random_state = 3)## takes random 492 cases from the normal sample case which we will use to balance
normal_new.shape

(492, 31)

# Undersampling 
## We take 492 cases from the normal case and now we will make a new balanced dataset with equal number of fraud and normal transactions to train the model on it 


In [35]:
df_new = pd.concat([normal_new,fraud],axis = 0)
df_new.head()
df_new.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [36]:
df_new.groupby("Class").mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,95498.711382,0.018051,0.043796,0.006189,-0.095401,-0.010223,0.021163,-0.002422,-0.05986,-0.016681,...,0.032159,7e-05,-0.000288,-0.057349,-0.033842,-0.00777,0.00526,0.015932,-0.002425,95.142337
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [37]:
## varitation is still the same we can now start training on this new dataset 
X = df_new.drop("Class",axis = 1)
Y = df_new["Class"]

In [38]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,stratify = Y,test_size = 0.2,random_state = 2)

In [39]:
model = LogisticRegression()
model.fit(X_train,Y_train)
pred1 = model.predict(X_train)

## Accuracy on the training example 

In [40]:
print(accuracy_score(pred1,Y_train))

0.9390088945362135


## Accuracy on the test examples

In [41]:
pred2 = model.predict(X_test)
print(accuracy_score(pred2,Y_test))

0.9238578680203046


## Running the model on the entire data set 

In [42]:
pred3 = model.predict(X_master)
pred3.shape

(284807,)

In [44]:
print(accuracy_score(pred3,Y_mas))

0.9608296144406563
