In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [15]:
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [20]:
# Handling the Imbalance Dataset

# Segregating the data into 2 based on type of class
Real_data = data[data.Class == 0]
Fraud_data = data[data.Class == 1]
print(Real_data.shape)
print(Fraud_data.shape)

(284315, 31)
(492, 31)


In [24]:
# There are only 492 data of fraud class available which probably 1 % of the whole dataset
# training with this dataset will not produce a high accuarcy.

# To handle that taking out the same amount of data from real class and concatinating it with fraud class will create
# a new dataset that has a pretty good distribution of both the classes 
new_Real_data = Real_data.sample(492)
new_Real_data.shape

(492, 31)

In [34]:
# Now lets concatinate the above data with fraud data sample
new_data = pd.concat([new_Real_data,Fraud_data])
print(data.shape)      ## Original dataset shape
print(new_Real_data.shape) ## new real dataset shape (which was segreggated from non-fraud class of data)
print(Fraud_data.shape)  ## Fraud dataset shape
print(new_data.shape)  #new dataset shape (created afer concatinating)

(284807, 31)
(492, 31)
(492, 31)
(984, 31)


In [35]:
new_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
190603,128899.0,-0.701062,1.46049,-1.086797,1.071629,1.244184,-0.488955,0.927029,0.404114,-1.078087,...,0.148652,0.426613,-0.177328,0.57622,0.024571,-0.426464,0.317182,0.175746,31.94,0
180580,124604.0,-1.589127,0.672125,0.996288,4.50727,0.262738,1.347637,-0.011286,0.227011,-1.146492,...,0.061761,0.738202,-0.997079,-0.894147,-0.435737,0.535663,0.1961,-0.495679,137.14,0
1271,978.0,1.295252,0.232756,-0.173885,0.308449,0.206534,-0.092895,-0.078438,0.049317,-0.063933,...,-0.319027,-0.977567,-0.020884,-0.893034,0.323284,0.151231,-0.032013,0.007618,1.78,0
180987,124776.0,-1.926359,-1.045605,0.583308,-1.450857,2.318088,1.333151,0.530273,0.667923,0.039086,...,0.274507,0.430373,-0.104047,-0.901483,1.126428,0.867434,-0.135768,0.022554,148.16,0
174683,122022.0,1.605564,-0.422538,-2.349825,0.975706,1.295817,0.679935,0.677488,0.050085,-0.32959,...,0.313334,0.672855,-0.169364,-1.634733,0.233852,-0.389496,-0.022953,-0.061101,180.0,0


In [39]:
new_data['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [46]:
# Above Distribution is pretty good 
# lets seperate the features and classes

x = new_data.drop(['Class'],axis = 1)
x

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
190603,128899.0,-0.701062,1.460490,-1.086797,1.071629,1.244184,-0.488955,0.927029,0.404114,-1.078087,...,0.109485,0.148652,0.426613,-0.177328,0.576220,0.024571,-0.426464,0.317182,0.175746,31.94
180580,124604.0,-1.589127,0.672125,0.996288,4.507270,0.262738,1.347637,-0.011286,0.227011,-1.146492,...,-0.129027,0.061761,0.738202,-0.997079,-0.894147,-0.435737,0.535663,0.196100,-0.495679,137.14
1271,978.0,1.295252,0.232756,-0.173885,0.308449,0.206534,-0.092895,-0.078438,0.049317,-0.063933,...,-0.067043,-0.319027,-0.977567,-0.020884,-0.893034,0.323284,0.151231,-0.032013,0.007618,1.78
180987,124776.0,-1.926359,-1.045605,0.583308,-1.450857,2.318088,1.333151,0.530273,0.667923,0.039086,...,0.242283,0.274507,0.430373,-0.104047,-0.901483,1.126428,0.867434,-0.135768,0.022554,148.16
174683,122022.0,1.605564,-0.422538,-2.349825,0.975706,1.295817,0.679935,0.677488,0.050085,-0.329590,...,-0.021243,0.313334,0.672855,-0.169364,-1.634733,0.233852,-0.389496,-0.022953,-0.061101,180.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00


In [55]:
y = new_data['Class']
y

190603    0
180580    0
1271      0
180987    0
174683    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64

In [58]:
# spliting into test and train dataset for traning
# making use of sklearn's train and test split function

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)
print(x.shape)
print(y.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(984, 30)
(984,)
(787, 30)
(787,)
(197, 30)
(197,)


In [61]:
# Now so our data has only 2 class so it is a binary classification problem
# A logistic Regression model can be best to use here

model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

In [93]:
# defining a accuracy calculator function
def cal_accuracy(prediction,y):
    y = y.to_numpy()
    l = []
    for i in range(prediction.size):
        l.append(prediction[i] == y[i])
    accuracy = np.mean(l)
    return accuracy

In [123]:
# Calculating accuracy on train dataset
prediction = model.predict(X_train)
accuracy = cal_accuracy(prediction,Y_train)
print(f'Accuracy on train dataset is:- {accuracy * 100} %')

Accuracy on train dataset is:- 92.37611181702668 %


In [124]:
# Calculating accuracy on test dataset
prediction = model.predict(X_test)
accuracy = cal_accuracy(prediction,Y_test)
print(f'Accuracy on test dataset is:- {accuracy * 100} %')

Accuracy on test dataset is:- 90.35532994923858 %


In [131]:
# Accuracy on test data is pretty good 
# cheking for value from original dataset
check_data = data[:50000]
check_x = check_data.drop(['Class'],axis = 1)
check_y = check_data['Class']
result = model.predict(check_x)
accuracy = cal_accuracy(result,check_y) 
print(f'Accuracy on check dataset is:- {accuracy * 100} %')

Accuracy on check dataset is:- 87.35199999999999 %
