In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [4]:

credit_card_data = pd.read_csv('/content/creditcard.csv')

credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [6]:

credit_card_data.info()
credit_card_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251530 entries, 0 to 251529
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    251530 non-null  float64
 1   V1      251530 non-null  float64
 2   V2      251530 non-null  float64
 3   V3      251530 non-null  float64
 4   V4      251530 non-null  float64
 5   V5      251530 non-null  float64
 6   V6      251530 non-null  float64
 7   V7      251530 non-null  float64
 8   V8      251530 non-null  float64
 9   V9      251530 non-null  float64
 10  V10     251530 non-null  float64
 11  V11     251529 non-null  float64
 12  V12     251529 non-null  float64
 13  V13     251529 non-null  float64
 14  V14     251529 non-null  float64
 15  V15     251529 non-null  float64
 16  V16     251529 non-null  float64
 17  V17     251529 non-null  float64
 18  V18     251529 non-null  float64
 19  V19     251529 non-null  float64
 20  V20     251529 non-null  float64
 21  V21     25

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [7]:

credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,251069
1.0,460


In [8]:

legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [9]:
print(legit.shape)
print(fraud.shape)

(251069, 31)
(460, 31)


In [10]:

legit.Amount.describe()

Unnamed: 0,Amount
count,251069.0
mean,90.648755
std,251.564984
min,0.0
25%,6.0
50%,23.05
75%,79.8
max,19656.53


In [11]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,460.0
mean,124.626543
std,262.514861
min,0.0
25%,1.0
50%,9.25
75%,105.89
max,2125.87


In [12]:

credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,85768.559101,-0.024864,-0.027628,0.109996,0.014835,-0.030354,0.020056,-0.004724,0.000841,0.005772,...,0.003536,-0.003086,-0.010405,-0.00572,0.001006,0.019989,0.001102,-0.00098,0.000569,90.648755
1.0,75114.213043,-5.025759,3.778657,-7.234344,4.621235,-3.406637,-1.380303,-5.844333,0.620114,-2.62747,...,0.369732,0.753907,0.010465,-0.053682,-0.098913,0.044822,0.046625,0.15977,0.074308,124.626543


In [26]:
legit_sample = legit.sample(n=len(fraud))

In [27]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [28]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
225017,144064.0,1.933139,-0.584518,-0.221092,0.06298,-0.681431,-0.027251,-0.787517,0.060868,1.485873,...,0.044678,0.233297,0.266046,0.599885,-0.408391,-0.123982,0.022637,-0.018859,43.25,0.0
132357,79957.0,1.202664,0.1731,0.610347,0.557375,-0.56164,-0.795665,-0.066637,-0.046508,-0.161764,...,-0.197385,-0.630491,0.1546,0.518258,0.145127,0.065946,-0.035495,0.009964,1.79,0.0
226824,144812.0,-0.572591,0.690133,1.607736,0.759355,0.027632,0.421647,0.298751,0.350403,-0.153964,...,-0.09744,-0.249292,0.010762,-0.490082,-0.309712,-0.7239,0.419281,0.210132,48.0,0.0
68903,53184.0,-1.444902,1.527407,-0.547589,-1.681947,0.156726,0.448573,-0.518405,1.384639,-0.837659,...,0.116859,-0.172685,-0.113432,-1.631809,-0.065512,0.84559,-0.692857,-0.192255,0.77,0.0
175548,122385.0,1.945428,-0.56058,-0.628315,-0.095889,-0.239605,0.400559,-0.695757,0.134898,1.121242,...,0.219134,0.805546,0.056716,0.266856,-0.084573,0.098148,0.008369,-0.043289,35.95,0.0


In [30]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,460
1.0,460


In [31]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,84533.75,-0.068181,-0.031748,0.093887,-0.045396,-0.036687,0.045308,-0.012342,0.043877,0.037011,...,0.00668,0.037365,0.00216,-0.013648,0.027876,0.037319,0.000924,-0.018161,-0.017132,86.735761
1.0,75114.213043,-5.025759,3.778657,-7.234344,4.621235,-3.406637,-1.380303,-5.844333,0.620114,-2.62747,...,0.369732,0.753907,0.010465,-0.053682,-0.098913,0.044822,0.046625,0.15977,0.074308,124.626543


In [32]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [33]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
225017  144064.0  1.933139 -0.584518 -0.221092  0.062980 -0.681431 -0.027251   
132357   79957.0  1.202664  0.173100  0.610347  0.557375 -0.561640 -0.795665   
226824  144812.0 -0.572591  0.690133  1.607736  0.759355  0.027632  0.421647   
68903    53184.0 -1.444902  1.527407 -0.547589 -1.681947  0.156726  0.448573   
175548  122385.0  1.945428 -0.560580 -0.628315 -0.095889 -0.239605  0.400559   
...          ...       ...       ...       ...       ...       ...       ...   
249607  154493.0 -7.381547 -7.449015 -4.696287  3.728439  6.198304 -6.406267   
249828  154599.0  0.667714  3.041502 -5.845112  5.967587  0.213863 -1.462923   
249963  154657.0 -0.679521  4.672553 -6.814798  7.143500  0.928654 -1.873013   
250761  155054.0 -0.512349  4.827060 -7.973939  7.334059  0.367704 -2.055129   
251477  155359.0 -1.067713  5.262312 -8.438567  7.316487  0.008254 -2.125936   

              V7        V8        V9  .

In [34]:
print(Y)

225017    0.0
132357    0.0
226824    0.0
68903     0.0
175548    0.0
         ... 
249607    1.0
249828    1.0
249963    1.0
250761    1.0
251477    1.0
Name: Class, Length: 920, dtype: float64


In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(920, 30) (736, 30) (184, 30)


In [36]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [37]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)


In [38]:

X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.970108695652174


In [39]:

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9402173913043478


In [44]:
import pickle
with open("fraud_model.pkl", "wb") as f:
    pickle.dump(model, f)