In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [2]:
credit_card_data = pd.read_csv("creditcard.csv")


In [3]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [4]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279186,168697.0,-1.44854,-0.106524,1.590888,0.787416,0.843798,0.719463,0.358887,0.409272,-0.557388,...,-0.082357,-0.62261,0.248731,0.137849,0.462997,-0.684516,0.271892,0.166265,157.0,0.0
279187,168698.0,-1.698935,1.318773,-0.532258,0.115716,-2.031122,-0.712686,-1.078152,1.537223,0.065154,...,0.524217,1.101406,0.115851,0.473729,-0.908005,0.506611,-0.69547,-0.224998,74.9,0.0
279188,168698.0,-1.698935,1.318773,-0.532258,0.115716,-2.031122,-0.712686,-1.078152,1.537223,0.065154,...,0.524217,1.101406,0.115851,0.473729,-0.908005,0.506611,-0.69547,-0.224998,74.9,0.0
279189,168698.0,-1.752086,1.235061,-0.49509,0.136284,-1.842112,-0.84121,-1.254498,1.57291,0.07559,...,0.549229,1.086182,0.10071,0.471034,-0.914791,0.501103,-0.744175,-0.204416,32.48,0.0
279190,168698.0,-1.752086,1.235061,-0.49509,0.136284,-1.842112,,,,,...,,,,,,,,,,


In [5]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279191 entries, 0 to 279190
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    279191 non-null  float64
 1   V1      279191 non-null  float64
 2   V2      279191 non-null  float64
 3   V3      279191 non-null  float64
 4   V4      279191 non-null  float64
 5   V5      279191 non-null  float64
 6   V6      279190 non-null  float64
 7   V7      279190 non-null  float64
 8   V8      279190 non-null  float64
 9   V9      279190 non-null  float64
 10  V10     279190 non-null  float64
 11  V11     279190 non-null  float64
 12  V12     279190 non-null  float64
 13  V13     279190 non-null  float64
 14  V14     279190 non-null  float64
 15  V15     279190 non-null  float64
 16  V16     279190 non-null  float64
 17  V17     279190 non-null  float64
 18  V18     279190 non-null  float64
 19  V19     279190 non-null  float64
 20  V20     279190 non-null  float64
 21  V21     27

In [6]:
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        1
V7        1
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [7]:
credit_card_data['Class'].value_counts()

0.0    278703
1.0       487
Name: Class, dtype: int64

In [8]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [9]:
print(legit.shape)
print(fraud.shape)

(278703, 31)
(487, 31)


In [10]:
legit.Amount.describe()

count    278703.000000
mean         88.676558
std         250.460854
min           0.000000
25%           5.760000
50%          22.190000
75%          77.850000
max       25691.160000
Name: Amount, dtype: float64

In [11]:
fraud.Amount.describe()

count     487.000000
mean      121.913326
std       257.561705
min         0.000000
25%         1.000000
50%         9.130000
75%       105.350000
max      2125.870000
Name: Amount, dtype: float64

In [12]:
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,93312.14889,0.007147,-0.013057,0.028317,-0.005329,-0.000463,0.00511,0.00736,-0.001612,0.004745,...,-0.000309,-0.001446,-0.001745,-0.000874,-0.00016,0.00274,-4.5e-05,-0.000292,-8.8e-05,88.676558
1.0,79834.240246,-4.816122,3.652183,-7.065,4.576639,-3.179607,-1.398985,-5.612526,0.570034,-2.597563,...,0.372003,0.716151,0.014237,-0.041043,-0.104202,0.041249,0.047424,0.168312,0.07591,121.913326


In [13]:
#under sampling
legit_sample = legit.sample(n=487)

In [14]:
#concatenating two dataframes
new_dataset = pd.concat([legit_sample,fraud], axis=0)

In [15]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
54321,46390.0,-2.446626,-1.392801,2.48609,-1.437006,-0.130777,0.247363,-0.122512,-0.217604,0.17186,...,-0.193329,0.546879,0.428545,0.025033,0.487434,-0.373896,-0.686355,0.157639,30.0,0.0
218141,141184.0,-0.255178,0.121335,-0.92681,-0.630456,2.038403,-1.615679,0.982834,-0.332275,-0.732437,...,0.432816,1.153237,-0.055591,-0.324124,-0.737607,0.098717,0.187306,0.253813,1.32,0.0
236254,148735.0,-1.653472,-0.523001,1.197025,-0.491487,0.514555,-0.179001,-0.693939,0.811481,-0.055643,...,0.428192,0.713897,-0.162769,0.762852,-0.166301,0.509718,-0.024955,-0.11187,28.75,0.0
172969,121309.0,-1.452337,-0.466853,-0.555007,-3.355574,2.117199,3.431942,-0.538528,1.577375,0.787465,...,0.319665,0.572263,0.028159,0.761226,-0.097191,0.546707,0.198838,0.055341,90.0,0.0
82686,59506.0,-0.47181,0.785264,1.833134,0.126919,0.015743,-0.598585,0.811016,-0.182698,0.202878,...,-0.246801,-0.312463,-0.184753,0.397333,-0.067566,0.213487,0.15019,-0.05075,4.3,0.0


In [16]:
new_dataset['Class'].value_counts()

0.0    487
1.0    487
Name: Class, dtype: int64

In [17]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,92263.850103,0.067381,-0.003737,-0.027522,-0.082441,0.054027,0.013951,0.103317,0.050098,0.032542,...,0.026185,-0.022195,0.012631,0.028038,0.017557,0.049854,-0.013015,-0.005725,-0.015707,89.72616
1.0,79834.240246,-4.816122,3.652183,-7.065,4.576639,-3.179607,-1.398985,-5.612526,0.570034,-2.597563,...,0.372003,0.716151,0.014237,-0.041043,-0.104202,0.041249,0.047424,0.168312,0.07591,121.913326


In [18]:
#splitting the data inot features and targets

X = new_dataset.drop(columns='Class',axis=1)
Y = new_dataset['Class']

In [19]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
54321    46390.0 -2.446626 -1.392801  2.486090 -1.437006 -0.130777  0.247363   
218141  141184.0 -0.255178  0.121335 -0.926810 -0.630456  2.038403 -1.615679   
236254  148735.0 -1.653472 -0.523001  1.197025 -0.491487  0.514555 -0.179001   
172969  121309.0 -1.452337 -0.466853 -0.555007 -3.355574  2.117199  3.431942   
82686    59506.0 -0.471810  0.785264  1.833134  0.126919  0.015743 -0.598585   
...          ...       ...       ...       ...       ...       ...       ...   
274382  165981.0 -5.766879 -8.402154  0.056543  6.950983  9.880564 -5.773192   
274475  166028.0 -0.956390  2.361594 -3.171195  1.970759  0.474761 -1.902598   
275992  166831.0 -2.027135 -1.131890 -1.135194  1.086963 -0.010547  0.423797   
276071  166883.0  2.091900 -0.757459 -1.192258 -0.755458 -0.620324 -0.322077   
276864  167338.0 -1.374424  2.793185 -4.346572  2.400731 -1.688433  0.111136   

              V7        V8        V9  .

In [20]:
print(Y)

54321     0.0
218141    0.0
236254    0.0
172969    0.0
82686     0.0
         ... 
274382    1.0
274475    1.0
275992    1.0
276071    1.0
276864    1.0
Name: Class, Length: 974, dtype: float64


In [21]:
#split the data into training and testing data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [22]:
print(X.shape,X_train.shape,X_test.shape)

(974, 30) (779, 30) (195, 30)


In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train, Y_train)

In [29]:
#model evaluation
X_train_prediction = model.predict(X_train)


In [30]:
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [31]:
print('Accuracy on Training data: ', training_data_accuracy)

Accuracy on Training data:  0.9255455712451861


In [32]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [33]:
print('Accuracy score on test data: ',test_data_accuracy)

Accuracy score on test data:  0.9076923076923077
