In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#load the data set to pandas dataframe

In [3]:
df = pd.read_csv('creditcard.csv')

In [12]:
# data set information 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [13]:
#checking the missing values 

In [21]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [22]:
# distribution of non fraud transaction   0 = normal , 1 = fraud
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [23]:
# DATA SET IS UNBALANCE
# let's separate data for analysis


In [24]:
non_fraud = df[df.Class == 0]
fraud = df[df.Class == 1]

In [25]:
len(non_fraud)

284315

In [26]:
len(fraud)

492

In [32]:
non_fraud.shape

(284315, 31)

In [33]:
fraud.shape

(492, 31)

In [38]:
non_fraud.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [40]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [41]:
# in case of fraud mean trasaction amount is quite bigger than non_fraud

In [43]:
# lets compare both

df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [44]:
# lets build sample dataset which has same amount of normal transaction and 
# fraud transaction 

non_fraud_sample = non_fraud.sample(492)

In [45]:
# concat both dataframe
new_dataset = pd.concat([non_fraud_sample, fraud], axis = 0)   # axis 0 --> rows wise , 1 --> column wise

In [46]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
218904,141520.0,-0.866714,-0.808262,1.08592,-1.662097,-0.779169,0.253385,1.332639,-0.20032,-1.357462,...,0.337519,0.482387,0.156717,-0.435366,0.720889,0.017258,-0.253582,-0.169804,300.22,0
58705,48511.0,-1.82984,-1.893724,2.592044,-1.29404,0.109202,-1.004276,-1.328721,0.459543,-0.678508,...,0.552082,0.879124,0.050255,0.321103,0.472743,-0.142991,-0.026809,0.087486,75.0,0
203323,134765.0,-2.13513,1.437036,1.877689,4.273205,0.707419,1.832328,-2.953359,-1.517056,-1.688494,...,-1.23756,0.357523,-1.044799,0.195647,-0.322422,0.349369,0.094844,0.171594,0.76,0
238570,149725.0,2.086812,-0.2908,-1.651916,0.201814,0.333657,-0.331143,0.01126,-0.06507,0.920641,...,-0.311356,-0.878148,0.252104,-0.016249,-0.168839,0.294286,-0.091533,-0.066034,15.7,0
55024,46734.0,-1.097534,1.219093,0.673471,1.221561,0.317957,-0.100124,0.189294,0.340655,-0.332354,...,-0.04736,0.012401,-0.359302,-0.401744,0.087237,-0.228786,0.061524,0.086777,1.0,0


In [52]:
len(new_dataset)

984

In [53]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [57]:
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [65]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,91359.128049,-0.125715,0.001795,0.053977,-0.005672,-0.011539,-0.130682,0.048191,0.012593,0.098482,...,0.034325,-0.053664,0.030961,0.024331,-0.001597,-0.013244,0.049566,-0.024517,0.072828,90.953923
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [59]:
#spliting the data into features and target

In [60]:
X = new_dataset.drop(columns = 'Class', axis = 1)
Y = new_dataset['Class']

In [62]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
218904,141520.0,-0.866714,-0.808262,1.085920,-1.662097,-0.779169,0.253385,1.332639,-0.200320,-1.357462,...,0.806258,0.337519,0.482387,0.156717,-0.435366,0.720889,0.017258,-0.253582,-0.169804,300.22
58705,48511.0,-1.829840,-1.893724,2.592044,-1.294040,0.109202,-1.004276,-1.328721,0.459543,-0.678508,...,0.524810,0.552082,0.879124,0.050255,0.321103,0.472743,-0.142991,-0.026809,0.087486,75.00
203323,134765.0,-2.135130,1.437036,1.877689,4.273205,0.707419,1.832328,-2.953359,-1.517056,-1.688494,...,0.581006,-1.237560,0.357523,-1.044799,0.195647,-0.322422,0.349369,0.094844,0.171594,0.76
238570,149725.0,2.086812,-0.290800,-1.651916,0.201814,0.333657,-0.331143,0.011260,-0.065070,0.920641,...,-0.285916,-0.311356,-0.878148,0.252104,-0.016249,-0.168839,0.294286,-0.091533,-0.066034,15.70
55024,46734.0,-1.097534,1.219093,0.673471,1.221561,0.317957,-0.100124,0.189294,0.340655,-0.332354,...,0.053111,-0.047360,0.012401,-0.359302,-0.401744,0.087237,-0.228786,0.061524,0.086777,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00


In [63]:
Y

218904    0
58705     0
203323    0
238570    0
55024     0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64

In [66]:
# spliting data into Training and Testing

In [68]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state = 2)
# 80 % of data will be stored in X train, and corresponding labels will be in Y_train
# 20 % of data will be stored in X_test and corresponding labels will be in Y_test

In [69]:
X_train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
100623,67571.0,-0.758469,-0.045410,-0.168438,-1.313275,-1.901763,0.739433,3.071892,-0.483422,0.618203,...,-0.032500,0.042619,0.397224,0.072229,-0.242276,0.560916,-0.540955,0.150606,-0.117140,549.06
257180,158044.0,1.961744,0.007209,-1.844403,0.250383,0.677941,-0.263530,0.169871,-0.098010,0.134735,...,-0.005064,-0.292246,-0.825942,0.249277,0.149055,-0.253030,0.171094,-0.066994,-0.035139,44.99
110739,71954.0,0.573099,-1.148593,0.239156,0.316650,-0.603098,0.584433,-0.047419,0.235221,0.334171,...,0.429927,-0.031714,-0.549299,-0.107709,-0.208749,-0.065096,0.852709,-0.091342,0.038542,291.40
156273,107715.0,-0.554230,0.733527,2.443014,-0.194203,0.067060,-0.081611,0.435840,-0.114464,1.308000,...,0.061990,-0.251149,-0.338597,-0.279574,-0.038238,0.212018,-0.752068,-0.089230,-0.155203,11.27
269062,163512.0,2.081067,-0.435541,-1.152896,0.240106,0.126376,0.350657,-0.500547,0.064577,1.572574,...,-0.211411,0.110420,0.606221,-0.060225,-0.284388,0.273656,-0.078689,0.019106,-0.051401,6.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130094,79281.0,-0.709814,1.265523,-0.192543,2.038304,1.687452,-1.144972,0.529495,0.095438,-1.374823,...,0.330207,-0.031726,-0.208690,-0.270956,-0.114737,0.483300,-0.003890,0.050792,0.079352,2.99
251814,155511.0,2.152721,-0.306779,-1.384323,-0.426428,-0.669358,-2.094397,-0.053912,-0.533191,-0.710025,...,-0.000340,0.025780,0.169811,0.275576,0.858061,-0.102637,-0.338293,-0.018276,-0.025697,10.00
223618,143456.0,-2.006582,3.676577,-5.463811,7.232058,-1.627859,-0.996755,-4.299833,2.268867,-3.651067,...,0.474414,0.713907,-0.063868,0.167947,-0.449864,0.023702,0.536905,0.485864,-0.042393,1.00
43061,41353.0,-15.020981,8.075240,-16.298091,5.664820,-11.918153,-4.246957,-14.716668,9.435084,-6.795398,...,-0.995787,2.525115,-0.832074,-0.186117,0.429781,0.697103,0.056031,-1.310888,-0.707403,34.12


In [70]:
Y_train

100623    1
257180    0
110739    0
156273    0
269062    0
         ..
130094    0
251814    0
223618    1
43061     1
234446    0
Name: Class, Length: 787, dtype: int64

In [71]:
X.shape,X_train.shape,X_test.shape

((984, 30), (787, 30), (197, 30))

In [72]:
# Training model

In [73]:
# Generally used the logistic Regression for binary classification problems
from sklearn.linear_model import LogisticRegression

In [74]:
model = LogisticRegression()

In [76]:
# train data using Logistic Regression
model.fit(X_train,Y_train)
# fit data to model

In [77]:
# Evaluation

In [82]:
# Accuracy --->
X_Train_prediction = model.predict(X_train)
accuracy = accuracy_score(X_Train_prediction,Y_train)

In [79]:
accuracy

0.9415501905972046

In [80]:
# also for test data

In [87]:
X_Test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(X_Test_prediction,Y_test)

In [88]:
test_accuracy

0.9187817258883249