In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
credit_card_df = pd.read_csv('creditcard.csv')
credit_card_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [11]:
# dataset informations
credit_card_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134763 entries, 0 to 134762
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    134763 non-null  int64  
 1   V1      134763 non-null  float64
 2   V2      134763 non-null  float64
 3   V3      134763 non-null  float64
 4   V4      134763 non-null  float64
 5   V5      134763 non-null  float64
 6   V6      134763 non-null  float64
 7   V7      134763 non-null  float64
 8   V8      134763 non-null  float64
 9   V9      134763 non-null  float64
 10  V10     134763 non-null  float64
 11  V11     134763 non-null  float64
 12  V12     134763 non-null  float64
 13  V13     134763 non-null  float64
 14  V14     134763 non-null  float64
 15  V15     134763 non-null  float64
 16  V16     134763 non-null  float64
 17  V17     134763 non-null  float64
 18  V18     134763 non-null  float64
 19  V19     134763 non-null  float64
 20  V20     134763 non-null  float64
 21  V21     13

In [12]:
credit_card_df.shape

(134763, 31)

In [13]:
# distribution of legit transactions & fraudulent transactions
credit_card_df['Class'].value_counts()

0.0    134500
1.0       262
Name: Class, dtype: int64

In [14]:
# checking the number of missing values in each column
credit_card_df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [15]:
credit_card_df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,134763.0,134763.0,134763.0,134763.0,134763.0,134763.0,134763.0,134763.0,134763.0,134763.0,...,134763.0,134762.0,134762.0,134762.0,134762.0,134762.0,134762.0,134762.0,134762.0,134762.0
mean,50680.319131,-0.245326,0.006623,0.675897,0.141477,-0.285222,0.078025,-0.11741,0.063781,-0.087465,...,-0.038128,-0.116508,-0.034394,0.012418,0.130786,0.023135,0.000535,0.00227,91.913697,0.001944
std,20378.24231,1.820046,1.618062,1.271608,1.326068,1.312503,1.283723,1.169059,1.208212,1.092231,...,0.718025,0.635475,0.59869,0.594691,0.437984,0.492344,0.390216,0.309722,249.294206,0.04405
min,0.0,-56.40751,-72.715728,-33.680984,-5.519697,-42.147898,-26.160506,-31.764946,-73.216718,-9.283925,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.53433,-22.565679,-11.710896,0.0,0.0
25%,37341.0,-1.017743,-0.570875,0.172363,-0.716447,-0.907849,-0.66121,-0.604879,-0.131899,-0.714125,...,-0.225554,-0.54549,-0.172469,-0.32353,-0.135358,-0.326302,-0.060363,-0.004153,6.05,0.0
50%,52508.0,-0.260097,0.097217,0.751889,0.167984,-0.317436,-0.175587,-0.066231,0.07951,-0.151273,...,-0.058524,-0.092714,-0.045513,0.068909,0.168013,-0.066728,0.011655,0.023571,24.115,0.0
75%,67716.0,1.15829,0.766549,1.36593,0.99765,0.236686,0.464507,0.409484,0.371297,0.492772,...,0.11453,0.303329,0.082667,0.408572,0.419698,0.288313,0.086458,0.076969,81.9275,0.0
max,80950.0,1.960497,18.902453,9.382558,16.715537,34.801666,22.529298,36.677268,20.007208,15.594995,...,27.202839,10.50309,19.002942,4.016342,5.541598,3.517346,12.152401,33.847808,19656.53,1.0


In [18]:
legit = credit_card_df[credit_card_df.Class==0]
fraud = credit_card_df[credit_card_df['Class'] == 1]

In [19]:
legit.value_counts()

Time   V1         V2         V3        V4         V5         V6         V7         V8         V9         V10         V11        V12        V13        V14        V15        V16        V17        V18        V19        V20        V21        V22        V23        V24        V25        V26        V27        V28        Amount  Class
43153  -2.086016   2.203265  1.654339   2.941050  -1.683045   0.529728  -1.352162   1.793449  -0.723686   0.600365   -0.982212  -0.551636  -1.337000   0.834403   1.251862   0.033455   1.067978   0.160510   0.213087   0.079002   0.216444   0.567241  -0.035345   0.370201   0.157378   0.440341   0.210230   0.090558  0.76    0.0      9
64947  -6.370459   3.306401  1.724991  -1.589581   0.174936   0.232403   2.967884  -3.421005   7.937413   11.515880   3.023644  -0.781401  -1.140026  -5.800852   0.406259  -1.723657  -2.375979  -0.529492  -0.100033   2.943972  -1.614332   0.387513  -0.364322   0.480224   0.154904  -0.535302  -4.787907  -3.696616  12.33   0.0      5
687

In [20]:
legit

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134757,80949,-1.083428,0.107919,0.963281,-2.489488,-0.638445,-1.454265,0.531511,0.006472,1.539550,...,-0.096107,-0.197681,-0.210752,0.382254,0.260441,-0.954895,-0.279019,-0.196329,57.01,0.0
134758,80949,0.120139,2.063931,-1.959708,1.420907,1.014997,-1.218084,0.818542,-0.015111,-0.162497,...,-0.170880,0.119557,0.119090,-0.123530,-0.507578,-0.447859,0.029418,-0.383972,0.89,0.0
134759,80949,1.111266,-0.124477,1.374073,0.881283,-0.962547,0.009063,-0.614334,0.125082,0.730352,...,-0.027973,0.196891,0.085341,0.480376,0.176152,0.321774,0.040823,0.030144,11.50,0.0
134760,80950,-1.192845,1.154940,1.200060,-1.334743,-0.442220,-0.687518,0.130546,0.567359,0.107404,...,-0.052138,-0.161462,0.023800,0.083906,-0.365689,0.758129,0.250869,0.167202,0.92,0.0


In [24]:
legit.shape

(134500, 31)

In [22]:
fraud.head().value_counts()

Time  V1         V2         V3         V4        V5         V6         V7         V8         V9         V10        V11        V12         V13        V14        V15        V16        V17         V18        V19        V20        V21        V22        V23        V24        V25        V26        V27        V28        Amount  Class
406   -2.312227   1.951992  -1.609851  3.997906  -0.522188  -1.426545  -2.537387   1.391657  -2.770089  -2.772272   3.202033  -2.899907   -0.595222  -4.289254   0.389724  -1.140747  -2.830056   -0.016822   0.416956   0.126911   0.517232  -0.035049  -0.465211   0.320198   0.044519   0.177840   0.261145  -0.143276  0.00    1.0      1
472   -3.043541  -3.157307   1.088463  2.288644   1.359805  -1.064823   0.325574  -0.067794  -0.270953  -0.838587  -0.414575  -0.503141    0.676502  -1.692029   2.000635   0.666780   0.599717    1.725321   0.283345   2.102339   0.661696   0.435477   1.375966  -0.293803   0.279798  -0.145362  -0.252773   0.035764  529.00  1.0      1
446

In [23]:
fraud.shape

(262, 31)

In [27]:
legit_sample = legit.sample(n=492)
credit_card_df = pd.concat([legit_sample , fraud],axis=0)

In [28]:
credit_card_df['Class'].value_counts

<bound method IndexOpsMixin.value_counts of 17389     0.0
86561     0.0
114877    0.0
54948     0.0
109585    0.0
         ... 
124115    1.0
124176    1.0
125342    1.0
128479    1.0
131272    1.0
Name: Class, Length: 754, dtype: float64>

In [29]:
credit_card_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,49112.416667,-0.251749,0.037725,0.618169,0.10867,-0.159343,0.188732,-0.128017,-0.029624,-0.063263,...,0.065451,-0.061693,-0.179495,0.00708,0.026092,0.146176,0.023095,-0.037959,0.016764,89.415671
1.0,42119.370229,-5.644679,3.962258,-7.190628,4.521182,-4.002069,-1.48968,-5.96575,1.512608,-2.610845,...,0.240131,1.263063,-0.315132,-0.117179,-0.104567,0.200913,0.05637,0.491164,0.081891,116.235115


In [30]:
X =credit_card_df.drop('Class' , axis=1)
Y = credit_card_df['Class']

In [31]:
X.shape

(754, 30)

In [32]:
Y.shape

(754,)

In [33]:
# statistical measures of the data
legit.Amount.describe()

count    134500.000000
mean         91.866320
std         249.299295
min           0.000000
25%           6.080000
50%          24.150000
75%          81.780000
max       19656.530000
Name: Amount, dtype: float64

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

## Model Training

In [37]:
model=LogisticRegression()
model.fit(X_train, Y_train)
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9519071310116086


In [38]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9337748344370861
