In [1]:
# important Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv("creditcard.csv")

In [4]:
print(data.head())

   Time        V1        V2        V3        V4        V5        V6        V7   
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599  \
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25   
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539  \
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
# missing value in the dataset
print(data.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [7]:
# Distribution of normal and fraud transactions
print(data['Class'].value_counts())

Class
0    284315
1       492
Name: count, dtype: int64


In [10]:
# seprate the data
normal = data[data.Class==0]
fraud = data[data.Class==1]

In [11]:
print(normal.shape)


(284315, 31)


In [12]:
print(fraud.shape)

(492, 31)


In [13]:
# Building new Normal dataset containing similar distribution of normal and fraud transaction
normal_sample= normal.sample(n=492)

In [14]:
#Condatenating the two dataset

In [15]:
new_dataset = pd.concat([normal_sample,fraud],axis=0)

In [19]:
print(new_dataset.head())

            Time        V1        V2        V3        V4        V5        V6   
164448  116721.0  1.986605 -0.284691 -0.298101  0.336718 -0.659719 -0.799912  \
169021  119489.0  2.320447 -1.301157 -0.816641 -1.583716 -1.190622 -0.518687   
40613    40317.0 -0.423882  1.067629  1.294850 -0.109265  0.308827 -0.175110   
49656    44152.0 -0.396845  0.881146  1.812273 -0.124597 -0.008582 -0.639788   
4593      3945.0 -1.476588  0.208667 -0.556085 -0.107826 -1.722552  0.879064   

              V7        V8        V9  ...       V21       V22       V23   
164448 -0.361154 -0.171338  0.882545  ... -0.245731 -0.578268  0.424240  \
169021 -1.193415 -0.180967 -1.188926  ... -0.167729  0.066457  0.287609   
40613   0.562237  0.188244 -0.722695  ... -0.180142 -0.427905 -0.004255   
49656   0.847876 -0.152882 -0.519462  ... -0.207629 -0.423261 -0.017994   
4593    3.428826 -0.521704  0.937382  ... -0.212722  0.148450  0.620162   

             V24       V25       V26       V27       V28  Amount  Cl

In [20]:
print(new_dataset.shape)

(984, 31)


In [21]:
#spliting the new_Dataset into X and Y
x = new_dataset.drop(columns='Class',axis=1)
y = new_dataset['Class']

In [22]:
print(x)

            Time        V1        V2        V3        V4        V5        V6   
164448  116721.0  1.986605 -0.284691 -0.298101  0.336718 -0.659719 -0.799912  \
169021  119489.0  2.320447 -1.301157 -0.816641 -1.583716 -1.190622 -0.518687   
40613    40317.0 -0.423882  1.067629  1.294850 -0.109265  0.308827 -0.175110   
49656    44152.0 -0.396845  0.881146  1.812273 -0.124597 -0.008582 -0.639788   
4593      3945.0 -1.476588  0.208667 -0.556085 -0.107826 -1.722552  0.879064   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [23]:
print(y)

164448    0
169021    0
40613     0
49656     0
4593      0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


In [24]:
#spliting the new_Dataset into Training Data and Testing Data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

In [25]:
print(x_train,x_test,y_train,y_test)

            Time         V1         V2         V3        V4         V5   
145289   86807.0   0.000846   0.808698   0.284955 -0.580010   0.455361  \
187341  127488.0  -1.121240   1.630165  -1.140808 -0.783866   1.048392   
56703    47545.0   1.176716   0.557091  -0.490800  0.756424   0.249192   
150684   93888.0 -10.040631   6.139183 -12.972972  7.740555  -8.684705   
6882      8808.0  -4.617217   1.695694  -3.114372  4.328199  -1.873257   
...          ...        ...        ...        ...       ...        ...   
39183    39729.0  -0.964567  -1.643541  -0.187727  1.158253  -2.458336   
20878    31328.0  -1.958463  -2.126117   1.106243 -3.617768   0.641729   
143335   85285.0  -6.713407   3.921104  -9.746678  5.148263  -5.151563   
60305    49263.0  -0.954048   0.627640   1.664676 -0.195736   0.532346   
17317    28625.0 -27.848181  15.598193 -28.923756  6.418442 -20.346228   

              V6         V7         V8        V9  ...       V20       V21   
145289 -1.049945   1.002527  -0.18

In [26]:
# Model Training-Loistic Regression
model = LogisticRegression()
model.fit(x_train,y_train)


In [27]:
#Model Evaluation-Accuracy Score
#Accuracy on training data
y_predict = model.predict(x_test)

In [30]:
accuracy = accuracy_score(y_test,y_predict)
print(accuracy)

0.9593908629441624
