In [2]:
# Fraud Detection with Machine Learning - Logistic Regression

### Problem Statement
In this session, you will work on creditcard_sampledata.csv, a dataset containing credit card transactions data. The challenge here is to train a Logistic Regression model for Fraud detection and get the best possible performance. Please, explain everything you do and try to have a clean and structured code by using functions, classes, etc. Draw some conclussions from the results obtained.

OPTIONAL:
When you think you got the best possible performance with LR, you can try another method to improve the final results.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
df = pd.read_csv("session_6_creditcard_sampledata.csv")
df
# .head and .tail gives the first and last 5 rows

Unnamed: 0.1,Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,258647,1.725265,-1.337256,-1.012687,-0.361656,-1.431611,-1.098681,-0.842274,-0.026594,-0.032409,...,0.414524,0.793434,0.028887,0.419421,-0.367529,-0.155634,-0.015768,0.010790,189.00,0
1,69263,0.683254,-1.681875,0.533349,-0.326064,-1.455603,0.101832,-0.520590,0.114036,-0.601760,...,0.116898,-0.304605,-0.125547,0.244848,0.069163,-0.460712,-0.017068,0.063542,315.17,0
2,96552,1.067973,-0.656667,1.029738,0.253899,-1.172715,0.073232,-0.745771,0.249803,1.383057,...,-0.189315,-0.426743,0.079539,0.129692,0.002778,0.970498,-0.035056,0.017313,59.98,0
3,281898,0.119513,0.729275,-1.678879,-1.551408,3.128914,3.210632,0.356276,0.920374,-0.160589,...,-0.335825,-0.906171,0.108350,0.593062,-0.424303,0.164201,0.245881,0.071029,0.89,0
4,86917,1.271253,0.275694,0.159568,1.003096,-0.128535,-0.608730,0.088777,-0.145336,0.156047,...,0.031958,0.123503,-0.174528,-0.147535,0.735909,-0.262270,0.015577,0.015955,6.53,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5045,223366,1.118331,2.074439,-3.837518,5.448060,0.071816,-1.020509,-1.808574,0.521744,-2.032638,...,0.289861,-0.172718,-0.021910,-0.376560,0.192817,0.114107,0.500996,0.259533,1.00,1
5046,239499,-2.150855,2.187917,-3.430516,0.119476,-0.173210,0.290700,-2.808988,-2.679351,-0.556685,...,-0.073205,0.561496,-0.075034,-0.437619,0.353841,-0.521339,0.144465,0.026588,50.00,1
5047,125342,-7.139060,2.773082,-6.757845,4.446456,-5.464428,-1.713401,-6.485365,3.409395,-3.053493,...,1.303250,-0.016118,-0.876670,0.382230,-1.054624,-0.614606,-0.766848,0.409424,106.90,1
5048,220725,-1.169203,1.863414,-2.515135,5.463681,-0.297971,1.364918,0.759219,-0.118861,-2.293921,...,-0.393090,-0.708692,0.471309,-0.078616,-0.544655,0.014777,-0.240930,-0.781055,324.59,1


In [8]:
# Dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5050 entries, 0 to 5049
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  5050 non-null   int64  
 1   V1          5050 non-null   float64
 2   V2          5050 non-null   float64
 3   V3          5050 non-null   float64
 4   V4          5050 non-null   float64
 5   V5          5050 non-null   float64
 6   V6          5050 non-null   float64
 7   V7          5050 non-null   float64
 8   V8          5050 non-null   float64
 9   V9          5050 non-null   float64
 10  V10         5050 non-null   float64
 11  V11         5050 non-null   float64
 12  V12         5050 non-null   float64
 13  V13         5050 non-null   float64
 14  V14         5050 non-null   float64
 15  V15         5050 non-null   float64
 16  V16         5050 non-null   float64
 17  V17         5050 non-null   float64
 18  V18         5050 non-null   float64
 19  V19         5050 non-null  

In [9]:
# Check for missing values in each column
df.isnull().sum()

Unnamed: 0    0
V1            0
V2            0
V3            0
V4            0
V5            0
V6            0
V7            0
V8            0
V9            0
V10           0
V11           0
V12           0
V13           0
V14           0
V15           0
V16           0
V17           0
V18           0
V19           0
V20           0
V21           0
V22           0
V23           0
V24           0
V25           0
V26           0
V27           0
V28           0
Amount        0
Class         0
dtype: int64

In [10]:
# Checking the number of normal transactions and fraudulent transactions

df['Class'].value_counts()

# Unbalanced Dataset
# 0 = Normal Transaction
# 1 = Fraud

0    5000
1      50
Name: Class, dtype: int64

In [11]:
# Separating data

normal = df[df.Class == 0]
fraud = df[df.Class == 1]

print(normal.shape)
print(fraud.shape)

(5000, 31)
(50, 31)


In [12]:
# Statistical Info of the normal data
normal.Amount.describe()

count    5000.000000
mean       85.843714
std       227.144663
min         0.000000
25%         4.997500
50%        20.325000
75%        74.960000
max      4584.880000
Name: Amount, dtype: float64

In [13]:
# Statistical Info of the fraud data
fraud.Amount.describe()

count      50.000000
mean      113.469000
std       234.409091
min         0.000000
25%         1.000000
50%        10.605000
75%       105.172500
max      1402.160000
Name: Amount, dtype: float64

In [14]:
# Compare the values (Class = 0 or 1)
df.groupby('Class').mean()

Unnamed: 0_level_0,Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,143084.8702,0.03503,0.011553,0.037444,-0.04576,-0.013825,-0.030885,0.014315,-0.022432,-0.002227,...,-0.002896,-0.010583,-0.010206,-0.003305,-0.000918,-0.002613,-0.004651,-0.009584,0.002414,85.843714
1,121384.7,-4.985211,3.321539,-7.293909,4.827952,-3.326587,-1.591882,-5.776541,1.395058,-2.537728,...,0.19458,0.703182,0.069065,-0.088374,-0.029425,-0.073336,-0.023377,0.380072,0.009304,113.469


In [15]:
# Under-Sampling: Building a sample dataset with similar distribution
normal_sample = normal.sample(n=50)

In [16]:
# Concatinate the new normal dataset with fraud dataset
new_df = pd.concat([normal_sample, fraud], axis = 0) #Axis = 1 is column
new_df.head()

Unnamed: 0.1,Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
3309,17073,1.036913,-0.450419,1.075731,0.873571,-1.226021,-0.387364,-0.567537,0.065728,0.929869,...,0.136697,0.334399,-0.120466,0.418172,0.294943,0.461636,-0.007248,0.037607,89.0,0
2516,9550,-1.960783,-1.02137,1.60187,-1.124639,1.91768,-1.689738,-0.680277,-0.039438,1.688404,...,-0.334513,-0.850762,-0.202247,-0.482227,-0.510374,0.43479,0.010254,0.120473,11.99,0
2579,264602,-2.012438,1.214886,-0.071997,3.391986,1.807812,-1.056313,1.041858,0.048216,-2.076481,...,-0.280944,-0.629357,0.747285,0.237802,-0.470854,-0.516683,-0.613293,0.315179,5.66,0
4122,149467,1.653011,-0.655863,-0.395625,1.456823,-0.522091,0.137831,-0.59356,0.095616,2.16128,...,0.177652,0.48628,-0.017018,-0.575114,-0.207406,-0.600661,-0.008163,-0.028511,153.0,0
2044,218728,-4.158473,-3.277814,0.188311,-1.671749,0.855371,-0.501198,-0.651103,0.404326,-0.554559,...,-0.040014,1.419716,1.584398,0.194192,0.72862,-0.047326,0.38983,0.117464,122.0,0


In [17]:
new_df['Class'].value_counts()

0    50
1    50
Name: Class, dtype: int64

In [18]:
new_df.groupby('Class').mean()
# Similar values meaning nature of the dataset has not changed.

Unnamed: 0_level_0,Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,137481.62,-0.118536,0.122819,0.289072,0.020708,0.268841,-0.193072,0.10127,-0.135655,0.008814,...,-0.047013,-0.11423,0.102181,-0.006508,-0.013382,0.047639,0.049218,-0.074725,-0.019379,47.2866
1,121384.7,-4.985211,3.321539,-7.293909,4.827952,-3.326587,-1.591882,-5.776541,1.395058,-2.537728,...,0.19458,0.703182,0.069065,-0.088374,-0.029425,-0.073336,-0.023377,0.380072,0.009304,113.469


In [19]:
# Split data into Features and Targets
X = new_df.drop(columns = 'Class', axis = 1)
Y = new_df['Class']
print(X)
print(Y)

      Unnamed: 0        V1        V2         V3        V4        V5        V6  \
3309       17073  1.036913 -0.450419   1.075731  0.873571 -1.226021 -0.387364   
2516        9550 -1.960783 -1.021370   1.601870 -1.124639  1.917680 -1.689738   
2579      264602 -2.012438  1.214886  -0.071997  3.391986  1.807812 -1.056313   
4122      149467  1.653011 -0.655863  -0.395625  1.456823 -0.522091  0.137831   
2044      218728 -4.158473 -3.277814   0.188311 -1.671749  0.855371 -0.501198   
...          ...       ...       ...        ...       ...       ...       ...   
5045      223366  1.118331  2.074439  -3.837518  5.448060  0.071816 -1.020509   
5046      239499 -2.150855  2.187917  -3.430516  0.119476 -0.173210  0.290700   
5047      125342 -7.139060  2.773082  -6.757845  4.446456 -5.464428 -1.713401   
5048      220725 -1.169203  1.863414  -2.515135  5.463681 -0.297971  1.364918   
5049       63421 -8.461845  6.866198 -11.838269  4.194211 -6.923097 -3.221147   

            V7        V8   

In [21]:
#Split the data into Training and Testing
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state=2)
#20% to testing
print(X.shape, X_train.shape, X_test.shape)

(100, 30) (80, 30) (20, 30)


In [22]:
# Logistic Regression
model = LogisticRegression()

In [23]:
# Training the Model with our training Data
model.fit(X_train, Y_train)

LogisticRegression()

In [24]:
# Accuracy on Training Data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print(training_data_accuracy)

0.9625


In [25]:
# Accuracy on Test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print(test_data_accuracy)

0.9
