In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from imblearn.over_sampling import SMOTE 
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


Using TensorFlow backend.


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path
</div>

In [2]:
path_of_input_file = r'D:\kaggle_trials\creditcardfraud\creditcard.csv'
df                 = pd.read_csv(path_of_input_file)
df.tail(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


<div class="alert alert-block alert-info">
<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively 
</div>

In [3]:
num_labels = df['Class'].unique()
print('The number of labels are ',len(num_labels))
for i in range(len(num_labels)):
    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['Class']==num_labels[i]]))
print('We dont have a balanced dataset and hence we need to perform imbalanced dataset handling')

The number of labels are  2
The number of  0  labels are :-  284315
The number of  1  labels are :-  492
We dont have a balanced dataset and hence we need to perform imbalanced dataset handling


In [4]:
scalar                 = MinMaxScaler()
scaled_col             = scalar.fit_transform(df[['Amount']]) 
X_already_preprocessed = df[df.columns[:-2]].values
X                      = np.concatenate([X_already_preprocessed,scaled_col],axis=1)
Y                      = df[df.columns[-1]].values

In [5]:
sm           = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X, Y)
print('Positive examples before Oversampling is ', sum(Y == 1))
print('Negative examples before Oversampling is ', sum(Y == 0))
print('\n')
print('Positive examples after Oversampling is ', sum(Y_res == 1))
print('Negative examples after Oversampling is ', sum(Y_res == 0))
print('\n')

Positive examples before Oversampling is  492
Negative examples before Oversampling is  284315


Positive examples after Oversampling is  284315
Negative examples after Oversampling is  284315




<div class="alert alert-block alert-info">
<b>Train Test Split :</b> We split the data to train and test set 
</div>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)

<div class="alert alert-block alert-info">
<b>Parameter Tuning and setting Grid for parameters:</b> We set up the grid for parameter tuning and then tune the parameters to get the optimal list of parameters to use
</div>

In [7]:
logistic_reg_grid  = {'penalty' : hp.choice('penalty',['l1','l2']),
                        'C': hp.uniform('C',0.0,1.0),
                        
                     }

In [8]:
def hyperopt_train_test(params):
    reg = LogisticRegression(**params)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, logistic_reg_grid, algo=tpe.suggest, max_evals=5, trials=trials)
best_parameters = space_eval(logistic_reg_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████████| 5/5 [00:41<00:00,  8.33s/it, best loss: -0.9798441918858796]
The best parameter tuned on training set is given by :-  {'C': 0.6860615150086679, 'penalty': 'l1'}


<div class="alert alert-block alert-info">
<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score
</div>

In [9]:
model = LogisticRegression(**best_parameters)
model.fit(X_train, y_train)

LogisticRegression(C=0.6860615150086679, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.9189150552683918
