In [11]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from imblearn.over_sampling import SMOTE 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA,TruncatedSVD
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import string
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data for analysis :</b> We will load the data for analysis and the column names are mostly the preprocessed names as an outcome of applying Principal Component Analysis, so we may not have to do dimensionality reduction from our end.
</div>

In [12]:
path_of_input_file = r'D:\kaggle_trials\creditcardfraud\creditcard.csv'
df                 = pd.read_csv(path_of_input_file)
df.tail(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


<div class="alert alert-block alert-info">
<b>Checking data imbalance:</b> We will usually see less frauds as compared to usual transactions. So it is apparent that the sample size of fraud data will be much less as compared to the usual transactions.
</div>

In [13]:
num_labels = df['Class'].unique()
print('The number of labels are ',len(num_labels))
for i in range(len(num_labels)):
    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['Class']==num_labels[i]]))
print('We dont have a balanced dataset and hence we need to perform imbalanced dataset handling')

The number of labels are  2
The number of  0  labels are :-  284315
The number of  1  labels are :-  492
We dont have a balanced dataset and hence we need to perform imbalanced dataset handling


<div class="alert alert-block alert-info">
<b>Feature Extraction :</b> We will extract the X matrix and Y label from the data given. We will also perform the MinMax Scaling on one column provided - Amount
</div>

In [14]:
scalar                 = MinMaxScaler()
scaled_col             = scalar.fit_transform(df[['Amount']]) 
X_already_preprocessed = df[df.columns[:-2]].values
X                      = np.concatenate([X_already_preprocessed,scaled_col],axis=1)
Y                      = df[df.columns[-1]].values

<div class="alert alert-block alert-info">
<b>Balanced data creation:</b> We will create balanced data having equal number of fraud and non-fraud transactions. I used SMOTE here.
</div>

In [15]:
sm           = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X, Y)

In [16]:
print('Positive examples before Oversampling is ', sum(Y == 1))
print('Negative examples before Oversampling is ', sum(Y == 0))
print('\n')
print('Positive examples after Oversampling is ', sum(Y_res == 1))
print('Negative examples after Oversampling is ', sum(Y_res == 0))
print('\n')

Positive examples before Oversampling is  492
Negative examples before Oversampling is  284315


Positive examples after Oversampling is  284315
Negative examples after Oversampling is  284315




<div class="alert alert-block alert-info">
<b>Train Test Split:</b> We perform Train test split on the data 
</div>

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)

<div class="alert alert-block alert-info">
<b>Hyper-parameter Grid:</b> We set up the hyper-parameter gird for analysis. Since only one feature was enough to be tuned to get fair enough results, I excluded all other hyper-parameters to be tuned and took then at their default value.
</div>

In [18]:
extra_trees_grid = {'n_estimators' : hp.choice('n_estimators',range(5,50)),
}

In [20]:
def hyperopt_train_test(params):
    clf = ExtraTreesClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, extra_trees_grid, algo=tpe.suggest, max_evals=3, trials=trials)
best_parameters = space_eval(extra_trees_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████████| 3/3 [01:16<00:00, 23.42s/it, best loss: -0.9998871336720009]
The best parameter tuned on training set is given by :-  {'n_estimators': 32}


<div class="alert alert-block alert-info">
<b>Fitting the tuned model:</b> Using the hyper parameters tuned, we fit them on our train data and get necessary results for analysis 
</div>

In [21]:
model = ExtraTreesClassifier(**best_parameters)
model.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=32, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [22]:
y_pred = model.predict(X_test)

In [23]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93757
           1       1.00      1.00      1.00     93891

    accuracy                           1.00    187648
   macro avg       1.00      1.00      1.00    187648
weighted avg       1.00      1.00      1.00    187648

