In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import (SMOTE, RandomOverSampler)
from imblearn.combine import SMOTEENN, SMOTETomek

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score

from sklearn import metrics
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import io
df = pd.read_csv(r"Banksim Dataset.csv")

In [3]:
df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [4]:
encoder=LabelEncoder()
df['customer']=encoder.fit_transform(df['customer'])
df['age']=encoder.fit_transform(df['age'])
df['gender']=encoder.fit_transform(df['gender'])
df['merchant']=encoder.fit_transform(df['merchant'])
df['category']=encoder.fit_transform(df['category'])
df.drop(['zipcodeOri','zipMerchant'],axis=1,inplace=True)
df=df.query('amount>0') #give filtered dataframe according to condition.
df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,210,4,2,30,12,4.55,0
1,0,2753,2,2,30,12,39.68,0
2,0,2285,4,1,18,12,26.89,0
3,0,1650,3,2,30,12,17.25,0
4,0,3585,5,2,30,12,35.72,0


In [5]:
fraud = df[df['fraud']==1]
non_fraud = df[df['fraud']==0]
print(fraud.shape , non_fraud.shape)

(7200, 8) (587391, 8)


In [6]:
x=df.drop('fraud',axis=1)
y=df['fraud']
print(x.head())
print(y.head())

   step  customer  age  gender  merchant  category  amount
0     0       210    4       2        30        12    4.55
1     0      2753    2       2        30        12   39.68
2     0      2285    4       1        18        12   26.89
3     0      1650    3       2        30        12   17.25
4     0      3585    5       2        30        12   35.72
0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64


In [7]:
x_new=pd.DataFrame(data=df,columns=['amount','category','merchant','gender','age','customer'])
x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2,random_state=0)

In [8]:
models={'name':['logistic regression (simple)','logistic regression (under sampling)','logistic regression (over sampling)','logistic regression (smote)','support vector machine (simple)','support vector machine (under sampling)','support vector machine (over sampling)','support vector machine (smote)','decision tree (simple)','decision tree (under sampling)','decision tree (over sampling)','decision tree (smote)','random forest (simple)','random forest (under sampling)','random forest (over sampling)','random forest (smote)','naive bayes (simple)','naive bayes (under sampling)','naive bayes (over sampling)','naive bayes (smote)','knn (simple)','knn (under sampling)','knn (over sampling)','knn (smote)']}
models = pd.DataFrame(models)
models['accuracy']=np.nan
models['precision score']=np.nan
models['recall score']=np.nan
models['f1 score']=np.nan
models['difference in accuracy (in %)']=np.nan
models['difference in recall (in %)']=np.nan
models['difference in precision (in %)']=np.nan
models['difference in f1 score (in %)']=np.nan
models.head()

Unnamed: 0,name,accuracy,precision score,recall score,f1 score,difference in accuracy (in %),difference in recall (in %),difference in precision (in %),difference in f1 score (in %)
0,logistic regression (simple),,,,,,,,
1,logistic regression (under sampling),,,,,,,,
2,logistic regression (over sampling),,,,,,,,
3,logistic regression (smote),,,,,,,,
4,support vector machine (simple),,,,,,,,


In [9]:
def get_report(model,x_train,x_test,y_train,y_test,name):
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(name,'\n')
  classification_report_m=classification_report(y_test,y_pred)
  print(classification_report_m)
  confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
  print('confusion matrix')
  print(confusion_matrix, '\n')

  print('for test data')
  accuracy=accuracy_score(y_test,y_pred)
  print('accuracy =',accuracy)
  precision_score_m=precision_score(y_test,y_pred)
  print('presicion score = ',precision_score_m)
  recall_score_m=recall_score(y_test,y_pred)
  print('recall score =',recall_score_m)
  f1_score_m=f1_score(y_test,y_pred)
  print('F1 score =',f1_score_m)
  print('\n')

  print('for train data')
  y_pred_train=model.predict(x_train)
  accuracy_t=accuracy_score(y_train,y_pred_train)
  print('accuracy =',accuracy_t)
  precision_score_m_t=precision_score(y_train,y_pred_train)
  print('presicion score = ',precision_score_m_t)
  recall_score_m_t=recall_score(y_train,y_pred_train)
  print('recall score =',recall_score_m_t)
  f1_score_m_t=f1_score(y_train,y_pred_train)
  print('F1 score =',f1_score_m_t)
  print('\n')

  print('to understand whether our model is overfitting or underfitting')
  print('difference in f1 scores')
  print(f1_score_m_t,' - ',f1_score_m,' = ',f1_score_m_t-f1_score_m)
  print('in percentage = ',(f1_score_m_t-f1_score_m)*100)
  print('difference in recall scores')
  print(recall_score_m_t,' - ',recall_score_m,' = ',recall_score_m_t-recall_score_m)
  print('in percentage = ',(recall_score_m_t-recall_score_m)*100)
  print('difference in precision scores')
  print(precision_score_m_t,' - ',precision_score_m,' = ',precision_score_m_t-precision_score_m)
  print('in percentage = ',(precision_score_m_t-precision_score_m)*100)
  print('difference in accuracy scores')
  print(accuracy_t,' - ',accuracy,' = ',accuracy_t-accuracy)
  print('in percentage = ',(accuracy_t-accuracy)*100)

  models.loc[models['name'] == name, 'accuracy'] = accuracy
  models.loc[models['name'] == name, 'precision score'] = precision_score_m
  models.loc[models['name'] == name, 'recall score'] = recall_score_m
  models.loc[models['name'] == name, 'f1 score'] = f1_score_m
  models.loc[models['name'] == name, 'difference in f1 score (in %)'] = (f1_score_m_t-f1_score_m)*100
  models.loc[models['name'] == name, 'difference in accuracy (in %)'] = (accuracy_t-accuracy)*100
  models.loc[models['name'] == name, 'difference in recall (in %)'] = (recall_score_m_t-recall_score_m)*100
  models.loc[models['name'] == name, 'difference in precision (in %)'] = (precision_score_m_t-precision_score_m)*100

In [10]:
rus=RandomUnderSampler()
x2_train,y2_train=rus.fit_resample(x_train,y_train)
np.bincount(y2_train)

array([5735, 5735], dtype=int64)

In [11]:
ros = RandomOverSampler()
x3_train,y3_train=ros.fit_resample(x_train,y_train)
np.bincount(y3_train)

array([469937, 469937], dtype=int64)

In [12]:
smt=SMOTE()
x4_train,y4_train=smt.fit_resample(x_train,y_train)
np.bincount(y4_train)

array([469937, 469937], dtype=int64)

In [21]:
from sklearn.model_selection import GridSearchCV
rfc =  RandomForestClassifier(n_estimators=100,max_features='auto',criterion='entropy',bootstrap=True,n_jobs=4)

Grid_l= {'max_depth': [10,20,40] ,'random_state':[8,10,15] } 

grid_search = GridSearchCV(rfc, param_grid=Grid_l)

grid_search.fit(x_train, y_train)

result = grid_search.cv_results_
estimator = grid_search.best_estimator_
score = grid_search.best_score_
params = grid_search.best_params_

print('result', result)
print('best score', score)
print('best params', params)

result {'mean_fit_time': array([8.58561492, 7.81003914, 8.44563227, 8.84452848, 8.3046752 ,
       8.90112453, 8.71858478, 8.41666532, 9.00145316]), 'std_fit_time': array([0.48569705, 0.22625723, 0.31797688, 0.23597342, 0.11317108,
       0.1980089 , 0.14383158, 0.27435875, 0.23924819]), 'mean_score_time': array([0.22277932, 0.21406393, 0.21855807, 0.22547913, 0.22365379,
       0.23080931, 0.23062449, 0.22583027, 0.23554325]), 'std_score_time': array([0.00768527, 0.00554502, 0.00368918, 0.00772929, 0.01029113,
       0.00744644, 0.00507027, 0.00718447, 0.00994417]), 'param_max_depth': masked_array(data=[10, 10, 10, 20, 20, 20, 40, 40, 40],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_random_state': masked_array(data=[8, 10, 15, 8, 10, 15, 8, 10, 15],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',

In [23]:
rfc1 = RandomForestClassifier(n_estimators=100,max_features='auto',bootstrap=True,criterion='entropy',random_state=10,max_depth=20,n_jobs=4)

name='random forest (simple)'
get_report(rfc1,x_train,x_test,y_train,y_test,name)

random forest (simple) 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117454
           1       0.89      0.75      0.82      1465

    accuracy                           1.00    118919
   macro avg       0.94      0.88      0.91    118919
weighted avg       1.00      1.00      1.00    118919

confusion matrix
[[117318    136]
 [   360   1105]] 

for test data
accuracy = 0.9958290937528906
presicion score =  0.8904109589041096
recall score = 0.7542662116040956
F1 score = 0.8167036215816704


for train data
accuracy = 0.9991380615213845
presicion score =  0.9998122770790313
recall score = 0.9286835222319093
F1 score = 0.962936177906346


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.962936177906346  -  0.8167036215816704  =  0.14623255632467558
in percentage =  14.623255632467558
difference in recall scores
0.9286835222319093  -  0.7542662116040956  =  0.17441731062781374
in percentage = 

In [24]:
name='random forest (under sampling)'
get_report(rfc1,x2_train,x_test,y2_train,y_test,name)

random forest (under sampling) 

              precision    recall  f1-score   support

           0       1.00      0.97      0.98    117454
           1       0.26      0.98      0.41      1465

    accuracy                           0.97    118919
   macro avg       0.63      0.97      0.70    118919
weighted avg       0.99      0.97      0.98    118919

confusion matrix
[[113388   4066]
 [    24   1441]] 

for test data
accuracy = 0.9656068416316989
presicion score =  0.261666969311785
recall score = 0.9836177474402731
F1 score = 0.41336775674125076


for train data
accuracy = 0.9999128160418483
presicion score =  1.0
recall score = 0.9998256320836966
F1 score = 0.9999128084401431


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.9999128084401431  -  0.41336775674125076  =  0.5865450516988924
in percentage =  58.654505169889234
difference in recall scores
0.9998256320836966  -  0.9836177474402731  =  0.016207884643423487
in percentage =  1.6

In [25]:
name='random forest (over sampling)'
get_report(rfc1,x3_train,x_test,y3_train,y_test,name)

random forest (over sampling) 

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    117454
           1       0.64      0.88      0.74      1465

    accuracy                           0.99    118919
   macro avg       0.82      0.94      0.87    118919
weighted avg       0.99      0.99      0.99    118919

confusion matrix
[[116737    717]
 [   177   1288]] 

for test data
accuracy = 0.9924822778529924
presicion score =  0.6423940149625935
recall score = 0.8791808873720136
F1 score = 0.7423631123919308


for train data
accuracy = 0.9979901561273107
presicion score =  0.9959964054545531
recall score = 1.0
F1 score = 0.9979941874972791


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.9979941874972791  -  0.7423631123919308  =  0.2556310751053483
in percentage =  25.563107510534834
difference in recall scores
1.0  -  0.8791808873720136  =  0.12081911262798639
in percentage =  12.08191126279864
dif

In [26]:
name='random forest (smote)'
get_report(rfc1,x4_train,x_test,y4_train,y_test,name)

random forest (smote) 

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    117454
           1       0.54      0.90      0.68      1465

    accuracy                           0.99    118919
   macro avg       0.77      0.94      0.84    118919
weighted avg       0.99      0.99      0.99    118919

confusion matrix
[[116352   1102]
 [   152   1313]] 

for test data
accuracy = 0.9894550071897679
presicion score =  0.5436853002070393
recall score = 0.8962457337883959
F1 score = 0.6768041237113401


for train data
accuracy = 0.9970655641075293
presicion score =  0.9948961540009789
recall score = 0.9992573472614414
F1 score = 0.9970719816887383


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.9970719816887383  -  0.6768041237113401  =  0.3202678579773982
in percentage =  32.02678579773982
difference in recall scores
0.9992573472614414  -  0.8962457337883959  =  0.10301161347304555
in percentage =  

In [27]:
models_used = models.dropna()
models_used

Unnamed: 0,name,accuracy,precision score,recall score,f1 score,difference in accuracy (in %),difference in recall (in %),difference in precision (in %),difference in f1 score (in %)
12,random forest (simple),0.995829,0.890411,0.754266,0.816704,0.330897,17.441731,10.940132,14.623256
13,random forest (under sampling),0.965607,0.261667,0.983618,0.413368,3.430597,1.620788,73.833303,58.654505
14,random forest (over sampling),0.992482,0.642394,0.879181,0.742363,0.550788,12.081911,35.360239,25.563108
15,random forest (smote),0.989455,0.543685,0.896246,0.676804,0.761056,10.301161,45.121085,32.026786
