In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import (SMOTE, RandomOverSampler)
from imblearn.combine import SMOTEENN, SMOTETomek

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score

from sklearn import metrics
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import io
df = pd.read_csv(r"Banksim Dataset.csv")

In [2]:
df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [3]:
encoder=LabelEncoder()
df['customer']=encoder.fit_transform(df['customer'])
df['age']=encoder.fit_transform(df['age'])
df['gender']=encoder.fit_transform(df['gender'])
df['merchant']=encoder.fit_transform(df['merchant'])
df['category']=encoder.fit_transform(df['category'])
df.drop(['zipcodeOri','zipMerchant'],axis=1,inplace=True)
df=df.query('amount>0') #give filtered dataframe according to condition.
df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,210,4,2,30,12,4.55,0
1,0,2753,2,2,30,12,39.68,0
2,0,2285,4,1,18,12,26.89,0
3,0,1650,3,2,30,12,17.25,0
4,0,3585,5,2,30,12,35.72,0


In [4]:
fraud = df[df['fraud']==1]
non_fraud = df[df['fraud']==0]
print(fraud.shape , non_fraud.shape)

(7200, 8) (587391, 8)


In [5]:
x=df.drop('fraud',axis=1)
y=df['fraud']
print(x.head())
print(y.head())

   step  customer  age  gender  merchant  category  amount
0     0       210    4       2        30        12    4.55
1     0      2753    2       2        30        12   39.68
2     0      2285    4       1        18        12   26.89
3     0      1650    3       2        30        12   17.25
4     0      3585    5       2        30        12   35.72
0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64


In [6]:
x_new=pd.DataFrame(data=df,columns=['amount','category','merchant','gender','age','customer'])
x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2,random_state=0)

In [7]:
models={'name':['logistic regression (simple)','logistic regression (under sampling)','logistic regression (over sampling)','logistic regression (smote)','support vector machine (simple)','support vector machine (under sampling)','support vector machine (over sampling)','support vector machine (smote)','decision tree (simple)','decision tree (under sampling)','decision tree (over sampling)','decision tree (smote)','random forest (simple)','random forest (under sampling)','random forest (over sampling)','random forest (smote)','naive bayes (simple)','naive bayes (under sampling)','naive bayes (over sampling)','naive bayes (smote)','knn (simple)','knn (under sampling)','knn (over sampling)','knn (smote)']}
models = pd.DataFrame(models)
models['accuracy']=np.nan
models['precision score']=np.nan
models['recall score']=np.nan
models['f1 score']=np.nan
models['difference in accuracy (in %)']=np.nan
models['difference in recall (in %)']=np.nan
models['difference in precision (in %)']=np.nan
models['difference in f1 score (in %)']=np.nan
models.head()

Unnamed: 0,name,accuracy,precision score,recall score,f1 score,difference in accuracy (in %),difference in recall (in %),difference in precision (in %),difference in f1 score (in %)
0,logistic regression (simple),,,,,,,,
1,logistic regression (under sampling),,,,,,,,
2,logistic regression (over sampling),,,,,,,,
3,logistic regression (smote),,,,,,,,
4,support vector machine (simple),,,,,,,,


In [8]:
def get_report(model,x_train,x_test,y_train,y_test,name):
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(name,'\n')
  classification_report_m=classification_report(y_test,y_pred)
  print(classification_report_m)
  confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
  print('confusion matrix')
  print(confusion_matrix, '\n')

  print('for test data')
  accuracy=accuracy_score(y_test,y_pred)
  print('accuracy =',accuracy)
  precision_score_m=precision_score(y_test,y_pred)
  print('presicion score = ',precision_score_m)
  recall_score_m=recall_score(y_test,y_pred)
  print('recall score =',recall_score_m)
  f1_score_m=f1_score(y_test,y_pred)
  print('F1 score =',f1_score_m)
  print('\n')

  print('for train data')
  y_pred_train=model.predict(x_train)
  accuracy_t=accuracy_score(y_train,y_pred_train)
  print('accuracy =',accuracy_t)
  precision_score_m_t=precision_score(y_train,y_pred_train)
  print('presicion score = ',precision_score_m_t)
  recall_score_m_t=recall_score(y_train,y_pred_train)
  print('recall score =',recall_score_m_t)
  f1_score_m_t=f1_score(y_train,y_pred_train)
  print('F1 score =',f1_score_m_t)
  print('\n')

  print('to understand whether our model is overfitting or underfitting')
  print('difference in f1 scores')
  print(f1_score_m_t,' - ',f1_score_m,' = ',f1_score_m_t-f1_score_m)
  print('in percentage = ',(f1_score_m_t-f1_score_m)*100)
  print('difference in recall scores')
  print(recall_score_m_t,' - ',recall_score_m,' = ',recall_score_m_t-recall_score_m)
  print('in percentage = ',(recall_score_m_t-recall_score_m)*100)
  print('difference in precision scores')
  print(precision_score_m_t,' - ',precision_score_m,' = ',precision_score_m_t-precision_score_m)
  print('in percentage = ',(precision_score_m_t-precision_score_m)*100)
  print('difference in accuracy scores')
  print(accuracy_t,' - ',accuracy,' = ',accuracy_t-accuracy)
  print('in percentage = ',(accuracy_t-accuracy)*100)

  models.loc[models['name'] == name, 'accuracy'] = accuracy
  models.loc[models['name'] == name, 'precision score'] = precision_score_m
  models.loc[models['name'] == name, 'recall score'] = recall_score_m
  models.loc[models['name'] == name, 'f1 score'] = f1_score_m
  models.loc[models['name'] == name, 'difference in f1 score (in %)'] = (f1_score_m_t-f1_score_m)*100
  models.loc[models['name'] == name, 'difference in accuracy (in %)'] = (accuracy_t-accuracy)*100
  models.loc[models['name'] == name, 'difference in recall (in %)'] = (recall_score_m_t-recall_score_m)*100
  models.loc[models['name'] == name, 'difference in precision (in %)'] = (precision_score_m_t-precision_score_m)*100

In [9]:
rus=RandomUnderSampler()
x2_train,y2_train=rus.fit_resample(x_train,y_train)
np.bincount(y2_train)

array([5735, 5735], dtype=int64)

In [10]:
ros = RandomOverSampler()
x3_train,y3_train=ros.fit_resample(x_train,y_train)
np.bincount(y3_train)

array([469937, 469937], dtype=int64)

In [11]:
smt=SMOTE()
x4_train,y4_train=smt.fit_resample(x_train,y_train)
np.bincount(y4_train)

array([469937, 469937], dtype=int64)

In [43]:
from sklearn.model_selection import GridSearchCV
rfc =  RandomForestClassifier(n_estimators=100,max_features='auto',bootstrap=True,random_state=9,n_jobs=4)

Grid_l= { 'max_depth': [10,20,30,40,60] ,'random_state':[5,10,15,20,40] } 

grid_search = GridSearchCV(rfc, param_grid=Grid_l)

grid_search.fit(x_train, y_train)

result = grid_search.cv_results_
estimator = grid_search.best_estimator_
score = grid_search.best_score_
params = grid_search.best_params_

print('result', result)
print('best score', score)
print('best params', params)

KeyboardInterrupt: 

In [78]:
rfc1 = RandomForestClassifier(n_estimators=100,max_features='auto',bootstrap=True,criterion='entropy',random_state=10,max_depth=40,n_jobs=4)

name='random forest (simple)'
get_report(rfc1,x_train,x_test,y_train,y_test,name)

random forest (simple) 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117454
           1       0.87      0.76      0.81      1465

    accuracy                           1.00    118919
   macro avg       0.93      0.88      0.90    118919
weighted avg       1.00      1.00      1.00    118919

confusion matrix
[[117292    162]
 [   354   1111]] 

for test data
accuracy = 0.9956609120493781
presicion score =  0.8727415553809897
recall score = 0.7583617747440273
F1 score = 0.8115412710007305


for train data
accuracy = 0.9999978977110278
presicion score =  1.0
recall score = 0.9998256320836966
F1 score = 0.9999128084401431


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.9999128084401431  -  0.8115412710007305  =  0.1883715374394126
in percentage =  18.837153743941258
difference in recall scores
0.9998256320836966  -  0.7583617747440273  =  0.24146385733966924
in percentage =  24.1463857339

In [80]:
name='random forest (under sampling)'
get_report(rfc1,x2_train,x_test,y2_train,y_test,name)

random forest (under sampling) 

              precision    recall  f1-score   support

           0       1.00      0.97      0.98    117454
           1       0.26      0.99      0.42      1465

    accuracy                           0.97    118919
   macro avg       0.63      0.98      0.70    118919
weighted avg       0.99      0.97      0.98    118919

confusion matrix
[[113440   4014]
 [    21   1444]] 

for test data
accuracy = 0.9660693413163582
presicion score =  0.26456577500916084
recall score = 0.985665529010239
F1 score = 0.41716019066878524


for train data
accuracy = 1.0
presicion score =  1.0
recall score = 1.0
F1 score = 1.0


to understand whether our model is overfitting or underfitting
difference in f1 scores
1.0  -  0.41716019066878524  =  0.5828398093312148
in percentage =  58.283980933121484
difference in recall scores
1.0  -  0.985665529010239  =  0.014334470989761039
in percentage =  1.433447098976104
difference in precision scores
1.0  -  0.26456577500916084  

In [81]:
name='random forest (over sampling)'
get_report(rfc1,x3_train,x_test,y3_train,y_test,name)

random forest (over sampling) 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117454
           1       0.81      0.79      0.80      1465

    accuracy                           1.00    118919
   macro avg       0.90      0.89      0.90    118919
weighted avg       1.00      1.00      1.00    118919

confusion matrix
[[117186    268]
 [   305   1160]] 

for test data
accuracy = 0.9951815941943676
presicion score =  0.8123249299719888
recall score = 0.7918088737201365
F1 score = 0.8019357068786728


for train data
accuracy = 1.0
presicion score =  1.0
recall score = 1.0
F1 score = 1.0


to understand whether our model is overfitting or underfitting
difference in f1 scores
1.0  -  0.8019357068786728  =  0.19806429312132723
in percentage =  19.80642931213272
difference in recall scores
1.0  -  0.7918088737201365  =  0.20819112627986347
in percentage =  20.819112627986346
difference in precision scores
1.0  -  0.8123249299719888  =  

In [82]:
name='random forest (smote)'
get_report(rfc1,x4_train,x_test,y4_train,y_test,name)

random forest (smote) 

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    117454
           1       0.62      0.86      0.72      1465

    accuracy                           0.99    118919
   macro avg       0.81      0.93      0.86    118919
weighted avg       0.99      0.99      0.99    118919

confusion matrix
[[116684    770]
 [   209   1256]] 

for test data
accuracy = 0.9917675056130644
presicion score =  0.6199407699901284
recall score = 0.8573378839590444
F1 score = 0.719564594672014


for train data
accuracy = 1.0
presicion score =  1.0
recall score = 1.0
F1 score = 1.0


to understand whether our model is overfitting or underfitting
difference in f1 scores
1.0  -  0.719564594672014  =  0.28043540532798605
in percentage =  28.043540532798605
difference in recall scores
1.0  -  0.8573378839590444  =  0.14266211604095558
in percentage =  14.266211604095558
difference in precision scores
1.0  -  0.6199407699901284  =  0.3800592

In [83]:
models_used = models.dropna()
models_used

Unnamed: 0,name,accuracy,precision score,recall score,f1 score,difference in accuracy (in %),difference in recall (in %),difference in precision (in %),difference in f1 score (in %)
12,random forest (simple),0.995661,0.872742,0.758362,0.811541,0.433699,24.146386,12.725844,18.837154
13,random forest (under sampling),0.966069,0.264566,0.985666,0.41716,3.393066,1.433447,73.543422,58.283981
14,random forest (over sampling),0.995182,0.812325,0.791809,0.801936,0.481841,20.819113,18.767507,19.806429
15,random forest (smote),0.991768,0.619941,0.857338,0.719565,0.823249,14.266212,38.005923,28.043541
