In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import (SMOTE, RandomOverSampler)
from imblearn.combine import SMOTEENN, SMOTETomek

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score

from sklearn import metrics
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import io
df = pd.read_csv(r"Banksim Dataset.csv")

In [2]:
df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [3]:
encoder=LabelEncoder()
df['customer']=encoder.fit_transform(df['customer'])
df['age']=encoder.fit_transform(df['age'])
df['gender']=encoder.fit_transform(df['gender'])
df['merchant']=encoder.fit_transform(df['merchant'])
df['category']=encoder.fit_transform(df['category'])
df.drop(['zipcodeOri','zipMerchant'],axis=1,inplace=True)
df=df.query('amount>0') #give filtered dataframe according to condition.
df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,210,4,2,30,12,4.55,0
1,0,2753,2,2,30,12,39.68,0
2,0,2285,4,1,18,12,26.89,0
3,0,1650,3,2,30,12,17.25,0
4,0,3585,5,2,30,12,35.72,0


In [4]:
fraud = df[df['fraud']==1]
non_fraud = df[df['fraud']==0]
print(fraud.shape , non_fraud.shape)

(7200, 8) (587391, 8)


In [5]:
x=df.drop('fraud',axis=1)
y=df['fraud']
print(x.head())
print(y.head())

   step  customer  age  gender  merchant  category  amount
0     0       210    4       2        30        12    4.55
1     0      2753    2       2        30        12   39.68
2     0      2285    4       1        18        12   26.89
3     0      1650    3       2        30        12   17.25
4     0      3585    5       2        30        12   35.72
0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64


In [6]:
x_new=pd.DataFrame(data=df,columns=['amount','category','merchant','gender','age','customer'])
x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2,random_state=0)

In [7]:
models={'name':['logistic regression (simple)','logistic regression (under sampling)','logistic regression (over sampling)','logistic regression (smote)','support vector machine (simple)','support vector machine (under sampling)','support vector machine (over sampling)','support vector machine (smote)','decision tree (simple)','decision tree (under sampling)','decision tree (over sampling)','decision tree (smote)','random forest (simple)','random forest (under sampling)','random forest (over sampling)','random forest (smote)','naive bayes (simple)','naive bayes (under sampling)','naive bayes (over sampling)','naive bayes (smote)','knn (simple)','knn (under sampling)','knn (over sampling)','knn (smote)']}
models = pd.DataFrame(models)
models['accuracy']=np.nan
models['precision score']=np.nan
models['recall score']=np.nan
models['f1 score']=np.nan
models['difference in accuracy (in %)']=np.nan
models['difference in recall (in %)']=np.nan
models['difference in precision (in %)']=np.nan
models['difference in f1 score (in %)']=np.nan
models.head()

Unnamed: 0,name,accuracy,precision score,recall score,f1 score,difference in accuracy (in %),difference in recall (in %),difference in precision (in %),difference in f1 score (in %)
0,logistic regression (simple),,,,,,,,
1,logistic regression (under sampling),,,,,,,,
2,logistic regression (over sampling),,,,,,,,
3,logistic regression (smote),,,,,,,,
4,support vector machine (simple),,,,,,,,


In [8]:
def get_report(model,x_train,x_test,y_train,y_test,name):
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(name,'\n')
    classification_report_m=classification_report(y_test,y_pred)
    print(classification_report_m)
    confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
    print('confusion matrix')
    print(confusion_matrix, '\n')

    print('for test data')
    accuracy=accuracy_score(y_test,y_pred)
    print('accuracy =',accuracy)
    precision_score_m=precision_score(y_test,y_pred)
    print('presicion score = ',precision_score_m)
    recall_score_m=recall_score(y_test,y_pred)
    print('recall score =',recall_score_m)
    f1_score_m=f1_score(y_test,y_pred)
    print('F1 score =',f1_score_m)
    print('\n')

    print('for train data')
    y_pred_train=model.predict(x_train)
    accuracy_t=accuracy_score(y_train,y_pred_train)
    print('accuracy =',accuracy_t)
    precision_score_m_t=precision_score(y_train,y_pred_train)
    print('presicion score = ',precision_score_m_t)
    recall_score_m_t=recall_score(y_train,y_pred_train)
    print('recall score =',recall_score_m_t)
    f1_score_m_t=f1_score(y_train,y_pred_train)
    print('F1 score =',f1_score_m_t)
    print('\n')

    print('to understand whether our model is overfitting or underfitting')
    print('difference in f1 scores')
    print(f1_score_m_t,' - ',f1_score_m,' = ',f1_score_m_t-f1_score_m)
    print('in percentage = ',(f1_score_m_t-f1_score_m)*100)
    print('difference in recall scores')
    print(recall_score_m_t,' - ',recall_score_m,' = ',recall_score_m_t-recall_score_m)
    print('in percentage = ',(recall_score_m_t-recall_score_m)*100)
    print('difference in precision scores')
    print(precision_score_m_t,' - ',precision_score_m,' = ',precision_score_m_t-precision_score_m)
    print('in percentage = ',(precision_score_m_t-precision_score_m)*100)
    print('difference in accuracy scores')
    print(accuracy_t,' - ',accuracy,' = ',accuracy_t-accuracy)
    print('in percentage = ',(accuracy_t-accuracy)*100)

    models.loc[models['name'] == name, 'accuracy'] = accuracy
    models.loc[models['name'] == name, 'precision score'] = precision_score_m
    models.loc[models['name'] == name, 'recall score'] = recall_score_m
    models.loc[models['name'] == name, 'f1 score'] = f1_score_m
    models.loc[models['name'] == name, 'difference in f1 score (in %)'] = (f1_score_m_t-f1_score_m)*100
    models.loc[models['name'] == name, 'difference in accuracy (in %)'] = (accuracy_t-accuracy)*100
    models.loc[models['name'] == name, 'difference in recall (in %)'] = (recall_score_m_t-recall_score_m)*100
    models.loc[models['name'] == name, 'difference in precision (in %)'] = (precision_score_m_t-precision_score_m)*100

In [9]:
rus=RandomUnderSampler()
x2_train,y2_train=rus.fit_resample(x_train,y_train)
np.bincount(y2_train)

array([5735, 5735], dtype=int64)

In [10]:
ros = RandomOverSampler()
x3_train,y3_train=ros.fit_resample(x_train,y_train)
np.bincount(y3_train)

array([469937, 469937], dtype=int64)

In [11]:
smt=SMOTE()
x4_train,y4_train=smt.fit_resample(x_train,y_train)
np.bincount(y4_train)

array([469937, 469937], dtype=int64)

In [12]:
rfc =  RandomForestClassifier(n_estimators=100,bootstrap=True,n_jobs=4)

# SIMPLE

In [13]:
#Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

Grid_l= {'max_depth': [10,15,20,30,40],'max_features': ['auto','log2','sqrt'],'criterion':['gini','entropy'],'random_state':[8,10,15,20]} 

grid_search = GridSearchCV(rfc, param_grid=Grid_l, scoring='recall')

grid_search.fit(x_train, y_train)

result = grid_search.cv_results_
estimator = grid_search.best_estimator_
score = grid_search.best_score_
params = grid_search.best_params_

print('result', result)
print('best score', score)
print('best params', params)

result {'mean_fit_time': array([14.23521829, 13.75558758, 14.21446681, 14.48538542, 12.54910164,
       13.77043781, 14.20806355, 14.45376158, 14.63226743, 14.17529759,
       14.306423  , 14.57636957, 14.99150476, 14.7563385 , 15.47786779,
       15.30100131, 15.06449733, 14.6571125 , 15.38010826, 15.20387559,
       15.06401105, 12.45667467,  9.85767107,  9.98262343,  9.823843  ,
        9.74945006,  9.96726055, 10.02172771,  9.87679973,  9.88594279,
        9.95428171,  9.95849514,  9.81500235,  9.8215991 ,  9.96242499,
        9.93787313,  9.91089568,  9.58388491, 10.07163625, 10.02013206,
        9.72307372,  9.68545437, 10.17534676, 10.13025374,  9.83158627,
        9.66777706, 10.11795993, 10.19269271,  9.96380396,  9.47151175,
       10.01855063, 10.04225688, 10.06727476,  9.77300973, 10.06190977,
       10.05765529, 10.19556336,  9.68577042,  9.9978044 , 10.0915411 ,
        8.60122833,  7.88179383,  8.52897387,  8.34057627,  8.50574241,
        8.05231657,  8.6657124 ,  8.308

In [14]:
#Best Simple
rfc1 = RandomForestClassifier(n_estimators=100,max_features='auto',criterion='entropy',bootstrap=True,random_state=20,max_depth=30,n_jobs=4)

name='random forest (simple)'
get_report(rfc1,x_train,x_test,y_train,y_test,name)

random forest (simple) 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117454
           1       0.87      0.76      0.81      1465

    accuracy                           1.00    118919
   macro avg       0.94      0.88      0.91    118919
weighted avg       1.00      1.00      1.00    118919

confusion matrix
[[117292    162]
 [   351   1114]] 

for test data
accuracy = 0.995686139304905
presicion score =  0.8730407523510971
recall score = 0.7604095563139932
F1 score = 0.8128420284567677


for train data
accuracy = 0.9999957954220555
presicion score =  1.0
recall score = 0.9996512641673932
F1 score = 0.9998256016742239


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.9998256016742239  -  0.8128420284567677  =  0.18698357321745618
in percentage =  18.69835732174562
difference in recall scores
0.9996512641673932  -  0.7604095563139932  =  0.23924170785340004
in percentage =  23.92417078534

# UNDERSAMPLING

In [16]:
grid_search.fit(x2_train, y2_train)

result = grid_search.cv_results_
estimator = grid_search.best_estimator_
score = grid_search.best_score_
params = grid_search.best_params_

print('result', result)
print('best score', score)
print('best params', params)

result {'mean_fit_time': array([0.57811041, 0.1989852 , 0.18849792, 0.19013515, 0.19089022,
       0.19595966, 0.20692401, 0.20101385, 0.20355539, 0.20007377,
       0.20643263, 0.20524764, 0.21179724, 0.20355916, 0.20961499,
       0.20520487, 0.20745921, 0.20742712, 0.2159173 , 0.20779281,
       0.24685922, 0.23354712, 0.23826437, 0.22598901, 0.22989273,
       0.20124416, 0.2128396 , 0.20966125, 0.20982866, 0.20643716,
       0.21616635, 0.21373053, 0.20954127, 0.2141542 , 0.20636139,
       0.23417954, 0.22786269, 0.22238965, 0.21984882, 0.22978988,
       0.22682571, 0.21650763, 0.21059136, 0.2138979 , 0.21422734,
       0.2165834 , 0.21051521, 0.21554804, 0.21702805, 0.21594234,
       0.2195498 , 0.21693039, 0.21641707, 0.21120186, 0.21870246,
       0.20559096, 0.21649094, 0.21067719, 0.21414056, 0.21435094,
       0.23001208, 0.22567625, 0.2343204 , 0.23751049, 0.24820547,
       0.24737716, 0.24051275, 0.24029965, 0.24847918, 0.24054747,
       0.23940501, 0.24584031, 0.2842

In [17]:
#Best under-sample
rfc2 = RandomForestClassifier(n_estimators=100,max_features='auto',criterion='entropy',bootstrap=True,random_state=20,max_depth=10,n_jobs=4)
name='random forest (under sampling)'
get_report(rfc2,x2_train,x_test,y2_train,y_test,name)

random forest (under sampling) 

              precision    recall  f1-score   support

           0       1.00      0.96      0.98    117454
           1       0.23      0.99      0.38      1465

    accuracy                           0.96    118919
   macro avg       0.62      0.97      0.68    118919
weighted avg       0.99      0.96      0.97    118919

confusion matrix
[[112681   4773]
 [    18   1447]] 

for test data
accuracy = 0.9597120729235866
presicion score =  0.23263665594855307
recall score = 0.9877133105802047
F1 score = 0.37657774886141837


for train data
accuracy = 0.9857890148212729
presicion score =  0.9739707383463763
recall score = 0.998256320836966
F1 score = 0.9859640058555067


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.9859640058555067  -  0.37657774886141837  =  0.6093862569940883
in percentage =  60.93862569940883
difference in recall scores
0.998256320836966  -  0.9877133105802047  =  0.010543010256761298
in per

# OVER-SAMPLING

In [19]:
grid_search.fit(x3_train, y3_train)

result = grid_search.cv_results_
estimator = grid_search.best_estimator_
score = grid_search.best_score_
params = grid_search.best_params_

print('result', result)
print('best score', score)
print('best params', params)

result {'mean_fit_time': array([20.50305057, 19.79890976, 20.45437846, 20.03670292, 20.31825728,
       19.8976552 , 20.49704003, 20.07727661, 20.4401648 , 19.88713779,
       20.58961072, 20.12742472, 23.2279583 , 22.41934414, 23.07919226,
       22.67382812, 22.21075892, 21.01647339, 21.65433784, 21.40244093,
       21.96124959, 21.20155482, 21.64448285, 21.51237135, 22.38538146,
       21.67427611, 22.290376  , 22.11637726, 22.38324757, 21.62100391,
       22.66950307, 21.99306512, 22.48860011, 21.58941493, 22.38221745,
       22.08208675, 22.43574934, 21.93582249, 22.72919245, 22.24960899,
       22.47487712, 22.2144978 , 22.66192164, 22.24246349, 22.59778986,
       22.0793117 , 22.63071203, 22.31805005, 23.25303397, 22.6788857 ,
       22.63425746, 22.39747348, 22.59988551, 21.90379162, 22.38872685,
       22.18284783, 22.52380033, 21.94059229, 22.62921958, 22.21378102,
       19.06057005, 18.29713721, 19.11097198, 18.57382689, 18.97216902,
       18.3046628 , 18.64753771, 18.530

In [20]:
#Best over-sample
rfc3 = RandomForestClassifier(n_estimators=100,max_features='auto',criterion='gini',bootstrap=True,random_state=8,max_depth=15,n_jobs=4)
name='random forest (over sampling)'
get_report(rfc3,x3_train,x_test,y3_train,y_test,name)

random forest (over sampling) 

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    117454
           1       0.45      0.94      0.61      1465

    accuracy                           0.99    118919
   macro avg       0.73      0.96      0.80    118919
weighted avg       0.99      0.99      0.99    118919

confusion matrix
[[115799   1655]
 [    91   1374]] 

for test data
accuracy = 0.9853177372833609
presicion score =  0.4536150544734236
recall score = 0.9378839590443686
F1 score = 0.6114819759679573


for train data
accuracy = 0.99341720273143
presicion score =  0.9870054859658408
recall score = 1.0
F1 score = 0.9934602525629954


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.9934602525629954  -  0.6114819759679573  =  0.38197827659503814
in percentage =  38.197827659503815
difference in recall scores
1.0  -  0.9378839590443686  =  0.06211604095563139
in percentage =  6.211604095563139
diff

# SMOTE

In [22]:
grid_search.fit(x4_train, y4_train)

result = grid_search.cv_results_
estimator = grid_search.best_estimator_
score = grid_search.best_score_
params = grid_search.best_params_

print('result', result)
print('best score', score)
print('best params', params)

result {'mean_fit_time': array([27.6805707 , 26.76448355, 27.53696985, 27.03651085, 27.26339717,
       26.84505186, 27.11182175, 27.00448174, 27.21709013, 26.89730206,
       27.25474   , 27.01325207, 33.91004224, 33.71157417, 34.00066466,
       33.8257092 , 33.95954995, 33.79082003, 34.09840584, 34.05462351,
       34.01108403, 34.06827593, 34.39219704, 33.65041447, 35.76794863,
       35.63446012, 35.83796   , 35.6644084 , 35.6498301 , 35.56858587,
       35.88317237, 35.74653358, 35.63454661, 35.39705768, 35.77557373,
       35.68969097, 36.04789362, 35.95621381, 36.12073755, 36.12527037,
       35.98996644, 35.79263563, 36.24595222, 35.97550607, 36.38802218,
       35.96437712, 36.28502593, 35.92450099, 36.20626359, 35.97414751,
       36.37867098, 36.27575722, 36.12913599, 36.04766712, 36.4473608 ,
       36.13034348, 36.24588003, 36.89027586, 36.02123203, 36.0883215 ,
       28.58923597, 28.55544834, 28.54501686, 28.34144745, 28.77653785,
       28.4276053 , 28.68301735, 28.464

In [23]:
#Best Smote
rfc4 = RandomForestClassifier(n_estimators=100,max_features='auto',criterion = 'entropy',bootstrap=True,random_state=20,max_depth=30,n_jobs=4)
name='random forest (smote)'
get_report(rfc4,x4_train,x_test,y4_train,y_test,name)

random forest (smote) 

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    117454
           1       0.61      0.86      0.72      1465

    accuracy                           0.99    118919
   macro avg       0.81      0.93      0.86    118919
weighted avg       0.99      0.99      0.99    118919

confusion matrix
[[116655    799]
 [   202   1263]] 

for test data
accuracy = 0.9915825057392006
presicion score =  0.6125121241513094
recall score = 0.8621160409556314
F1 score = 0.7161893960873262


for train data
accuracy = 0.9998957307043285
presicion score =  0.9997915048879339
recall score = 1.0
F1 score = 0.999895741575281


to understand whether our model is overfitting or underfitting
difference in f1 scores
0.999895741575281  -  0.7161893960873262  =  0.2837063454879548
in percentage =  28.370634548795483
difference in recall scores
1.0  -  0.8621160409556314  =  0.13788395904436856
in percentage =  13.788395904436857
difference i

In [24]:
models_used = models.dropna()
models_used

Unnamed: 0,name,accuracy,precision score,recall score,f1 score,difference in accuracy (in %),difference in recall (in %),difference in precision (in %),difference in f1 score (in %)
12,random forest (simple),0.995686,0.873041,0.76041,0.812842,0.430966,23.924171,12.695925,18.698357
13,random forest (under sampling),0.959712,0.232637,0.987713,0.376578,2.607694,1.054301,74.133408,60.938626
14,random forest (over sampling),0.985318,0.453615,0.937884,0.611482,0.809947,6.211604,53.339043,38.197828
15,random forest (smote),0.991583,0.612512,0.862116,0.716189,0.831322,13.788396,38.727938,28.370635
