In [4]:
import pandas as pd
from numpy import mean
from numpy import std
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [5]:
# get the dataset
train_df = pd.read_csv('drive/MyDrive/Colab Notebooks/fraudTrain.csv')
reduced_train_df = train_df.sample(n=300000, random_state=42)
# train_df.head()

#get dataset for training
test_df = pd.read_csv('drive/MyDrive/Colab Notebooks/fraudTest.csv')
reduced_test_df = test_df.sample(n=250000, random_state=42)
test_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [7]:
#training mode
train_col_list = reduced_train_df.columns.tolist()
print(train_col_list)
train_col_to_drop = ['trans_num', 'unix_time', 'Unnamed: 0']
train_col_to_drop = [col for col in train_col_to_drop if col in reduced_train_df.columns]  # Filter out columns that do not exist
reduced_train_df = reduced_train_df.drop(columns=train_col_to_drop, axis=1)
#reduced_train_df.head()

#testing mode
test_col_list = reduced_test_df.columns.tolist()
print(test_col_list)
test_col_to_drop = ['trans_num', 'unix_time', 'Unnamed: 0']
test_col_to_drop = [col for col in test_col_to_drop if col in reduced_test_df.columns]  # Filter out columns that do not exist
reduced_test_df = reduced_test_df.drop(columns=test_col_to_drop, axis=1)
#test_df.head()

['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'merch_lat', 'merch_long', 'is_fraud']
['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'merch_lat', 'merch_long', 'is_fraud']


In [9]:
#train section
x_train = reduced_train_df.drop('is_fraud', axis=1)
y_train = reduced_train_df['is_fraud']
# reduced_train_df.info()
# print(reduced_train_df.is_fraud.value_counts())
# print(x_train.shape)

#test section
x_test = reduced_test_df.drop('is_fraud', axis=1)
y_test = reduced_test_df['is_fraud']
# reduced_test_df.info()
# print(x_test.shape)

In [10]:
# Handling Preprocessing
label_encoder = LabelEncoder()
#x_train.isnull().sum()

# train section
x_train_encoded = x_train.copy()
for col in x_train.columns:
    if x_train[col].dtype == 'object':
        x_train_encoded[col] = label_encoder.fit_transform(x_train[col])

# test section
x_test_encoded = x_test.copy()
for col in x_test.columns:
    if x_test[col].dtype == 'object':
        x_test_encoded[col] = label_encoder.fit_transform(x_test[col])

print(x_train_encoded.head())
print(x_test_encoded.head())

#Print the numbers of rows and columns after encoding
print(x_train_encoded.shape)
print(x_test_encoded.shape)

         trans_date_trans_time            cc_num  merchant  category     amt  \
1045211                 240584      577588686219       629         9  194.51   
547406                  125875    30376238035123       180         5   52.32   
110142                   25214  4658490815480264       429        12    6.53   
1285953                 296309  3514897282719543       187         6    7.33   
271705                   62564  6011381817520024        92         2   64.29   

         first  last  gender  street  city  state    zip      lat      long  \
1045211    151   413       1     247   760     38  15686  40.6153  -79.4545   
547406      87    99       0     705   742     37  97476  42.8250 -124.4409   
110142     320   360       0     487   396     38  15449  39.9636  -79.7853   
1285953    310   125       1     830   260     34  14425  42.9580  -77.3083   
271705     203     5       0     848   415     50  82221  41.6423 -104.1974   

         city_pop  job  dob  merch_lat  merc

In [11]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(x_train_encoded, y_train)

# Print the class distribution after SMOTE
print(y_train_smote.value_counts())

# Apply ADASYN to balance the training data
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(x_train_encoded, y_train)

# Print the class distribution after ADASYN
print(y_train_adasyn.value_counts())

# Print the class distribution
print(y_train.value_counts())



is_fraud
0    298256
1    298256
Name: count, dtype: int64
is_fraud
1    298470
0    298256
Name: count, dtype: int64
is_fraud
0    298256
1      1744
Name: count, dtype: int64


In [18]:
#Random Forest

#==============================================
#train with Unbalanced data
#==============================================
# rf = RandomForestClassifier(n_estimators=100)
# rf.fit(x_train_encoded, y_train)

# # make predictions
# y_unblncd_predict = rf.predict(x_test_encoded)

# # training set performance
# rf_unblncd_f1_score = f1_score(y_test, y_unblncd_predict, average='weighted')
# rf_unblncd_matt_corr = matthews_corrcoef(y_test, y_unblncd_predict)
# rf_unblncd_precision = precision_score(y_test, y_unblncd_predict, average='weighted',)
# rf_unblncd_recall = recall_score(y_test, y_unblncd_predict, average='weighted',)

# # result
# print('Unbalanced Performance Result')
# print('Acc F1:', rf_unblncd_f1_score)
# print('Acc Matt_Corr:', rf_unblncd_matt_corr)
# print('Acc Precision:', rf_unblncd_precision)
# print('Acc Recall:', rf_unblncd_recall)

#==============================================
# train with SMOTE Balnced data
#==============================================
rf_smote = RandomForestClassifier(n_estimators=300)
rf_smote.fit(X_train_smote, y_train_smote)

# make predictions
y_smote_predict = rf_smote.predict(x_test_encoded)

# testing set performance
rf_smote_f1_score = f1_score(y_test, y_smote_predict, average='weighted')
rf_smote_matt_corr = matthews_corrcoef(y_test, y_smote_predict)
rf_smote_precision = precision_score(y_test, y_smote_predict, average='weighted',)
rf_smote_recall = recall_score(y_test, y_smote_predict, average='weighted',)

print('===================================')
# result
print('SMOTE Performance Result')
print('Acc F1:', rf_smote_f1_score)
print('Acc Matt_Corr:', rf_smote_matt_corr)
print('Acc Precision:', rf_smote_precision)
print('Acc Recall:', rf_smote_recall)

#==============================================
# train with ADASYN Balanced data
#==============================================
rf_adasyn = RandomForestClassifier(n_estimators=450)
rf_adasyn.fit(X_train_adasyn, y_train_adasyn)

# make predictions
y_adasyn_predict = rf_adasyn.predict(x_test_encoded)

# testing set performance
rf_adasyn_f1_score = f1_score(y_test, y_adasyn_predict, average='weighted')
rf_adasyn_matt_corr = matthews_corrcoef(y_test, y_adasyn_predict)
rf_adasyn_precision = precision_score(y_test, y_adasyn_predict, average='weighted',)
rf_adasyn_recall = recall_score(y_test, y_adasyn_predict, average='weighted',)

print('===================================')
# result
print('ADASYN Performance Result')
print('Acc F1:', rf_adasyn_f1_score)
print('Acc Matt_Corr:', rf_adasyn_matt_corr)
print('Acc Precision:', rf_adasyn_precision)
print('Acc Recall:', rf_adasyn_recall)

SMOTE Performance Result
Acc F1: 0.9907905646885283
Acc Matt_Corr: 0.2923818174996081
Acc Precision: 0.9949868186084769
Acc Recall: 0.987428
ADASYN Performance Result
Acc F1: 0.990574324780904
Acc Matt_Corr: 0.2893027118010219
Acc Precision: 0.9949820792119805
Acc Recall: 0.987036


In [12]:
#================================
#train with Unbalanced data
#================================

#Neural Network Classifier
# nn_classifier = MLPClassifier(alpha=1, max_iter=50)
# nn_classifier.fit(x_train_encoded, y_train)

# # make predictions
# y_unblncd_predict = nn_classifier.predict(x_test_encoded)

# # training set performance
# nn_unblncd_f1_score = f1_score(y_test, y_unblncd_predict)
# nn_unblncd_matt_corr = matthews_corrcoef(y_test, y_unblncd_predict)
# nn_unblncd_precision = precision_score(y_test, y_unblncd_predict)
# nn_unblncd_recall = recall_score(y_test, y_unblncd_predict)

# print('Unbalanced Performance Result')
# print('Acc F1:', nn_unblncd_f1_score)
# print('Acc Matt_Corr:', nn_unblncd_matt_corr)
# print('Acc Precision:', nn_unblncd_precision)
# print('Acc Recall:', nn_unblncd_recall)

#==============================================
#train with SMOTE
#==============================================

nn_smote_classifier = MLPClassifier(alpha=1, max_iter=150)
nn_smote_classifier.fit(X_train_smote, y_train_smote)

# make predictions
y_smote_predict = nn_smote_classifier.predict(x_test_encoded)

# training set performance
nn_smote_f1_score = f1_score(y_test, y_smote_predict, average='weighted')
nn_smote_matt_corr = matthews_corrcoef(y_test, y_smote_predict)
nn_smote_precision = precision_score(y_test, y_smote_predict, average='weighted', zero_division=1)
nn_smote_recall = recall_score(y_test, y_smote_predict, average='weighted', zero_division=1)

print('===================================')
print('SMOTE Performance Result')
print('Acc F1:', nn_smote_f1_score)
print('Acc Matt_Corr:', nn_smote_matt_corr)
print('Acc Precision:', nn_smote_precision)
print('Acc Recall:', nn_smote_recall)

#==============================================
#train with ADASYN
#==============================================

nn_adasyn_classifier = MLPClassifier(alpha=1, max_iter=150)
nn_adasyn_classifier.fit(X_train_adasyn, y_train_adasyn)

# make predictions
y_adasyn_predict = nn_adasyn_classifier.predict(x_test_encoded)

# training set performance
nn_adasyn_f1_score = f1_score(y_test, y_adasyn_predict, average='weighted')
nn_adasyn_matt_corr = matthews_corrcoef(y_test, y_adasyn_predict)
nn_adasyn_precision = precision_score(y_test, y_adasyn_predict, average='weighted' , zero_division=1)
nn_adasyn_recall = recall_score(y_test, y_adasyn_predict, average='weighted', zero_division=1)

print('===================================')
print('ADASYN Performance Result')
print('Acc F1:', nn_adasyn_f1_score)
print('Acc Matt_Corr:', nn_adasyn_matt_corr)
print('Acc Precision:', nn_adasyn_precision)
print('Acc Recall:', nn_adasyn_recall)



SMOTE Performance Result
Acc F1: 2.968461737692507e-05
Acc Matt_Corr: 0.0
Acc Precision: 0.9961548996
Acc Recall: 0.00386
ADASYN Performance Result
Acc F1: 0.9940963592350397
Acc Matt_Corr: 0.01613122727241249
Acc Precision: 0.9925353298552494
Acc Recall: 0.995832


In [13]:
#Decision Tree Classifier

#==============================================
#train with Unbalanced data
#==============================================

# Finding the optimal max_depth using cross-validation on the training data
depths = range(1, 10)
# cv_scores_unblncd = [cross_val_score(DecisionTreeClassifier(max_depth=d), x_train_encoded, y_train, cv=5, scoring='f1_weighted').mean() for d in depths]
# optimal_depth = depths[np.argmax(cv_scores_unblncd)]

# print(f"Optimal max_depth: {optimal_depth}")

# # Train the decision tree classifier with the optimal max_depth
# dt_classifier = DecisionTreeClassifier(max_depth=optimal_depth)
# dt_classifier.fit(x_train_encoded, y_train)


# # dt_classifier = DecisionTreeClassifier(max_depth=10)
# # dt_classifier.fit(x_train_encoded, y_train)

# # make predictions
# y_unblncd_predict = dt_classifier.predict(x_test_encoded)

# # training set performance
# dt_unblncd_f1_score = f1_score(y_test, y_unblncd_predict, average='weighted')
# dt_unblncd_matt_corr = matthews_corrcoef(y_test, y_unblncd_predict)
# dt_unblncd_precision = precision_score(y_test, y_unblncd_predict)
# dt_unblncd_recall = recall_score(y_test, y_unblncd_predict)

# #rf_score = nn_classifier(y_test_encoded, y_test)

# # result
# print('Unbalanced Performance Result')
# print('Acc F1:', dt_unblncd_f1_score)
# print('Acc Matt_Corr:', dt_unblncd_matt_corr)
# print('Acc Precision:', dt_unblncd_precision)
# print('Acc Recall:', dt_unblncd_recall)

#==============================================
#train with SMOTE
#==============================================

cv_scores_smote = [cross_val_score(DecisionTreeClassifier(max_depth=d), X_train_smote, y_train_smote, cv=5, scoring='f1_weighted').mean() for d in depths]
smote_optimal_depth = depths[np.argmax(cv_scores_smote)]
print(f"Optimal max_depth: {smote_optimal_depth}")

dt_smote_classifier = DecisionTreeClassifier(max_depth=smote_optimal_depth)
dt_smote_classifier.fit(X_train_smote, y_train_smote)

# make predictions
y_smote_predict = dt_smote_classifier.predict(x_test_encoded)

# testing set performance
dt_smote_f1_score = f1_score(y_test, y_smote_predict, average='weighted')
dt_smote_matt_corr = matthews_corrcoef(y_test, y_smote_predict)
dt_smote_precision = precision_score(y_test, y_smote_predict)
dt_smote_recall = recall_score(y_test, y_smote_predict)

print('===================================')
# result
print('SMOTE Performance Result')
print('Acc F1:', dt_smote_f1_score)
print('Acc Matt_Corr:', dt_smote_matt_corr)
print('Acc Precision:', dt_smote_precision)
print('Acc Recall:', dt_smote_recall)

#==============================================
#train with ADASYN
#==============================================

cv_scores_adasyn = [cross_val_score(DecisionTreeClassifier(max_depth=d), X_train_adasyn, y_train_adasyn, cv=5, scoring='f1_weighted').mean() for d in depths]
adasyn_optimal_depth = depths[np.argmax(cv_scores_adasyn)]
print(f"Optimal max_depth: {adasyn_optimal_depth}")

dt_adasyn_classifier = DecisionTreeClassifier(max_depth=adasyn_optimal_depth)
dt_adasyn_classifier.fit(X_train_adasyn, y_train_adasyn)

# make predictions
y_adasyn_predict = dt_adasyn_classifier.predict(x_test_encoded)

# testing set performance
dt_adasyn_f1_score = f1_score(y_test, y_adasyn_predict, average='weighted')
dt_adasyn_matt_corr = matthews_corrcoef(y_test, y_adasyn_predict)
dt_adasyn_precision = precision_score(y_test, y_adasyn_predict)
dt_adasyn_recall = recall_score(y_test, y_adasyn_predict)


print('================================')
# result
print('ADASYN Performance Result')
print('Acc F1:', dt_adasyn_f1_score)
print('Acc Matt_Corr:', dt_adasyn_matt_corr)
print('Acc Precision:', dt_adasyn_precision)
print('Acc Recall:', dt_adasyn_recall)



Optimal max_depth: 9
SMOTE Performance Result
Acc F1: 0.9762836890958634
Acc Matt_Corr: 0.250795711235728
Acc Precision: 0.07742659435197306
Acc Recall: 0.8580310880829015
Optimal max_depth: 9
ADASYN Performance Result
Acc F1: 0.9733793932243582
Acc Matt_Corr: 0.23555616882769
Acc Precision: 0.06873963515754561
Acc Recall: 0.8590673575129534


In [14]:

#LightGBM Classifier

# Define the parameter grid

#==============================================
#train with Unbalanced data
#==============================================
# lgb_classifier = lgb.LGBMClassifier()
# lgb_classifier.fit(x_train_encoded, y_train)

# # make predictions
# y_unblncd_predict = lgb_classifier.predict(x_test_encoded)

# # training set performance
# lgb_unblncd_f1_score = f1_score(y_test, y_unblncd_predict, average='weighted')
# lgb_unblncd_matt_corr = matthews_corrcoef(y_test, y_unblncd_predict)
# lgb_unblncd_precision = precision_score(y_test, y_unblncd_predict, average='weighted')
# lgb_unblncd_recall = recall_score(y_test, y_unblncd_predict, average='weighted')

# print('Unbalanced Performance Result')
# print('Acc F1:', lgb_unblncd_f1_score)
# print('Acc Matt_Corr:', lgb_unblncd_matt_corr)
# print('Acc Precision:', lgb_unblncd_precision)
# print('Acc Recall:', lgb_unblncd_recall)

#==============================================
# train with ADASYN
#==============================================

lgb_adasyn_classifier = lgb.LGBMClassifier()
lgb_adasyn_classifier.fit(X_train_adasyn, y_train_adasyn)

# make predictions
y_adasyn_predict = lgb_adasyn_classifier.predict(x_test_encoded)

# testing set performance
lgb_adasyn_f1_score = f1_score(y_test, y_adasyn_predict, average='weighted')
lgb_adasyn_matt_corr = matthews_corrcoef(y_test, y_adasyn_predict)
lgb_adasyn_precision = precision_score(y_test, y_adasyn_predict, average='weighted')
lgb_adasyn_recall = recall_score(y_test, y_adasyn_predict, average='weighted')

# result

print('===================================')
# result
print('ADASYN Performance Result')
print('Acc F1:', lgb_adasyn_f1_score)
print('Acc Matt_Corr:', lgb_adasyn_matt_corr)
print('Acc Precision:', lgb_adasyn_precision)
print('Acc Recall:', lgb_adasyn_recall)

#==============================================
# train with SMOTE
#==============================================

lgb_classifier_smote = lgb.LGBMClassifier()

# Predict and evaluate
lgb_classifier_smote.fit(X_train_smote, y_train_smote)

y_smote_predict = lgb_classifier_smote.predict(x_test_encoded)

lgb_smote_f1_score = f1_score(y_test, y_smote_predict, average='weighted')
lgb_smote_matt_corr = matthews_corrcoef(y_test, y_smote_predict)
lgb_smote_precision = precision_score(y_test, y_smote_predict, average='weighted')
lgb_smote_recall = recall_score(y_test, y_smote_predict, average='weighted')

print('===================================')
print('SMOTE Performance Result')
print('Acc F1:', lgb_smote_f1_score)
print('Acc Matt_Corr:', lgb_smote_matt_corr)
print('Acc Precision:', lgb_smote_precision)
print('Acc Recall:', lgb_smote_recall)



[LightGBM] [Info] Number of positive: 298470, number of negative: 298256
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4122
[LightGBM] [Info] Number of data points in the train set: 596726, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500179 -> initscore=0.000717
[LightGBM] [Info] Start training from score 0.000717
ADASYN Performance Result
Acc F1: 0.9785203054996374
Acc Matt_Corr: 0.257306815674534
Acc Precision: 0.995792071324434
Acc Recall: 0.964216
[LightGBM] [Info] Number of positive: 298256, number of negative: 298256
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.120763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4117
[LightGBM] [Info] Number of data

In [15]:
#==============================================
#train with Unbalanced data
#==============================================

#XGBOOST Classifier
# xgb_classifier = XGBClassifier()
# xgb_classifier.fit(x_train_encoded, y_train)

# # make predictions
# y_unblncd_predict = xgb_classifier.predict(x_test_encoded)

# # training set performance
# xgb_unblncd_f1_score = f1_score(y_test, y_unblncd_predict, average='weighted')
# xgb_unblncd_matt_corr = matthews_corrcoef(y_test, y_unblncd_predict)
# xgb_unblncd_precision = precision_score(y_test, y_unblncd_predict, average='weighted')
# xgb_unblncd_recall = recall_score(y_test, y_unblncd_predict, average='weighted')

# # result
# print('Unbalanced Performance Result')
# print('Acc F1:', xgb_unblncd_f1_score)
# print('Acc Matt_Corr:', xgb_unblncd_matt_corr)
# print('Acc Precision:', xgb_unblncd_precision)
# print('Acc Recall:', xgb_unblncd_recall)

#==============================================
# train with SMOTE
#==============================================
xgb_classifier_smote = XGBClassifier()
xgb_classifier_smote.fit(X_train_smote, y_train_smote)
# make predictions
y_smote_predict = xgb_classifier_smote.predict(x_test_encoded)

# training set performance
xgb_smote_f1_score = f1_score(y_test, y_smote_predict, average='weighted')
xgb_smote_matt_corr = matthews_corrcoef(y_test, y_smote_predict)
xgb_smote_precision = precision_score(y_test, y_smote_predict, average='weighted')
xgb_smote_recall = recall_score(y_test, y_smote_predict, average='weighted')


print('===================================')
# result
print('SMOTE Performance Result')
print('Acc F1:', xgb_smote_f1_score)
print('Acc Matt_Corr:', xgb_smote_matt_corr)
print('Acc Precision:', xgb_smote_precision)
print('Acc Recall:', xgb_smote_recall)
#==============================================
# train with ADASYN
#==============================================

xgb_adasyn_classifier = XGBClassifier()
xgb_adasyn_classifier.fit(X_train_adasyn, y_train_adasyn)

# make predictions
y_adasyn_predict = xgb_adasyn_classifier.predict(x_test_encoded)

# testing set performance
xgb_adasyn_f1_score = f1_score(y_test, y_adasyn_predict, average='weighted')
xgb_adasyn_matt_corr = matthews_corrcoef(y_test, y_adasyn_predict)
xgb_adasyn_precision = precision_score(y_test, y_adasyn_predict, average='weighted')
xgb_adasyn_recall = recall_score(y_test, y_adasyn_predict, average='weighted')


print('===================================')
print('ADASYN Performance Result')
print('Acc F1:', xgb_adasyn_f1_score)
print('Acc Matt_Corr:', xgb_adasyn_matt_corr)
print('Acc Precision:', xgb_adasyn_precision)
print('Acc Recall:', xgb_adasyn_recall)



SMOTE Performance Result
Acc F1: 0.9725803973143518
Acc Matt_Corr: 0.20207142005790452
Acc Precision: 0.9953505133574156
Acc Recall: 0.953116
ADASYN Performance Result
Acc F1: 0.9694643078123741
Acc Matt_Corr: 0.19073934135051643
Acc Precision: 0.9953331284281479
Acc Recall: 0.947264


In [19]:
# esemble parformance
estimator_list = [
    ('dt', dt_smote_classifier),
    ('rf', rf_smote),
    ('nlp', nn_adasyn_classifier),
    ('lgb', lgb_classifier_smote),
    ('xgb', xgb_classifier_smote)
]

# Build stack model
stack = StackingClassifier( estimators =estimator_list, final_estimator=LogisticRegression())

# Train stacked model
stack.fit(X_train_smote, y_train_smote)

# make predictions
y_predict = stack.predict(x_test_encoded)

# testing set performance
stack_f1_score = f1_score(y_test, y_predict, average='weighted')
stack_matt_corr = matthews_corrcoef(y_test, y_predict)
stack_precision = precision_score(y_test, y_predict, average='weighted')
stack_recall = recall_score(y_test, y_predict, average='weighted')

# result
print('===================================')
print('Testing Performance Result')
print('Acc F1:', stack_f1_score)
print('Acc Matt_Corr:', stack_matt_corr)
print('Acc Precision:', stack_precision)
print('Acc Recall:', stack_recall)



[LightGBM] [Info] Number of positive: 298256, number of negative: 298256
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.303783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4117
[LightGBM] [Info] Number of data points in the train set: 596512, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 238605, number of negative: 238604
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.174807 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4118
[LightGBM] [Info] Number of data points in the train set: 477209, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500001 -> initscore=0.000004
[LightGBM] [Info] Start training from score 0.000004
[LightGBM] [Info] Number of positive: 238604, number of negativ

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
