In [1]:
import sys
sys.path.append('..')

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.utils import publish_model_scores
from src.datasampling import dataSampling
from src.bayesianopt import bayesianOpt
from sklearn.tree import DecisionTreeClassifier

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("creditcard.csv")
print(df.head(n=10))

# Plot the histogrm
df.hist(figsize = (20, 20))
plt.show()

# Comment : features are scaled so need not to do any standard transformation for preprocessing

# Print the correlation of data
correlation_matrix = df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

#Comment : There is no significant correlation amoung the data so no need for reducing highly correlated data

FileNotFoundError: [Errno 2] No such file or directory: 'creditcard.csv'

# 1 Evaludate Desicion Tree Classifier without sampling techniques

# 1.1 Get the data

In [8]:
# Make a data source object using dataSampling class
ds = dataSampling(df,"Class",0.30,42,True,True)

In [9]:
# Make in-smaple and out sample dataset without sampling
X_train,X_test,y_train,y_test = ds.get_data_wthout_sample()
ds.print_class_percentage(X_train,X_test,y_train,y_test)

----Train class count------
Class
0    199020
1       344
Name: count, dtype: int64
Class
0    0.998275
1    0.001725
Name: count, dtype: float64
----Test class count------
Class
0    85295
1      148
Name: count, dtype: int64
Class
0    0.998268
1    0.001732
Name: count, dtype: float64


# 1.2 Fit the Descion Tree Classifier and optimize Hyperparameters using Bayesian Optimization

In [10]:
# Specified the parameters
param_grid = { 'criterion':['gini','entropy'],
              'max_depth': np.arange(1,15),
              "max_features": ['sqrt', 'log2'],
              "min_samples_split": range(2,21),
              "class_weight": [{0:1, 1:1}, {0:1, 1:5}, {0:1, 1:50},'balanced'],
              "cv":[2],"scoring":["f1_macro"],"n_jobs":[-1]}

# Config dict for Bayesian Optimizer
conf_dict={"num_iteration":100}

# Fit and Optimize the Baysian model
dt_opt = bayesianOpt(DecisionTreeClassifier(random_state = 0))
best_model = dt_opt.optimize_fit(X_train,y_train,param_grid,conf_dict)

Best score: 0.9017272087216697: 100%|██████████| 100/100 [05:23<00:00,  3.24s/it]


# 1.3 See the performance on unseen data

In [11]:
dt_opt.optimize_results["best_params"]

{'class_weight': {0: 1, 1: 1},
 'criterion': 'gini',
 'cv': 2,
 'max_depth': 8,
 'max_features': 'sqrt',
 'min_samples_split': 12,
 'n_jobs': -1,
 'scoring': 'f1_macro'}

In [12]:
print("------Training model Results------")
publish_model_scores(X_train,y_train,best_model)
print("------Testing model Result----------")
publish_model_scores(X_test,y_test,best_model)

------Training model Results------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199020
           1       0.94      0.79      0.86       344

    accuracy                           1.00    199364
   macro avg       0.97      0.89      0.93    199364
weighted avg       1.00      1.00      1.00    199364

ROC_AUC Score 0.9493892310694496
------Testing model Result----------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.86      0.73      0.79       148

    accuracy                           1.00     85443
   macro avg       0.93      0.86      0.90     85443
weighted avg       1.00      1.00      1.00     85443

ROC_AUC Score 0.9222415686100545


# 2 Evaluate using under sampling method

# 2.1 Get the data

In [13]:
X_nm_train,X_test,y_nm_train,y_test= ds.get_data_near_miss()
ds.print_class_percentage(X_nm_train,X_test,y_nm_train,y_test)

----Train class count------
Class
0    3440
1     344
Name: count, dtype: int64
Class
0    0.909091
1    0.090909
Name: count, dtype: float64
----Test class count------
Class
0    85295
1      148
Name: count, dtype: int64
Class
0    0.998268
1    0.001732
Name: count, dtype: float64


# 2.2 Fit the Descion Tree Classifier and optimize Hyperparameters using Bayesian Optimization

In [21]:
# Specified the parameters
param_grid = { 'criterion':['gini','entropy'],
              'max_depth': np.arange(1,15),
              "max_features": ['sqrt', 'log2'],
              "min_samples_split": range(2,21),
              "class_weight": [{0:1, 1:1}, {0:1, 1:5}, {0:1, 1:50},'balanced'],
              "cv":[2],"scoring":["f1_macro"],"n_jobs":[-1]}

# Config dict for Bayesian Optimizer
conf_dict={"num_iteration":100}

# Fit and Optimize the Baysian model
dt_nm_opt = bayesianOpt(DecisionTreeClassifier(random_state = 0))
best_nm_model = dt_nm_opt.optimize_fit(X_nm_train,y_nm_train,param_grid,conf_dict)

Best score: 0.9557682229535889: 100%|██████████| 100/100 [00:14<00:00,  6.93it/s]


# 2.3 See the performance on unseen data

In [22]:
print("------Training model Results------")
publish_model_scores(X_nm_train,y_nm_train,best_nm_model)
print("------Testing model Result----------")
publish_model_scores(X_test,y_test,best_nm_model)

------Training model Results------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3440
           1       1.00      0.96      0.98       344

    accuracy                           1.00      3784
   macro avg       1.00      0.98      0.99      3784
weighted avg       1.00      1.00      1.00      3784

ROC_AUC Score 0.9965141630611141
------Testing model Result----------
              precision    recall  f1-score   support

           0       1.00      0.83      0.91     85295
           1       0.01      0.86      0.02       148

    accuracy                           0.83     85443
   macro avg       0.50      0.85      0.46     85443
weighted avg       1.00      0.83      0.91     85443

ROC_AUC Score 0.8637362698298274


# 3. Evaluate using under sampling method
# 3.1 Get the data sets using near miss sampling technique

In [23]:
X_sm_train,X_test,y_sm_train,y_test= ds.get_data_smote()
ds.print_class_percentage(X_sm_train,X_test,y_sm_train,y_test)

----Train class count------
Class
0    199020
1    199020
Name: count, dtype: int64
Class
0    0.5
1    0.5
Name: count, dtype: float64
----Test class count------
Class
0    85295
1      148
Name: count, dtype: int64
Class
0    0.998268
1    0.001732
Name: count, dtype: float64


# 3.2 Fit the data and tune the hyperparametres using Bayesian Optimization

In [24]:
# Specified the parameters
param_grid = { 'criterion':['gini','entropy'],
              'max_depth': np.arange(1,15),
              "max_features": ['sqrt', 'log2'],
              "min_samples_split": range(2,21),
              "class_weight": [{0:1, 1:1}, {0:1, 1:5}, {0:1, 1:50},'balanced'],
              "cv":[2],"scoring":["f1_macro"],"n_jobs":[-1]}

# Config dict for Bayesian Optimizer
conf_dict={"num_iteration":100}

# Fit and Optimize the Baysian model
dt_sm_opt = bayesianOpt(DecisionTreeClassifier(random_state = 0))
best_sm_model = dt_sm_opt.optimize_fit(X_sm_train,y_sm_train,param_grid,conf_dict)

Best score: 0.9966309793545538: 100%|██████████| 100/100 [14:00<00:00,  8.40s/it]


# 3.3 Evaluate the Performance

In [25]:
print("------Training model Results------")
publish_model_scores(X_sm_train,y_sm_train,best_sm_model)
print("------Testing model Result----------")
publish_model_scores(X_test,y_test,best_sm_model)

------Training model Results------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199020
           1       1.00      1.00      1.00    199020

    accuracy                           1.00    398040
   macro avg       1.00      1.00      1.00    398040
weighted avg       1.00      1.00      1.00    398040

ROC_AUC Score 0.9999650037520298
------Testing model Result----------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     85295
           1       0.19      0.80      0.31       148

    accuracy                           0.99     85443
   macro avg       0.59      0.90      0.65     85443
weighted avg       1.00      0.99      1.00     85443

ROC_AUC Score 0.9007358800854903


# 4 Evaluate Performance using Random Sampling Methods
# 4.1 Get the dataset using ramdom sampling method

In [26]:
X_rn_train,X_test,y_rn_train,y_test= ds.get_data_random_over_sample()
ds.print_class_percentage(X_rn_train,X_test,y_rn_train,y_test)

----Train class count------
Class
0    199020
1    199020
Name: count, dtype: int64
Class
0    0.5
1    0.5
Name: count, dtype: float64
----Test class count------
Class
0    85295
1      148
Name: count, dtype: int64
Class
0    0.998268
1    0.001732
Name: count, dtype: float64


# 4.2 Fit the data and tune the hyperparametres using Bayesian Optimization

In [28]:
# Specified the parameters
param_grid = { 'criterion':['gini','entropy'],
              'max_depth': np.arange(1,15),
              "max_features": ['sqrt', 'log2'],
              "min_samples_split": range(2,21),
              "class_weight": [{0:1, 1:1}, {0:1, 1:5}, {0:1, 1:50},'balanced'],
              "cv":[2],"scoring":["f1_macro"],"n_jobs":[-1]}

# Config dict for Bayesian Optimizer
conf_dict={"num_iteration":100}

# Fit and Optimize the Baysian model
dt_rn_opt = bayesianOpt(DecisionTreeClassifier(random_state = 0))
best_rn_model = dt_rn_opt.optimize_fit(X_sm_train,y_sm_train,param_grid,conf_dict)

Best score: 0.9959526595296999: 100%|██████████| 100/100 [12:26<00:00,  7.46s/it]


# 4.3 Evaluate the Performance on unseen data

In [29]:
print("------Training model Results------")
publish_model_scores(X_rn_train,y_rn_train,best_rn_model)
print("------Testing model Result----------")
publish_model_scores(X_test,y_test,best_rn_model)

------Training model Results------
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    199020
           1       1.00      0.99      0.99    199020

    accuracy                           0.99    398040
   macro avg       0.99      0.99      0.99    398040
weighted avg       0.99      0.99      0.99    398040

ROC_AUC Score 0.9987757750895173
------Testing model Result----------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     85295
           1       0.20      0.79      0.31       148

    accuracy                           0.99     85443
   macro avg       0.60      0.89      0.66     85443
weighted avg       1.00      0.99      1.00     85443

ROC_AUC Score 0.8716882821622256
