In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from matplotlib import pyplot as plt

from sklearn import model_selection
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from sklearn.model_selection import RandomizedSearchCV

In [7]:
import random

random.seed(2022)

In [8]:
X_train = pd.read_csv('../datasets/word2vec/train_data_imputed_FINAL.csv')
X_test = pd.read_csv('../datasets/word2vec/test_data_imputed_FINAL.csv')
y_train = pd.read_csv('../datasets/word2vec/y_train_FINAL.csv').iloc[:,1:]
y_test = pd.read_csv('../datasets/word2vec/y_test_FINAL.csv').iloc[:,1:]

In [12]:
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

X_train: (11516, 263)
y_train: (11516, 1)
X_test: (2880, 263)
y_test: (2880, 1)


In [10]:
def SMOTE_ENN(X_train, y_train, n_neighbours, k_neighbours, sampling_strategy):
    # SMOTE ENN oversampling
    smote_only = SMOTE(random_state=42, sampling_strategy=sampling_strategy, k_neighbors = k_neighbours)
    ennObj = EditedNearestNeighbours(n_neighbors=n_neighbours)
    smote_enn = SMOTEENN(random_state=42, smote=smote_only , enn= ennObj)
    X_res_smoteENN, y_res_smoteENN = smote_enn.fit_resample(X_train, y_train)
    return X_res_smoteENN, y_res_smoteENN

X_smoteENN, y_smoteENN = SMOTE_ENN(X_train, y_train, n_neighbours = 4, k_neighbours = 6, sampling_strategy = 0.1)

In [11]:
print(X_smoteENN.shape)
print(y_smoteENN.shape)
print(X_test.shape)
print(y_test.shape)

(12097, 263)
(12097, 1)
(2880, 263)
(2880, 1)


In [None]:
def print_metrics(y_true, y_pred, y_pred_prob):
    print("Confusion Matrix")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred)) 
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_pred_prob))

Default Decision Tree

In [None]:
base_dt = DecisionTreeClassifier()
base_dt.fit(X_smoteENN, y_smoteENN)
dt_pred_train = base_dt.predict(X_smoteENN)
dt_pred_prob_train = base_dt.predict_proba(X_smoteENN)[:,1]
print_metrics(y_smoteENN, dt_pred_train, dt_pred_prob_train)

Confusion Matrix
[[10968     0]
 [    0  1129]]
F1 Score: 1.0
Precision: 1.0
Recall: 1.0
Accuracy: 1.0
ROC AUC: 1.0


In [None]:
dt_pred_test = base_dt.predict(X_test)
dt_pred_prob_test = tuned_dt_estimator.predict_proba(X_test)[:, 1]
print_metrics(y_test, dt_pred_test, dt_pred_prob_test)

Confusion Matrix
[[2772   52]
 [  29   27]]
F1 Score: 0.39999999999999997
Precision: 0.34177215189873417
Recall: 0.48214285714285715
Accuracy: 0.971875
ROC AUC: 0.75884004451639


# Tuning Round 1
Best test F1 score: 0.234

In [None]:
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = np.arange(100, 400, 20).tolist()
# Minimum number of samples required to split a node
min_samples_split = np.arange(3, 200, 10).tolist()
# Minimum number of samples required at each leaf node
min_samples_leaf = np.arange(3, 100, 5).tolist()
# Complexity parameter used for Minimal Cost-Complexity Pruning
ccp_alpha = np.arange(0, 0.3, 0.01).tolist()

random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'ccp_alpha': ccp_alpha,
               'random_state': [2022]}

In [None]:
dt_estimator = DecisionTreeClassifier()
dt_random = RandomizedSearchCV(estimator = dt_estimator, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=2022, n_jobs = -1, scoring='f1', refit=True)
dt_random.fit(X_smoteENN, y_smoteENN)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


RandomizedSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'ccp_alpha': [0.0, 0.01, 0.02, 0.03,
                                                      0.04, 0.05, 0.06, 0.07,
                                                      0.08, 0.09, 0.1, 0.11,
                                                      0.12, 0.13, 0.14, 0.15,
                                                      0.16, 0.17, 0.18, 0.19,
                                                      0.2, 0.21, 0.22, 0.23,
                                                      0.24, 0.25, 0.26, 0.27,
                                                      0.28, 0.29],
                                        'max_depth': [100, 120, 140, 160, 180,
                                                      200, 220, 240, 260, 280,
                                                      300, 320, 340, 360, 380],
                                        'max_fea

In [None]:
dt_random.best_params_

{'random_state': 2022,
 'min_samples_split': 133,
 'min_samples_leaf': 23,
 'max_features': 'sqrt',
 'max_depth': 300,
 'ccp_alpha': 0.0}

In [None]:
tuned_dt_estimator = dt_random.best_estimator_

In [None]:
dt_pred_train = tuned_dt_estimator.predict(X_smoteENN)
dt_pred_prob_train = tuned_dt_estimator.predict_proba(X_smoteENN)[:,1]
print_metrics(y_smoteENN, dt_pred_train, dt_pred_prob_train)

Confusion Matrix
[[10795   173]
 [  479   650]]
F1 Score: 0.6659836065573771
Precision: 0.7897934386391251
Recall: 0.5757307351638619
Accuracy: 0.9461023394229975
ROC AUC: 0.9710019210406117


In [None]:
dt_pred_test = tuned_dt_estimator.predict(X_test)
dt_pred_prob_test = tuned_dt_estimator.predict_proba(X_test)[:, 1]
print_metrics(y_test, dt_pred_test, dt_pred_prob_test)

Confusion Matrix
[[2767   57]
 [  41   15]]
F1 Score: 0.23437500000000003
Precision: 0.20833333333333334
Recall: 0.26785714285714285
Accuracy: 0.9659722222222222
ROC AUC: 0.7838046337515175


# Tuning Round 2

Best test F1 score: 0.189

In [None]:
# Maximum number of levels in tree
max_depth = np.arange(100, 600, 20).tolist()
# Minimum number of samples required to split a node
min_samples_split = np.arange(3, 200, 10).tolist()
# Minimum number of samples required at each leaf node
min_samples_leaf = np.arange(3, 100, 5).tolist()
# Complexity parameter used for Minimal Cost-Complexity Pruning
ccp_alpha = np.arange(0.01, 0.3, 0.01).tolist()

random_grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'ccp_alpha': ccp_alpha,
               'random_state': [2022]}

In [None]:
dt_estimator = DecisionTreeClassifier()
dt_random = RandomizedSearchCV(estimator = dt_estimator, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=2022, n_jobs = -1, scoring='f1', refit=True)
dt_random.fit(X_smoteENN, y_smoteENN)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


RandomizedSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'ccp_alpha': [0.01, 0.02, 0.03, 0.04,
                                                      0.05,
                                                      0.060000000000000005,
                                                      0.06999999999999999, 0.08,
                                                      0.09, 0.09999999999999999,
                                                      0.11, 0.12, 0.13, 0.14,
                                                      0.15000000000000002, 0.16,
                                                      0.17, 0.18000000000000002,
                                                      0.19, 0.2,
                                                      0.21000000000000002, 0.22,
                                                      0.23, 0.24000000000000002,
                                                     

In [None]:
dt_random.best_params_

{'random_state': 2022,
 'min_samples_split': 133,
 'min_samples_leaf': 23,
 'max_depth': 520,
 'ccp_alpha': 0.01}

In [None]:
tuned_dt_estimator = dt_random.best_estimator_

In [None]:
dt_pred_train = tuned_dt_estimator.predict(X_smoteENN)
dt_pred_prob_train = tuned_dt_estimator.predict_proba(X_smoteENN)[:,1]
print_metrics(y_smoteENN, dt_pred_train, dt_pred_prob_train)

Confusion Matrix
[[10840   128]
 [  787   342]]
F1 Score: 0.4277673545966229
Precision: 0.7276595744680852
Recall: 0.3029229406554473
Accuracy: 0.9243614119203108
ROC AUC: 0.648560447043303


In [None]:
dt_pred_test = tuned_dt_estimator.predict(X_test)
dt_pred_prob_test = tuned_dt_estimator.predict_proba(X_test)[:, 1]
print_metrics(y_test, dt_pred_test, dt_pred_prob_test)

Confusion Matrix
[[2794   30]
 [  47    9]]
F1 Score: 0.18947368421052632
Precision: 0.23076923076923078
Recall: 0.16071428571428573
Accuracy: 0.9732638888888889
ROC AUC: 0.5566509004451639


# Tuning Round 3

Best test F1 score: 0.261

In [None]:
# Maximum number of levels in tree
max_depth = np.arange(100, 600, 20).tolist()
# Minimum number of samples required to split a node
min_samples_split = np.arange(3, 200, 10).tolist()
# Complexity parameter used for Minimal Cost-Complexity Pruning
ccp_alpha = np.arange(0.001, 0.3, 0.01).tolist()

random_grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'ccp_alpha': ccp_alpha,
               'random_state': [2022]}

In [None]:
dt_estimator = DecisionTreeClassifier()
dt_random = RandomizedSearchCV(estimator = dt_estimator, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=2022, n_jobs = -1, scoring='f1', refit=True)
dt_random.fit(X_smoteENN, y_smoteENN)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


RandomizedSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'ccp_alpha': [0.001, 0.011,
                                                      0.020999999999999998,
                                                      0.030999999999999996,
                                                      0.040999999999999995,
                                                      0.05099999999999999,
                                                      0.06099999999999999,
                                                      0.071,
                                                      0.08099999999999999,
                                                      0.09099999999999998,
                                                      0.10099999999999998,
                                                      0.11099999999999999,
                                                      0.12099999999999998,
              

In [None]:
dt_random.best_params_

{'random_state': 2022,
 'min_samples_split': 43,
 'max_depth': 120,
 'ccp_alpha': 0.001}

In [None]:
tuned_dt_estimator = dt_random.best_estimator_

In [None]:
dt_pred_train = tuned_dt_estimator.predict(X_smoteENN)
dt_pred_prob_train = tuned_dt_estimator.predict_proba(X_smoteENN)[:,1]
print_metrics(y_smoteENN, dt_pred_train, dt_pred_prob_train)

Confusion Matrix
[[10925    43]
 [  204   925]]
F1 Score: 0.8822126847877921
Precision: 0.9555785123966942
Recall: 0.8193091231178034
Accuracy: 0.9795817144746631
ROC AUC: 0.9608128469712035


In [None]:
dt_pred_test = tuned_dt_estimator.predict(X_test)
dt_pred_prob_test = tuned_dt_estimator.predict_proba(X_test)[:, 1]
print_metrics(y_test, dt_pred_test, dt_pred_prob_test)

Confusion Matrix
[[2780   44]
 [  41   15]]
F1 Score: 0.2608695652173913
Precision: 0.2542372881355932
Recall: 0.26785714285714285
Accuracy: 0.9704861111111112
ROC AUC: 0.7359147359368677


# Tuning Round 3

Best test F1 score: 0.441

In [None]:
# Maximum number of levels in tree
max_depth = np.arange(100, 600, 20).tolist()
# Minimum number of samples required to split a node
min_samples_split = np.arange(3, 200, 10).tolist()
# Complexity parameter used for Minimal Cost-Complexity Pruning

random_grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'random_state': [2022]}

In [None]:
dt_estimator = DecisionTreeClassifier()
dt_random = RandomizedSearchCV(estimator = dt_estimator, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=2022, n_jobs = -1, scoring='f1', refit=True)
dt_random.fit(X_smoteENN, y_smoteENN)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


RandomizedSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [100, 120, 140, 160, 180,
                                                      200, 220, 240, 260, 280,
                                                      300, 320, 340, 360, 380,
                                                      400, 420, 440, 460, 480,
                                                      500, 520, 540, 560, 580],
                                        'min_samples_split': [3, 13, 23, 33, 43,
                                                              53, 63, 73, 83,
                                                              93, 103, 113, 123,
                                                              133, 143, 153,
                                                              163, 173, 183,
                                                              193],
                                      

In [None]:
dt_random.best_params_

{'random_state': 2022, 'min_samples_split': 3, 'max_depth': 220}

In [None]:
tuned_dt_estimator = dt_random.best_estimator_

In [None]:
dt_pred_train = tuned_dt_estimator.predict(X_smoteENN)
dt_pred_prob_train = tuned_dt_estimator.predict_proba(X_smoteENN)[:,1]
print_metrics(y_smoteENN, dt_pred_train, dt_pred_prob_train)

Confusion Matrix
[[10968     0]
 [    3  1126]]
F1 Score: 0.998669623059867
Precision: 1.0
Recall: 0.9973427812223207
Accuracy: 0.999752004629247
ROC AUC: 0.9999996365948062


In [None]:
dt_pred_test = tuned_dt_estimator.predict(X_test)
dt_pred_prob_test = tuned_dt_estimator.predict_proba(X_test)[:, 1]
print_metrics(y_test, dt_pred_test, dt_pred_prob_test)

Confusion Matrix
[[2774   50]
 [  26   30]]
F1 Score: 0.44117647058823534
Precision: 0.375
Recall: 0.5357142857142857
Accuracy: 0.9736111111111111
ROC AUC: 0.75884004451639
