In [56]:
import random
import itertools
from typing import Tuple

import numpy as np
import pandas as pd
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize, OneHotEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, accuracy_score

np.random.seed(0)
random.seed(0)

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, pos_label='RB'))
    print("The Recall is: %7.4f" % recall_score(truth, preds, pos_label='RB'))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, pos_label='RB'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print()
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(truth, preds)))

def one_hot_encode_categorical_features(all_features: pd.DataFrame) -> pd.DataFrame:
    is_categorical = lambda f: all_features[f].apply(lambda x: not isinstance(x, float) or x.is_integer()).all()
    categorical_cols = [f for f in all_features if is_categorical(f)]
    other_cols = [f for f in all_features if not is_categorical(f)]
    
    encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_matrix = all_features[categorical_cols].to_numpy().astype(int).astype(str)
    categorical_encoded_matrix = encoder.fit_transform(categorical_matrix).toarray()
    
    headers = []
    for base_name, categories in zip(categorical_cols, encoder.categories_):
        for c in categories:
            headers.append(base_name + "_" + c)
        # print(f"{base_name} has {len(categories)} categories")
    
    return pd.concat([all_features[other_cols], pd.DataFrame(data=categorical_encoded_matrix, columns=headers)], axis=1)

In [57]:
initial_dataset = pd.read_csv("biodegradable_a.csv").sample(frac=1).reset_index(drop=True)
total_len, _ = initial_dataset.shape

# NOTE - NO INDEPENDENT VALIDATION SET !!!


class_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 == 0).all()]
num_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 != 0).any()]

print(len(class_cols) + len(num_cols))
print(len(initial_dataset.drop("Biodegradable", axis=1).columns))






total_categorical_dataset = initial_dataset[class_cols]
total_categorical_dataset = total_categorical_dataset.fillna(-1)
total_categorical_dataset = total_categorical_dataset.astype(int).astype(object).astype(str)






total_numerical_dataset = initial_dataset[num_cols]
total_numerical_dataset = total_numerical_dataset.fillna(total_numerical_dataset.mean())

total_biodegradable = initial_dataset["Biodegradable"]



# Scale numerical data
# https://scikit-learn.org/stable/modules/preproce
#print(total_numerical_dataset)

scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))

# Commenting the following two lines will run the models without scaling. It allows the usage of certain Naive Bayes models, but ruins SVMs.
# As the NaiveBayes that don't work with the negative numbers that come from the scaling of data, we keep it for the ones that show better results
total_numerical_dataset = pd.DataFrame(scaler.fit_transform(total_numerical_dataset),
             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)


#total_numerical_dataset = pd.DataFrame(normalize(total_numerical_dataset, norm='l2', axis=1, copy=True, return_norm=False),
#             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)

#print(total_numerical_dataset)
#





total_dataset = pd.concat([total_categorical_dataset, total_numerical_dataset,total_biodegradable], axis=1)
print(total_dataset)

total_len, _ = total_dataset.shape
train_dataset_len = round(total_len * 0.75)

dataset_train = total_dataset[0:train_dataset_len]
dataset_test = total_dataset[train_dataset_len:total_len]

print(total_dataset.shape)


41
41
     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...       SpMax_A  \
0      0   0      0   2  2   0    0      0     0   0  ...  7.635193e-01   
1      1   0      0   0  3   0    0      0     0   0  ... -4.481867e-02   
2      0   0      0   2  2   0    0      0     0   0  ...  2.805893e-14   
3      0   0      0   2  1   0    0      0     0   0  ...  6.022004e-01   
4      0   0      0   0  0   0    0      0     0   0  ... -1.439928e+00   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...           ...   
4559   0   1      0   0  1   1    0      0     0   0  ...  2.805893e-14   
4560   0   0      0   0  3   0    0      0     0   0  ... -6.781979e-01   
4561   0   0      0   0  2   0    0      0     0   0  ...  1.261149e+00   
4562   0   0      0   0  1   0    0      0     0   0  ... -8.396889e-01   
4563   0   0      0   2  1   0    0      0     0   0  ...  2.624708e-01   

      Psi_i_1d       SdO     TI2_L      nCrt   SpMax_B       Psi_i_A  \
0    -0.036040  0.083

In [58]:
print(total_dataset.columns)

Index(['nHM', 'F04', 'NssssC', 'nCb', 'nO', 'F03', 'nN_N', 'nArNO2', 'nCRX3',
       'B01', 'B03', 'N_073', 'B04', 'C_026', 'F02_CN', 'nHDon', 'nN',
       'nArCOOR', 'SpMax_L', 'J_Dz(e)', 'F01', 'C', 'nCp', 'SdssC', 'HyWi_B',
       'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi', 'SpPosA_B', 'nCIR', 'SpMax_A',
       'Psi_i_1d', 'SdO', 'TI2_L', 'nCrt', 'SpMax_B', 'Psi_i_A', 'SM6_B', 'nX',
       'Biodegradable'],
      dtype='object')


## Using Model with replaced values when NaN, and discarding the dropped NaN values dataset

In [59]:
X_train_total = dataset_train.drop(["Biodegradable"], axis=1)
y_train_total = dataset_train.Biodegradable
print(X_train_total)
print(y_train_total)

X_test_total = dataset_test.drop(["Biodegradable"], axis=1)
y_test_total = dataset_test.Biodegradable

     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...      nCIR  \
0      0   0      0   2  2   0    0      0     0   0  ...  0.117354   
1      1   0      0   0  3   0    0      0     0   0  ... -0.302031   
2      0   0      0   2  2   0    0      0     0   0  ...  0.536738   
3      0   0      0   2  1   0    0      0     0   0  ...  0.117354   
4      0   0      0   0  0   0    0      0     0   0  ... -0.302031   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...       ...   
3418   0   0      0   0  2   0    0      0     0   0  ... -0.302031   
3419   0   0      0   2  2   0    0      0     0   0  ...  0.117354   
3420   0   0      0   0  1   0    0      0     0   0  ... -0.302031   
3421   0   0      0   0  2   0    0      0     0   0  ... -0.302031   
3422   0   0      0   0  2   0    0      0     0   0  ... -0.302031   

           SpMax_A  Psi_i_1d       SdO     TI2_L      nCrt   SpMax_B  \
0     7.635193e-01 -0.036040  0.083128 -0.584680 -0.100618  0.238243   
1  

## Testing Random Forests for Feature Selection

In [60]:
# https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), max_features=12) # when no max_features are specified, seems to vary between 12 and 15
sel.fit(X_train_total, y_train_total)

In [61]:

print(sel.get_support())
selected_feat= X_train_total.columns[(sel.get_support())]
print(len(selected_feat))
print(selected_feat)

[ True  True  True  True False  True False False False False False False
 False False  True False False False  True False False False False False
 False False  True False False False  True False False False False False
 False  True False  True  True]
12
Index(['nHM', 'F04', 'NssssC', 'nCb', 'F03', 'F02_CN', 'SpMax_L', 'SM6_L',
       'SpPosA_B', 'SpMax_B', 'SM6_B', 'nX'],
      dtype='object')


In [62]:
X_train_rf = X_train_total[X_train_total.columns[(sel.get_support())]]
X_test_rf = X_test_total[X_test_total.columns[(sel.get_support())]]

print(X_train_rf)

     nHM F04 NssssC nCb F03 F02_CN   SpMax_L     SM6_L  SpPosA_B   SpMax_B  \
0      0   0      0   2   0      0  0.502761  0.262102  0.467673  0.238243   
1      1   0      0   0   0      4  1.064719  0.332217 -0.831613  0.000000   
2      0   0      0   2   0      0  0.262057  0.323590  1.037564  0.106006   
3      0   0      0   2   0      0  0.445841  0.220531  0.812120  0.000000   
4      0   0      0   0   0      0 -0.971360 -1.241510  0.585408 -0.639572   
...   ..  ..    ...  ..  ..    ...       ...       ...       ...       ...   
3418   0   0      0   0   0      1 -0.917662 -1.298313 -1.879649 -0.565504   
3419   0   0      0   2   0      0  0.479167  0.324590  0.349936  0.000000   
3420   0   0      0   0   0      0 -0.515278 -0.679934 -0.416422 -1.104080   
3421   0   0      0   0   0      0  0.285715  0.070415 -1.550311  0.000000   
3422   0   0      0   0   0      0  0.102410 -0.399898 -0.285076  0.000000   

         SM6_B            nX  
0     0.139808 -1.602862e-01  
1

## One Hot Encoding Categorical data

In [63]:
_ = pd.concat([X_train_rf,X_test_rf])
_ = one_hot_encode_categorical_features(_)
X_train_rf_ohe = _[0:train_dataset_len]
X_test_rf_ohe = _[train_dataset_len:total_len]

# Random Forest Classifier

In [64]:
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(X_train_rf_ohe, y_train_total)

preds = rf_model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9641
The Precision is:  0.9845
The Recall is:  0.9735
The F1 score is:  0.9790
The Matthews correlation coefficient is:  0.8558

This is the Confusion Matrix
     0    1
0  145   15
1   26  955


#### Simple Decision Tree test

In [65]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

rf_model = DecisionTreeClassifier(max_depth=5)
rf_model.fit(X_train_rf_ohe, y_train_total)

preds = rf_model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9474
The Precision is:  0.9794
The Recall is:  0.9596
The F1 score is:  0.9694
The Matthews correlation coefficient is:  0.7852

This is the Confusion Matrix
     0    1
0  131   20
1   40  950


## Optimizing Random Forest Model for Classification

In [66]:
best_model = None
best_model_type = ""
best_model_hyperparams = None
best_fitness = -99999
tested_models = []
def rf_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    #n_estimators = [100]
    #n_estimators = [110,120,130,150,200]
    #n_estimators = [110,115,120,125,130]
    n_estimators = [125,126,127,128,129,130]
    #criterion = ["gini"]
    criterion = ["gini","entropy","log_loss"]
    max_depth = [None]
    min_samples_split = [2]
    min_samples_leaf = [1]
    min_weight_fraction_leaf = [0]
    #max_features = ["sqrt"]
    max_features = ["sqrt","log2",None]
    max_leaf_nodes = [None]
    min_impurity_decrease = [0]
    #bootstrap = [True]
    bootstrap = [True,False]
    #oob_score = [False]
    oob_score = [False,True]
    n_jobs = [None]
    random_state = [None]
    verbose = [0]
    #warm_start = [False]
    warm_start = [False,True]
    class_weight = [None]
    ccp_alpha = [0]
    max_samples = [None]

    rf_hyperparams = itertools.product(
                                        n_estimators,
                                        criterion,
                                        max_depth,
                                        min_samples_split,
                                        min_samples_leaf,
                                        min_weight_fraction_leaf,
                                        max_features,
                                        max_leaf_nodes,
                                        min_impurity_decrease,
                                        bootstrap,
                                        oob_score,
                                        n_jobs,
                                        random_state,
                                        verbose,
                                        warm_start,
                                        class_weight,
                                        ccp_alpha,
                                        max_samples
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        n_estimators,\
            criterion,\
            max_depth,\
            min_samples_split,\
            min_samples_leaf,\
            min_weight_fraction_leaf,\
            max_features,\
            max_leaf_nodes,\
            min_impurity_decrease,\
            bootstrap,\
            oob_score,\
            n_jobs,\
            random_state,\
            verbose,\
            warm_start,\
            class_weight,\
            ccp_alpha,\
            max_samples\
                = hyper_param

        if oob_score and not bootstrap:
            continue

        model = RandomForestClassifier(
                        n_estimators=n_estimators,
                        criterion=criterion,
                        max_depth=max_depth,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        min_weight_fraction_leaf=min_weight_fraction_leaf,
                        max_features=max_features,
                        max_leaf_nodes=max_leaf_nodes,
                        min_impurity_decrease=min_impurity_decrease,
                        bootstrap=bootstrap,
                        oob_score=oob_score,
                        n_jobs=n_jobs,
                        random_state=random_state,
                        verbose=verbose,
                        warm_start=warm_start,
                        class_weight=class_weight,
                        ccp_alpha=ccp_alpha,
                        max_samples=max_samples
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    n_estimators,\
            criterion,\
            max_depth,\
            min_samples_split,\
            min_samples_leaf,\
            min_weight_fraction_leaf,\
            max_features,\
            max_leaf_nodes,\
            min_impurity_decrease,\
            bootstrap,\
            oob_score,\
            n_jobs,\
            random_state,\
            verbose,\
            warm_start,\
            class_weight,\
            ccp_alpha,\
            max_samples\
                = best_hyper_params

    best_hyper_params_dict = {
        "n_estimators": n_estimators,
        "criterion": criterion,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "min_weight_fraction_leaf": min_weight_fraction_leaf,
        "max_features": max_features,
        "max_leaf_nodes": max_leaf_nodes,
        "min_impurity_decrease": min_impurity_decrease,
        "bootstrap": bootstrap,
        "oob_score": oob_score,
        "n_jobs": n_jobs,
        "random_state": random_state,
        "verbose": verbose,
        "warm_start": warm_start,
        "class_weight": class_weight,
        "ccp_alpha": ccp_alpha,
        "max_samples": max_samples
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = rf_optimization(X_train_rf_ohe, y_train_total, X_test_rf_ohe, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

tested_models.append("RandomForestClassifier")
if fitness > best_fitness:
    best_model = model
    best_model_type = "RandomForestClassifier"
    best_model_hyperparams = hyper_params_dict
    best_fitness = fitness

# Support Vector Machines

In [None]:
svc_model = SVC()
svc_model.fit(X_train_rf_ohe, y_train_total)

preds = svc_model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9553
The Precision is:  0.9907
The Recall is:  0.9581
The F1 score is:  0.9742
The Matthews correlation coefficient is:  0.8157

This is the Confusion Matrix
     0    1
0  129    9
1   42  961


## Optimizing Support Vector Machines for Optimization

In [None]:
def svc_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    #C = [1]
    C = [0.5,1,4,6,8,12]
    #kernel = ["rbf"]
    #kernel = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
    kernel = ['linear', 'poly', 'rbf']
    degree = [3]
    gamma = ["scale"]
    coef0 = [0]
    #shrinking = [True]
    shrinking = [True,False]
    #probability = [False]
    probability = [False, True]
    tol = [0.001]
    cache_size = [200]
    class_weight = [None]
    verbose = [False]
    max_iter = [-1]
    #decision_function_shape = ["ovr"]
    decision_function_shape = ["ovr", 'ovo']
    break_ties = [False]
    random_state = [None]

    rf_hyperparams = itertools.product(
                                        C,\
                                        kernel,\
                                        degree,\
                                        gamma,\
                                        coef0,\
                                        shrinking,\
                                        probability,\
                                        tol,\
                                        cache_size,\
                                        class_weight,\
                                        verbose,\
                                        max_iter,\
                                        decision_function_shape,\
                                        break_ties,\
                                        random_state
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        C,\
            kernel,\
            degree,\
            gamma,\
            coef0,\
            shrinking,\
            probability,\
            tol,\
            cache_size,\
            class_weight,\
            verbose,\
            max_iter,\
            decision_function_shape,\
            break_ties,\
            random_state\
                = hyper_param

        model = SVC(
                        C=C,\
                        kernel=kernel,\
                        degree=degree,\
                        gamma=gamma,\
                        coef0=coef0,\
                        shrinking=shrinking,\
                        probability=probability,\
                        tol=tol,\
                        cache_size=cache_size,\
                        class_weight=class_weight,\
                        verbose=verbose,\
                        max_iter=max_iter,\
                        decision_function_shape=decision_function_shape,\
                        break_ties=break_ties,\
                        random_state=random_state
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    C,\
        kernel,\
        degree,\
        gamma,\
        coef0,\
        shrinking,\
        probability,\
        tol,\
        cache_size,\
        class_weight,\
        verbose,\
        max_iter,\
        decision_function_shape,\
        break_ties,\
        random_state\
                = best_hyper_params

    best_hyper_params_dict = {
        "C": C,
        "kernel": kernel,
        "degree": degree,
        "gamma": gamma,
        "coef0": coef0,
        "shrinking": shrinking,
        "probability": probability,
        "tol": probability,
        "cache_size": cache_size,
        "class_weight": class_weight,
        "verbose": verbose,
        "max_iter": max_iter,
        "decision_function_shape": decision_function_shape,
        "break_ties": break_ties,
        "random_state": random_state
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = svc_optimization(X_train_rf_ohe, y_train_total, X_test_rf_ohe, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

tested_models.append("SVC")
if fitness > best_fitness:
    best_model = model
    best_model_type = "SVC"
    best_model_hyperparams = hyper_params_dict
    best_fitness = fitness

0.8903659194114709---(0.5, 'linear', 3, 'scale', 0, True, False, 0.001, 200, None, False, -1, 'ovo', False, None)
0.8903659194114709---(0.5, 'linear', 3, 'scale', 0, True, True, 0.001, 200, None, False, -1, 'ovr', False, None)
0.8903659194114709---(0.5, 'linear', 3, 'scale', 0, True, True, 0.001, 200, None, False, -1, 'ovo', False, None)
0.8903659194114709---(0.5, 'linear', 3, 'scale', 0, False, False, 0.001, 200, None, False, -1, 'ovr', False, None)
0.8903659194114709---(0.5, 'linear', 3, 'scale', 0, False, False, 0.001, 200, None, False, -1, 'ovo', False, None)
0.8903659194114709---(0.5, 'linear', 3, 'scale', 0, False, True, 0.001, 200, None, False, -1, 'ovr', False, None)
0.8903659194114709---(0.5, 'linear', 3, 'scale', 0, False, True, 0.001, 200, None, False, -1, 'ovo', False, None)
0.9062429057641039---(0.5, 'poly', 3, 'scale', 0, True, False, 0.001, 200, None, False, -1, 'ovo', False, None)
0.9062429057641039---(0.5, 'poly', 3, 'scale', 0, True, True, 0.001, 200, None, False, -1,

In [None]:
def lsvc_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    penalty = ["l2"]
    #loss=["squared_hinge"]
    loss=['hinge','squared_hinge']
    #dual=[True]
    dual=[True,False]
    tol=[0.0000001,0.0001,0.01,0.1,0.5,1]
    #C = [1]
    C = [0.5,1,4,6,8,12]
    #multi_class=["ovr"]
    multi_class=['ovr','crammer_singer']
    fit_intercept=[True]
    intercept_scaling=[1]
    class_weight=[None]
    verbose=[0]
    random_state=[None]
    max_iter=[1000,1500,2000,3000]

    rf_hyperparams = itertools.product(
                                        penalty,
                                        loss,
                                        dual,
                                        tol,
                                        C,
                                        multi_class,
                                        fit_intercept,
                                        intercept_scaling,
                                        class_weight,
                                        verbose,
                                        random_state,
                                        max_iter
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        penalty,\
        loss,\
        dual,\
        tol,\
        C,\
        multi_class,\
        fit_intercept,\
        intercept_scaling,\
        class_weight,\
        verbose,\
        random_state,\
        max_iter\
                = hyper_param

        if penalty=='l2' and loss=='hinge':
            continue

        model = LinearSVC(
                        penalty=penalty,\
                        loss=loss,\
                        dual=dual,\
                        tol=tol,\
                        C=C,\
                        multi_class=multi_class,\
                        fit_intercept=fit_intercept,\
                        intercept_scaling=intercept_scaling,\
                        class_weight=class_weight,\
                        verbose=verbose,\
                        random_state=random_state,\
                        max_iter=max_iter
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    penalty,\
    loss,\
    dual,\
    tol,\
    C,\
    multi_class,\
    fit_intercept,\
    intercept_scaling,\
    class_weight,\
    verbose,\
    random_state,\
    max_iter\
         = best_hyper_params

    best_hyper_params_dict = {
        "penalty": penalty,
        "loss": loss,
        "dual": dual,
        "tol": tol,
        "C": C,
        "multi_class": multi_class,
        "fit_intercept": fit_intercept,
        "intercept_scaling": intercept_scaling,
        "class_weight": class_weight,
        "verbose": verbose,
        "random_state": random_state,
        "max_iter": max_iter
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = lsvc_optimization(X_train_rf_ohe, y_train_total, X_test_rf_ohe, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

tested_models.append("LinearSVC")
if fitness > best_fitness:
    best_model = model
    best_model_type = "LinearSVC"
    best_model_hyperparams = hyper_params_dict
    best_fitness = fitness



0.8956796108658015---('l2', 'squared_hinge', True, 1e-07, 0.5, 'ovr', True, 1, None, 0, None, 1500)
0.8956796108658015---('l2', 'squared_hinge', True, 1e-07, 0.5, 'ovr', True, 1, None, 0, None, 2000)




0.8956796108658015---('l2', 'squared_hinge', True, 1e-07, 0.5, 'ovr', True, 1, None, 0, None, 3000)




0.8956796108658015---('l2', 'squared_hinge', True, 1e-07, 1, 'ovr', True, 1, None, 0, None, 1000)
0.8956796108658015---('l2', 'squared_hinge', True, 1e-07, 1, 'ovr', True, 1, None, 0, None, 1500)




0.8956796108658015---('l2', 'squared_hinge', True, 1e-07, 1, 'ovr', True, 1, None, 0, None, 2000)
0.8956796108658015---('l2', 'squared_hinge', True, 1e-07, 1, 'ovr', True, 1, None, 0, None, 3000)




0.8956796108658015---('l2', 'squared_hinge', True, 0.0001, 0.5, 'ovr', True, 1, None, 0, None, 1000)
0.8956796108658015---('l2', 'squared_hinge', True, 0.0001, 0.5, 'ovr', True, 1, None, 0, None, 1500)




0.8956796108658015---('l2', 'squared_hinge', True, 0.0001, 0.5, 'ovr', True, 1, None, 0, None, 2000)
0.8956796108658015---('l2', 'squared_hinge', True, 0.0001, 0.5, 'ovr', True, 1, None, 0, None, 3000)




0.8956796108658015---('l2', 'squared_hinge', True, 0.0001, 1, 'ovr', True, 1, None, 0, None, 1000)
0.8956796108658015---('l2', 'squared_hinge', True, 0.0001, 1, 'ovr', True, 1, None, 0, None, 1500)




0.8956796108658015---('l2', 'squared_hinge', True, 0.0001, 1, 'ovr', True, 1, None, 0, None, 2000)
0.8956796108658015---('l2', 'squared_hinge', True, 0.0001, 1, 'ovr', True, 1, None, 0, None, 3000)




0.8957271396569354---('l2', 'squared_hinge', True, 0.01, 12, 'ovr', True, 1, None, 0, None, 1000)




Max Fitness = 0.8867279862222794 using the mean of ['F1', 'MatthewsCorrelation', 'Accuracy']
Best Hyperparams = {'penalty': 'l2', 'loss': 'squared_hinge', 'dual': True, 'tol': 0.1, 'C': 6, 'multi_class': 'ovr', 'fit_intercept': True, 'intercept_scaling': 1, 'class_weight': None, 'verbose': 0, 'random_state': None, 'max_iter': 1000}
The Accuracy is:  0.9483
The Precision is:  0.9887
The Recall is:  0.9523
The F1 score is:  0.9702
The Matthews correlation coefficient is:  0.7849

This is the Confusion Matrix
     0    1
0  123   11
1   48  959




# Naive Bayes

In [None]:
scaled = True # To use the models that don't work with scaled (negative) values.

nb_model = GaussianNB() # Likelihood of the features is assumed to be Gaussian
nb_model.fit(X_train_rf, y_train_total)

preds = nb_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

print("Gaussian Naive Bayes")
printClassResults(preds,truths)



print()
print()


if not scaled:
    nb_model = MultinomialNB() # For multinomially distributed data
    nb_model.fit(X_train_rf, y_train_total)

    preds = nb_model.predict(X_test_rf)
    truths = y_test_total.to_numpy()

    print("Multinomial Naive Bayes")
    printClassResults(preds,truths)



    print()
    print()



    # For imbalanced datasets - ours seems to be really imbalenced towards the category "RB", but doesn't work with the data scaling. Tested it without scaling, didn't surpass the Gaussian nor the Bernoulli, so we didn't further optimize this
    nb_model = ComplementNB()
    nb_model.fit(X_train_rf, y_train_total)

    preds = nb_model.predict(X_test_rf)
    truths = y_test_total.to_numpy()

    print("Complement Naive Bayes")
    printClassResults(preds,truths)



    print()
    print()



nb_model = BernoulliNB() # Distributed according to multivariate Bernoulli distributions
nb_model.fit(X_train_rf, y_train_total)

preds = nb_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

print("Bernoulli Naive Bayes")
printClassResults(preds,truths)



if not scaled:
    print()
    print()



    nb_model = CategoricalNB() # For categorically distributed data
    nb_model.fit(X_train_rf, y_train_total)
    
    preds = nb_model.predict(X_test_rf)
    truths = y_test_total.to_numpy()
    
    print("Categorical Naive Bayes")
    printClassResults(preds,truths)



# Out-of-core with the partial_fit function?

Gaussian Naive Bayes
The Accuracy is:  0.9273
The Precision is:  0.9515
The Recall is:  0.9625
The F1 score is:  0.9570
The Matthews correlation coefficient is:  0.7224

This is the Confusion Matrix
     0    1
0  135   47
1   36  923


Bernoulli Naive Bayes
The Accuracy is:  0.8861
The Precision is:  0.9227
The Recall is:  0.9421
The F1 score is:  0.9323
The Matthews correlation coefficient is:  0.5747

This is the Confusion Matrix
     0    1
0  116   75
1   55  895


## Gaussian Naive Bayes

In [None]:
def gnb_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    priors = [None]
    #var_smoothing = [1e-9]
    var_smoothing = [0,1e-20,1e-10,1e-9,1e-08,1e-07,1e-06,1e-02]

    rf_hyperparams = itertools.product(
                                        priors,
                                        var_smoothing
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        priors,\
        var_smoothing\
                = hyper_param

        model = GaussianNB(
                        priors=priors,
                        var_smoothing=var_smoothing
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    priors,\
    var_smoothing,\
                = best_hyper_params

    best_hyper_params_dict = {
        "priors": priors,
        "var_smoothing": var_smoothing
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = gnb_optimization(X_train_rf_ohe, y_train_total, X_test_rf_ohe, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

tested_models.append("GaussianNB")
if fitness > best_fitness:
    best_model = model
    best_model_type = "GaussianNB"
    best_model_hyperparams = hyper_params_dict
    best_fitness = fitness

0.8688715384788606---(None, 1e-20)
0.8688715384788606---(None, 1e-10)
0.8688715384788606---(None, 1e-09)
0.8688715384788606---(None, 1e-08)
0.8688715384788606---(None, 1e-07)
0.8688715384788606---(None, 1e-06)
Max Fitness = 0.8584548700270367 using the mean of ['F1', 'MatthewsCorrelation', 'Accuracy']
Best Hyperparams = {'priors': None, 'var_smoothing': 0}
The Accuracy is:  0.9273
The Precision is:  0.9515
The Recall is:  0.9625
The F1 score is:  0.9570
The Matthews correlation coefficient is:  0.7224

This is the Confusion Matrix
     0    1
0  135   47
1   36  923


## Bernoulli Naive Bayes

In [None]:
def bnb_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    #alpha = [1.0]
    #alpha = [0,0.1,0.5,1.0,1.5,2]
    alpha = [1e-10,1e-9,1e-8,1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,0.1]
    #binarize = [0.0]
    #binarize = [None,0.0,0.1,0.5,1,2]
    binarize = [1,1.5,1.75,2,2.25,2.5,2.75,3,4]
    fit_prior = [True]
    #fit_prior = [True,False]
    class_prior = [None]

    rf_hyperparams = itertools.product(
                                        alpha,
                                        binarize,
                                        fit_prior,
                                        class_prior
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        alpha,\
            binarize,\
            fit_prior,\
            class_prior\
                = hyper_param

        model = BernoulliNB(
                        alpha=alpha,
                        binarize=binarize,
                        fit_prior=fit_prior,
                        class_prior=class_prior
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    alpha,\
        binarize,\
        fit_prior,\
        class_prior\
                = best_hyper_params

    best_hyper_params_dict = {
        "alpha": alpha,
        "binarize": binarize,
        "fit_prior": fit_prior,
        "class_prior": class_prior
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = bnb_optimization(X_train_rf, y_train_total, X_test_rf_ohe, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

tested_models.append("BernoulliNB")
if fitness > best_fitness:
    best_model = model
    best_model_type = "BernoulliNB"
    best_model_hyperparams = hyper_params_dict
    best_fitness = fitness

0.8386694437070922---(1e-09, 1.75, True, None)
0.8386694437070922---(1e-08, 1.75, True, None)
0.8386694437070922---(1e-07, 1.75, True, None)
0.8386694437070922---(1e-06, 1.75, True, None)
0.8386694437070922---(1e-05, 1.75, True, None)
0.8386694437070922---(0.0001, 1.75, True, None)
0.8386694437070922---(0.001, 1.75, True, None)
0.8386694437070922---(0.01, 1.75, True, None)
0.8386694437070922---(0.1, 1.75, True, None)
Max Fitness = 0.7547517624469914 using the mean of ['F1', 'MatthewsCorrelation', 'Accuracy']
Best Hyperparams = {'alpha': 1e-10, 'binarize': 1.75, 'fit_prior': True, 'class_prior': None}
The Accuracy is:  0.9150
The Precision is:  0.9608
The Recall is:  0.9405
The F1 score is:  0.9505
The Matthews correlation coefficient is:  0.6505

This is the Confusion Matrix
     0    1
0  112   38
1   59  932


# Ensemble Boosting
## Gradient Boosting

In [None]:
xgb_model = GradientBoostingClassifier()
xgb_model.fit(X_train_rf, y_train_total)

preds = xgb_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9527
The Precision is:  0.9856
The Recall is:  0.9598
The F1 score is:  0.9725
The Matthews correlation coefficient is:  0.8055

This is the Confusion Matrix
     0    1
0  131   14
1   40  956


In [None]:
def gb_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    #loss=["log_loss"]
    loss=['log_loss','exponential']
    #learning_rate=[0.1]
    learning_rate=[0.1,0.2,0.5,1]
    #n_estimators=[100]
    n_estimators=[50,100,150,200,1000]
    subsample=[1]
    #criterion=["friedman_mse"]
    #criterion=['friedman_mse','squared_error','mse']
    criterion=['friedman_mse','squared_error']
    min_samples_split=[2]
    min_samples_leaf=[1]
    min_weight_fraction_leaf=[0]
    #max_depth=[3]
    max_depth=[2,3,10,30]
    min_impurity_decrease=[0]
    init=[None]
    random_state=[None]
    max_features=[None]
    verbose=[0]
    max_leaf_nodes=[None]
    warm_start=[False]
    validation_fraction=[0.1]
    n_iter_no_change=[None]
    tol=[0.0001]
    ccp_alpha=[0]

    rf_hyperparams = itertools.product(
                                        loss,
                                        learning_rate,
                                        n_estimators,
                                        subsample,
                                        criterion,
                                        min_samples_split,
                                        min_samples_leaf,
                                        min_weight_fraction_leaf,
                                        max_depth,
                                        min_impurity_decrease,
                                        init,
                                        random_state,
                                        max_features,
                                        verbose,
                                        max_leaf_nodes,
                                        warm_start,
                                        validation_fraction,
                                        n_iter_no_change,
                                        tol,
                                        ccp_alpha
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        loss,\
        learning_rate,\
        n_estimators,\
        subsample,\
        criterion,\
        min_samples_split,\
        min_samples_leaf,\
        min_weight_fraction_leaf,\
        max_depth,\
        min_impurity_decrease,\
        init,\
        random_state,\
        max_features,\
        verbose,\
        max_leaf_nodes,\
        warm_start,\
        validation_fraction,\
        n_iter_no_change,\
        tol,\
        ccp_alpha\
             = hyper_param

        model = GradientBoostingClassifier(
                        loss=loss,
                        learning_rate=learning_rate,
                        n_estimators=n_estimators,
                        subsample=subsample,
                        criterion=criterion,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        min_weight_fraction_leaf=min_weight_fraction_leaf,
                        max_depth=max_depth,
                        min_impurity_decrease=min_impurity_decrease,
                        init=init,
                        random_state=random_state,
                        max_features=max_features,
                        verbose=verbose,
                        max_leaf_nodes=max_leaf_nodes,
                        warm_start=warm_start,
                        validation_fraction=validation_fraction,
                        n_iter_no_change=n_iter_no_change,
                        tol=tol,
                        ccp_alpha=ccp_alpha
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    loss,\
    learning_rate,\
    n_estimators,\
    subsample,\
    criterion,\
    min_samples_split,\
    min_samples_leaf,\
    min_weight_fraction_leaf,\
    max_depth,\
    min_impurity_decrease,\
    init,\
    random_state,\
    max_features,\
    verbose,\
    max_leaf_nodes,\
    warm_start,\
    validation_fraction,\
    n_iter_no_change,\
    tol,\
    ccp_alpha\
            = best_hyper_params

    best_hyper_params_dict = {
        "loss": loss,
        "learning_rate": learning_rate,
        "n_estimators": n_estimators,
        "subsample": subsample,
        "criterion": criterion,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "min_weight_fraction_leaf": min_weight_fraction_leaf,
        "max_depth": max_depth,
        "min_impurity_decrease": min_impurity_decrease,
        "init": init,
        "random_state": random_state,
        "max_features": max_features,
        "verbose": verbose,
        "max_leaf_nodes": max_leaf_nodes,
        "warm_start": warm_start,
        "validation_fraction": validation_fraction,
        "n_iter_no_change": n_iter_no_change,
        "tol": tol,
        "ccp_alpha": ccp_alpha
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = gb_optimization(X_train_rf_ohe, y_train_total, X_test_rf_ohe, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

tested_models.append("GradientBoostingClassifier")
if fitness > best_fitness:
    best_model = model
    best_model_type = "GradientBoostingClassifier"
    best_model_hyperparams = hyper_params_dict
    best_fitness = fitness

0.9098574602081476---('log_loss', 0.1, 50, 1, 'squared_error', 2, 1, 0, 3, 0, None, None, None, 0, None, False, 0.1, None, 0.0001, 0)
0.9240599613423942---('log_loss', 0.1, 200, 1, 'squared_error', 2, 1, 0, 3, 0, None, None, None, 0, None, False, 0.1, None, 0.0001, 0)
0.9329478257365369---('exponential', 0.5, 1000, 1, 'friedman_mse', 2, 1, 0, 10, 0, None, None, None, 0, None, False, 0.1, None, 0.0001, 0)
Max Fitness = 0.8788582651154259 using the mean of ['F1', 'MatthewsCorrelation', 'Accuracy']
Best Hyperparams = {'loss': 'exponential', 'learning_rate': 0.2, 'n_estimators': 1000, 'subsample': 1, 'criterion': 'friedman_mse', 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0, 'max_depth': 10, 'min_impurity_decrease': 0, 'init': None, 'random_state': None, 'max_features': None, 'verbose': 0, 'max_leaf_nodes': None, 'warm_start': False, 'validation_fraction': 0.1, 'n_iter_no_change': None, 'tol': 0.0001, 'ccp_alpha': 0}
The Accuracy is:  0.9641
The Precision is:

NameError: name 'tested_models' is not defined

## AdaBoost

In [None]:
adab_model = AdaBoostClassifier()
adab_model.fit(X_train_rf, y_train_total)

preds = adab_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9430
The Precision is:  0.9794
The Recall is:  0.9548
The F1 score is:  0.9669
The Matthews correlation coefficient is:  0.7653

This is the Confusion Matrix
     0    1
0  126   20
1   45  950


## HistGradientBoost (Histogram-Based Gradient Boosting)

In [None]:
_y_train = initial_dataset["Biodegradable"][0:train_dataset_len]
_y_test = initial_dataset["Biodegradable"][train_dataset_len:total_len]

_X_train = initial_dataset.drop(["Biodegradable"], axis=1)[0:train_dataset_len]
_X_test = initial_dataset.drop(["Biodegradable"], axis=1)[train_dataset_len:total_len]

histb_model = HistGradientBoostingClassifier()
histb_model.fit(_X_train, _y_train)

preds = histb_model.predict(_X_test)
truths = _y_test.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9693
The Precision is:  0.9866
The Recall is:  0.9775
The F1 score is:  0.9820
The Matthews correlation coefficient is:  0.8774

This is the Confusion Matrix
     0    1
0  149   13
1   22  957


# K-Nearest Neighbours

In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_rf, y_train_total)

preds = knn_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9579
The Precision is:  0.9897
The Recall is:  0.9619
The F1 score is:  0.9756
The Matthews correlation coefficient is:  0.8274

This is the Confusion Matrix
     0    1
0  133   10
1   38  960


In [None]:
def knn_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    #n_neighbors=[5]
    n_neighbors=[3,4,5,6,7,8,10]
    #weights=["uniform"]
    weights=['uniform','distance']
    #algorithm=["auto"]
    algorithm=['auto','ball_tree','kd_tree','brute']
    #leaf_size=[30]
    leaf_size=[20,25,30,35,40]
    #p=[2]
    p=[1,2]
    metric=["minkowski"]
    #metric=["minkowski","precomputed"]
    metric_params=[None]
    n_jobs=[None]

    rf_hyperparams = itertools.product(
                                        n_neighbors,
                                        weights,
                                        algorithm,
                                        leaf_size,
                                        p,
                                        metric,
                                        metric_params,
                                        n_jobs
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        n_neighbors,\
        weights,\
        algorithm,\
        leaf_size,\
        p,\
        metric,\
        metric_params,\
        n_jobs\
                = hyper_param

        model = KNeighborsClassifier(
                        n_neighbors=n_neighbors,
                        weights=weights,
                        algorithm=algorithm,
                        leaf_size=leaf_size,
                        p=p,
                        metric=metric,
                        metric_params=metric_params,
                        n_jobs=n_jobs
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    n_neighbors,\
    weights,\
    algorithm,\
    leaf_size,\
    p,\
    metric,\
    metric_params,\
    n_jobs\
         = best_hyper_params

    best_hyper_params_dict = {
        "n_neighbors": n_neighbors,
        "weights": weights,
        "algorithm": algorithm,
        "leaf_size": leaf_size,
        "p": p,
        "metric": metric,
        "metric_params": metric_params,
        "n_jobs": n_jobs
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = knn_optimization(X_train_rf_ohe, y_train_total, X_test_rf_ohe, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf_ohe)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

tested_models.append("KNeighborsClassifier")
if fitness > best_fitness:
    best_model = model
    best_model_type = "KNeighborsClassifier"
    best_model_hyperparams = hyper_params_dict
    best_fitness = fitness

0.923770497287316---(3, 'uniform', 'auto', 25, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'auto', 30, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'auto', 35, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'auto', 40, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'ball_tree', 20, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'ball_tree', 25, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'ball_tree', 30, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'ball_tree', 35, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'ball_tree', 40, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'kd_tree', 20, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'kd_tree', 25, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'kd_tree', 30, 1, 'minkowski', None, None)
0.923770497287316---(3, 'uniform', 'kd_tree', 35, 1, 'minkowski', 

# Logistic Regression

In [None]:
lr_model = LogisticRegressionCV()
lr_model.fit(X_train_rf, y_train_total)

preds = lr_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9422
The Precision is:  0.9876
The Recall is:  0.9466
The F1 score is:  0.9667
The Matthews correlation coefficient is:  0.7573

This is the Confusion Matrix
     0    1
0  117   12
1   54  958


# Multi-Layer Perceptron

In [None]:
mlp_model = MLPClassifier()
mlp_model.fit(X_train_rf, y_train_total)

preds = mlp_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9553
The Precision is:  0.9804
The Recall is:  0.9674
The F1 score is:  0.9739
The Matthews correlation coefficient is:  0.8198

This is the Confusion Matrix
     0    1
0  139   19
1   32  951




# Results
## Features
### Using Random Forest Classifier for Feature Selection

In [None]:
print(len(X_train_rf.columns))
print(X_train_rf.columns.to_list())

12
['nHM', 'F04', 'NssssC', 'nCb', 'F03', 'F02_CN', 'SpMax_L', 'SM6_L', 'SpPosA_B', 'SpMax_B', 'SM6_B', 'nX']


## Best Possible Model Found from tests

In [None]:
best_model.fit(X_train_rf, y_train_total)

preds = best_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

print(f"Tested Models - {str(tested_models)}")
print(f"{best_model_type} Model")
print(f"With hyperparams {best_model_hyperparams}")
printClassResults(preds,truths)

RandomForestClassifier Model
With hyperparams {'n_estimators': 129, 'criterion': 'log_loss', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0, 'max_features': 'log2', 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'bootstrap': True, 'oob_score': True, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False, 'class_weight': None, 'ccp_alpha': 0, 'max_samples': None}
The Accuracy is:  0.9676
The Precision is:  0.9876
The Recall is:  0.9746
The F1 score is:  0.9811
The Matthews correlation coefficient is:  0.8696

This is the Confusion Matrix
     0    1
0  146   12
1   25  958


## Models
### Random Forest Classifier
Best Hyperparams = {'n_estimators': 129, 'criterion': 'log_loss', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0, 'max_features': 'log2', 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'bootstrap': True, 'oob_score': True, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False, 'class_weight': None, 'ccp_alpha': 0, 'max_samples': None}
<br>
<br>
The **Accuracy** is:  **0.9728**
<br>
The **Precision** is:  **0.9897**
<br>
The **Recall** is:  **0.9786**
<br>
The **F1 score** is:  **0.9841**
<br>
The **Matthews correlation coefficient** is:  **0.8911**
<br>
<br>
This is the **Confusion Matrix**

|   | 0   | 1   |
|---|-----|-----|
| 0 | 150 | 10  |
| 1 | 21  | 960 |

### SVM for Classification (SVC)
Best Hyperparams = {'C': 4, 'kernel': 'rbf', 'degree': 3, 'gamma': 'scale', 'coef0': 0, 'shrinking': True, 'probability': False, 'tol': False, 'cache_size': 200, 'class_weight': None, 'verbose': False, 'max_iter': -1, 'decision_function_shape': 'ovr', 'break_ties': False, 'random_state': None}
<br>
<br>
The **Accuracy** is:  **0.9606**
<br>
The **Precision** is:  **0.9928**
<br>
The **Recall** is:  **0.9620**
<br>
The **F1 score** is:  **0.9772**
<br>
The **Matthews correlation coefficient** is:  **0.8383**
<br>
<br>
This is the **Confusion Matrix**

|   | 0   | 1   |
|---|-----|-----|
| 0 | 133 | 7   |
| 1 | 38  | 963 |

### Naive Bayes
#### Gaussian Naive Bayes
Best Hyperparams = {'priors': None, 'var_smoothing': 0}
<br>
<br>
The **Accuracy** is:  **0.9273**
<br>
The **Precision** is:  **0.9515**
<br>
The **Recall** is:  **0.9625**
<br>
The **F1 score** is:  **0.9570**
<br>
The **Matthews correlation coefficient** is:  **0.7224**
<br>
<br>
This is the **Confusion Matrix**

|   | 0   | 1   |
|---|-----|-----|
| 0 | 135 | 47  |
| 1 | 36  | 923 |


#### Bernoulli Naive Bayes
Best Hyperparams = {'alpha': 1e-10, 'binarize': 1.75, 'fit_prior': True, 'class_prior': None}
<br>
<br>
The **Accuracy** is:  **0.9150**
<br>
The **Precision** is:  **0.9608**
<br>
The **Recall** is:  **0.9405**
<br>
The **F1 score** is:  **0.9505**
<br>
The **Matthews correlation coefficient** is:  **0.6505**
<br>
<br>
This is the **Confusion Matrix**

|   | 0   | 1   |
|---|-----|-----|
| 0 | 112 | 38  |
| 1 | 59  | 932 |

### Gradient-Boost

### Ada-Boost

### K-Nearest Neighbours

### Logistic Regression

### Multi-Layer Perceptron

