In [34]:
import random
import itertools
from typing import Tuple

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, accuracy_score

np.random.seed(0)
random.seed(0)

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, pos_label='RB'))
    print("The Recall is: %7.4f" % recall_score(truth, preds, pos_label='RB'))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, pos_label='RB'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print()
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(truth, preds)))

In [35]:
initial_dataset = pd.read_csv("biodegradable_a.csv").sample(frac=1).reset_index(drop=True)
total_len, _ = initial_dataset.shape

# NOTE - NO INDEPENDENT VALIDATION SET !!!

# Total with means
#categorical = ['int16', 'int32', 'int64']
#biodegradable = ['object']
#numerical = ['float16', 'float32', 'float64']

class_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 == 0).all()]
num_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 != 0).any()]

print(len(class_cols) + len(num_cols))
print(len(initial_dataset.drop("Biodegradable", axis=1).columns))

total_categorical_dataset = initial_dataset[class_cols]
#total_categorical_dataset = total_categorical_dataset.fillna(total_categorical_dataset.mode())
total_categorical_dataset = total_categorical_dataset.fillna(-1)
total_categorical_dataset = total_categorical_dataset.astype(int).astype(object).astype(str)
#print(total_categorical_dataset)

total_numerical_dataset = initial_dataset[num_cols]
total_numerical_dataset = total_numerical_dataset.fillna(total_numerical_dataset.mean())

total_biodegradable = initial_dataset["Biodegradable"]
#total_biodegradable = initial_dataset.select_dtypes(include=biodegradable)
#total_biodegradable = total_biodegradable.fillna("")

# Scale numerical data
# https://scikit-learn.org/stable/modules/preproce
#print(total_numerical_dataset)

scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))

# Commenting the following two lines will run the models without scaling. It allows the usage of certain Naive Bayes models, but ruins SVMs.
# As the NaiveBayes that don't work with the negative numbers that come from the scaling of data, we keep it for the ones that show better results
total_numerical_dataset = pd.DataFrame(scaler.fit_transform(total_numerical_dataset),
             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)


#total_numerical_dataset = pd.DataFrame(normalize(total_numerical_dataset, norm='l2', axis=1, copy=True, return_norm=False),
#             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)

#print(total_numerical_dataset)
#
total_dataset = pd.concat([total_categorical_dataset, total_numerical_dataset,total_biodegradable], axis=1)
#total_dataset.dropna(0)
print(total_dataset)

total_len, _ = total_dataset.shape
train_dataset_len = round(total_len * 0.75)

dataset_train = total_dataset[0:train_dataset_len]
dataset_test = total_dataset[train_dataset_len:total_len]

print(total_dataset.shape)

# Removal of None/NaN vals
dropna_dataset = initial_dataset.dropna()

dropna_len, _ = dropna_dataset.shape
model_dropna_len = round(total_len * 0.75)

dropna_train = dropna_dataset[0:model_dropna_len]
dropna_test = dropna_dataset[model_dropna_len:dropna_len]

print(dropna_dataset.shape)

41
41
     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...       SpMax_A  \
0      0   0      0   2  2   0    0      0     0   0  ...  7.635193e-01   
1      1   0      0   0  3   0    0      0     0   0  ... -4.481867e-02   
2      0   0      0   2  2   0    0      0     0   0  ...  2.805893e-14   
3      0   0      0   2  1   0    0      0     0   0  ...  6.022004e-01   
4      0   0      0   0  0   0    0      0     0   0  ... -1.439928e+00   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...           ...   
4559   0   1      0   0  1   1    0      0     0   0  ...  2.805893e-14   
4560   0   0      0   0  3   0    0      0     0   0  ... -6.781979e-01   
4561   0   0      0   0  2   0    0      0     0   0  ...  1.261149e+00   
4562   0   0      0   0  1   0    0      0     0   0  ... -8.396889e-01   
4563   0   0      0   2  1   0    0      0     0   0  ...  2.624708e-01   

      Psi_i_1d       SdO     TI2_L      nCrt   SpMax_B       Psi_i_A  \
0    -0.036040  0.083

In [36]:
print(total_dataset.columns)
print(dropna_test.columns)

Index(['nHM', 'F04', 'NssssC', 'nCb', 'nO', 'F03', 'nN_N', 'nArNO2', 'nCRX3',
       'B01', 'B03', 'N_073', 'B04', 'C_026', 'F02_CN', 'nHDon', 'nN',
       'nArCOOR', 'SpMax_L', 'J_Dz(e)', 'F01', 'C', 'nCp', 'SdssC', 'HyWi_B',
       'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi', 'SpPosA_B', 'nCIR', 'SpMax_A',
       'Psi_i_1d', 'SdO', 'TI2_L', 'nCrt', 'SpMax_B', 'Psi_i_A', 'SM6_B', 'nX',
       'Biodegradable'],
      dtype='object')
Index(['SpMax_L', 'J_Dz(e)', 'nHM', 'F01', 'F04', 'NssssC', 'nCb', 'C', 'nCp',
       'nO', 'F03', 'SdssC', 'HyWi_B', 'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi',
       'nN_N', 'nArNO2', 'nCRX3', 'SpPosA_B', 'nCIR', 'B01', 'B03', 'N_073',
       'SpMax_A', 'Psi_i_1d', 'B04', 'SdO', 'TI2_L', 'nCrt', 'C_026', 'F02_CN',
       'nHDon', 'SpMax_B', 'Psi_i_A', 'nN', 'SM6_B', 'nArCOOR', 'nX',
       'Biodegradable'],
      dtype='object')


## Using Model with replaced values when NaN, and discarding the dropped NaN values dataset

In [37]:
X_train_total = dataset_train.drop(["Biodegradable"], axis=1)
y_train_total = dataset_train.Biodegradable
print(X_train_total)
print(y_train_total)

X_test_total = dataset_test.drop(["Biodegradable"], axis=1)
y_test_total = dataset_test.Biodegradable

     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...      nCIR  \
0      0   0      0   2  2   0    0      0     0   0  ...  0.117354   
1      1   0      0   0  3   0    0      0     0   0  ... -0.302031   
2      0   0      0   2  2   0    0      0     0   0  ...  0.536738   
3      0   0      0   2  1   0    0      0     0   0  ...  0.117354   
4      0   0      0   0  0   0    0      0     0   0  ... -0.302031   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...       ...   
3418   0   0      0   0  2   0    0      0     0   0  ... -0.302031   
3419   0   0      0   2  2   0    0      0     0   0  ...  0.117354   
3420   0   0      0   0  1   0    0      0     0   0  ... -0.302031   
3421   0   0      0   0  2   0    0      0     0   0  ... -0.302031   
3422   0   0      0   0  2   0    0      0     0   0  ... -0.302031   

           SpMax_A  Psi_i_1d       SdO     TI2_L      nCrt   SpMax_B  \
0     7.635193e-01 -0.036040  0.083128 -0.584680 -0.100618  0.238243   
1  

## Testing Random Forests for Feature Selection

In [38]:
# https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), max_features=12) # when no max_features are specified, seems to vary between 12 and 15
sel.fit(X_train_total, y_train_total)

In [39]:

print(sel.get_support())
selected_feat= X_train_total.columns[(sel.get_support())]
print(len(selected_feat))
print(selected_feat)

[ True  True  True  True False  True False False False False False False
 False False  True False False False  True False False False False False
 False False  True False False False  True False False False False False
 False  True False  True  True]
12
Index(['nHM', 'F04', 'NssssC', 'nCb', 'F03', 'F02_CN', 'SpMax_L', 'SM6_L',
       'SpPosA_B', 'SpMax_B', 'SM6_B', 'nX'],
      dtype='object')


In [40]:
X_train_rf = X_train_total[X_train_total.columns[(sel.get_support())]]
X_test_rf = X_test_total[X_test_total.columns[(sel.get_support())]]

print(X_train_rf)

     nHM F04 NssssC nCb F03 F02_CN   SpMax_L     SM6_L  SpPosA_B   SpMax_B  \
0      0   0      0   2   0      0  0.502761  0.262102  0.467673  0.238243   
1      1   0      0   0   0      4  1.064719  0.332217 -0.831613  0.000000   
2      0   0      0   2   0      0  0.262057  0.323590  1.037564  0.106006   
3      0   0      0   2   0      0  0.445841  0.220531  0.812120  0.000000   
4      0   0      0   0   0      0 -0.971360 -1.241510  0.585408 -0.639572   
...   ..  ..    ...  ..  ..    ...       ...       ...       ...       ...   
3418   0   0      0   0   0      1 -0.917662 -1.298313 -1.879649 -0.565504   
3419   0   0      0   2   0      0  0.479167  0.324590  0.349936  0.000000   
3420   0   0      0   0   0      0 -0.515278 -0.679934 -0.416422 -1.104080   
3421   0   0      0   0   0      0  0.285715  0.070415 -1.550311  0.000000   
3422   0   0      0   0   0      0  0.102410 -0.399898 -0.285076  0.000000   

         SM6_B            nX  
0     0.139808 -1.602862e-01  
1

# Random Forest

## Testing a Random Forest Model for Classification

In [41]:
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(X_train_rf, y_train_total)

preds = rf_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9658
The Precision is:  0.9856
The Recall is:  0.9745
The F1 score is:  0.9800
The Matthews correlation coefficient is:  0.8629

This is the Confusion Matrix
     0    1
0  146   14
1   25  956


## Optimizing Random Forest Model for Classification

In [10]:
def rf_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    #n_estimators = [100]
    #n_estimators = [110,120,130,150,200]
    #n_estimators = [110,115,120,125,130]
    n_estimators = [125,126,127,128,129,130]
    #criterion = ["gini"]
    criterion = ["gini","entropy","log_loss"]
    max_depth = [None]
    min_samples_split = [2]
    min_samples_leaf = [1]
    min_weight_fraction_leaf = [0]
    #max_features = ["sqrt"]
    max_features = ["sqrt","log2",None]
    max_leaf_nodes = [None]
    min_impurity_decrease = [0]
    #bootstrap = [True]
    bootstrap = [True,False]
    #oob_score = [False]
    oob_score = [False,True]
    n_jobs = [None]
    random_state = [None]
    verbose = [0]
    #warm_start = [False]
    warm_start = [False,True]
    class_weight = [None]
    ccp_alpha = [0]
    max_samples = [None]

    rf_hyperparams = itertools.product(
                                        n_estimators,
                                        criterion,
                                        max_depth,
                                        min_samples_split,
                                        min_samples_leaf,
                                        min_weight_fraction_leaf,
                                        max_features,
                                        max_leaf_nodes,
                                        min_impurity_decrease,
                                        bootstrap,
                                        oob_score,
                                        n_jobs,
                                        random_state,
                                        verbose,
                                        warm_start,
                                        class_weight,
                                        ccp_alpha,
                                        max_samples
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        n_estimators,\
            criterion,\
            max_depth,\
            min_samples_split,\
            min_samples_leaf,\
            min_weight_fraction_leaf,\
            max_features,\
            max_leaf_nodes,\
            min_impurity_decrease,\
            bootstrap,\
            oob_score,\
            n_jobs,\
            random_state,\
            verbose,\
            warm_start,\
            class_weight,\
            ccp_alpha,\
            max_samples\
                = hyper_param

        if oob_score and not bootstrap:
            continue

        model = RandomForestClassifier(
                        n_estimators=n_estimators,
                        criterion=criterion,
                        max_depth=max_depth,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        min_weight_fraction_leaf=min_weight_fraction_leaf,
                        max_features=max_features,
                        max_leaf_nodes=max_leaf_nodes,
                        min_impurity_decrease=min_impurity_decrease,
                        bootstrap=bootstrap,
                        oob_score=oob_score,
                        n_jobs=n_jobs,
                        random_state=random_state,
                        verbose=verbose,
                        warm_start=warm_start,
                        class_weight=class_weight,
                        ccp_alpha=ccp_alpha,
                        max_samples=max_samples
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    n_estimators,\
            criterion,\
            max_depth,\
            min_samples_split,\
            min_samples_leaf,\
            min_weight_fraction_leaf,\
            max_features,\
            max_leaf_nodes,\
            min_impurity_decrease,\
            bootstrap,\
            oob_score,\
            n_jobs,\
            random_state,\
            verbose,\
            warm_start,\
            class_weight,\
            ccp_alpha,\
            max_samples\
                = best_hyper_params

    best_hyper_params_dict = {
        "n_estimators": n_estimators,
        "criterion": criterion,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "min_weight_fraction_leaf": min_weight_fraction_leaf,
        "max_features": max_features,
        "max_leaf_nodes": max_leaf_nodes,
        "min_impurity_decrease": min_impurity_decrease,
        "bootstrap": bootstrap,
        "oob_score": oob_score,
        "n_jobs": n_jobs,
        "random_state": random_state,
        "verbose": verbose,
        "warm_start": warm_start,
        "class_weight": class_weight,
        "ccp_alpha": ccp_alpha,
        "max_samples": max_samples
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = rf_optimization(X_train_rf, y_train_total, X_test_rf, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

0.9460803502501417---(127, 'log_loss', None, 2, 1, 0, 'sqrt', None, 0, True, False, None, None, 0, True, None, 0, None)
0.9460803502501417---(129, 'entropy', None, 2, 1, 0, 'sqrt', None, 0, True, False, None, None, 0, False, None, 0, None)
Max Fitness = 0.8808272270638579 using the mean of ['F1', 'MatthewsCorrelation', 'Accuracy']
Best Hyperparams = {'n_estimators': 129, 'criterion': 'log_loss', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0, 'max_features': 'log2', 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'bootstrap': True, 'oob_score': True, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False, 'class_weight': None, 'ccp_alpha': 0, 'max_samples': None}
The Accuracy is:  0.9728
The Precision is:  0.9897
The Recall is:  0.9786
The F1 score is:  0.9841
The Matthews correlation coefficient is:  0.8911

This is the Confusion Matrix
     0    1
0  150   10
1   21  960


# Support Vector Machines

In [43]:
svc_model = SVC()
svc_model.fit(X_train_rf, y_train_total)

preds = svc_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9544
The Precision is:  0.9907
The Recall is:  0.9572
The F1 score is:  0.9737
The Matthews correlation coefficient is:  0.8118

This is the Confusion Matrix
     0    1
0  128    9
1   43  961


## Optimizing Support Vector Machines for Optimization

In [12]:
def svm_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    #C = [1]
    C = [1,4,6,8,12]
    #kernel = ["rbf"]
    #kernel = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
    kernel = ['linear', 'poly', 'rbf']
    degree = [3]
    gamma = ["scale"]
    coef0 = [0]
    #shrinking = [True]
    shrinking = [True,False]
    #probability = [False]
    probability = [False, True]
    tol = [0.001]
    cache_size = [200]
    class_weight = [None]
    verbose = [False]
    max_iter = [-1]
    #decision_function_shape = ["ovr"]
    decision_function_shape = ["ovr", 'ovo']
    break_ties = [False]
    random_state = [None]

    rf_hyperparams = itertools.product(
                                        C,\
                                        kernel,\
                                        degree,\
                                        gamma,\
                                        coef0,\
                                        shrinking,\
                                        probability,\
                                        tol,\
                                        cache_size,\
                                        class_weight,\
                                        verbose,\
                                        max_iter,\
                                        decision_function_shape,\
                                        break_ties,\
                                        random_state
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        C,\
            kernel,\
            degree,\
            gamma,\
            coef0,\
            shrinking,\
            probability,\
            tol,\
            cache_size,\
            class_weight,\
            verbose,\
            max_iter,\
            decision_function_shape,\
            break_ties,\
            random_state\
                = hyper_param

        model = SVC(
                        C=C,\
                        kernel=kernel,\
                        degree=degree,\
                        gamma=gamma,\
                        coef0=coef0,\
                        shrinking=shrinking,\
                        probability=probability,\
                        tol=tol,\
                        cache_size=cache_size,\
                        class_weight=class_weight,\
                        verbose=verbose,\
                        max_iter=max_iter,\
                        decision_function_shape=decision_function_shape,\
                        break_ties=break_ties,\
                        random_state=random_state
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    C,\
        kernel,\
        degree,\
        gamma,\
        coef0,\
        shrinking,\
        probability,\
        tol,\
        cache_size,\
        class_weight,\
        verbose,\
        max_iter,\
        decision_function_shape,\
        break_ties,\
        random_state\
                = best_hyper_params

    best_hyper_params_dict = {
        "C": C,
        "kernel": kernel,
        "degree": degree,
        "gamma": gamma,
        "coef0": coef0,
        "shrinking": shrinking,
        "probability": probability,
        "tol": probability,
        "cache_size": cache_size,
        "class_weight": class_weight,
        "verbose": verbose,
        "max_iter": max_iter,
        "decision_function_shape": decision_function_shape,
        "break_ties": break_ties,
        "random_state": random_state
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = svm_optimization(X_train_rf, y_train_total, X_test_rf, y_test_total, ["F1", "MatthewsCorrelation","Accuracy"])

preds = model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

0.8812861026843137---(1, 'linear', 3, 'scale', 0, True, False, 0.001, 200, None, False, -1, 'ovo', False, None)
0.8812861026843137---(1, 'linear', 3, 'scale', 0, True, True, 0.001, 200, None, False, -1, 'ovr', False, None)
0.8812861026843137---(1, 'linear', 3, 'scale', 0, True, True, 0.001, 200, None, False, -1, 'ovo', False, None)
0.8812861026843137---(1, 'linear', 3, 'scale', 0, False, False, 0.001, 200, None, False, -1, 'ovr', False, None)
0.8812861026843137---(1, 'linear', 3, 'scale', 0, False, False, 0.001, 200, None, False, -1, 'ovo', False, None)
0.8812861026843137---(1, 'linear', 3, 'scale', 0, False, True, 0.001, 200, None, False, -1, 'ovr', False, None)
0.8812861026843137---(1, 'linear', 3, 'scale', 0, False, True, 0.001, 200, None, False, -1, 'ovo', False, None)
0.913297452837417---(1, 'rbf', 3, 'scale', 0, True, False, 0.001, 200, None, False, -1, 'ovo', False, None)
0.913297452837417---(1, 'rbf', 3, 'scale', 0, True, True, 0.001, 200, None, False, -1, 'ovr', False, None)
0

# Naive Bayes

In [44]:
scaled = True # To use the models that don't work with scaled (negative) values.

nb_model = GaussianNB() # Likelihood of the features is assumed to be Gaussian
nb_model.fit(X_train_rf, y_train_total)

preds = nb_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

print("Gaussian Naive Bayes")
printClassResults(preds,truths)



print()
print()


if not scaled:
    nb_model = MultinomialNB() # For multinomially distributed data
    nb_model.fit(X_train_rf, y_train_total)

    preds = nb_model.predict(X_test_rf)
    truths = y_test_total.to_numpy()

    print("Multinomial Naive Bayes")
    printClassResults(preds,truths)



    print()
    print()



    # For imbalanced datasets - ours seems to be really imbalenced towards the category "RB", but doesn't work with the data scaling. Tested it without scaling, didn't surpass the Gaussian nor the Bernoulli, so we didn't further optimize this
    nb_model = ComplementNB()
    nb_model.fit(X_train_rf, y_train_total)

    preds = nb_model.predict(X_test_rf)
    truths = y_test_total.to_numpy()

    print("Complement Naive Bayes")
    printClassResults(preds,truths)



    print()
    print()



nb_model = BernoulliNB() # Distributed according to multivariate Bernoulli distributions
nb_model.fit(X_train_rf, y_train_total)

preds = nb_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

print("Bernoulli Naive Bayes")
printClassResults(preds,truths)



if not scaled:
    print()
    print()



    nb_model = CategoricalNB() # For categorically distributed data
    nb_model.fit(X_train_rf, y_train_total)
    
    preds = nb_model.predict(X_test_rf)
    truths = y_test_total.to_numpy()
    
    print("Categorical Naive Bayes")
    printClassResults(preds,truths)



# Out-of-core with the partial_fit function?

Gaussian Naive Bayes
The Accuracy is:  0.9273
The Precision is:  0.9515
The Recall is:  0.9625
The F1 score is:  0.9570
The Matthews correlation coefficient is:  0.7224

This is the Confusion Matrix
     0    1
0  135   47
1   36  923


Bernoulli Naive Bayes
The Accuracy is:  0.8861
The Precision is:  0.9227
The Recall is:  0.9421
The F1 score is:  0.9323
The Matthews correlation coefficient is:  0.5747

This is the Confusion Matrix
     0    1
0  116   75
1   55  895


# X-Boost

# Ada-Boost

# K-Nearest Neighbours

# Logistic Regression

# Multi-Layer Perceptron

# Results
## Features
### Using Random Forest Classifier for Feature Selection

In [13]:
print(len(X_train_rf.columns))
print(X_train_rf.columns.to_list())

12
['nHM', 'F04', 'NssssC', 'nCb', 'F03', 'F02_CN', 'SpMax_L', 'SM6_L', 'SpPosA_B', 'SpMax_B', 'SM6_B', 'nX']


## Models
### Random Forest Classifier
Best Hyperparams = {'n_estimators': 129, 'criterion': 'log_loss', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0, 'max_features': 'log2', 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'bootstrap': True, 'oob_score': True, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False, 'class_weight': None, 'ccp_alpha': 0, 'max_samples': None}
<br>
<br>
The **Accuracy** is:  **0.9728**
<br>
The **Precision** is:  **0.9897**
<br>
The **Recall** is:  **0.9786**
<br>
The **F1 score** is:  **0.9841**
<br>
The **Matthews correlation coefficient** is:  **0.8911**
<br>
<br>
This is the **Confusion Matrix**

|   | 0   | 1   |
|---|-----|-----|
| 0 | 150 | 10  |
| 1 | 21  | 960 |

### SVM for Classification (SVC)
Best Hyperparams = {'C': 4, 'kernel': 'rbf', 'degree': 3, 'gamma': 'scale', 'coef0': 0, 'shrinking': True, 'probability': False, 'tol': False, 'cache_size': 200, 'class_weight': None, 'verbose': False, 'max_iter': -1, 'decision_function_shape': 'ovr', 'break_ties': False, 'random_state': None}
<br>
<br>
The **Accuracy** is:  **0.9606**
<br>
The **Precision** is:  **0.9928**
<br>
The **Recall** is:  **0.9620**
<br>
The **F1 score** is:  **0.9772**
<br>
The **Matthews correlation coefficient** is:  **0.8383**
<br>
<br>
This is the **Confusion Matrix**

|   | 0   | 1   |
|---|-----|-----|
| 0 | 133 | 7   |
| 1 | 38  | 963 |

### Naive Bayes

### X-Boost

### Ada-Boost

### K-Nearest Neighbours

### Logistic Regression

### Multi-Layer Perceptron

