In [20]:
import random
import itertools
from typing import Tuple

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, accuracy_score

np.random.seed(0)
random.seed(0)

def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, pos_label='RB'))
    print("The Recall is: %7.4f" % recall_score(truth, preds, pos_label='RB'))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, pos_label='RB'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print()
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(truth, preds)))

In [2]:
initial_dataset = pd.read_csv("biodegradable_a.csv").sample(frac=1).reset_index(drop=True)
total_len, _ = initial_dataset.shape

# NOTE - NO INDEPENDENT VALIDATION SET !!!

# Total with means
#categorical = ['int16', 'int32', 'int64']
#biodegradable = ['object']
#numerical = ['float16', 'float32', 'float64']

class_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 == 0).all()]
num_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 != 0).any()]

print(len(class_cols) + len(num_cols))
print(len(initial_dataset.drop("Biodegradable", axis=1).columns))

total_categorical_dataset = initial_dataset[class_cols]
#total_categorical_dataset = total_categorical_dataset.fillna(total_categorical_dataset.mode())
total_categorical_dataset = total_categorical_dataset.fillna(-1)
total_categorical_dataset = total_categorical_dataset.astype(int).astype(object).astype(str)
#print(total_categorical_dataset)

total_numerical_dataset = initial_dataset[num_cols]
total_numerical_dataset = total_numerical_dataset.fillna(total_numerical_dataset.mean())

total_biodegradable = initial_dataset["Biodegradable"]
#total_biodegradable = initial_dataset.select_dtypes(include=biodegradable)
#total_biodegradable = total_biodegradable.fillna("")

# Scale numerical data
# https://scikit-learn.org/stable/modules/preproce
#print(total_numerical_dataset)

scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))

total_numerical_dataset = pd.DataFrame(scaler.fit_transform(total_numerical_dataset),
             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)


#total_numerical_dataset = pd.DataFrame(normalize(total_numerical_dataset, norm='l2', axis=1, copy=True, return_norm=False),
#             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)

#print(total_numerical_dataset)
#
total_dataset = pd.concat([total_categorical_dataset, total_numerical_dataset,total_biodegradable], axis=1)
#total_dataset.dropna(0)
print(total_dataset)

total_len, _ = total_dataset.shape
train_dataset_len = round(total_len * 0.75)

dataset_train = total_dataset[0:train_dataset_len]
dataset_test = total_dataset[train_dataset_len:total_len]

print(total_dataset.shape)

# Removal of None/NaN vals
dropna_dataset = initial_dataset.dropna()

dropna_len, _ = dropna_dataset.shape
model_dropna_len = round(total_len * 0.75)

dropna_train = dropna_dataset[0:model_dropna_len]
dropna_test = dropna_dataset[model_dropna_len:dropna_len]

print(dropna_dataset.shape)

41
41
     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...       SpMax_A  \
0      0   0      0   0  4   0    0      0     0   0  ...  2.805893e-14   
1      0   0      0   0  4   0    0      0     0   0  ...  6.371135e-03   
2      0   0      0   2  2   0    0      0     0   0  ...  3.818507e-01   
3      0   0      0   2  2   0    0      0     0   0  ...  5.995285e-01   
4      0   0      0   0  2   0    0      0     0   0  ... -1.693350e+00   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...           ...   
4559   0   0      0   0  2   0    0      0     0   0  ...  2.805893e-14   
4560   0   0      0   0  4   0    0      0     0   0  ...  5.062141e-01   
4561   0   0      4   0  4   0    0      0     0   0  ...  1.756083e+00   
4562   0   0      0   0  1   0    0      0     0   0  ... -9.546767e-02   
4563   0   0      0   1  0   0    0      0     0   0  ... -5.648479e-01   

      Psi_i_1d       SdO     TI2_L      nCrt       SpMax_B       Psi_i_A  \
0    -0.394796  1

In [3]:
print(total_dataset.columns)
print(dropna_test.columns)

Index(['nHM', 'F04', 'NssssC', 'nCb', 'nO', 'F03', 'nN_N', 'nArNO2', 'nCRX3',
       'B01', 'B03', 'N_073', 'B04', 'C_026', 'F02_CN', 'nHDon', 'nN',
       'nArCOOR', 'SpMax_L', 'J_Dz(e)', 'F01', 'C', 'nCp', 'SdssC', 'HyWi_B',
       'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi', 'SpPosA_B', 'nCIR', 'SpMax_A',
       'Psi_i_1d', 'SdO', 'TI2_L', 'nCrt', 'SpMax_B', 'Psi_i_A', 'SM6_B', 'nX',
       'Biodegradable'],
      dtype='object')
Index(['SpMax_L', 'J_Dz(e)', 'nHM', 'F01', 'F04', 'NssssC', 'nCb', 'C', 'nCp',
       'nO', 'F03', 'SdssC', 'HyWi_B', 'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi',
       'nN_N', 'nArNO2', 'nCRX3', 'SpPosA_B', 'nCIR', 'B01', 'B03', 'N_073',
       'SpMax_A', 'Psi_i_1d', 'B04', 'SdO', 'TI2_L', 'nCrt', 'C_026', 'F02_CN',
       'nHDon', 'SpMax_B', 'Psi_i_A', 'nN', 'SM6_B', 'nArCOOR', 'nX',
       'Biodegradable'],
      dtype='object')


## Using Model with replaced values when NaN, and discarding the dropped NaN values dataset

In [4]:
X_train_total = dataset_train.drop(["Biodegradable"], axis=1)
y_train_total = dataset_train.Biodegradable
print(X_train_total)
print(y_train_total)

X_test_total = dataset_test.drop(["Biodegradable"], axis=1)
y_test_total = dataset_test.Biodegradable

     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...      nCIR  \
0      0   0      0   0  4   0    0      0     0   0  ... -0.302031   
1      0   0      0   0  4   0    0      0     0   0  ... -0.302031   
2      0   0      0   2  2   0    0      0     0   0  ...  0.117354   
3      0   0      0   2  2   0    0      0     0   0  ...  0.117354   
4      0   0      0   0  2   0    0      0     0   0  ... -0.302031   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...       ...   
3418   0   0      0   1  1   0    0      0     0   0  ...  0.117354   
3419   0   0      0   0  0   0    0      0     0   0  ... -0.302031   
3420   0   0      0   0  2   1    0      0     0   0  ... -0.302031   
3421   1   3      0   1  2   3    0      0     0   0  ...  0.117354   
3422   0   0      0   1  1   0    0      0     0   0  ...  0.117354   

           SpMax_A  Psi_i_1d       SdO     TI2_L      nCrt       SpMax_B  \
0     2.805893e-14 -0.394796  1.045124 -0.154799 -0.100618  4.501572e-1

## Testing Random Forests for Feature Selection

In [5]:
# https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), max_features=12) # when no max_features are specified, seems to vary between 12 and 15
sel.fit(X_train_total, y_train_total)

In [6]:

print(sel.get_support())
selected_feat= X_train_total.columns[(sel.get_support())]
print(len(selected_feat))
print(selected_feat)

[ True  True  True  True False  True False False False False False False
 False False  True False  True False  True False False False False False
 False False False False False False  True False  True False False False
 False False False  True  True]
12
Index(['nHM', 'F04', 'NssssC', 'nCb', 'F03', 'F02_CN', 'nN', 'SpMax_L',
       'SpPosA_B', 'SpMax_A', 'SM6_B', 'nX'],
      dtype='object')


In [7]:
X_train_rf = X_train_total[X_train_total.columns[(sel.get_support())]]
X_test_rf = X_test_total[X_test_total.columns[(sel.get_support())]]

print(X_train_rf)

     nHM F04 NssssC nCb F03 F02_CN nN   SpMax_L  SpPosA_B       SpMax_A  \
0      0   0      0   0   0      0  0 -0.297450 -0.989182  2.805893e-14   
1      0   0      0   0   0      0  0  0.245742 -1.628388  6.371135e-03   
2      0   0      0   2   0      0  0  0.371020  0.224842  3.818507e-01   
3      0   0      0   2   0      0  0  0.441189  0.584927  5.995285e-01   
4      0   0      0   0   0      0  0 -0.945053 -1.685966 -1.693350e+00   
...   ..  ..    ...  ..  ..    ... ..       ...       ...           ...   
3418   0   0      0   1   0      0  0  0.162363  0.947616  3.374858e-01   
3419   0   0      0   0   0      0  0 -1.244933  0.086245  2.805893e-14   
3420   0   0      0   0   1      1  1 -0.706745 -0.173784 -1.155604e+00   
3421   1   3      0   1   3      2  1  1.702623  0.598555  1.082130e+00   
3422   0   0      0   1   0      0  0  0.262735  1.378860  4.087575e-01   

         SM6_B            nX  
0    -0.080245 -1.602862e-01  
1    -0.127823  2.612766e-17  
2    -

# Random Forest

## Testing a Random Forest Model for Classification

In [8]:
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(X_train_rf, y_train_total)

preds = rf_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9649
The Precision is:  0.9893
The Recall is:  0.9685
The F1 score is:  0.9788
The Matthews correlation coefficient is:  0.8800

This is the Confusion Matrix
     0    1
0  179   10
1   30  922


## Optimizing Random Forest Model for Classification

In [22]:
def rf_optimization(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, target_classes: list = ["F1"]) -> Tuple[RandomForestClassifier, np.array]:
    
    assert "F1" in target_classes or "Accuracy" in target_classes or "Precision" in target_classes or "Recall" in target_classes or "MatthewsCorrelation" in target_classes

    n_estimators = [100,120,130,150,200]
    #criterion = ["gini"]
    criterion = ["gini","entropy","log_loss"]
    max_depth = [None]
    min_samples_split = [2]
    min_samples_leaf = [1]
    min_weight_fraction_leaf = [0]
    #max_features = ["sqrt"]
    max_features = ["sqrt","log2", None]
    max_leaf_nodes = [None]
    min_impurity_decrease = [0]
    bootstrap = [True]
    oob_score = [False]
    n_jobs = [None]
    random_state = [None]
    verbose = [0]
    warm_start = [False]
    class_weight = [None]
    ccp_alpha = [0]
    max_samples = [None]

    rf_hyperparams = itertools.product(
                                        n_estimators,
                                        criterion,
                                        max_depth,
                                        min_samples_split,
                                        min_samples_leaf,
                                        min_weight_fraction_leaf,
                                        max_features,
                                        max_leaf_nodes,
                                        min_impurity_decrease,
                                        bootstrap,
                                        oob_score,
                                        n_jobs,
                                        random_state,
                                        verbose,
                                        warm_start,
                                        class_weight,
                                        ccp_alpha,
                                        max_samples
                                    )

    best_fitness = -99999
    best_model = None
    best_hyper_params = None

    for hyper_param in rf_hyperparams:

        n_estimators,\
            criterion,\
            max_depth,\
            min_samples_split,\
            min_samples_leaf,\
            min_weight_fraction_leaf,\
            max_features,\
            max_leaf_nodes,\
            min_impurity_decrease,\
            bootstrap,\
            oob_score,\
            n_jobs,\
            random_state,\
            verbose,\
            warm_start,\
            class_weight,\
            ccp_alpha,\
            max_samples\
                = hyper_param

        model = RandomForestClassifier(
                        n_estimators=n_estimators,
                        criterion=criterion,
                        max_depth=max_depth,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        min_weight_fraction_leaf=min_weight_fraction_leaf,
                        max_features=max_features,
                        max_leaf_nodes=max_leaf_nodes,
                        min_impurity_decrease=min_impurity_decrease,
                        bootstrap=bootstrap,
                        oob_score=oob_score,
                        n_jobs=n_jobs,
                        random_state=random_state,
                        verbose=verbose,
                        warm_start=warm_start,
                        class_weight=class_weight,
                        ccp_alpha=ccp_alpha,
                        max_samples=max_samples
                    )

        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        truths = y_test.to_numpy()

        fitness = 0

        for target_class in target_classes:
            if target_class == "F1":
                fitness += f1_score(truths, preds, pos_label='RB')
            elif target_class == "Accuracy":
                fitness += accuracy_score(truths, preds)
            elif target_class == "Precision":
                fitness += precision_score(truths, preds, pos_label='RB')
            elif target_class == "Recall":
                fitness += recall_score(truths, preds, pos_label='RB')
            elif target_class == "MatthewsCorrelation":
                fitness += matthews_corrcoef(truths, preds)
            else:
                raise Exception(f"Unkown Metric {target_class}")
        
        fitness = fitness / len(target_classes)

        if fitness > best_fitness:
            best_model = model
            best_fitness = fitness
            best_hyper_params = hyper_param
        elif fitness == best_fitness:
            print(str(fitness) + "---" + str(hyper_param))

    
    print(f"Max Fitness = {str(fitness)} using the mean of {str(target_classes)}")

    n_estimators,\
            criterion,\
            max_depth,\
            min_samples_split,\
            min_samples_leaf,\
            min_weight_fraction_leaf,\
            max_features,\
            max_leaf_nodes,\
            min_impurity_decrease,\
            bootstrap,\
            oob_score,\
            n_jobs,\
            random_state,\
            verbose,\
            warm_start,\
            class_weight,\
            ccp_alpha,\
            max_samples\
                = best_hyper_params

    best_hyper_params_dict = {
        "n_estimators": n_estimators,
        "criterion": criterion,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "min_weight_fraction_leaf": min_weight_fraction_leaf,
        "max_features": max_features,
        "max_leaf_nodes": max_leaf_nodes,
        "min_impurity_decrease": min_impurity_decrease,
        "bootstrap": bootstrap,
        "oob_score": oob_score,
        "n_jobs": n_jobs,
        "random_state": random_state,
        "verbose": verbose,
        "warm_start": warm_start,
        "class_weight": class_weight,
        "ccp_alpha": ccp_alpha,
        "max_samples": max_samples
    }

    print(f"Best Hyperparams = {str(best_hyper_params_dict)}")
    return best_model, best_fitness, best_hyper_params_dict


model, fitness, hyper_params_dict = rf_optimization(X_train_rf, y_train_total, X_test_rf, y_test_total, ["F1", "MatthewsCorrelation"])

preds = model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

0.9293193711555747---(130, 'entropy', None, 2, 1, 0, 'sqrt', None, 0, True, False, None, None, 0, False, None, 0, None)
0.9293193711555747---(150, 'gini', None, 2, 1, 0, 'sqrt', None, 0, True, False, None, None, 0, False, None, 0, None)
0.9293193711555747---(200, 'gini', None, 2, 1, 0, 'sqrt', None, 0, True, False, None, None, 0, False, None, 0, None)
Max Fitness = 0.9165753789651077 using the mean of ['F1', 'MatthewsCorrelation']
Best Hyperparams = {'n_estimators': 120, 'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0, 'max_features': 'log2', 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'bootstrap': True, 'oob_score': False, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False, 'class_weight': None, 'ccp_alpha': 0, 'max_samples': None}
The Accuracy is:  0.9649
The Precision is:  0.9903
The Recall is:  0.9675
The F1 score is:  0.9788
The Matthews correlation coefficient is:  0.8798

This is

# Support Vector Machines