In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, accuracy_score
def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, pos_label='RB'))
    print("The Recall is: %7.4f" % recall_score(truth, preds, pos_label='RB'))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, pos_label='RB'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print()
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(truth, preds)))

In [18]:
initial_dataset = pd.read_csv("biodegradable_a.csv").sample(frac=1).reset_index(drop=True)
total_len, _ = initial_dataset.shape

# NOTE - NO INDEPENDENT VALIDATION SET !!!

# Total with means
#categorical = ['int16', 'int32', 'int64']
#biodegradable = ['object']
#numerical = ['float16', 'float32', 'float64']

class_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 == 0).all()]
num_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 != 0).any()]

print(len(class_cols) + len(num_cols))
print(len(initial_dataset.drop("Biodegradable", axis=1).columns))

total_categorical_dataset = initial_dataset[class_cols]
#total_categorical_dataset = total_categorical_dataset.fillna(total_categorical_dataset.mode())
total_categorical_dataset = total_categorical_dataset.fillna(-1)
total_categorical_dataset = total_categorical_dataset.astype(int).astype(object).astype(str)
#print(total_categorical_dataset)

total_numerical_dataset = initial_dataset[num_cols]
total_numerical_dataset = total_numerical_dataset.fillna(total_numerical_dataset.mean())

total_biodegradable = initial_dataset["Biodegradable"]
#total_biodegradable = initial_dataset.select_dtypes(include=biodegradable)
#total_biodegradable = total_biodegradable.fillna("")

# Scale numerical data
# https://scikit-learn.org/stable/modules/preproce
#print(total_numerical_dataset)

scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))

total_numerical_dataset = pd.DataFrame(scaler.fit_transform(total_numerical_dataset),
             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)


#total_numerical_dataset = pd.DataFrame(normalize(total_numerical_dataset, norm='l2', axis=1, copy=True, return_norm=False),
#             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)

#print(total_numerical_dataset)
#
total_dataset = pd.concat([total_categorical_dataset, total_numerical_dataset,total_biodegradable], axis=1)
#total_dataset.dropna(0)
print(total_dataset)

total_len, _ = total_dataset.shape
train_dataset_len = round(total_len * 0.75)

dataset_train = total_dataset[0:train_dataset_len]
dataset_test = total_dataset[train_dataset_len:total_len]

print(total_dataset.shape)

# Removal of None/NaN vals
dropna_dataset = initial_dataset.dropna()

dropna_len, _ = dropna_dataset.shape
model_dropna_len = round(total_len * 0.75)

dropna_train = dropna_dataset[0:model_dropna_len]
dropna_test = dropna_dataset[model_dropna_len:dropna_len]

print(dropna_dataset.shape)

41
41
     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...       SpMax_A  \
0      0   0      0   0  4   0    0      0     0   0  ...  8.912254e-01   
1      0   0      0   0  1   0    0      0     0   0  ...  1.169122e-14   
2      0   0      0   0  3   0    0      0     0   0  ...  1.169122e-14   
3      0   3      0   4  2   3    0      0     0   0  ...  2.278254e+00   
4      0   0      0   3  3   0    0      0     0   0  ...  6.275332e-01   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...           ...   
4559   0   0      0   0  2   0    0      0     0   0  ...  1.169122e-14   
4560   0   0      0   0  2   0    0      0     0   0  ...  1.169122e-14   
4561   0   0      0   2  4   0    0      0     0   0  ...  9.180815e-01   
4562   0   0      0   2  4   0    0      0     0   0  ...  1.217134e+00   
4563   0   0      0   0  1   0    0      0     0   0  ...  4.831751e-01   

      Psi_i_1d           SdO     TI2_L      nCrt       SpMax_B   Psi_i_A  \
0     0.046197  2

In [19]:
print(total_dataset.columns)
print(dropna_test.columns)

Index(['nHM', 'F04', 'NssssC', 'nCb', 'nO', 'F03', 'nN_N', 'nArNO2', 'nCRX3',
       'B01', 'B03', 'N_073', 'B04', 'C_026', 'F02_CN', 'nHDon', 'nN',
       'nArCOOR', 'SpMax_L', 'J_Dz(e)', 'F01', 'C', 'nCp', 'SdssC', 'HyWi_B',
       'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi', 'SpPosA_B', 'nCIR', 'SpMax_A',
       'Psi_i_1d', 'SdO', 'TI2_L', 'nCrt', 'SpMax_B', 'Psi_i_A', 'SM6_B', 'nX',
       'Biodegradable'],
      dtype='object')
Index(['SpMax_L', 'J_Dz(e)', 'nHM', 'F01', 'F04', 'NssssC', 'nCb', 'C', 'nCp',
       'nO', 'F03', 'SdssC', 'HyWi_B', 'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi',
       'nN_N', 'nArNO2', 'nCRX3', 'SpPosA_B', 'nCIR', 'B01', 'B03', 'N_073',
       'SpMax_A', 'Psi_i_1d', 'B04', 'SdO', 'TI2_L', 'nCrt', 'C_026', 'F02_CN',
       'nHDon', 'SpMax_B', 'Psi_i_A', 'nN', 'SM6_B', 'nArCOOR', 'nX',
       'Biodegradable'],
      dtype='object')


## Using Model with replaced values when NaN, and discarding the dropped NaN values dataset

In [20]:
X_train_total = dataset_train.drop(["Biodegradable"], axis=1)
y_train_total = dataset_train.Biodegradable
print(X_train_total)
print(y_train_total)

X_test_total = dataset_test.drop(["Biodegradable"], axis=1)
y_test_total = dataset_test.Biodegradable

     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...      nCIR  \
0      0   0      0   0  4   0    0      0     0   0  ...  0.117354   
1      0   0      0   0  1   0    0      0     0   0  ... -0.302031   
2      0   0      0   0  3   0    0      0     0   0  ... -0.302031   
3      0   3      0   4  2   3    0      0     0   0  ...  2.214277   
4      0   0      0   3  3   0    0      0     0   0  ...  0.117354   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...       ...   
3418   5   0      0   0  3   1    0      0     0   0  ...  0.117354   
3419   1   0      0   0  0   0    0      0     0   0  ...  0.117354   
3420   0   0      0   0  2   0    0      0     0   0  ... -0.302031   
3421   3   3      0   4  4   5    0      0     0   0  ...  0.536738   
3422   0   0      0   0  2   0    0      0     0   0  ... -0.302031   

           SpMax_A  Psi_i_1d       SdO     TI2_L      nCrt       SpMax_B  \
0     8.912254e-01  0.046197  2.933423  0.542122 -0.100618 -9.983386e-0

## Testing Random Forests for Feature Selection

In [21]:
# https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), max_features=12) # when no max_features are specified, seems to vary between 12 and 15
sel.fit(X_train_total, y_train_total)

In [22]:

print(sel.get_support())
selected_feat= X_train_total.columns[(sel.get_support())]
print(len(selected_feat))
print(selected_feat)

[ True  True  True  True False  True False False False False False False
 False False  True False False False  True False False False False False
 False False  True False False False  True False False False False False
 False  True False  True  True]
12
Index(['nHM', 'F04', 'NssssC', 'nCb', 'F03', 'F02_CN', 'SpMax_L', 'SM6_L',
       'SpPosA_B', 'SpMax_B', 'SM6_B', 'nX'],
      dtype='object')


In [23]:
X_train_rf = X_train_total[X_train_total.columns[(sel.get_support())]]
X_test_rf = X_test_total[X_test_total.columns[(sel.get_support())]]

print(X_train_rf)

     nHM F04 NssssC nCb F03 F02_CN   SpMax_L     SM6_L  SpPosA_B  \
0      0   0      0   0   0      0  0.710478  1.498196 -0.241791   
1      0   0      0   0   0      0 -1.539955 -1.782914 -0.110611   
2      0   0      0   0   0      0 -0.575178 -0.104105 -0.800415   
3      0   3      0   4   3      2  1.798886  1.727435  1.323393   
4      0   0      0   3   0      0  0.466709  0.357760  0.251109   
...   ..  ..    ...  ..  ..    ...       ...       ...       ...   
3418   5   0      0   0   1      2  1.556979  1.360117  1.433179   
3419   1   0      0   0   0      0 -0.290619 -0.497948  0.361870   
3420   0   0      0   0   0      0 -1.054234 -0.879713 -0.917676   
3421   3   3      0   4   5      8  1.748396  1.664892  0.909982   
3422   0   0      0   0   0      0 -0.227143 -0.413814  0.652836   

           SpMax_B     SM6_B            nX  
0    -9.983386e-04  0.929544 -1.602862e-01  
1    -8.767777e-01 -1.268642 -1.602862e-01  
2    -4.909760e-01 -0.210655 -1.602862e-01  
3  

## Testing a Random Forest Model for Classification

In [24]:
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(X_train_rf, y_train_total)

preds = rf_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9535
The Precision is:  0.9844
The Recall is:  0.9615
The F1 score is:  0.9728
The Matthews correlation coefficient is:  0.8165

This is the Confusion Matrix
     0    1
0  140   15
1   38  948
