In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, accuracy_score
def printClassResults(truth, preds):
    print("The Accuracy is: %7.4f" % accuracy_score(truth, preds))
    print("The Precision is: %7.4f" % precision_score(truth, preds, pos_label='RB'))
    print("The Recall is: %7.4f" % recall_score(truth, preds, pos_label='RB'))
    print("The F1 score is: %7.4f" % f1_score(truth, preds, pos_label='RB'))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(truth, preds))
    print()
    print("This is the Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(truth, preds)))

In [2]:
initial_dataset = pd.read_csv("biodegradable_a.csv").sample(frac=1).reset_index(drop=True)
total_len, _ = initial_dataset.shape

# NOTE - NO INDEPENDENT VALIDATION SET !!!

# Total with means
#categorical = ['int16', 'int32', 'int64']
#biodegradable = ['object']
#numerical = ['float16', 'float32', 'float64']

class_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 == 0).all()]
num_cols = [col for col in initial_dataset.drop("Biodegradable", axis=1) if initial_dataset[col].apply(lambda x: x % 1 != 0).any()]

print(len(class_cols) + len(num_cols))
print(len(initial_dataset.drop("Biodegradable", axis=1).columns))

total_categorical_dataset = initial_dataset[class_cols]
#total_categorical_dataset = total_categorical_dataset.fillna(total_categorical_dataset.mode())
total_categorical_dataset = total_categorical_dataset.fillna(-1)
total_categorical_dataset = total_categorical_dataset.astype(int).astype(object).astype(str)
#print(total_categorical_dataset)

total_numerical_dataset = initial_dataset[num_cols]
total_numerical_dataset = total_numerical_dataset.fillna(total_numerical_dataset.mean())

total_biodegradable = initial_dataset["Biodegradable"]
#total_biodegradable = initial_dataset.select_dtypes(include=biodegradable)
#total_biodegradable = total_biodegradable.fillna("")

# Scale numerical data
# https://scikit-learn.org/stable/modules/preproce
#print(total_numerical_dataset)

scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))

total_numerical_dataset = pd.DataFrame(scaler.fit_transform(total_numerical_dataset),
             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)


#total_numerical_dataset = pd.DataFrame(normalize(total_numerical_dataset, norm='l2', axis=1, copy=True, return_norm=False),
#             columns=total_numerical_dataset.columns, index=total_numerical_dataset.index)

#print(total_numerical_dataset)
#
total_dataset = pd.concat([total_categorical_dataset, total_numerical_dataset,total_biodegradable], axis=1)
#total_dataset.dropna(0)
print(total_dataset)

total_len, _ = total_dataset.shape
train_dataset_len = round(total_len * 0.75)

dataset_train = total_dataset[0:train_dataset_len]
dataset_test = total_dataset[train_dataset_len:total_len]

print(total_dataset.shape)

# Removal of None/NaN vals
dropna_dataset = initial_dataset.dropna()

dropna_len, _ = dropna_dataset.shape
model_dropna_len = round(total_len * 0.75)

dropna_train = dropna_dataset[0:model_dropna_len]
dropna_test = dropna_dataset[model_dropna_len:dropna_len]

print(dropna_dataset.shape)

41
41
     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...       SpMax_A  \
0      0   0      0   0  2   0    0      0     0   0  ... -7.745865e-03   
1      0   0      0   2  4   0    0      0     0   0  ...  1.048795e+00   
2      0   0      1   0  0   0    0      0     0   0  ... -6.396091e-01   
3      0   0      0   0  4   0    0      0     0   0  ... -3.667911e-01   
4      0   1      0   1  0   2    0      0     0   0  ...  6.029915e-01   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...           ...   
4559   0   0      0   0  2   0    0      0     0   0  ...  3.867406e-01   
4560   0   4      0   4  5   4    0      2     0   0  ...  1.870595e-14   
4561   0   0      0   1  0   0    0      0     0   0  ... -1.256225e-01   
4562   0   2      0   2  2   3    0      1     0   0  ...  8.767849e-01   
4563   0   0      0   2  4   0    0      0     0   0  ...  1.208496e+00   

      Psi_i_1d           SdO     TI2_L      nCrt       SpMax_B       Psi_i_A  \
0    -0.44720

  total_dataset.dropna(0)


In [3]:
print(total_dataset.columns)
print(dropna_test.columns)

Index(['nHM', 'F04', 'NssssC', 'nCb', 'nO', 'F03', 'nN_N', 'nArNO2', 'nCRX3',
       'B01', 'B03', 'N_073', 'B04', 'C_026', 'F02_CN', 'nHDon', 'nN',
       'nArCOOR', 'SpMax_L', 'J_Dz(e)', 'F01', 'C', 'nCp', 'SdssC', 'HyWi_B',
       'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi', 'SpPosA_B', 'nCIR', 'SpMax_A',
       'Psi_i_1d', 'SdO', 'TI2_L', 'nCrt', 'SpMax_B', 'Psi_i_A', 'SM6_B', 'nX',
       'Biodegradable'],
      dtype='object')
Index(['SpMax_L', 'J_Dz(e)', 'nHM', 'F01', 'F04', 'NssssC', 'nCb', 'C', 'nCp',
       'nO', 'F03', 'SdssC', 'HyWi_B', 'LOC', 'SM6_L', 'F03_CO', 'Me', 'Mi',
       'nN_N', 'nArNO2', 'nCRX3', 'SpPosA_B', 'nCIR', 'B01', 'B03', 'N_073',
       'SpMax_A', 'Psi_i_1d', 'B04', 'SdO', 'TI2_L', 'nCrt', 'C_026', 'F02_CN',
       'nHDon', 'SpMax_B', 'Psi_i_A', 'nN', 'SM6_B', 'nArCOOR', 'nX',
       'Biodegradable'],
      dtype='object')


## Using Model with replaced values when NaN, and discarding the dropped NaN values dataset

In [4]:
X_train_total = dataset_train.drop(["Biodegradable"], axis=1)
y_train_total = dataset_train.Biodegradable
print(X_train_total)
print(y_train_total)

X_test_total = dataset_test.drop(["Biodegradable"], axis=1)
y_test_total = dataset_test.Biodegradable

     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...      nCIR  \
0      0   0      0   0  2   0    0      0     0   0  ...  0.000000   
1      0   0      0   2  4   0    0      0     0   0  ...  0.117354   
2      0   0      1   0  0   0    0      0     0   0  ... -0.302031   
3      0   0      0   0  4   0    0      0     0   0  ...  0.000000   
4      0   1      0   1  0   2    0      0     0   0  ...  0.117354   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...       ...   
3418   0   0      0   0  4   0    0      0     0   0  ... -0.302031   
3419   0   0      0   0  2   0    0      0     0   0  ... -0.302031   
3420   0   0      0   1  1   0    0      0     0   0  ...  0.117354   
3421   0   0      0   0  1   0    0      0     0   0  ...  0.000000   
3422   0   0      0   0  0   0    0      0     0   0  ...  0.000000   

           SpMax_A  Psi_i_1d           SdO     TI2_L      nCrt       SpMax_B  \
0    -7.745865e-03 -0.447209  7.976115e-02 -0.493791 -0.100618 -3.5

## Testing Random Forests for Feature Selection

In [5]:
# https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), max_features=12) # when no max_features are specified, seems to vary between 12 and 15
sel.fit(X_train_total, y_train_total)

In [6]:

print(sel.get_support())
selected_feat= X_train_total.columns[(sel.get_support())]
print(len(selected_feat))
print(selected_feat)

[ True  True  True  True False  True False False False False False False
 False False  True False  True False  True False False False False False
 False False  True False False False  True False False False False False
 False False False  True  True]
12
Index(['nHM', 'F04', 'NssssC', 'nCb', 'F03', 'F02_CN', 'nN', 'SpMax_L',
       'SM6_L', 'SpPosA_B', 'SM6_B', 'nX'],
      dtype='object')


In [7]:
X_train_rf = X_train_total[X_train_total.columns[(sel.get_support())]]
X_test_rf = X_test_total[X_test_total.columns[(sel.get_support())]]

print(X_train_rf)

     nHM F04 NssssC nCb F03 F02_CN nN   SpMax_L     SM6_L  SpPosA_B     SM6_B  \
0      0   0      0   0   0      0  0  0.647035  0.059398 -0.576764 -0.532966   
1      0   0      0   2   0      0  0  0.856252  0.786146  0.065220  0.535403   
2      0   0      1   0   0      3  1  0.928629  0.059824 -0.310812 -1.230161   
3      0   0      0   0   0      0  0 -0.318002 -0.014615 -0.892347 -0.083656   
4      0   1      0   1   2      4  1  0.549954  0.229274  0.623469 -0.078910   
...   ..  ..    ...  ..  ..    ... ..       ...       ...       ...       ...   
3418   0   0      0   0   0      0  0 -0.216163  0.867787 -0.423026  0.602067   
3419   0   0      0   0   0      0  0 -0.748765 -1.086930 -0.932843 -0.882881   
3420   0   0      0   1   0      0  0  0.121582  0.035307  0.520911 -0.136099   
3421   0   0      0   0   0      0  0 -0.173122 -0.291635 -0.237572 -0.615503   
3422   0   0      0   0   0      0  0 -1.358747 -0.850887  0.006335 -1.096266   

                nX  
0    -

## Testing a Random Forest Model for Classification

In [16]:
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(X_train_rf, y_train_total)

preds = rf_model.predict(X_test_rf)
truths = y_test_total.to_numpy()

printClassResults(preds,truths)

The Accuracy is:  0.9544
The Precision is:  0.9822
The Recall is:  0.9641
The F1 score is:  0.9731
The Matthews correlation coefficient is:  0.8261

This is the Confusion Matrix
     0    1
0  149   17
1   35  940
