In [1]:
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib

# Data preparation

In [4]:
dataset_root_path = "../1_convertToMLdata/"
trainVal_dataset_path = os.path.join(dataset_root_path, 'trainVal_dataset.csv')
test_dataset_path = os.path.join(dataset_root_path, 'test_dataset.csv')

# file path for trainVal SMILES
dataset_SMILES_path = "../0_splitData/"
trainVal_SMILES_path = os.path.join(dataset_SMILES_path, 'output_trainset_uniqueSMILES.xlsx')

In [None]:
trainVal_dataset_df = pd.read_csv(trainVal_dataset_path)
test_dataset_df = pd.read_csv(test_dataset_path)

# Read trainVal SMILES
trainVal_SMILES_df = pd.read_excel(trainVal_SMILES_path)

In [None]:
# Get train/val indices stratifiedShuffleSplit by SMILES functional group

splitter = StratifiedShuffleSplit(n_splits=3, test_size=0.15, random_state=14)
custom_indices = []

for train_idx, val_idx in splitter.split(trainVal_SMILES_df, trainVal_SMILES_df['Class_by_SMARTS_combineRare']):
    train_SMILES_set = trainVal_SMILES_df.iloc[train_idx]
    val_SMILES_set = trainVal_SMILES_df.iloc[val_idx]

    train_indices = trainVal_dataset_df[
        trainVal_dataset_df['SMILES'].isin(train_SMILES_set['SMILES'])
    ].index.tolist()
    val_indices = trainVal_dataset_df[
        trainVal_dataset_df['SMILES'].isin(val_SMILES_set['SMILES'])
    ].index.tolist()

    custom_indices.append((train_indices, val_indices))

print("Splits number: ", len(custom_indices))

Splits number:  3


In [46]:
print("Train size for one split: ", len(train_indices))
print("Val size for one split:", len(val_indices))
print("Val SMILES number: ", val_SMILES_set.shape[0])

Train size for one split:  5990
Val size for one split: 1591
Val SMILES number:  42


In [47]:
# Split x and y

x_trainVal_df = trainVal_dataset_df.iloc[:, 0:1860]
y_trainVal_df = trainVal_dataset_df.iloc[:, 1860]
x_test_df = test_dataset_df.iloc[:, 0:1860]
y_test_df = test_dataset_df.iloc[:, 1860]

# Convert df to numpy array
x_trainVal = x_trainVal_df.to_numpy()
y_trainVal = y_trainVal_df.to_numpy()
x_test = x_test_df.to_numpy()
y_test = y_test_df.to_numpy()

print("x_trainVal shape: ", x_trainVal.shape)
print("y_trainVal shape: ", y_trainVal.shape)
print("x_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)

x_trainVal shape:  (7581, 1860)
y_trainVal shape:  (7581,)
x_test shape:  (494, 1860)
y_test shape:  (494,)


In [48]:
x_trainVal_df.head(3)

Unnamed: 0,CircularFP_0,CircularFP_1,CircularFP_2,CircularFP_3,CircularFP_5,CircularFP_6,CircularFP_7,CircularFP_8,CircularFP_9,CircularFP_10,...,Melting_point_K,Boiling_point_K,Density_g/cm3,First_IE_kJ/mol,Second_IE_kJ/mol,Third_IE_kJ/mol,Matallic_radius_nm,Pauling_EN,Ionic_radius_nm,Oxidation_state
0,1,1,0,0,0,0,0,0,0,0,...,1193.15,3737.15,6.162,538.1,1067.0,1850.3,0.162,1.1,0.1172,3
1,1,1,0,0,0,0,0,0,0,0,...,1068.15,3633.15,6.77,534.4,1046.9,1948.8,0.1818,1.12,0.115,3
2,1,1,0,0,0,0,0,0,0,0,...,1208.15,3563.15,6.77,528.1,1017.9,2086.4,0.1824,1.13,0.113,3


# Build and train the model

In [None]:
# Initialize the classifier
clfRFmodel = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [10, 15, 20, 25], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(
    clfRFmodel, 
    param_grid,
    scoring='accuracy',
    cv=custom_indices,
    refit=True,
    return_train_score=True,
    n_jobs=-1
)

grid_search.fit(x_trainVal, y_trainVal)

  _data = np.array(data, dtype=dtype, copy=copy,


In [50]:
# Save and print the training results

cv_results_df = pd.DataFrame(grid_search.cv_results_)
cv_results_sort_df = cv_results_df.sort_values(by='mean_test_score', ascending=False)

# Print top 5 models
top_5_models_df = cv_results_sort_df.head(5)
print("Top 10 models:")
for index, row in top_5_models_df.iterrows():
    print(f"Rank {index+1}:")
    print(f"Parameters: {row['params']}")
    print(f"Mean Train Score: {row['mean_train_score']}")
    print(f"Mean Test Score: {row['mean_test_score']}")
    print("-" * 50)

# Save top 100 models
cv_results_sort_df.head(100).to_excel('cv_results_sort.xlsx', index=False)

Top 10 models:
Rank 76:
Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mean Train Score: 0.9522128485106472
Mean Test Score: 0.6423290115554116
--------------------------------------------------
Rank 110:
Parameters: {'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Mean Train Score: 0.960588736001268
Mean Test Score: 0.638574666193343
--------------------------------------------------
Rank 75:
Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Train Score: 0.9513762482189545
Mean Test Score: 0.6382390469469003
--------------------------------------------------
Rank 112:
Parameters: {'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mean Train Score: 0.9621531179178983
Mean Test Score: 0.6358089441887289
--------------------------------------------------
Rank 86:
Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 

In [None]:
print("Best parameters found: ", grid_search.best_params_)

# Get and save best estimator
best_rf = grid_search.best_estimator_
model_filename = 'rf_model.sav'
joblib.dump(best_rf, model_filename)
print(f'Model saved as {model_filename}')

Best parameters found:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


Model saved as rf_model.sav


# Evaluate the model

In [52]:
# Load the saved model

rf_model_name = "rf_model.sav"
loaded_model = joblib.load(rf_model_name)

In [53]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### Test set evaluation

In [54]:
y_test_pred = loaded_model.predict(x_test)

# Print eval
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[[113  51   3]
 [ 45 180  14]
 [  4  58  26]]
              precision    recall  f1-score   support

           0       0.70      0.68      0.69       167
           1       0.62      0.75      0.68       239
           2       0.60      0.30      0.40        88

    accuracy                           0.65       494
   macro avg       0.64      0.58      0.59       494
weighted avg       0.64      0.65      0.63       494



In [55]:
# Save eval report
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred, output_dict=True)

conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1', 'Actual 2'], columns=['Predicted 0', 'Predicted 1', 'Predicted 2'])
class_report_df = pd.DataFrame(class_report).transpose()

with pd.ExcelWriter('metrics_output.xlsx') as writer:
    conf_matrix_df.to_excel(writer, sheet_name='Confusion Matrix')
    class_report_df.to_excel(writer, sheet_name='Classification Report')