In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mglearn
from sklearn.tree import plot_tree

In [None]:
# --- Data Loading and Preprocessing ---
# User configuration
base_path = 'your_data_directory_here/' # Set your preferred target data directory
file_name = 'Radical_Analysis.xlsx'

rf_path = os.path.join(base_path, 'RF')
os.makedirs(rf_path, exist_ok=True)

file_path = os.path.join(base_path, file_name)
df = pd.read_excel(file_path)

X = df[['NH2', 'NH', 'N-(C)3', 'OH', 'CHO', 'COOH', 'Size']]
y = df['r']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

In [None]:
# --- Hyperparameter Tuning via Grid Search ---
rf = RandomForestRegressor(random_state=42)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'n_estimators': range(10, 101, 5),
    'max_depth': range(3, 21),
    'min_samples_split': range(2, 11),
    'min_samples_leaf': range(3, 11, 2),
    'bootstrap': [True]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=kfold, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')
print(f'Best Scores: {grid_search.best_score_}')

In [None]:
# --- Parameter Sensitivity Analysis (n_estimators vs max_depth) ---
n_estimators_range = range(10, 101, 5)
max_depth_range = range(3, 21)

CVS_matrix = np.zeros((len(n_estimators_range), len(max_depth_range)))
rmse_matrix = np.zeros((len(n_estimators_range), len(max_depth_range)))
R2_matrix = np.zeros((len(n_estimators_range), len(max_depth_range)))

for i, n_estimators in enumerate(n_estimators_range):
    for j, max_depth in enumerate(max_depth_range):
        # Using best parameters from GridSearch for fixed variables
        rf = RandomForestRegressor(n_estimators = n_estimators, 
                                   max_depth = max_depth, 
                                   min_samples_split = best_params['min_samples_split'],
                                   min_samples_leaf = best_params['min_samples_leaf'],
                                   bootstrap = best_params['bootstrap'],
                                   random_state = 42)
        
        cvs = cross_val_score(rf, X_train, y_train, cv=kfold, n_jobs=-1, scoring='r2').mean()
        CVS_matrix[i, j] = cvs
        
        rf.fit(X_train, y_train)
        y_test_pred = rf.predict(X_test)

        rmse_matrix[i, j] = np.sqrt(mean_squared_error(y_test, y_test_pred))
        R2_matrix[i, j] = r2_score(y_test, y_test_pred)

In [None]:
# --- Performance Visualization (Heatmaps) ---
xticks = max_depth_range
yticks = n_estimators_range

plt.figure(figsize=(10, 6), dpi=600)
sns.heatmap(CVS_matrix, annot=True, xticklabels=xticks, yticklabels=yticks, cmap="YlGnBu")
plt.title('CVS Heatmap (n_estimators vs max_depth)')
plt.xlabel('max_depth')
plt.ylabel('n_estimators')
save_path = os.path.join(rf_path, 'CVS_heatmap.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=600)
sns.heatmap(rmse_matrix, annot=True, xticklabels=xticks, yticklabels=yticks, cmap="YlGnBu")
plt.title('RMSE Heatmap (n_estimators vs max_depth)')
plt.xlabel('max_depth')
plt.ylabel('n_estimators')
save_path = os.path.join(rf_path, 'RMSE_heatmap.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=600)
sns.heatmap(R2_matrix, annot=True, xticklabels=xticks, yticklabels=yticks, cmap="YlGnBu")
plt.title('R2_Test Heatmap (n_estimators vs max_depth)')
plt.xlabel('max_depth')
plt.ylabel('n_estimators')
save_path = os.path.join(rf_path, 'R2_heatmap.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
# --- Model Interpretation: Feature Importance ---
# n_estimators and max_depth need to be manually selected neither overfitting nor underfitting based on the three previously calculated Heatmaps
best_rf = RandomForestRegressor(n_estimators = 15,
                                max_depth = 4,
                                min_samples_split = 2,
                                min_samples_leaf = 3,
                                bootstrap = True,
                                random_state = 42)
best_rf.fit(X_train, y_train)

feature_importance = best_rf.feature_importances_
features = X.columns

print('feature importance: {}'.format(best_rf.feature_importances_))

plt.figure(figsize=(10, 6), dpi=600)
sns.barplot(x=feature_importance, y=features, palette=['#fcbba1', '#fff5f0', '#cb181d', '#fee0d2', '#fc9272', '#ef3b2c','#fb6a4a'])
plt.title('Feature Importance')
save_path = os.path.join(rf_path, 'Feature_Importance.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
# --- Model Visualization: Tree Structure ---
feature_importance_matrix = np.array(feature_importance)
plt.figure(figsize=(60, 15), dpi=600)
plot_tree(best_rf.estimators_[0], filled=True, feature_names=features.tolist())
plt.title('Visualization of One Decision Tree from Random Forest', fontsize=50)
save_path = os.path.join(rf_path, 'Visualization of One Decision Tree from Random Forest.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
# --- Export Results to Excel ---
cvs_matrix_df = pd.DataFrame(CVS_matrix, index=n_estimators_range, columns=max_depth_range)
rmse_matrix_df = pd.DataFrame(rmse_matrix, index=n_estimators_range, columns=max_depth_range)
r2_matrix_df = pd.DataFrame(R2_matrix, index=n_estimators_range, columns=max_depth_range)
feature_importance_matrix_df = pd.DataFrame(feature_importance_matrix, index=features)

excel_path = os.path.join(rf_path, 'RF_Output.xlsx')

with pd.ExcelWriter(excel_path) as writer:
    cvs_matrix_df.to_excel(writer, index=True, sheet_name='CVS_matrix')
    rmse_matrix_df.to_excel(writer, index=True, sheet_name='rmse_matrix')
    r2_matrix_df.to_excel(writer, index=True, sheet_name='R2_matrix')
    feature_importance_matrix_df.to_excel(writer, index=True, sheet_name='feature_importance_matrix', header=None)