In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import plot_importance, plot_tree

In [None]:
# --- Data Loading and Preprocessing ---
# User configuration
base_path = 'your_data_directory_here/' # Set your preferred target data directory
file_name = 'Radical_Analysis.xlsx'

xgbr_path = os.path.join(base_path, 'XGBoost')
os.makedirs(xgbr_path, exist_ok=True)

file_path = os.path.join(base_path, file_name)
df = pd.read_excel(file_path)

X = df[['NH2', 'NH', 'N-(C)3', 'OH', 'CHO', 'COOH', 'Size']]
y = df['r']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
# --- Hyperparameter Tuning via Grid Search ---
xgbr = xgb.XGBRegressor(random_state=42)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'n_estimators': range(10, 101, 5),
    'max_depth': range(3, 11),
    'objective': ['reg:squarederror'],
    'gamma': [0],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
}

grid_search = GridSearchCV(estimator=xgbr, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train.ravel())

best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')
print(f'Best Score: {grid_search.best_score_}')

In [None]:
# --- Parameter Sensitivity Analysis (n_estimators vs max_depth) ---
n_estimators_range = range(10, 101, 5)
max_depth_range = range(2, 21)

CVS_matrix = np.zeros((len(n_estimators_range), len(max_depth_range)))
rmse_matrix = np.zeros((len(n_estimators_range), len(max_depth_range)))
R2_matrix = np.zeros((len(n_estimators_range), len(max_depth_range)))

for i, n_estimators in enumerate(n_estimators_range):
    for j, max_depth in enumerate(max_depth_range):
        xgbr = xgb.XGBRegressor(
            n_estimators=n_estimators, 
            max_depth=max_depth, 
            objective=best_params['objective'],
            gamma=best_params['gamma'],
            learning_rate=best_params['learning_rate'],
            min_child_weight=best_params['min_child_weight'],
            subsample=best_params['subsample'],
            colsample_bytree=best_params['colsample_bytree'],
            random_state=42
        )
        
        cvs = cross_val_score(xgbr, X_train, y_train, cv=kfold, n_jobs=-1, scoring='r2').mean()
        CVS_matrix[i, j] = cvs
        
        xgbr.fit(X_train, y_train)
        y_test_pred = xgbr.predict(X_test)
        
        rmse_matrix[i, j] = np.sqrt(mean_squared_error(y_test, y_test_pred))
        R2_matrix[i, j] = r2_score(y_test, y_test_pred)

In [None]:
# --- Performance Visualization (Heatmaps) ---
xticks = max_depth_range
yticks = n_estimators_range

plt.figure(figsize=(10, 6), dpi=600)
sns.heatmap(CVS_matrix, annot=True, xticklabels=xticks, yticklabels=yticks, cmap="YlGnBu")
plt.title('CVS Heatmap (n_estimators vs max_depth)')
plt.xlabel('max_depth')
plt.ylabel('n_estimators')
save_path = os.path.join(xgbr_path, 'CVS_heatmap.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=600)
sns.heatmap(rmse_matrix, annot=True, xticklabels=xticks, yticklabels=yticks, cmap="YlGnBu")
plt.title('RMSE Heatmap (n_estimators vs max_depth)')
plt.xlabel('max_depth')
plt.ylabel('n_estimators')
save_path = os.path.join(xgbr_path, 'RMSE_heatmap.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=600)
sns.heatmap(R2_matrix, annot=True, xticklabels=xticks, yticklabels=yticks, cmap="YlGnBu")
plt.title('R2 Score Heatmap (n_estimators vs max_depth)')
plt.xlabel('max_depth')
plt.ylabel('n_estimators')
save_path = os.path.join(xgbr_path, 'R2_heatmap.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
# --- Model Interpretation: Feature Importance ---
# n_estimators and max_depth need to be manually selected neither overfitting nor underfitting based on the three previously calculated Heatmaps
best_xgbr = xgb.XGBRegressor(n_estimators=80, 
                             max_depth=2, 
                             objective=best_params['objective'],
                             gamma=best_params['gamma'],
                             learning_rate=best_params['learning_rate'],
                             min_child_weight=best_params['min_child_weight'],
                             subsample=best_params['subsample'],
                             colsample_bytree=best_params['colsample_bytree'],
                             random_state=42)
best_xgbr.fit(X_train, y_train)

feature_importance = best_xgbr.feature_importances_
features = X.columns

print('feature importance: {}'.format(best_xgbr.feature_importances_))

plt.figure(figsize=(10, 6), dpi=600)
sns.barplot(x=feature_importance, y=features, palette=['#fcbba1', '#fff5f0', '#cb181d', '#fee0d2', '#fc9272', '#ef3b2c','#fb6a4a'])
plt.title('Feature Importance')
save_path = os.path.join(xgbr_path, 'Feature_Importance.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
# --- Model Visualization: Tree Structure ---
plt.figure(figsize=(20, 10), dpi=600)
plot_tree(best_xgbr, num_trees=0, filled=True)
plt.title('Visualization of One Decision Tree from XGBoost', fontsize=20 )
save_path = os.path.join(xgbr_path, 'Visualization of One Decision Tree from XGBoost.png')
plt.savefig(save_path, bbox_inches='tight')
plt.show()

In [None]:
booster = best_xgbr.get_booster()
print(booster.attributes())
print(booster.get_dump())

trees = best_xgbr.get_booster().get_dump(dump_format="text")
for i, tree in enumerate(trees):
    print(f"Tree {i} structure:\n{tree}\n")

trees_json = best_xgbr.get_booster().get_dump(dump_format="json")
print(trees_json[0])

In [None]:
# --- Export Results to Excel ---
feature_importance_matrix = np.array(feature_importance)
cvs_matrix_df = pd.DataFrame(CVS_matrix, index=n_estimators_range, columns=max_depth_range)
rmse_matrix_df = pd.DataFrame(rmse_matrix, index=n_estimators_range, columns=max_depth_range)
r2_matrix_df = pd.DataFrame(R2_matrix, index=n_estimators_range, columns=max_depth_range)
feature_importance_matrix_df = pd.DataFrame(feature_importance_matrix, index=features)

excel_path = os.path.join(xgbr_path, 'XGBoost_Output.xlsx')

with pd.ExcelWriter(excel_path) as writer:
    cvs_matrix_df.to_excel(writer, index=True, sheet_name='CVS_matrix')
    rmse_matrix_df.to_excel(writer, index=True, sheet_name='rmse_matrix')
    r2_matrix_df.to_excel(writer, index=True, sheet_name='R2_matrix')
    feature_importance_matrix_df.to_excel(writer, index=True, sheet_name='feature_importance_matrix', header=None)