In [None]:
# Bayesian Optimization on the XGBoost model with reduced features n=50

import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Importance Threshold for Selection
importance_threshold = 0.005  # Keep features with importance > 0.5%

# Train XGBoost Model
xgb_model = xgb.XGBRegressor(n_estimators=200, max_depth=6, tree_method="hist", random_state=42)
xgb_model.fit(X_train, y_train)

# Feature Importance (XGBoost)
xgb_importance = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
selected_features = xgb_importance[xgb_importance > importance_threshold].index.tolist()

# Train XGBoost Model with Selected Features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Define Bayesian Optimization for XGBoost

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "tree_method": "hist",
        "random_state": 42
    }
    model = xgb.XGBRegressor(**params)
    model.fit(X_train_selected, y_train)
    test_rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test_selected)))
    return test_rmse

# Run Bayesian Optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Train Optimized XGBoost Model
best_params = study.best_params
optimized_xgb = xgb.XGBRegressor(**best_params)
optimized_xgb.fit(X_train_selected, y_train)

# Evaluate Optimized XGBoost Performance
train_rmse_xgb_opt = np.sqrt(mean_squared_error(y_train, optimized_xgb.predict(X_train_selected)))
test_rmse_xgb_opt = np.sqrt(mean_squared_error(y_test, optimized_xgb.predict(X_test_selected)))
train_r2_xgb_opt = r2_score(y_train, optimized_xgb.predict(X_train_selected))
test_r2_xgb_opt = r2_score(y_test, optimized_xgb.predict(X_test_selected))

print(f"Optimized XGBoost - Train RMSE: {train_rmse_xgb_opt:.4f}, Train R²: {train_r2_xgb_opt:.4f}")
print(f"Optimized XGBoost - Test RMSE: {test_rmse_xgb_opt:.4f}, Test R²: {test_r2_xgb_opt:.4f}")

# Save Best Parameters
print("Best Parameters Found:")
print(best_params)

# Save Feature Importance Ranking
xgb_importance.to_csv("feature_importance_ranking_xgb.csv")

# Print ranked features for comparison
print("Feature Importance Ranking (XGBoost):")
print(xgb_importance)

# Plot Feature Importance (XGBoost)
plt.figure(figsize=(10, 6))
xgb_importance[:20].plot(kind='barh')  # Show top 20 features
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.title("Top Feature Importances in XGBoost Model")
plt.gca().invert_yaxis()
plt.show()


Results:

[I 2025-02-25 17:37:57,153] Trial 49 finished with value: 9.00103285810336 and parameters: {'n_estimators': 438, 'max_depth': 10, 'learning_rate': 0.052021796177536335, 'subsample': 0.9131247386146722, 'colsample_bytree': 0.6548309731038137, 'gamma': 4.509418662732957}. Best is trial 42 with value: 8.91350265825123.
Optimized XGBoost - Train RMSE: 5.3971, Train R²: 0.9753
Optimized XGBoost - Test RMSE: 8.9377, Test R²: 0.9306
Best Parameters Found:
{'n_estimators': 407, 'max_depth': 10, 'learning_rate': 0.02962746174406205, 'subsample': 0.8786056663685927, 'colsample_bytree': 0.6260167856358314, 'gamma': 4.321388407974591}
Feature Importance Ranking (XGBoost):
Cu                           0.631333
Ba                           0.051469
gmean_Valence                0.046137
Ca                           0.029114
range_ThermalConductivity    0.020274
                               ...   
Hf                           0.000000
Os                           0.000000
Po                           0.000000
At                           0.000000
Rn                           0.000000


Improvement! Would n= 100 or 150 yield better? Whatabout on all features?