In [None]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                  subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, verbose=-1, force_col_wise=True)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Generate predictions
y_pred_lgb_test = optimized_lgb.predict(X_test)
y_pred_xgb_test = optimized_xgb.predict(X_test)

# Define Bayesian Optimization for Blending Weights
def objective(trial):
    weight_lgb = trial.suggest_float("weight_lgb", 0.0, 1.0)
    weight_xgb = 1.0 - weight_lgb  # Ensure sum is 1
    y_pred_ensemble = (weight_lgb * y_pred_lgb_test) + (weight_xgb * y_pred_xgb_test)
    return np.sqrt(mean_squared_error(y_test, y_pred_ensemble))

# Run Bayesian Optimization for 50 trials
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Best Weights from Optimization
best_weight_lgb = study.best_params["weight_lgb"]
best_weight_xgb = 1.0 - best_weight_lgb

# Apply Best Weights
y_pred_ensemble_optimized = (best_weight_lgb * y_pred_lgb_test) + (best_weight_xgb * y_pred_xgb_test)

# Evaluate Optimized Blended Model
ensemble_rmse_opt = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_optimized))
ensemble_r2_opt = r2_score(y_test, y_pred_ensemble_optimized)

print(f"Optimized Weighted Blended Model - Test RMSE: {ensemble_rmse_opt:.4f}, Test R²: {ensemble_r2_opt:.4f}")
print(f"Optimal Blending Weights: LightGBM={best_weight_lgb:.4f}, XGBoost={best_weight_xgb:.4f}")


Results:

[I 2025-02-25 18:16:34,082] Trial 49 finished with value: 8.567962292471476 and parameters: {'weight_lgb': 0.8532787775439186}. Best is trial 10 with value: 8.502439368842953.
Optimized Weighted Blended Model - Test RMSE: 8.5024, Test R²: 0.9372
Optimal Blending Weights: LightGBM=0.3454, XGBoost=0.6546