In [None]:
# This is to test feature importance with the weighted XGBoost and LightGBM model ensemble

import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                  subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, verbose=-1, force_col_wise=True)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Generate predictions
y_pred_lgb_test = optimized_lgb.predict(X_test)
y_pred_xgb_test = optimized_xgb.predict(X_test)

# Define Bayesian Optimization for Blending Weights
def objective(trial):
    weight_lgb = trial.suggest_float("weight_lgb", 0.0, 1.0)
    weight_xgb = 1.0 - weight_lgb  # Ensure sum is 1
    y_pred_ensemble = (weight_lgb * y_pred_lgb_test) + (weight_xgb * y_pred_xgb_test)
    return np.sqrt(mean_squared_error(y_test, y_pred_ensemble))

# Run Bayesian Optimization for 50 trials
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Best Weights from Optimization
best_weight_lgb = study.best_params["weight_lgb"]
best_weight_xgb = 1.0 - best_weight_lgb

# Apply Best Weights
y_pred_ensemble_optimized = (best_weight_lgb * y_pred_lgb_test) + (best_weight_xgb * y_pred_xgb_test)

# Evaluate Optimized Blended Model
ensemble_rmse_opt = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_optimized))
ensemble_r2_opt = r2_score(y_test, y_pred_ensemble_optimized)

print(f"Optimized Weighted Blended Model - Test RMSE: {ensemble_rmse_opt:.4f}, Test R²: {ensemble_r2_opt:.4f}")
print(f"Optimal Blending Weights: LightGBM={best_weight_lgb:.4f}, XGBoost={best_weight_xgb:.4f}")

# Feature Importance Analysis
lgb_importance = pd.Series(optimized_lgb.feature_importances_, index=X_train.columns).sort_values(ascending=False)
xgb_importance = pd.Series(optimized_xgb.feature_importances_, index=X_train.columns).sort_values(ascending=False)

# Normalize importance scores
lgb_importance = lgb_importance / lgb_importance.sum()
xgb_importance = xgb_importance / xgb_importance.sum()

# Compute blended feature importance
ensemble_importance = (best_weight_lgb * lgb_importance) + (best_weight_xgb * xgb_importance)
ensemble_importance = ensemble_importance.sort_values(ascending=False)

# Plot Feature Importance
plt.figure(figsize=(10, 6))
ensemble_importance[:20].plot(kind='barh')  # Show top 20 features
plt.xlabel("Blended Feature Importance")
plt.ylabel("Feature Name")
plt.title("Top Feature Importances in Optimized Blended Model")
plt.gca().invert_yaxis()
plt.show()

# Print top features
print("Top 20 Features in Optimized Blended Model:")
print(ensemble_importance[:20])


Results:

[I 2025-02-25 18:23:37,719] Trial 49 finished with value: 8.514911745102962 and parameters: {'weight_lgb': 0.1303015207206778}. Best is trial 22 with value: 8.502434908752939.
Optimized Weighted Blended Model - Test RMSE: 8.5024, Test R²: 0.9372
Optimal Blending Weights: LightGBM=0.3507, XGBoost=0.6493


Top 20 Features in Optimized Blended Model:
range_ThermalConductivity          0.282304
Cu                                 0.175546
range_atomic_radius                0.029883
Ca                                 0.017167
Ba                                 0.012077
wtd_std_ElectronAffinity           0.010526
wtd_gmean_Density                  0.008571
wtd_gmean_ThermalConductivity      0.008373
wtd_std_Valence                    0.008336
wtd_entropy_ThermalConductivity    0.008035
wtd_entropy_ElectronAffinity       0.007918
wtd_gmean_Valence                  0.007916
wtd_range_Valence                  0.007643
wtd_std_ThermalConductivity        0.007577
wtd_range_atomic_radius            0.007523
gmean_Valence                      0.007404
wtd_entropy_atomic_mass            0.007300
wtd_entropy_FusionHeat             0.007222
wtd_std_atomic_radius              0.007204
wtd_entropy_Density                0.007102