In [1]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Feature Engineering: Physics-Based Ratios
X["mass_density_ratio"] = X["wtd_mean_atomic_mass"] / (X["wtd_mean_Density"] + 1e-9)
X["affinity_valence_ratio"] = X["wtd_mean_ElectronAffinity"] / (X["wtd_mean_Valence"] + 1e-9)

# Train-validation-test split (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                  subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, force_col_wise=True, verbose=-1, random_state=42)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Define blending weights
best_weight_lgb = 0.3454  # Previously found optimal weight
best_weight_xgb = 1.0 - best_weight_lgb

# Feature Importance Analysis Before Removing Features
lgb_importance = pd.Series(optimized_lgb.feature_importances_, index=X.columns).sort_values(ascending=False)
xgb_importance = pd.Series(optimized_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)

# Show top 20 features for each model
print("Top 20 Features for LightGBM:")
print(lgb_importance[:20])
print("\nTop 20 Features for XGBoost:")
print(xgb_importance[:20])

# Normalize importance scores
lgb_importance = lgb_importance / lgb_importance.sum()
xgb_importance = xgb_importance / xgb_importance.sum()

# Compute blended feature importance
ensemble_importance = (best_weight_lgb * lgb_importance) + (best_weight_xgb * xgb_importance)
ensemble_importance = ensemble_importance.sort_values(ascending=False)

# Remove features with very low importance
low_importance_threshold = 0.005
low_importance_features = ensemble_importance[ensemble_importance < low_importance_threshold].index
X_train = X_train.drop(columns=low_importance_features)
X_valid = X_valid.drop(columns=low_importance_features)
X_test = X_test.drop(columns=low_importance_features)

# Compute correlation matrix and remove highly correlated features
corr_matrix = X_train.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
correlation_threshold = 0.95
high_correlation_features = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]
X_train = X_train.drop(columns=high_correlation_features)
X_valid = X_valid.drop(columns=high_correlation_features)
X_test = X_test.drop(columns=high_correlation_features)

# Retrain models after feature selection
optimized_lgb.fit(X_train, y_train)
optimized_xgb.fit(X_train, y_train)

# Generate predictions
y_pred_lgb_test = optimized_lgb.predict(X_test)
y_pred_xgb_test = optimized_xgb.predict(X_test)

# Apply Blending Weights
y_pred_ensemble_test = (best_weight_lgb * y_pred_lgb_test) + (best_weight_xgb * y_pred_xgb_test)

# Evaluate Model
ensemble_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_test))
ensemble_r2_test = r2_score(y_test, y_pred_ensemble_test)

print(f"Optimized Weighted Blended Model (After Feature Selection) - Test RMSE: {ensemble_rmse_test:.4f}, Test R²: {ensemble_r2_test:.4f}")


Top 20 Features for LightGBM:
wtd_entropy_ElectronAffinity       1046
wtd_entropy_atomic_mass            1042
wtd_range_atomic_radius            1040
wtd_entropy_Valence                1021
mass_density_ratio                  961
wtd_entropy_ThermalConductivity     958
wtd_std_ThermalConductivity         945
wtd_range_Valence                   937
wtd_range_fie                       936
wtd_std_atomic_radius               924
wtd_range_atomic_mass               888
wtd_entropy_FusionHeat              862
wtd_entropy_Density                 856
wtd_entropy_fie                     855
wtd_mean_fie                        836
wtd_mean_Density                    803
wtd_gmean_fie                       791
affinity_valence_ratio              761
wtd_entropy_atomic_radius           754
wtd_std_fie                         741
dtype: int32

Top 20 Features for XGBoost:
range_ThermalConductivity        0.330287
Cu                               0.301463
range_atomic_radius              0.072670
C

In [2]:
# Show top 60 features for each model
print("Top 60 Features for LightGBM:")
print(lgb_importance[:60])
print("\nTop 60 Features for XGBoost:")
print(xgb_importance[:60])

Top 60 Features for LightGBM:
wtd_entropy_ElectronAffinity       0.021181
wtd_entropy_atomic_mass            0.021100
wtd_range_atomic_radius            0.021059
wtd_entropy_Valence                0.020675
mass_density_ratio                 0.019460
wtd_entropy_ThermalConductivity    0.019399
wtd_std_ThermalConductivity        0.019136
wtd_range_Valence                  0.018974
wtd_range_fie                      0.018954
wtd_std_atomic_radius              0.018711
wtd_range_atomic_mass              0.017982
wtd_entropy_FusionHeat             0.017455
wtd_entropy_Density                0.017334
wtd_entropy_fie                    0.017313
wtd_mean_fie                       0.016929
wtd_mean_Density                   0.016260
wtd_gmean_fie                      0.016017
affinity_valence_ratio             0.015410
wtd_entropy_atomic_radius          0.015268
wtd_std_fie                        0.015005
wtd_mean_atomic_radius             0.014985
wtd_mean_ThermalConductivity       0.014904
wt

In [None]:
# The two new features show up, but seem much more important to the LightGBM model than to XGBoost.

In [3]:
# Show top 60 features for ensemble importance
print("Top 60 Features for ensemble:")
print(ensemble_importance[:60])

Top 60 Features for ensemble:
range_ThermalConductivity          0.216947
Cu                                 0.200296
range_atomic_radius                0.048521
Ca                                 0.020699
Ba                                 0.015465
O                                  0.011583
wtd_gmean_ThermalConductivity      0.010381
wtd_mean_ThermalConductivity       0.009875
wtd_std_ThermalConductivity        0.009873
gmean_Valence                      0.009440
wtd_entropy_Valence                0.008104
wtd_mean_Valence                   0.008015
wtd_std_ElectronAffinity           0.007935
wtd_gmean_Density                  0.007892
wtd_entropy_ElectronAffinity       0.007820
wtd_range_Valence                  0.007704
mass_density_ratio                 0.007633
wtd_entropy_atomic_mass            0.007628
wtd_range_atomic_radius            0.007614
wtd_entropy_ThermalConductivity    0.007413
wtd_gmean_Valence                  0.007279
wtd_std_Valence                    0.007229
wt