In [1]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Feature Engineering: Physics-Based Ratios & Thermal Conductivity Transformations
X["mass_density_ratio"] = X["wtd_mean_atomic_mass"] / (X["wtd_mean_Density"] + 1e-9)
X["affinity_valence_ratio"] = X["wtd_mean_ElectronAffinity"] / (X["wtd_mean_Valence"] + 1e-9)
X["log_thermal_conductivity"] = np.log1p(X["range_ThermalConductivity"])  # Log transformation
X["thermal_conductivity_squared"] = X["range_ThermalConductivity"] ** 2  # Squared transformation

# Train-validation-test split (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                  subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, force_col_wise=True, verbose=-1, random_state=42)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Define blending weights
best_weight_lgb = 0.3454  # Previously found optimal weight
best_weight_xgb = 1.0 - best_weight_lgb

# Feature Importance Analysis Before Removing Features
lgb_importance = pd.Series(optimized_lgb.feature_importances_, index=X.columns).sort_values(ascending=False)
xgb_importance = pd.Series(optimized_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)

# Show top 20 features for each model
print("Top 30 Features for LightGBM:")
print(lgb_importance[:30])
print("\nTop 30 Features for XGBoost:")
print(xgb_importance[:30])

# Normalize importance scores
lgb_importance = lgb_importance / lgb_importance.sum()
xgb_importance = xgb_importance / xgb_importance.sum()

# Compute blended feature importance
ensemble_importance = (best_weight_lgb * lgb_importance) + (best_weight_xgb * xgb_importance)
ensemble_importance = ensemble_importance.sort_values(ascending=False)

# Remove features with very low importance
low_importance_threshold = 0.005
low_importance_features = ensemble_importance[ensemble_importance < low_importance_threshold].index
X_train = X_train.drop(columns=low_importance_features)
X_valid = X_valid.drop(columns=low_importance_features)
X_test = X_test.drop(columns=low_importance_features)

# Compute correlation matrix and remove highly correlated features
corr_matrix = X_train.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
correlation_threshold = 0.95
high_correlation_features = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]
X_train = X_train.drop(columns=high_correlation_features)
X_valid = X_valid.drop(columns=high_correlation_features)
X_test = X_test.drop(columns=high_correlation_features)

# Retrain models after feature selection
optimized_lgb.fit(X_train, y_train)
optimized_xgb.fit(X_train, y_train)

# Generate predictions
y_pred_lgb_test = optimized_lgb.predict(X_test)
y_pred_xgb_test = optimized_xgb.predict(X_test)

# Apply Blending Weights
y_pred_ensemble_test = (best_weight_lgb * y_pred_lgb_test) + (best_weight_xgb * y_pred_xgb_test)

# Evaluate Model
ensemble_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_test))
ensemble_r2_test = r2_score(y_test, y_pred_ensemble_test)

print(f"Optimized Weighted Blended Model (After Feature Selection) - Test RMSE: {ensemble_rmse_test:.4f}, Test R²: {ensemble_r2_test:.4f}")


Top 30 Features for LightGBM:
wtd_entropy_atomic_mass            1043
wtd_range_atomic_radius            1002
wtd_entropy_ElectronAffinity        985
wtd_entropy_Valence                 960
wtd_entropy_ThermalConductivity     952
wtd_range_fie                       949
wtd_range_Valence                   942
mass_density_ratio                  942
wtd_std_atomic_radius               917
wtd_entropy_fie                     905
wtd_entropy_FusionHeat              898
wtd_entropy_Density                 889
wtd_std_ThermalConductivity         885
wtd_mean_fie                        854
wtd_mean_Density                    850
wtd_range_atomic_mass               831
wtd_range_ElectronAffinity          778
wtd_mean_atomic_radius              771
wtd_entropy_atomic_radius           764
wtd_gmean_fie                       763
wtd_std_atomic_mass                 754
wtd_gmean_ThermalConductivity       747
wtd_mean_ThermalConductivity        727
wtd_std_ElectronAffinity            725
wtd_mean_a