In [None]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Feature Engineering: Physics-Based Ratios & Thermal Conductivity Transformations
X["mass_density_ratio"] = X["wtd_mean_atomic_mass"] / (X["wtd_mean_Density"] + 1e-9)
X["affinity_valence_ratio"] = X["wtd_mean_ElectronAffinity"] / (X["wtd_mean_Valence"] + 1e-9)
X["log_thermal_conductivity"] = np.log1p(X["range_ThermalConductivity"])  # Log transformation

# Train-validation-test split (80/10/10) to match sample size for validation later
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                  subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, force_col_wise=True, verbose=-1, random_state=42)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Define blending weights
best_weight_lgb = 0.3454  # Previously found optimal weight
best_weight_xgb = 1.0 - best_weight_lgb

# Feature Importance Analysis Before Removing Features
lgb_importance = pd.Series(optimized_lgb.feature_importances_, index=X.columns).sort_values(ascending=False)
xgb_importance = pd.Series(optimized_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)

# Show top 30 features for each model
# print("Top 30 Features for LightGBM:")
# print(lgb_importance[:30])
# print("\nTop 30 Features for XGBoost:")
# print(xgb_importance[:30])

# Normalize importance scores
lgb_importance = lgb_importance / lgb_importance.sum()
xgb_importance = xgb_importance / xgb_importance.sum()

# Compute blended feature importance
ensemble_importance = (best_weight_lgb * lgb_importance) + (best_weight_xgb * xgb_importance)
ensemble_importance = ensemble_importance.sort_values(ascending=False)

# Remove features with very low importance
low_importance_threshold = 0.0005
low_importance_features = ensemble_importance[ensemble_importance < low_importance_threshold].index
X_train = X_train.drop(columns=low_importance_features)
X_valid = X_valid.drop(columns=low_importance_features)
X_test = X_test.drop(columns=low_importance_features)

# Compute correlation matrix and remove highly correlated features
corr_matrix = X_train.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
correlation_threshold = 0.95
high_correlation_features = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]
X_train = X_train.drop(columns=high_correlation_features)
X_valid = X_valid.drop(columns=high_correlation_features)
X_test = X_test.drop(columns=high_correlation_features)

# Retrain models after feature selection
optimized_lgb.fit(X_train, y_train)
optimized_xgb.fit(X_train, y_train)

# Generate predictions
y_pred_lgb_test = optimized_lgb.predict(X_test)
y_pred_xgb_test = optimized_xgb.predict(X_test)

# Apply Blending Weights
y_pred_ensemble_test = (best_weight_lgb * y_pred_lgb_test) + (best_weight_xgb * y_pred_xgb_test)

# Evaluate Model
ensemble_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_test))
ensemble_r2_test = r2_score(y_test, y_pred_ensemble_test)

print(f"Optimized Weighted Blended Model (After Feature Selection) - Test RMSE: {ensemble_rmse_test:.4f}, Test R²: {ensemble_r2_test:.4f}")


Results:

Optimized Weighted Blended Model (After Feature Selection) - Test RMSE: 8.3005, Test R²: 0.9392

Best run yet! Adding the three physics features, throwing out the features with less than 0.0005 importance and high corolation, then using the optmized ensemble is having a positive effect.


Perhaps I should go back and redo the Bayesian with these settings on each model, then the ensemble.

In [None]:
features_save = X_train.columns
print(features_save)
print('Numer of features: ', len(features_save))

Features:

Index(['mean_atomic_mass', 'wtd_mean_atomic_mass', 'gmean_atomic_mass',
       'entropy_atomic_mass', 'wtd_entropy_atomic_mass', 'range_atomic_mass',
       'wtd_range_atomic_mass', 'wtd_std_atomic_mass', 'mean_fie',
       'wtd_mean_fie', 'wtd_entropy_fie', 'range_fie', 'wtd_range_fie',
       'wtd_std_fie', 'mean_atomic_radius', 'wtd_mean_atomic_radius',
       'gmean_atomic_radius', 'range_atomic_radius', 'wtd_range_atomic_radius',
       'mean_Density', 'wtd_mean_Density', 'gmean_Density', 'entropy_Density',
       'wtd_entropy_Density', 'range_Density', 'wtd_range_Density',
       'wtd_std_Density', 'mean_ElectronAffinity', 'wtd_mean_ElectronAffinity',
       'gmean_ElectronAffinity', 'wtd_gmean_ElectronAffinity',
       'entropy_ElectronAffinity', 'wtd_entropy_ElectronAffinity',
       'range_ElectronAffinity', 'wtd_range_ElectronAffinity',
       'wtd_std_ElectronAffinity', 'mean_FusionHeat', 'wtd_mean_FusionHeat',
       'gmean_FusionHeat', 'entropy_FusionHeat', 'wtd_entropy_FusionHeat',
       'range_FusionHeat', 'wtd_range_FusionHeat', 'wtd_std_FusionHeat',
       'mean_ThermalConductivity', 'wtd_mean_ThermalConductivity',
       'gmean_ThermalConductivity', 'wtd_gmean_ThermalConductivity',
       'entropy_ThermalConductivity', 'wtd_entropy_ThermalConductivity',
       'range_ThermalConductivity', 'wtd_range_ThermalConductivity',
       'mean_Valence', 'wtd_mean_Valence', 'range_Valence',
       'wtd_range_Valence', 'wtd_std_Valence', 'H', 'B', 'C', 'O', 'F', 'Na',
       'Mg', 'Al', 'Cl', 'K', 'Ca', 'V', 'Cr', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
       'As', 'Se', 'Sr', 'Y', 'Nb', 'Sn', 'I', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
       'Sm', 'Eu', 'Gd', 'Tb', 'Yb', 'Hg', 'Tl', 'Pb', 'Bi',
       'mass_density_ratio', 'affinity_valence_ratio',
       'log_thermal_conductivity'],
      dtype='object')

      
Numer of features:  99