In [None]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Feature Engineering: Physics-Based Ratios & Thermal Conductivity Transformations
X["mass_density_ratio"] = X["wtd_mean_atomic_mass"] / (X["wtd_mean_Density"] + 1e-9)
X["affinity_valence_ratio"] = X["wtd_mean_ElectronAffinity"] / (X["wtd_mean_Valence"] + 1e-9)
X["log_thermal_conductivity"] = np.log1p(X["range_ThermalConductivity"])  # Log transformation


# Keep 99 selected features from previous feature selection, 0.0005 or better correlation sorted
selected_features = ['mean_atomic_mass', 'wtd_mean_atomic_mass', 'gmean_atomic_mass',
       'entropy_atomic_mass', 'wtd_entropy_atomic_mass', 'range_atomic_mass',
       'wtd_range_atomic_mass', 'wtd_std_atomic_mass', 'mean_fie',
       'wtd_mean_fie', 'wtd_entropy_fie', 'range_fie', 'wtd_range_fie',
       'wtd_std_fie', 'mean_atomic_radius', 'wtd_mean_atomic_radius',
       'gmean_atomic_radius', 'range_atomic_radius', 'wtd_range_atomic_radius',
       'mean_Density', 'wtd_mean_Density', 'gmean_Density', 'entropy_Density',
       'wtd_entropy_Density', 'range_Density', 'wtd_range_Density',
       'wtd_std_Density', 'mean_ElectronAffinity', 'wtd_mean_ElectronAffinity',
       'gmean_ElectronAffinity', 'wtd_gmean_ElectronAffinity',
       'entropy_ElectronAffinity', 'wtd_entropy_ElectronAffinity',
       'range_ElectronAffinity', 'wtd_range_ElectronAffinity',
       'wtd_std_ElectronAffinity', 'mean_FusionHeat', 'wtd_mean_FusionHeat',
       'gmean_FusionHeat', 'entropy_FusionHeat', 'wtd_entropy_FusionHeat',
       'range_FusionHeat', 'wtd_range_FusionHeat', 'wtd_std_FusionHeat',
       'mean_ThermalConductivity', 'wtd_mean_ThermalConductivity',
       'gmean_ThermalConductivity', 'wtd_gmean_ThermalConductivity',
       'entropy_ThermalConductivity', 'wtd_entropy_ThermalConductivity',
       'range_ThermalConductivity', 'wtd_range_ThermalConductivity',
       'mean_Valence', 'wtd_mean_Valence', 'range_Valence',
       'wtd_range_Valence', 'wtd_std_Valence', 'H', 'B', 'C', 'O', 'F', 'Na',
       'Mg', 'Al', 'Cl', 'K', 'Ca', 'V', 'Cr', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
       'As', 'Se', 'Sr', 'Y', 'Nb', 'Sn', 'I', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
       'Sm', 'Eu', 'Gd', 'Tb', 'Yb', 'Hg', 'Tl', 'Pb', 'Bi',
       'mass_density_ratio', 'affinity_valence_ratio',
       'log_thermal_conductivity'
]

# Apply feature selection BEFORE train-test split
X = X[selected_features]


# Train-validation-test split (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Define the objective function for Bayesian Optimization
def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'random_state': 42
    }
    model = lgb.LGBMRegressor(**params, verbose=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return np.sqrt(mean_squared_error(y_valid, preds))

study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(objective_lgb, n_trials=300)

# Print and save LightGBM results immediately
print("Best LightGBM Parameters:", study_lgb.best_params)
with open("./output/bayesian_optimization_results.txt", "w") as f:
    f.write("Best LightGBM Parameters:\n")
    f.write(str(study_lgb.best_params) + "\n\n")


# Best LightGBM parameters
best_params_lgb = study_lgb.best_params

# Define the objective function for XGBoost
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'random_state': 42
    }
    model = xgb.XGBRegressor(**params, tree_method="hist")
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return np.sqrt(mean_squared_error(y_valid, preds))

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=300)

# Print and save XGBoost results immediately
print("Best XGBoost Parameters:", study_xgb.best_params)
with open("./output/bayesian_optimization_results.txt", "a") as f:
    f.write("Best XGBoost Parameters:\n")
    f.write(str(study_xgb.best_params) + "\n\n")

# Best XGBoost parameters
best_params_xgb = study_xgb.best_params

# Optimize the blending weight for LightGBM and XGBoost
def objective_blend(trial):
    weight_lgb = trial.suggest_float('weight_lgb', 0.1, 0.9)
    weight_xgb = 1.0 - weight_lgb
    
    model_lgb = lgb.LGBMRegressor(**best_params_lgb, verbose=-1)
    model_xgb = xgb.XGBRegressor(**best_params_xgb, tree_method="hist")
    
    model_lgb.fit(X_train, y_train)
    model_xgb.fit(X_train, y_train)
    
    preds_lgb = model_lgb.predict(X_valid)
    preds_xgb = model_xgb.predict(X_valid)
    
    blended_preds = (weight_lgb * preds_lgb) + (weight_xgb * preds_xgb)
    return np.sqrt(mean_squared_error(y_valid, blended_preds))

study_blend = optuna.create_study(direction='minimize')
study_blend.optimize(objective_blend, n_trials=200)

# Print and save Blending results immediately
best_weight_lgb = study_blend.best_params['weight_lgb']
best_weight_xgb = 1.0 - best_weight_lgb
print(f"Best Blending Weights: LightGBM: {best_weight_lgb:.4f}, XGBoost: {best_weight_xgb:.4f}")
with open("./output/bayesian_optimization_results.txt", "a") as f:
    f.write("Best Blending Weights:\n")
    f.write(f"LightGBM: {best_weight_lgb:.4f}, XGBoost: {best_weight_xgb:.4f}\n\n")

best_weight_lgb = study_blend.best_params['weight_lgb']
best_weight_xgb = 1.0 - best_weight_lgb

# Save best parameters to file
with open("./output/bayesian_optimization_results.txt", "w") as f:
    f.write("Best LightGBM Parameters:\n")
    f.write(str(best_params_lgb) + "\n\n")
    f.write("Best XGBoost Parameters:\n")
    f.write(str(best_params_xgb) + "\n\n")
    f.write("Best Blending Weights:\n")
    f.write(f"LightGBM: {best_weight_lgb:.4f}, XGBoost: {best_weight_xgb:.4f}\n")

# Print results
print("Best LightGBM Parameters:", best_params_lgb)
print("Best XGBoost Parameters:", best_params_xgb)
print(f"Best Blending Weights: LightGBM: {best_weight_lgb:.4f}, XGBoost: {best_weight_xgb:.4f}")


Results:

[I 2025-02-28 16:51:45,088] Trial 299 finished with value: 8.84463719104261 and parameters: {'n_estimators': 233, 'max_depth': 12, 'learning_rate': 0.09060115024684388, 'subsample': 0.9604645860495126, 'colsample_bytree': 0.5299704519405377, 'num_leaves': 63}. Best is trial 277 with value: 8.512404871788814.


[I 2025-02-28 17:59:19,579] Trial 299 finished with value: 8.675454586299422 and parameters: {'n_estimators': 579, 'max_depth': 8, 'learning_rate': 0.06341887231538701, 'subsample': 0.9902333267253244, 'colsample_bytree': 0.5097482026830921, 'gamma': 3.191952664911134}. Best is trial 85 with value: 8.565670848868228.

[I 2025-02-28 19:02:49,912] Trial 199 finished with value: 8.541009038981985 and parameters: {'weight_lgb': 0.5540972983041471}. Best is trial 129 with value: 8.540319505407359.

Best Blending Weights: LightGBM: 0.5945, XGBoost: 0.4055
Best LightGBM Parameters: {'n_estimators': 531, 'max_depth': 13, 'learning_rate': 0.08858714978625931, 'subsample': 0.9745911652256972, 'colsample_bytree': 0.5495191207658753, 'num_leaves': 52}
Best XGBoost Parameters: {'n_estimators': 567, 'max_depth': 8, 'learning_rate': 0.07801682573293517, 'subsample': 0.9918091768682683, 'colsample_bytree': 0.5113150149536635, 'gamma': 1.895779893865473}
Best Blending Weights: LightGBM: 0.5945, XGBoost: 0.4055
