In [None]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Feature Engineering: Physics-Based Ratios & Thermal Conductivity Transformations
X["mass_density_ratio"] = X["wtd_mean_atomic_mass"] / (X["wtd_mean_Density"] + 1e-9)
X["affinity_valence_ratio"] = X["wtd_mean_ElectronAffinity"] / (X["wtd_mean_Valence"] + 1e-9)
X["log_thermal_conductivity"] = np.log1p(X["range_ThermalConductivity"])  # Log transformation


# Keep only selected features from previous feature selection step of 0.0005 or better
selected_features = [
    "wtd_entropy_ElectronAffinity", "wtd_entropy_atomic_mass", "wtd_range_atomic_radius",
    "wtd_range_fie", "wtd_std_atomic_radius", "wtd_range_Valence", "wtd_entropy_FusionHeat",
    "wtd_entropy_Valence", "wtd_entropy_Density", "wtd_entropy_fie", "wtd_entropy_ThermalConductivity",
    "mass_density_ratio", "wtd_range_atomic_mass", "wtd_std_ThermalConductivity", "wtd_entropy_atomic_radius",
    "wtd_mean_Density", "wtd_mean_fie", "wtd_gmean_fie", "wtd_mean_ThermalConductivity",
    "affinity_valence_ratio", "wtd_std_ElectronAffinity", "wtd_mean_atomic_radius", "wtd_std_Valence",
    "wtd_gmean_FusionHeat", "wtd_std_atomic_mass", "wtd_std_fie", "wtd_gmean_Density",
    "wtd_mean_atomic_mass", "wtd_range_ElectronAffinity", "wtd_gmean_ThermalConductivity",
    "log_thermal_conductivity", "range_ThermalConductivity", "Cu", "range_atomic_radius",
    "Ca", "Ba", "As", "gmean_Valence", "Zn",
    "Pr", "Fe", "mean_Valence", "std_atomic_mass",
    "gmean_Density", "Mg", "wtd_mean_Valence",
    "gmean_ElectronAffinity", "wtd_gmean_Valence", "range_fie", "Hg",
    "mean_ThermalConductivity", "entropy_Density", "Cl",
    "O", "std_Density"
]

# Apply feature selection BEFORE train-test split
X = X[selected_features]


# Train-validation-test split (80/10/10)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Define the objective function for Bayesian Optimization
def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'random_state': 42
    }
    model = lgb.LGBMRegressor(**params, verbose=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return np.sqrt(mean_squared_error(y_valid, preds))

study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(objective_lgb, n_trials=150)

# Print and save LightGBM results immediately
print("Best LightGBM Parameters:", study_lgb.best_params)
with open("./output/bayesian_optimization_results.txt", "w") as f:
    f.write("Best LightGBM Parameters:\n")
    f.write(str(study_lgb.best_params) + "\n\n")


# Best LightGBM parameters
best_params_lgb = study_lgb.best_params

# Define the objective function for XGBoost
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'random_state': 42
    }
    model = xgb.XGBRegressor(**params, tree_method="hist")
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return np.sqrt(mean_squared_error(y_valid, preds))

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=150)

# Print and save XGBoost results immediately
print("Best XGBoost Parameters:", study_xgb.best_params)
with open("./output/bayesian_optimization_results.txt", "a") as f:
    f.write("Best XGBoost Parameters:\n")
    f.write(str(study_xgb.best_params) + "\n\n")

# Best XGBoost parameters
best_params_xgb = study_xgb.best_params

# Optimize the blending weight for LightGBM and XGBoost
def objective_blend(trial):
    weight_lgb = trial.suggest_float('weight_lgb', 0.1, 0.9)
    weight_xgb = 1.0 - weight_lgb
    
    model_lgb = lgb.LGBMRegressor(**best_params_lgb, verbose=-1)
    model_xgb = xgb.XGBRegressor(**best_params_xgb, tree_method="hist")
    
    model_lgb.fit(X_train, y_train)
    model_xgb.fit(X_train, y_train)
    
    preds_lgb = model_lgb.predict(X_valid)
    preds_xgb = model_xgb.predict(X_valid)
    
    blended_preds = (weight_lgb * preds_lgb) + (weight_xgb * preds_xgb)
    return np.sqrt(mean_squared_error(y_valid, blended_preds))

study_blend = optuna.create_study(direction='minimize')
study_blend.optimize(objective_blend, n_trials=50)

# Print and save Blending results immediately
best_weight_lgb = study_blend.best_params['weight_lgb']
best_weight_xgb = 1.0 - best_weight_lgb
print(f"Best Blending Weights: LightGBM: {best_weight_lgb:.4f}, XGBoost: {best_weight_xgb:.4f}")
with open("./output/bayesian_optimization_results.txt", "a") as f:
    f.write("Best Blending Weights:\n")
    f.write(f"LightGBM: {best_weight_lgb:.4f}, XGBoost: {best_weight_xgb:.4f}\n\n")

best_weight_lgb = study_blend.best_params['weight_lgb']
best_weight_xgb = 1.0 - best_weight_lgb

# Save best parameters to file
with open("./output/bayesian_optimization_results.txt", "w") as f:
    f.write("Best LightGBM Parameters:\n")
    f.write(str(best_params_lgb) + "\n\n")
    f.write("Best XGBoost Parameters:\n")
    f.write(str(best_params_xgb) + "\n\n")
    f.write("Best Blending Weights:\n")
    f.write(f"LightGBM: {best_weight_lgb:.4f}, XGBoost: {best_weight_xgb:.4f}\n")

# Print results
print("Best LightGBM Parameters:", best_params_lgb)
print("Best XGBoost Parameters:", best_params_xgb)
print(f"Best Blending Weights: LightGBM: {best_weight_lgb:.4f}, XGBoost: {best_weight_xgb:.4f}")


Results:

[I 2025-02-28 00:31:34,153] Trial 149 finished with value: 8.707895416186213 and parameters: {'n_estimators': 542, 'max_depth': 8, 'learning_rate': 0.029919215957583872, 'subsample': 0.9734247492846361, 'colsample_bytree': 0.5015685193545648, 'gamma': 0.5053168584372654}. Best is trial 142 with value: 8.67628548053857.
[I 2025-02-28 00:31:34,156] A new study created in memory with name: no-name-79369e40-55e0-4905-9693-22b17e1b368f


[I 2025-02-28 00:47:56,262] Trial 49 finished with value: 8.67239680831487 and parameters: {'weight_lgb': 0.6891019320589149}. Best is trial 32 with value: 8.66722659322327.


Best Blending Weights: LightGBM: 0.5496, XGBoost: 0.4504
Best LightGBM Parameters: {'n_estimators': 571, 'max_depth': 15, 'learning_rate': 0.04733890495767294, 'subsample': 0.5477546247338764, 'colsample_bytree': 0.5000311167875034, 'num_leaves': 82}
Best XGBoost Parameters: {'n_estimators': 579, 'max_depth': 9, 'learning_rate': 0.03147254688837867, 'subsample': 0.9703470640547884, 'colsample_bytree': 0.5104011133022952, 'gamma': 0.23681375835515578}
Best Blending Weights: LightGBM: 0.5496, XGBoost: 0.4504


