In [None]:
# this notebook is to try a Bayesian Optimization on LightGBM using a reduced feature set for sepped, 50 iterations

import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Importance Threshold for Selection
importance_threshold = 0.005  # Keep features with importance > 0.5%

# Define Bayesian Optimization for LightGBM
def objective_lgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "random_state": 42
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    test_rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
    return test_rmse

# Run Bayesian Optimization for LightGBM
study_lgb = optuna.create_study(direction="minimize")
study_lgb.optimize(objective_lgb, n_trials=50)

# Train Optimized LightGBM Model
best_params_lgb = study_lgb.best_params
optimized_lgb = lgb.LGBMRegressor(**best_params_lgb)
optimized_lgb.fit(X_train, y_train)

# Evaluate Optimized LightGBM Performance
train_rmse_lgb_opt = np.sqrt(mean_squared_error(y_train, optimized_lgb.predict(X_train)))
test_rmse_lgb_opt = np.sqrt(mean_squared_error(y_test, optimized_lgb.predict(X_test)))
train_r2_lgb_opt = r2_score(y_train, optimized_lgb.predict(X_train))
test_r2_lgb_opt = r2_score(y_test, optimized_lgb.predict(X_test))

print(f"Optimized LightGBM - Train RMSE: {train_rmse_lgb_opt:.4f}, Train R²: {train_r2_lgb_opt:.4f}")
print(f"Optimized LightGBM - Test RMSE: {test_rmse_lgb_opt:.4f}, Test R²: {test_r2_lgb_opt:.4f}")

# Save Best Parameters
print("Best LightGBM Parameters:")
print(best_params_lgb)


Rsults:

Optimized LightGBM - Train RMSE: 5.0729, Train R²: 0.9782
Optimized LightGBM - Test RMSE: 8.6114, Test R²: 0.9356
Best LightGBM Parameters:
{'n_estimators': 496, 'max_depth': 15, 'learning_rate': 0.057878589503943714, 'subsample': 0.6619352139576826, 'colsample_bytree': 0.7512301369524537, 'num_leaves': 148}

