In [None]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import shuffle

# Load datasets
main_data = pd.read_csv("./data/train.csv")  # Superconductivity dataset
unique_m = pd.read_csv("./data/unique_m.csv")

# Remove 'critical_temp' from unique_m to avoid duplication
unique_m = unique_m.drop(columns=["critical_temp"], errors='ignore')

# Merge datasets assuming rows align (index-based merge)
merged_data = pd.concat([main_data, unique_m], axis=1)

# Define target and features
target = "critical_temp"  # Target variable
X = merged_data.drop(columns=[target, "material"])  # Drop 'material' column
y = merged_data[target]

# Shuffle data before splitting
X, y = shuffle(X, y, random_state=42)

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Optimized LightGBM Model
optimized_lgb = lgb.LGBMRegressor(n_estimators=496, max_depth=15, learning_rate=0.057878589503943714, 
                                  subsample=0.6619352139576826, colsample_bytree=0.7512301369524537, 
                                  num_leaves=148, force_col_wise=True)
optimized_lgb.fit(X_train, y_train)

# Train Optimized XGBoost Model
optimized_xgb = xgb.XGBRegressor(n_estimators=407, max_depth=10, learning_rate=0.02962746174406205,
                                 subsample=0.8786056663685927, colsample_bytree=0.6260167856358314,
                                 gamma=4.321388407974591, tree_method="hist", random_state=42)
optimized_xgb.fit(X_train, y_train)

# Generate predictions for validation and test sets
y_pred_lgb_valid = optimized_lgb.predict(X_valid)
y_pred_xgb_valid = optimized_xgb.predict(X_valid)
y_pred_lgb_test = optimized_lgb.predict(X_test)
y_pred_xgb_test = optimized_xgb.predict(X_test)

# Use Previously Optimized Weights for Blending
best_weight_lgb = 0.3454  # Previously found weight
best_weight_xgb = 1.0 - best_weight_lgb

# Apply Best Weights to Validation and Test Sets
y_pred_ensemble_valid = (best_weight_lgb * y_pred_lgb_valid) + (best_weight_xgb * y_pred_xgb_valid)
y_pred_ensemble_test = (best_weight_lgb * y_pred_lgb_test) + (best_weight_xgb * y_pred_xgb_test)

# Evaluate Blended Model on Validation and Test Sets
ensemble_rmse_valid = np.sqrt(mean_squared_error(y_valid, y_pred_ensemble_valid))
ensemble_r2_valid = r2_score(y_valid, y_pred_ensemble_valid)
ensemble_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_ensemble_test))
ensemble_r2_test = r2_score(y_test, y_pred_ensemble_test)

print(f"Optimized Weighted Blended Model - Validation RMSE: {ensemble_rmse_valid:.4f}, Validation R²: {ensemble_r2_valid:.4f}")
print(f"Optimized Weighted Blended Model - Test RMSE: {ensemble_rmse_test:.4f}, Test R²: {ensemble_r2_test:.4f}")
print(f"Optimal Blending Weights Used: LightGBM={best_weight_lgb:.4f}, XGBoost={best_weight_xgb:.4f}")

# Learning Curve Analysis
def plot_learning_curve(model, X, y, title):
    train_sizes, train_scores, valid_scores = learning_curve(model, X, y, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
    train_mean = -np.mean(train_scores, axis=1)
    valid_mean = -np.mean(valid_scores, axis=1)
    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, train_mean, label="Training RMSE")
    plt.plot(train_sizes, valid_mean, label="Validation RMSE")
    plt.xlabel("Training Size")
    plt.ylabel("RMSE")
    plt.title(title)
    plt.legend()
    plt.show()

# Plot learning curves for both models
plot_learning_curve(optimized_xgb, X_train, y_train, "Learning Curve - XGBoost")
plot_learning_curve(optimized_lgb, X_train, y_train, "Learning Curve - LightGBM")


Results:
Optimized Weighted Blended Model - Validation RMSE: 9.3620, Validation R²: 0.9225
Optimized Weighted Blended Model - Test RMSE: 9.0471, Test R²: 0.9311
Optimal Blending Weights Used: LightGBM=0.3454, XGBoost=0.6546

The RMSE is worse than before likel yowing to less data available for training due to the 70/15/15 train/test/validate split.

Overfitting doesn't seem to be a problem, but more data would be good (not available). Propose increaseing train split to 80/10/10 and trying some more feature engineering.

Learn Curves are saved as .png in the /output folder, will upload to Slack