<a href="https://colab.research.google.com/github/BalajiMittapalli/Zelestra-Challenge/blob/main/AWS_Ascends.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install category_encoders optuna xgboost lightgbm catboost scikit-learn pandas numpy



In [None]:
# Cell 1: Installs and Imports
!pip install category_encoders optuna xgboost lightgbm catboost scikit-learn pandas numpy -q

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.ensemble import StackingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import optuna
from sklearn.metrics import mean_squared_error
import warnings

# Suppress Optuna's specific warning about preferring `suggest_float` for `learning_rate`
# and other common warnings to keep output cleaner during HPO.
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
warnings.filterwarnings("ignore", category=UserWarning) # General user warnings often from libraries
warnings.filterwarnings("ignore", category=FutureWarning)

# For reproducibility in Optuna studies and KFold
RANDOM_STATE = 42

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Cell 2: Load Data
df_train = pd.read_csv("/content/train.csv")
df_test = pd.read_csv("/content/test.csv")

print("Train data shape:", df_train.shape)
print("Test data shape:", df_test.shape)

Train data shape: (20000, 17)
Test data shape: (12000, 16)


In [None]:
# Cell 3: Separate Target & IDs, Initial Feature Lists
y = df_train["efficiency"].values
X = df_train.drop(["id", "efficiency"], axis=1)
X_test = df_test.drop(["id"], axis=1)
test_ids = df_test["id"].values

# Original numerical features list (before engineering)
original_numerical_feats = [
    "temperature", "irradiance", "humidity", "soiling_ratio", "voltage",
    "current", "module_temperature", "cloud_coverage", "wind_speed", "pressure",
    "panel_age", "maintenance_count"
]
low_card_cat = ["installation_type"]
high_card_cat = ["string_id", "error_code"]

print("X shape:", X.shape)
print("X_test shape:", X_test.shape)
print("y shape:", y.shape)

X shape: (20000, 15)
X_test shape: (12000, 15)
y shape: (20000,)


In [None]:
# Cell 4: Coerce Numerical Features to Numeric Types
# This step is crucial and should happen before feature engineering if engineered features depend on these.
print("Coercing numerical features to numeric types...")
for df in (X, X_test):
    # Check for non-numeric strings before coercion if necessary (for debugging)
    # for col in original_numerical_feats:
    #     if df[col].dtype == 'object':
    #         non_numeric = df[col][~df[col].astype(str).str.match(r'^-?\d+\.?\d*$')]
    #         if not non_numeric.empty:
    #             print(f"Non-numeric values found in {col} before coerce: {non_numeric.unique()[:5]}")
    df[original_numerical_feats] = df[original_numerical_feats].apply(pd.to_numeric, errors='coerce')

# Check NaNs introduced by coercion
print(f"NaNs in X after coercion: {X[original_numerical_feats].isnull().sum().sum()}")
print(f"NaNs in X_test after coercion: {X_test[original_numerical_feats].isnull().sum().sum()}")
print("Coercion complete.")

Coercing numerical features to numeric types...
NaNs in X after coercion: 9375
NaNs in X_test after coercion: 5538
Coercion complete.


In [None]:
# Cell 5: Feature Engineering
print("Starting Feature Engineering...")

# (Keep existing features)
for df in (X, X_test):
    df['power_approx'] = df['voltage'] * df['current']
    df['temp_delta'] = df['module_temperature'] - df['temperature']

    # New Features:
    # Interactions with panel_age
    df['soiling_impact_aged'] = df['soiling_ratio'] * (df['panel_age'] + 1) # Interaction
    df['maintenance_per_year'] = df['maintenance_count'] / (df['panel_age'] + 1e-6) # Ratio, avoid div by zero

    # Environmental interactions
    df['irradiance_cloud_effect'] = df['irradiance'] * (1 - df['cloud_coverage']/100) # Effective irradiance
    df['temp_rise_potential'] = df['irradiance'] / (df['wind_speed'] + 1e-6) # Irradiance vs cooling

    # Polynomial features for key variables (simple quadratics)
    for col in ['temperature', 'irradiance', 'module_temperature']:
        df[col + '_sq'] = df[col]**2

    # Age and maintenance interaction
    df['age_x_maintenance'] = df['panel_age'] * df['maintenance_count']

    # Humidity and temperature
    df['humidity_temp_interaction'] = df['humidity'] * df['temperature']

print("Feature Engineering Complete.")

# Check for new NaNs from division by zero or other ops
print(f"NaNs in X after new FE: {X.isnull().sum().sum()}")
print(f"NaNs in X_test after new FE: {X_test.isnull().sum().sum()}")

# Make sure to update numerical_feats list in Cell 6

Starting Feature Engineering...
Feature Engineering Complete.
NaNs in X after FE (power_approx): 1925
NaNs in X_test after FE (temp_delta): 1134


In [None]:
# Cell 6: Updated Feature Lists (Post-Engineering)

# Keep your original ones
original_numerical_feats_from_cell3 = [
    "temperature", "irradiance", "humidity", "soiling_ratio", "voltage",
    "current", "module_temperature", "cloud_coverage", "wind_speed", "pressure",
    "panel_age", "maintenance_count"
]

# Add the new engineered features to the list
engineered_numerical_feats = [
    'power_approx', 'temp_delta',
    'soiling_impact_aged', 'maintenance_per_year',
    'irradiance_cloud_effect', 'temp_rise_potential',
    'temperature_sq', 'irradiance_sq', 'module_temperature_sq',
    'age_x_maintenance', 'humidity_temp_interaction'
]

numerical_feats = original_numerical_feats_from_cell3 + engineered_numerical_feats

# Ensure all new features are actually present in X, X_test
for df_check in [X, X_test]:
    for f in numerical_feats:
        if f not in df_check.columns:
            print(f"WARNING: Feature {f} not found in DataFrame columns after FE!")


print("Final numerical features:", numerical_feats)
# The rest of Cell 6 (the loop for df in (X,X_test) where features are created)
# should be MOVED to Cell 5 as shown above. Cell 6 should now primarily be for defining the list.

Final numerical features: ['temperature', 'irradiance', 'humidity', 'soiling_ratio', 'voltage', 'current', 'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure', 'panel_age', 'maintenance_count', 'power_approx', 'temp_delta']


In [None]:
# Cell 7: Preprocessing Pipelines Definition

# Numerical pipeline with imputation and scaling
num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy='median')), # Median is robust to outliers
    ("scale", StandardScaler())
])

# Low cardinality categorical pipeline with imputation and OneHotEncoding
cat_low_pipe = Pipeline([
    ("impute", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # sparse_output=False for easier downstream use if needed
])

# High cardinality categorical pipeline with imputation and TargetEncoding
cat_high_pipe = Pipeline([
    ("impute", SimpleImputer(strategy='constant', fill_value='_MISSING_CAT_')), # Explicitly handle NaNs before TE
    ("target_enc", TargetEncoder(min_samples_leaf=50, smoothing=20, handle_unknown='value', handle_missing='value'))
    # Consider cols parameter if not all high_card_cat are suitable for TE
])

# Combined preprocessor
preprocessor = ColumnTransformer([
    ("num", num_pipe, numerical_feats),
    ("cat_low", cat_low_pipe, low_card_cat),
    ("cat_high", cat_high_pipe, high_card_cat),
], remainder='drop', n_jobs=-1) # remainder='passthrough' if you have other cols you want to keep

print("Preprocessor defined.")

Preprocessor defined.


In [None]:
# Cell 8: LightGBM Optuna Objective and Optimization
def objective_lgb(trial, X_data, y_data):
    params = {
        'objective': 'regression_l1', # MAE objective, often robust
        'metric': 'rmse', # Still optimize for RMSE
        'random_state': RANDOM_STATE,
        'n_estimators': 1500,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    }
    model = Pipeline([
        ('prep', preprocessor),
        ('lgbm', LGBMRegressor(**params, verbosity=-1)) # verbosity=-1 to suppress LGBM training output
    ])
    cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) # Reduced splits for faster HPO if needed
    # Using n_jobs=2 for cross_val_score to limit parallelism during Optuna if main loop is also parallel
    scores = cross_val_score(model, X_data, y_data, cv=cv,
                             scoring='neg_root_mean_squared_error', n_jobs=2)
    return -np.mean(scores)

print("Optimizing LightGBM...")
study_lgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study_lgb.optimize(lambda trial: objective_lgb(trial, X, y), n_trials=75, n_jobs=1) # n_jobs=1 for Optuna to avoid oversubscribing CPUs if CV has n_jobs
best_lgb_params = study_lgb.best_params
print("Best LightGBM CV RMSE:", study_lgb.best_value)
print("Best LightGBM parameters:", best_lgb_params)

[I 2025-06-07 10:41:22,272] A new study created in memory with name: no-name-cc5f72ec-5d57-4fea-a116-4e837c29c31f


Optimizing LightGBM...


[I 2025-06-07 10:41:49,935] Trial 0 finished with value: 0.1036918465391868 and parameters: {'learning_rate': 0.015355286838886862, 'num_leaves': 287, 'max_depth': 10, 'min_child_samples': 62, 'feature_fraction': 0.5780093202212182, 'bagging_fraction': 0.5779972601681014, 'bagging_freq': 1, 'lambda_l1': 0.6245760287469893, 'lambda_l2': 0.002570603566117598}. Best is trial 0 with value: 0.1036918465391868.
[I 2025-06-07 10:42:06,486] Trial 1 finished with value: 0.10383732687205402 and parameters: {'learning_rate': 0.04170553216181044, 'num_leaves': 25, 'max_depth': 12, 'min_child_samples': 84, 'feature_fraction': 0.6061695553391381, 'bagging_fraction': 0.5909124836035503, 'bagging_freq': 2, 'lambda_l1': 5.472429642032198e-06, 'lambda_l2': 0.00052821153945323}. Best is trial 0 with value: 0.1036918465391868.
[I 2025-06-07 10:42:38,001] Trial 2 finished with value: 0.10361918407348605 and parameters: {'learning_rate': 0.018236581424556055, 'num_leaves': 101, 'max_depth': 9, 'min_child_sa

Best LightGBM CV RMSE: 0.10357193330632464
Best LightGBM parameters: {'learning_rate': 0.010768024916395167, 'num_leaves': 142, 'max_depth': 5, 'min_child_samples': 30, 'feature_fraction': 0.6810422728033312, 'bagging_fraction': 0.6272672891085348, 'bagging_freq': 2, 'lambda_l1': 2.7952651140818567e-05, 'lambda_l2': 0.009972883683414422}


In [None]:
# Cell 8.1: Save Best LightGBM Parameters to JSON
import json

# Ensure best_lgb_params is defined from the previous Optuna cell
if 'best_lgb_params' in locals() and best_lgb_params:
    lgb_params_filename = "best_lgb_params.json"
    with open(lgb_params_filename, 'w') as f:
        json.dump(best_lgb_params, f, indent=4) # indent=4 makes the JSON file human-readable
    print(f"Best LightGBM parameters saved to {lgb_params_filename}")

    # To verify, you can load it back (optional)
    # with open(lgb_params_filename, 'r') as f:
    #     loaded_params = json.load(f)
    # print("Loaded params for verification:", loaded_params)
else:
    print("Error: 'best_lgb_params' not found or is empty. Please run the LightGBM optimization cell first.")

Best LightGBM parameters saved to best_lgb_params.json


In [None]:
# Cell 9: CatBoost Optuna Objective and Optimization (Minimally Changed for Speed - Recommended Iterations)
def objective_cat(trial, X_data, y_data):
    params = {
        'loss_function': 'RMSE',
        'iterations': 300,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True), # Adjusted for fewer iterations
        'depth': trial.suggest_int('depth', 4, 12), # You can keep this range
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'random_seed': RANDOM_STATE,
        'thread_count': 2
    }
    if params['bootstrap_type'] == 'MVS':
        params['subsample'] = trial.suggest_float('subsample_mvs', 0.1, 1.0)
    elif params['bootstrap_type'] == 'Bernoulli':
         params['subsample'] = trial.suggest_float('subsample_bernoulli', 0.1, 1.0)

    model = Pipeline([
        ('prep', preprocessor),
        ('cat', CatBoostRegressor(**params, verbose=0))
    ])
    cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(model, X_data, y_data, cv=cv,
                             scoring='neg_root_mean_squared_error', n_jobs=2)
    return -np.mean(scores)

print("Optimizing CatBoost (Recommended 'minimum best' iterations for HPO speed)...")
study_cat = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study_cat.optimize(lambda trial: objective_cat(trial, X, y), n_trials=50, n_jobs=1) # Kept n_trials=50
best_cat_params = study_cat.best_params
print("Best CatBoost CV RMSE (from faster HPO):", study_cat.best_value)
print("Best CatBoost parameters (from faster HPO):", best_cat_params)

# REMEMBER: For the final model in Cell 11, INCREASE 'iterations' significantly
# (e.g., back to 1500, 2000, or even more based on best_cat_params).
# Example for Cell 11:
# final_cat_params = best_cat_params.copy()
# final_cat_params['iterations'] = 2000 # Use a larger number for the final model
# # Ensure the learning rate found by Optuna is appropriate or adjust if needed for more iterations.
# # Sometimes, if HPO used a higher LR due to fewer iterations, you might want to slightly
# # decrease the LR for the final model with many more iterations.
# # However, often the Optuna-found LR works well enough.
# ... rest of final_cat_params update ...

[I 2025-06-07 12:13:28,151] A new study created in memory with name: no-name-7db1c9b3-f550-4611-8260-468f0e58748c


Optimizing CatBoost (Recommended 'minimum best' iterations for HPO speed)...


[I 2025-06-07 12:15:23,971] Trial 0 finished with value: 0.10418672957920339 and parameters: {'learning_rate': 0.030710573677773714, 'depth': 12, 'l2_leaf_reg': 1.5702970884055387, 'border_count': 166, 'bootstrap_type': 'Bayesian', 'grow_policy': 'SymmetricTree'}. Best is trial 0 with value: 0.10418672957920339.
[I 2025-06-07 12:16:24,904] Trial 1 finished with value: 0.10475437627144443 and parameters: {'learning_rate': 0.010636066512540286, 'depth': 12, 'l2_leaf_reg': 3.142880890840109, 'border_count': 79, 'bootstrap_type': 'MVS', 'grow_policy': 'SymmetricTree', 'subsample_mvs': 0.6506676052501416}. Best is trial 0 with value: 0.10418672957920339.
[I 2025-06-07 12:16:42,626] Trial 2 finished with value: 0.10337334150998294 and parameters: {'learning_rate': 0.01518747922672247, 'depth': 6, 'l2_leaf_reg': 0.1256277350380703, 'border_count': 134, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Lossguide'}. Best is trial 2 with value: 0.10337334150998294.
[I 2025-06-07 12:16:47,862] Trial 

Best CatBoost CV RMSE (from faster HPO): 0.10289498361813935
Best CatBoost parameters (from faster HPO): {'learning_rate': 0.033012966807138715, 'depth': 4, 'l2_leaf_reg': 8.361927793592796, 'border_count': 208, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise'}


In [None]:
import json # Ensure json module is imported

filename_cat_params = "best_catboost_hpo_params.json" # Define the filename
try:
    with open(filename_cat_params, 'w') as f:
        json.dump(best_cat_params, f, indent=4) # Save only the best_cat_params dictionary
    print(f"Successfully saved Best CatBoost parameters to {filename_cat_params}")
except Exception as e:
    print(f"Error saving CatBoost parameters to JSON: {e}")

Successfully saved Best CatBoost parameters to best_catboost_hpo_params.json


In [None]:
# Cell 10: XGBoost Optuna Objective and Optimization (Minimally Changed for Speed - Faster Trials)
def objective_xgb(trial, X_data, y_data):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': trial.suggest_float('eta', 0.01, 0.2, log=True), # learning_rate, adjusted for fewer estimators
        'max_depth': trial.suggest_int('max_depth', 3, 12), # Kept original range
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'n_estimators': 300,  # << PRIMARY CHANGE FOR SPEED PER TRIAL
        'random_state': RANDOM_STATE,
        'n_jobs': 2
    }
    model = Pipeline([
        ('prep', preprocessor),
        ('xgb', XGBRegressor(**params, tree_method='hist'))
    ])
    cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) # Kept original n_splits
    scores = cross_val_score(model, X_data, y_data, cv=cv,
                             scoring='neg_root_mean_squared_error', n_jobs=2)
    return -np.mean(scores)

print("Optimizing XGBoost (Minimally changed: fewer estimators per HPO trial)...")
study_xgb = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study_xgb.optimize(lambda trial: objective_xgb(trial, X, y), n_trials=40, n_jobs=1) # Kept original n_trials
best_xgb_params = study_xgb.best_params
print("Best XGBoost CV RMSE (from faster HPO - expect this RMSE to be higher):", study_xgb.best_value)
print("Best XGBoost parameters (from faster HPO):", best_xgb_params)

# Optional: Save Best XGBoost Params to JSON
import json
filename_xgb_params = "best_xgboost_hpo_params_fast_trials.json"
try:
    with open(filename_xgb_params, 'w') as f:
        json.dump(best_xgb_params, f, indent=4)
    print(f"Successfully saved Best XGBoost parameters (fast trials) to {filename_xgb_params}")
except Exception as e:
    print(f"Error saving XGBoost parameters to JSON: {e}")

# In Cell 11 for xgb_final:
# final_xgb_params = best_xgb_params.copy()
# final_xgb_params['n_estimators'] = 1500 # Or 2000, your original target for final model
# # The 'eta' found by Optuna will be used.
# final_xgb_params.update({ ... other essential final params ...})
# xgb_final = Pipeline(...)

[I 2025-06-07 12:48:49,077] A new study created in memory with name: no-name-e4f4506d-036a-4c81-be5d-c89c8393c3fe


Optimizing XGBoost (Minimally changed: fewer estimators per HPO trial)...


[I 2025-06-07 12:49:22,773] Trial 0 finished with value: 0.104960582461162 and parameters: {'eta': 0.030710573677773714, 'max_depth': 12, 'subsample': 0.8659969709057025, 'colsample_bytree': 0.7993292420985183, 'min_child_weight': 4, 'gamma': 1.7699302940633311e-07, 'lambda': 3.3323645788192616e-08, 'alpha': 0.6245760287469893}. Best is trial 0 with value: 0.104960582461162.
[I 2025-06-07 12:49:33,864] Trial 1 finished with value: 0.10698863244395276 and parameters: {'eta': 0.06054365855469246, 'max_depth': 10, 'subsample': 0.5102922471479012, 'colsample_bytree': 0.9849549260809971, 'min_child_weight': 17, 'gamma': 4.997040685255803e-07, 'lambda': 4.329370014459266e-07, 'alpha': 4.4734294104626844e-07}. Best is trial 0 with value: 0.104960582461162.
[I 2025-06-07 12:49:41,625] Trial 2 finished with value: 0.10398293261859329 and parameters: {'eta': 0.024878734419814436, 'max_depth': 8, 'subsample': 0.7159725093210578, 'colsample_bytree': 0.645614570099021, 'min_child_weight': 13, 'gamm

Best XGBoost CV RMSE (from faster HPO - expect this RMSE to be higher): 0.10301383586503032
Best XGBoost parameters (from faster HPO): {'eta': 0.030881651323450904, 'max_depth': 4, 'subsample': 0.8788239775227437, 'colsample_bytree': 0.7807231174177514, 'min_child_weight': 10, 'gamma': 0.020042881084584397, 'lambda': 0.02573487256683641, 'alpha': 0.7541782189867495}
Successfully saved Best XGBoost parameters (fast trials) to best_xgboost_hpo_params_fast_trials.json


In [None]:
# Cell 11: Final Model Definitions (Base Learners)
import json # Ensure json is imported for loading

print("Building final model pipelines by loading HPO parameters from JSON...")

# --- Load LightGBM Parameters ---
try:
    with open("/content/best_lgb_params.json", 'r') as f:
        best_lgb_params = json.load(f)
    print("Successfully loaded best_lgb_params from /content/best_lgb_params.json")
except FileNotFoundError:
    print("ERROR: /content/best_lgb_params.json not found! Using empty dict (will likely fail or use defaults).")
    best_lgb_params = {} # Fallback to prevent immediate error, but model will be default
except json.JSONDecodeError:
    print("ERROR: Could not decode JSON from /content/best_lgb_params.json! Using empty dict.")
    best_lgb_params = {}

# --- Load CatBoost Parameters ---
try:
    with open("/content/best_catboost_hpo_params.json", 'r') as f:
        best_cat_params = json.load(f)
    print("Successfully loaded best_cat_params from /content/best_catboost_hpo_params.json")
except FileNotFoundError:
    print("ERROR: /content/best_catboost_hpo_params.json not found! Using empty dict.")
    best_cat_params = {}
except json.JSONDecodeError:
    print("ERROR: Could not decode JSON from /content/best_catboost_hpo_params.json! Using empty dict.")
    best_cat_params = {}

# --- Load XGBoost Parameters ---
try:
    with open("/content/best_xgboost_hpo_params_fast_trials.json", 'r') as f:
        best_xgb_params = json.load(f)
    print("Successfully loaded best_xgb_params from /content/best_xgboost_hpo_params_fast_trials.json")
except FileNotFoundError:
    print("ERROR: /content/best_xgboost_hpo_params_fast_trials.json not found! Using empty dict.")
    best_xgb_params = {}
except json.JSONDecodeError:
    print("ERROR: Could not decode JSON from /content/best_xgboost_hpo_params_fast_trials.json! Using empty dict.")
    best_xgb_params = {}


# Ensure 'n_estimators' or 'iterations' are set for final training, and other essential params

# --- Define LightGBM Final Pipeline ---
if best_lgb_params: # Proceed only if params were loaded
    final_lgb_params = best_lgb_params.copy()
    final_lgb_params.update({
        'n_estimators': 2500, # Increased for final model
        'random_state': RANDOM_STATE,
        'verbosity': -1,
        # Ensure objective and metric are present if not in HPO params, or override if needed
        'objective': final_lgb_params.get('objective', 'regression_l1'),
        'metric': final_lgb_params.get('metric', 'rmse')
    })
    lgb_final = Pipeline([
        ('prep', preprocessor),
        ('lgbm', LGBMRegressor(**final_lgb_params))
    ])
    print("LightGBM final pipeline defined.")
else:
    print("Skipping LightGBM final pipeline definition due to missing parameters.")
    lgb_final = None # Or a default model pipeline if you prefer

# --- Define CatBoost Final Pipeline ---
if best_cat_params: # Proceed only if params were loaded
    final_cat_params = best_cat_params.copy()
    # CRITICAL: Increase 'iterations' for the final model
    # The 'iterations' from HPO (e.g., 300) is specific to faster HPO.
    final_cat_params['iterations'] = 2500 # Increased for final model

    final_cat_params.update({
        'random_seed': RANDOM_STATE,
        'verbose': 0, # Suppress output during stacking
        # Ensure loss_function is present if not in HPO params
        'loss_function': final_cat_params.get('loss_function', 'RMSE'),
        'thread_count': -1 # Use all available threads for final model training
    })
    cat_final = Pipeline([
        ('prep', preprocessor),
        ('cat', CatBoostRegressor(**final_cat_params))
    ])
    print("CatBoost final pipeline defined.")
else:
    print("Skipping CatBoost final pipeline definition due to missing parameters.")
    cat_final = None

# --- Define XGBoost Final Pipeline ---
if best_xgb_params: # Proceed only if params were loaded
    final_xgb_params = best_xgb_params.copy()
    # CRITICAL: Increase 'n_estimators' for the final model
    # The 'n_estimators' from HPO (e.g., 300) is specific to faster HPO.
    final_xgb_params['n_estimators'] = 2500 # Increased for final model

    # The 'eta' (learning rate) found by Optuna during HPO will be used.
    # It was optimized for fewer estimators, but often works well or is a good start.
    final_xgb_params.update({
        'random_state': RANDOM_STATE,
        # Ensure objective and eval_metric are present if not in HPO params
        'objective': final_xgb_params.get('objective', 'reg:squarederror'),
        'eval_metric': final_xgb_params.get('eval_metric', 'rmse'),
        'n_jobs': -1 # Use all available threads for final training
    })
    xgb_final = Pipeline([
        ('prep', preprocessor),
        ('xgb', XGBRegressor(**final_xgb_params, tree_method='hist')) # Keep tree_method='hist'
    ])
    print("XGBoost final pipeline defined.")
else:
    print("Skipping XGBoost final pipeline definition due to missing parameters.")
    xgb_final = None

print("Base learner pipelines defined (or skipped if params were missing).")

# The rest of your Cell 11 (StackingRegressor definition) would follow.
# You'll need to handle cases where lgb_final, cat_final, or xgb_final might be None
# if their parameters couldn't be loaded, e.g., by excluding them from the stack.
# For simplicity here, I am assuming they will be loaded. If not, the stacking part needs adjustment.

Building final model pipelines by loading HPO parameters from JSON...
Successfully loaded best_lgb_params from /content/best_lgb_params.json
Successfully loaded best_cat_params from /content/best_catboost_hpo_params.json
Successfully loaded best_xgb_params from /content/best_xgboost_hpo_params_fast_trials.json
LightGBM final pipeline defined.
CatBoost final pipeline defined.
XGBoost final pipeline defined.
Base learner pipelines defined (or skipped if params were missing).


In [None]:
# Cell 12: Stacking Regressor Definition

# Meta-learner for stacking
meta_learner = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.03, # Slightly lower LR for meta-learner
    num_leaves=20,     # Simpler meta-learner
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbosity=-1,
    colsample_bytree=0.7, # Add some regularization
    subsample=0.7
)

stack = StackingRegressor(
    estimators=[
        ('lgbm', lgb_final),
        ('cat', cat_final),
        ('xgb', xgb_final)
    ],
    final_estimator=meta_learner,
    cv=KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE + 1), # Use a different seed for stacking CV
    n_jobs=-1, # Parallelize stacking model fitting
    passthrough=False # Features are preprocessed by base learners, no need to pass raw to meta
)
print("Stacking regressor defined.")

Stacking regressor defined.


In [None]:
# Cell 13: Final Model Training
print("Training final stacked model... This may take a while.")
total_model = stack.fit(X, y)
print("Final model training complete.")

Training final stacked model... This may take a while.
Final model training complete.


In [None]:
# Cell 14: Prediction and Clipping
print("Making predictions on the test set...")
preds = total_model.predict(X_test)

# Clipping predictions to the observed range in the training target
y_min_train = y.min()
y_max_train = y.max()
print(f"Clipping predictions to the training range: [{y_min_train:.4f}, {y_max_train:.4f}]")
preds_clipped = np.clip(preds, y_min_train, y_max_train)

# Check if clipping had an effect
if not np.allclose(preds, preds_clipped):
    print(f"Clipping changed {np.sum(preds != preds_clipped)} prediction values.")
    print(f"Min pred before clip: {preds.min():.4f}, Min pred after clip: {preds_clipped.min():.4f}")
    print(f"Max pred before clip: {preds.max():.4f}, Max pred after clip: {preds_clipped.max():.4f}")
else:
    print("Clipping did not significantly change prediction values.")
print("Predictions made and clipped.")

Making predictions on the test set...
Clipping predictions to the training range: [0.0000, 0.9871]
Clipping did not significantly change prediction values.
Predictions made and clipped.


In [None]:
# Cell 15: Submission File Creation
submission = pd.DataFrame({'id': test_ids, 'efficiency': preds_clipped})
submission_filename = 'submission_stacked_enhanced.csv'
submission.to_csv(submission_filename, index=False)
print(f"Saved submission file: {submission_filename} (Shape: {submission.shape})")

Saved submission file: submission_stacked_enhanced.csv (Shape: (12000, 2))


In [None]:
# Cell 16: Optuna Summaries and Finish Message
print("\n--- Optuna Study Summaries ---")
print(f"\nLGBM Best CV RMSE: {study_lgb.best_value:.5f}")
print("LGBM Best Params:", study_lgb.best_params)
print(f"\nCatBoost Best CV RMSE: {study_cat.best_value:.5f}")
print("CatBoost Best Params:", study_cat.best_params)
print(f"\nXGBoost Best CV RMSE: {study_xgb.best_value:.5f}")
print("XGBoost Best Params:", study_xgb.best_params)

print("\nCode execution finished.")

# Optional: Visualize Optuna studies if desired
# import optuna.visualization as vis
# vis.plot_optimization_history(study_lgb).show()
# vis.plot_param_importances(study_lgb).show()
# (repeat for study_cat and study_xgb)