In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
import joblib
import warnings
warnings.filterwarnings("ignore")

def load_and_preprocess():
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    X = train_df.drop(['id', 'Calories'], axis=1)
    y = train_df['Calories']
    X_test = test_df.drop(['id'], axis=1)

    X = pd.get_dummies(X, drop_first=True)
    X_test = pd.get_dummies(X_test, drop_first=True)
    X_test = X_test.reindex(columns=X.columns, fill_value=0)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    return X_scaled, y, X_test_scaled, test_df, scaler

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Optuna
def tune_model(model_name, return_dict):
    X_scaled, y, _, _, _ = load_and_preprocess()

    def objective(trial):
        if model_name == 'xgb':
            model = XGBRegressor(
                n_estimators=trial.suggest_int('n_estimators', 300, 800),
                max_depth=trial.suggest_int('max_depth', 4, 12),
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
                subsample=trial.suggest_float('subsample', 0.6, 1.0),
                colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
                n_jobs=-1, random_state=42
            )
        elif model_name == 'rf':
            model = RandomForestRegressor(
                n_estimators=trial.suggest_int('n_estimators', 200, 800),
                max_depth=trial.suggest_int('max_depth', 6, 15),
                n_jobs=-1, random_state=42
            )
        elif model_name == 'gbr':
            model = GradientBoostingRegressor(
                n_estimators=trial.suggest_int('n_estimators', 200, 800),
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
                max_depth=trial.suggest_int('max_depth', 3, 10),
                random_state=42
            )
        elif model_name == 'lgbm':
            model = LGBMRegressor(
                n_estimators=trial.suggest_int('n_estimators', 300, 800),
                max_depth=trial.suggest_int('max_depth', 4, 12),
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
                subsample=trial.suggest_float('subsample', 0.6, 1.0),
                colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
                n_jobs=-1, random_state=42
            )
        score = cross_val_score(model, X_scaled, y, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()
        return -score

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=150, n_jobs=-1)
    return_dict[model_name] = study.best_params

if __name__ == '__main__':
    return_dict = {}
    for model_name in ['xgb', 'rf', 'gbr', 'lgbm']:
        print(f"🔍 Tuning {model_name.upper()}...")
        tune_model(model_name, return_dict)
        print(f"✅ Done: {model_name.upper()}")

    X_scaled, y, X_test_scaled, test_df, scaler = load_and_preprocess()

    xgb = XGBRegressor(**return_dict['xgb'], n_jobs=-1, random_state=42)
    rf = RandomForestRegressor(**return_dict['rf'], n_jobs=-1, random_state=42)
    gbr = GradientBoostingRegressor(**return_dict['gbr'], random_state=42)
    lgbm = LGBMRegressor(**return_dict['lgbm'], n_jobs=-1, random_state=42)
    cat = CatBoostRegressor(iterations=700, learning_rate=0.05, depth=8, verbose=0, random_state=42)
    extra = ExtraTreesRegressor(n_estimators=500, max_depth=15, n_jobs=-1, random_state=42)
    hist = HistGradientBoostingRegressor(max_iter=300, random_state=42)

    meta_models = {
        'ridge': RidgeCV(),
        'lasso': LassoCV(cv=5),
        'elastic': ElasticNetCV(cv=5)
    }

    best_score = float('inf')
    best_stack = None
    best_meta = None

    for name, meta_model in meta_models.items():
        stack = StackingRegressor(
            estimators=[
                ('xgb', xgb),
                ('rf', rf),
                ('gbr', gbr),
                ('lgbm', lgbm),
                ('cat', cat),
                ('extra', extra),
                ('hist', hist)
            ],
            final_estimator=meta_model,
            passthrough=True,
            cv=cv,
            n_jobs=-1
        )
        score = -cross_val_score(stack, X_scaled, y, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()
        print(f"{name} RMSE: {score:.5f}")
        if score < best_score:
            best_score = score
            best_stack = stack
            best_meta = name

    best_stack.fit(X_scaled, y)
    preds = best_stack.predict(X_test_scaled)

    submission = pd.DataFrame({'id': test_df['id'], 'Calories': preds})
    submission.to_csv('top_model_submission.csv', index=False)
    joblib.dump(best_stack, f'top_stacked_model_{best_meta}.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    print(f"✅ Modelo salvo como 'top_stacked_model_{best_meta}.pkl'")

🔍 Tuning XGB...


[I 2025-05-09 10:25:26,853] A new study created in memory with name: no-name-aac9ed7e-7bd1-4147-b9f7-c1695c801e7c
[I 2025-05-09 10:26:35,273] Trial 1 finished with value: 3.6422740843345887 and parameters: {'n_estimators': 412, 'max_depth': 7, 'learning_rate': 0.1589457479644133, 'subsample': 0.9377052372918582, 'colsample_bytree': 0.8197650415939534}. Best is trial 1 with value: 3.6422740843345887.
[I 2025-05-09 10:27:59,922] Trial 0 finished with value: 3.7390488944330285 and parameters: {'n_estimators': 530, 'max_depth': 8, 'learning_rate': 0.249639512931574, 'subsample': 0.8253039582824186, 'colsample_bytree': 0.7677358385965476}. Best is trial 1 with value: 3.6422740843345887.
[I 2025-05-09 10:29:39,512] Trial 4 finished with value: 3.7474542149093644 and parameters: {'n_estimators': 565, 'max_depth': 5, 'learning_rate': 0.2934357025886057, 'subsample': 0.6857887243187551, 'colsample_bytree': 0.9966345910074553}. Best is trial 1 with value: 3.6422740843345887.
[I 2025-05-09 10:31:

✅ Done: XGB
🔍 Tuning RF...


[I 2025-05-09 13:24:36,473] A new study created in memory with name: no-name-ef00f427-e1f3-4bde-aea8-c918a278d8db
[I 2025-05-09 13:58:29,560] Trial 2 finished with value: 5.683652620510125 and parameters: {'n_estimators': 703, 'max_depth': 9}. Best is trial 2 with value: 5.683652620510125.
[I 2025-05-09 14:04:05,913] Trial 0 finished with value: 4.471700223899184 and parameters: {'n_estimators': 773, 'max_depth': 11}. Best is trial 0 with value: 4.471700223899184.
[I 2025-05-09 14:29:57,711] Trial 4 finished with value: 9.354648128616953 and parameters: {'n_estimators': 457, 'max_depth': 6}. Best is trial 0 with value: 4.471700223899184.
[I 2025-05-09 14:30:10,991] Trial 3 finished with value: 3.7529775600761375 and parameters: {'n_estimators': 241, 'max_depth': 15}. Best is trial 3 with value: 3.7529775600761375.
[I 2025-05-09 14:34:34,734] Trial 5 finished with value: 7.868744315109443 and parameters: {'n_estimators': 375, 'max_depth': 7}. Best is trial 3 with value: 3.75297756007613

✅ Done: RF
🔍 Tuning GBR...


[I 2025-05-10 20:48:45,905] A new study created in memory with name: no-name-85121f01-bea0-4af1-8b9d-747becd47d71
[I 2025-05-10 22:00:58,829] Trial 6 finished with value: 3.702489217885327 and parameters: {'n_estimators': 223, 'learning_rate': 0.19982433834192753, 'max_depth': 9}. Best is trial 6 with value: 3.702489217885327.
[I 2025-05-10 22:17:14,364] Trial 1 finished with value: 3.733795016455274 and parameters: {'n_estimators': 241, 'learning_rate': 0.18516704275407808, 'max_depth': 5}. Best is trial 6 with value: 3.702489217885327.
[I 2025-05-10 22:23:32,642] Trial 9 finished with value: 3.7584445131524347 and parameters: {'n_estimators': 248, 'learning_rate': 0.2930186270978709, 'max_depth': 8}. Best is trial 6 with value: 3.702489217885327.
[I 2025-05-10 22:40:20,517] Trial 3 finished with value: 4.313308263122359 and parameters: {'n_estimators': 432, 'learning_rate': 0.043564756166119954, 'max_depth': 3}. Best is trial 6 with value: 3.702489217885327.
[I 2025-05-10 22:45:08,59

✅ Done: GBR
🔍 Tuning LGBM...


[I 2025-05-12 10:34:43,021] A new study created in memory with name: no-name-a7264da2-3a01-4b92-b1f7-9b3768f211da
[I 2025-05-12 10:36:15,565] Trial 0 finished with value: 3.6696002170801245 and parameters: {'n_estimators': 414, 'max_depth': 11, 'learning_rate': 0.23677808005666445, 'subsample': 0.9556256241155466, 'colsample_bytree': 0.6503840395434197}. Best is trial 0 with value: 3.6696002170801245.
[I 2025-05-12 10:37:48,057] Trial 7 finished with value: 3.6525890383292596 and parameters: {'n_estimators': 426, 'max_depth': 6, 'learning_rate': 0.2704523287767883, 'subsample': 0.9439461982672278, 'colsample_bytree': 0.6200124674515453}. Best is trial 7 with value: 3.6525890383292596.
[I 2025-05-12 10:37:56,564] Trial 2 finished with value: 3.6575967711651503 and parameters: {'n_estimators': 306, 'max_depth': 10, 'learning_rate': 0.08871666886349851, 'subsample': 0.7178553824388465, 'colsample_bytree': 0.8410524421922696}. Best is trial 7 with value: 3.6525890383292596.
[I 2025-05-12 1

✅ Done: LGBM
ridge RMSE: 3.52679
lasso RMSE: 3.55991
elastic RMSE: 3.54835
✅ Modelo salvo como 'top_stacked_model_ridge.pkl'
