In [2]:
# 06_Optimizacion_Optuna.ipynb

# ===============================
# 🔹 1. Importaciones
# ===============================
import pandas as pd
import numpy as np
import optuna
import joblib
import os

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score

# ===============================
# 🔹 2. Cargar datos sin preprocesar
# ===============================
df = pd.read_csv("data/interim/feature_engineered_data.csv")
target_col = "Fat_Percentage"
X = df.drop(columns=[target_col])
y = df[target_col]

# Detectar columnas
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# ===============================
# 🔹 3. Función objetivo de Optuna
# ===============================
def objective(trial):
    # Espacio de búsqueda
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)

    # Preprocesador
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

    # Pipeline
    model = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        ))
    ])

    # Validación cruzada
    scores = cross_val_score(model, X, y, cv=3, scoring='r2')
    return scores.mean()

# ===============================
# 🔹 4. Ejecutar la optimización
# ===============================
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# ===============================
# 🔹 5. Resultados
# ===============================
print("✅ Mejor puntuación R2:", study.best_value)
print("📦 Mejores hiperparámetros:", study.best_params)

# ===============================
# 🔹 6. Entrenar modelo final y guardar
# ===============================
# Entrenamiento con mejores params
best_params = study.best_params

# Preprocesador
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# Pipeline final
final_model = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', RandomForestRegressor(
        **best_params,
        random_state=42
    ))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
final_model.fit(X_train, y_train)

# Guardar pipeline optimizado
os.makedirs("models", exist_ok=True)
joblib.dump(final_model, "models/rf_pipeline_optuna.pkl")
print("💾 Pipeline optimizado guardado como 'rf_pipeline_optuna.pkl'")

[I 2025-05-17 19:00:49,509] A new study created in memory with name: no-name-3e420967-8414-46e7-8a98-4bb0c2bd6d5d
[I 2025-05-17 19:00:52,424] Trial 0 finished with value: 0.7942290702283138 and parameters: {'n_estimators': 206, 'max_depth': 27, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7942290702283138.
[I 2025-05-17 19:00:57,439] Trial 1 finished with value: 0.7950697186230674 and parameters: {'n_estimators': 246, 'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.7950697186230674.
[I 2025-05-17 19:01:02,629] Trial 2 finished with value: 0.7957921643353236 and parameters: {'n_estimators': 324, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.7957921643353236.
[I 2025-05-17 19:01:09,879] Trial 3 finished with value: 0.7941028753823199 and parameters: {'n_estimators': 366, 'max_depth': 25, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 2 with value

✅ Mejor puntuación R2: 0.801659773609512
📦 Mejores hiperparámetros: {'n_estimators': 174, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 2}
💾 Pipeline optimizado guardado como 'rf_pipeline_optuna.pkl'
