In [1]:
# 07_Optimizacion_Optuna.ipynb

# 🔹 1. Importaciones
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
import optuna
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from datetime import datetime

# 🔹 2. Cargar datos intermedios
ROOT = Path.cwd().parent
INTERIM_PATH = ROOT / "data" / "interim" / "feature_engineered_data.csv"
df = pd.read_csv(INTERIM_PATH)

df["Gender"] = df["Gender"].astype("category")
df["Workout_Type"] = df["Workout_Type"].astype("category")

target_col = "Fat_Percentage"
X = df.drop(columns=[target_col])
y = df[target_col]

# 🔹 3. Columnas
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=["category"]).columns.tolist()

print("📊 Numéricas:", num_features)
print("📊 Categóricas:", cat_features)

# 🔹 4. Objetivo para Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    }

    preprocessor = ColumnTransformer([
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ])

    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("regressor", RandomForestRegressor(**params, random_state=42))
    ])

    scores = cross_val_score(pipeline, X, y, cv=3, scoring="r2")
    return scores.mean()

# 🔹 5. Optimización
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("✅ Mejor R2:", study.best_value)
print("🏆 Mejores hiperparámetros:", study.best_params)

# 🔹 6. Entrenar modelo final
best_params = study.best_params

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

pipeline_final = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", RandomForestRegressor(**best_params, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline_final.fit(X_train, y_train)

# 🔹 7. Guardar modelo optimizado
MODEL_DIR = ROOT / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Guardado principal
model_path = MODEL_DIR / "rf_pipeline_optuna.pkl"
joblib.dump(pipeline_final, model_path)
print(f"💾 Pipeline optimizado guardado en: {model_path}")

# Guardado versionado con fecha
today = datetime.today().strftime("%Y-%m-%d")
versioned_path = MODEL_DIR / f"rf_pipeline_optuna_{today}.pkl"
joblib.dump(pipeline_final, versioned_path)
print(f"🗂 Versión guardada como: {versioned_path}")

[I 2025-06-08 12:19:10,381] A new study created in memory with name: no-name-77c8518a-320a-4067-a932-b5475a981739


📊 Numéricas: ['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'BMI', 'Log_Age']
📊 Categóricas: ['Gender', 'Workout_Type']


[I 2025-06-08 12:19:11,559] Trial 0 finished with value: 0.798388116206961 and parameters: {'n_estimators': 95, 'max_depth': 8, 'min_samples_split': 9, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.798388116206961.
[I 2025-06-08 12:19:12,796] Trial 1 finished with value: 0.7946812509735778 and parameters: {'n_estimators': 110, 'max_depth': 19, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.798388116206961.
[I 2025-06-08 12:19:14,064] Trial 2 finished with value: 0.7988756365599281 and parameters: {'n_estimators': 101, 'max_depth': 15, 'min_samples_split': 7, 'min_samples_leaf': 9}. Best is trial 2 with value: 0.7988756365599281.
[I 2025-06-08 12:19:17,794] Trial 3 finished with value: 0.796882951029855 and parameters: {'n_estimators': 303, 'max_depth': 16, 'min_samples_split': 10, 'min_samples_leaf': 7}. Best is trial 2 with value: 0.7988756365599281.
[I 2025-06-08 12:19:22,245] Trial 4 finished with value: 0.7948116520321046 and parameters: {'n_e

✅ Mejor R2: 0.8072541137625507
🏆 Mejores hiperparámetros: {'n_estimators': 203, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 1}
💾 Pipeline optimizado guardado en: c:\Users\alozano\OneDrive\Documentos\Workspace\Grasa_corporal\models\rf_pipeline_optuna.pkl
🗂 Versión guardada como: c:\Users\alozano\OneDrive\Documentos\Workspace\Grasa_corporal\models\rf_pipeline_optuna_2025-06-08.pkl
