In [3]:
# Entrenamiento y guardado del pipeline completo

# ===============================
# 🔹 1. Importaciones
# ===============================
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# ===============================
# 🔹 2. Cargar datos preprocesados (sin escalar)
# ===============================
ROOT = Path.cwd().parent
INTERIM_PATH = ROOT / "data" / "interim" / "feature_engineered_data.csv"

df = pd.read_csv(INTERIM_PATH)  # Usar CSV sin escalar
target_col = "Fat_Percentage"

if target_col not in df.columns:
    raise ValueError(f"No se encontró la columna objetivo '{target_col}'.")

X = df.drop(columns=[target_col])
y = df[target_col]

# ===============================
# 🔹 3. Detectar columnas numéricas y categóricas
# ===============================
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("Numéricas:", num_features)
print("Categóricas:", cat_features)

# ===============================
# 🔹 4. Construir el preprocesador
# ===============================
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

# ===============================
# 🔹 5. Construir pipeline completo
# ===============================
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

# ===============================
# 🔹 6. Split de datos y entrenamiento
# ===============================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# ===============================
# 🔹 7. Guardar el pipeline entrenado
# ===============================
MODEL_DIR = ROOT / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

pipeline_path = MODEL_DIR / "rf_pipeline.pkl"
joblib.dump(pipeline, pipeline_path)
print(f"✅ Pipeline guardado en: {pipeline_path}")

# ===============================
# 🔹 8. Guardar una versión con fecha (opcional pero recomendado)
# ===============================
from datetime import datetime

today = datetime.today().strftime("%Y-%m-%d")
versioned_model_path = MODEL_DIR / f"rf_pipeline_v1_{today}.pkl"
joblib.dump(pipeline, versioned_model_path)
print(f"📦 Pipeline versionado en: {versioned_model_path}")

Numéricas: ['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'BMI', 'Log_Age']
Categóricas: ['Gender', 'Workout_Type']
✅ Pipeline guardado en: c:\Users\alozano\OneDrive\Documentos\Workspace\Grasa_corporal\models\rf_pipeline.pkl
📦 Pipeline versionado en: c:\Users\alozano\OneDrive\Documentos\Workspace\Grasa_corporal\models\rf_pipeline_v1_2025-06-01.pkl
