In [1]:
# ===============================
# üîπ 04. entrenamiento
# ===============================
import pandas as pd
import numpy as np
import os
import joblib
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# ===============================
# üîπ 2. Cargar datos preprocesados
# ===============================
ROOT_DIR = Path.cwd().parent
data_path = ROOT_DIR / "data" / "interim" / "feature_engineered_data.csv"

df = pd.read_csv(data_path)

target_col = "Fat_Percentage"
if target_col not in df.columns:
    raise ValueError(f"No se encontr√≥ la columna objetivo '{target_col}'.")

X = df.drop(columns=[target_col])
y = df[target_col]

# ===============================
# üîπ 3. Detectar columnas num√©ricas y categ√≥ricas
# ===============================
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("Num√©ricas:", num_features)
print("Categ√≥ricas:", cat_features)

# ===============================
# üîπ 4. Construir el preprocesador
# ===============================
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

# ===============================
# üîπ 5. Construir pipeline completo
# ===============================
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

# ===============================
# üîπ 6. Split de datos y entrenamiento
# ===============================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# ===============================
# üîπ 7. Guardar el pipeline entrenado
# ===============================
MODELS_DIR = ROOT_DIR / "models"
MODELS_DIR.mkdir(exist_ok=True)

pipeline_path = MODELS_DIR / "rf_pipeline.pkl"
joblib.dump(pipeline, pipeline_path)

print(f"‚úÖ Pipeline guardado en: {pipeline_path}")

# üîπ 7. Guardar el pipeline entrenado
# ===============================
from pathlib import Path
import joblib
from datetime import datetime

# Ruta al directorio de modelos
MODEL_DIR = Path.cwd().parent / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Guardar pipeline principal
joblib.dump(pipeline, MODEL_DIR / "rf_pipeline.pkl")
print(f"‚úÖ Pipeline completo guardado en: {MODEL_DIR / 'rf_pipeline.pkl'}")

# (Opcional) Guardar con fecha para versiones hist√≥ricas
fecha = datetime.today().strftime("%Y-%m-%d")
joblib.dump(pipeline, MODEL_DIR / f"rf_pipeline_{fecha}.pkl")
print(f"üóÇÔ∏è Versi√≥n con fecha guardada en: {MODEL_DIR / f'rf_pipeline_{fecha}.pkl'}")

Num√©ricas: ['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'BMI', 'Log_Age']
Categ√≥ricas: ['Gender', 'Workout_Type']
‚úÖ Pipeline guardado en: c:\Users\alozano\OneDrive\Documentos\Workspace\Grasa_corporal\models\rf_pipeline.pkl
‚úÖ Pipeline completo guardado en: c:\Users\alozano\OneDrive\Documentos\Workspace\Grasa_corporal\models\rf_pipeline.pkl
üóÇÔ∏è Versi√≥n con fecha guardada en: c:\Users\alozano\OneDrive\Documentos\Workspace\Grasa_corporal\models\rf_pipeline_2025-06-08.pkl
