In [1]:
# Entrenamiento y guardado del pipeline completo

# ===============================
# ðŸ”¹ 1. Importaciones
# ===============================
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# ===============================
# ðŸ”¹ 2. Cargar datos preprocesados
# (en este ejemplo usamos el CSV con features originales, no los estandarizados)
# ===============================
df = pd.read_csv("data/interim/feature_engineered_data.csv")  # <- NO usar el ya escalado

target_col = "Fat_Percentage"
if target_col not in df.columns:
    raise ValueError(f"No se encontrÃ³ la columna objetivo '{target_col}'.")

X = df.drop(columns=[target_col])
y = df[target_col]

# ===============================
# ðŸ”¹ 3. Detectar columnas numÃ©ricas y categÃ³ricas
# ===============================
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("NumÃ©ricas:", num_features)
print("CategÃ³ricas:", cat_features)

# ===============================
# ðŸ”¹ 4. Construir el preprocesador
# ===============================
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

# ===============================
# ðŸ”¹ 5. Construir pipeline completo
# ===============================
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

# ===============================
# ðŸ”¹ 6. Split de datos y entrenamiento
# ===============================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# ===============================
# ðŸ”¹ 7. Guardar el pipeline entrenado
# ===============================
os.makedirs("models", exist_ok=True)
joblib.dump(pipeline, "models/rf_pipeline.pkl")
print("âœ… Pipeline guardado en 'models/rf_pipeline.pkl'")

NumÃ©ricas: ['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'BMI', 'Log_Age']
CategÃ³ricas: ['Gender', 'Workout_Type']
âœ… Pipeline guardado en 'models/rf_pipeline.pkl'
