In [None]:
# Entrenamiento y guardado del pipeline completo

# ===============================
# ðŸ”¹ 1. Importaciones
# ===============================
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# ===============================
# ðŸ”¹ 2. Cargar datos preprocesados (sin escalar)
# ===============================
ROOT = Path.cwd()
INTERIM_PATH = Path("data/interim/feature_engineered_data.csv")
df = pd.read_csv(INTERIM_PATH)# Usar CSV sin escalar

target_col = "Fat_Percentage"

if target_col not in df.columns:
    raise ValueError(f"No se encontrÃ³ la columna objetivo '{target_col}'.")

X = df.drop(columns=[target_col])
y = df[target_col]

# ðŸ”¹ Renombrar columnas para que coincidan con el backend
X = X.rename(columns={
    "Age": "age",
    "Weight (kg)": "weight_kg",
    "Height (m)": "height_m",
    "Max_BPM": "max_bpm",
    "Avg_BPM": "avg_bpm",
    "Resting_BPM": "resting_bpm",
    "Session_Duration (hours)": "session_duration_hours",
    "Gender": "gender",
    "Workout_Type": "workout_type"
})

# ===============================
# ðŸ”¹ 3. Detectar columnas numÃ©ricas y categÃ³ricas
# ===============================
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("NumÃ©ricas:", num_features)
print("CategÃ³ricas:", cat_features)

# ===============================
# ðŸ”¹ 4. Construir el preprocesador
# ===============================
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

# ===============================
# ðŸ”¹ 5. Construir pipeline completo
# ===============================
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

# ===============================
# ðŸ”¹ 6. Split de datos y entrenamiento
# ===============================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# ===============================
from datetime import datetime
from pathlib import Path
import joblib
import os

# Crear carpeta absoluta al directorio raÃ­z del proyecto
ROOT = Path.cwd().resolve()
MODEL_DIR = ROOT / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Guardar sin versiÃ³n
pipeline_path = MODEL_DIR / "rf_pipeline.pkl"
joblib.dump(pipeline, pipeline_path)
print(f"âœ… Pipeline guardado en: {pipeline_path}")

# Guardar con versiÃ³n
today = datetime.today().strftime("%Y-%m-%d")
versioned_path = MODEL_DIR / f"rf_pipeline_v1_{today}.pkl"
joblib.dump(pipeline, versioned_path)
print(f"ðŸ“¦ Pipeline versionado en: {versioned_path}")

FileNotFoundError: [Errno 2] No such file or directory: 'data\\interim\\feature_engineered_data.csv'

In [1]:
print(df.shape)
print(df.head())
print(df.dtypes)


NameError: name 'df' is not defined