In [4]:
# 03_Preprocesamiento.ipynb

# ===============================
# üîπ 1. Configuraci√≥n y librer√≠as
# ===============================
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import os

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# ===============================
# üîπ 2. Cargar datos intermedios
# ===============================
ROOT_DIR = Path.cwd()
while not (ROOT_DIR / "pyproject.toml").exists() and ROOT_DIR != ROOT_DIR.parent:
    ROOT_DIR = ROOT_DIR.parent

INPUT_PATH = ROOT_DIR / "data" / "interim" / "feature_engineered_data.csv"

if not INPUT_PATH.exists():
    raise FileNotFoundError(f"‚ùå No se encuentra el archivo: {INPUT_PATH}")

df = pd.read_csv(INPUT_PATH)

# ===============================
# üîπ 3. Detectar columnas
# ===============================
target_col = "Fat_Percentage"
X = df.drop(columns=[target_col])
y = df[target_col]

num_features = X.select_dtypes(include=["number"]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# ===============================
# üîπ 4. Crear preprocesador
# ===============================
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features)
])

# ===============================
# üîπ 5. Aplicar preprocesamiento
# ===============================
X_processed = preprocessor.fit_transform(X)

# Recuperar nombres finales de columnas
ohe_columns = preprocessor.named_transformers_["cat"].get_feature_names_out(cat_features)
all_columns = np.concatenate([num_features, ohe_columns])

# DataFrame final
X_processed_df = pd.DataFrame(X_processed, columns=all_columns)

# ===============================
# üîπ 6. Guardar datos procesados
# ===============================
PROCESSED_PATH = ROOT_DIR / "data" / "processed" / "preprocessed_data.csv"
PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)
X_processed_df.to_csv(PROCESSED_PATH, index=False)
print(f"‚úÖ Datos procesados guardados en: {PROCESSED_PATH}")

# ===============================
# üîπ 7. Guardar el preprocesador
# ===============================
PREPROCESSOR_PATH = ROOT_DIR / "models" / "preprocessor.joblib"
joblib.dump(preprocessor, PREPROCESSOR_PATH)
print(f"‚úÖ Preprocesador guardado en: {PREPROCESSOR_PATH}")




‚úÖ Datos procesados guardados en: c:\TEMP\ALOZANO\GRASACORPORAL\data\processed\preprocessed_data.csv
‚úÖ Preprocesador guardado en: c:\TEMP\ALOZANO\GRASACORPORAL\models\preprocessor.joblib
