In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split   # 👈 ESTA ES LA QUE FALTABA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [18]:
#Cargar datos (train y test)

train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("\nColumnas disponibles:", train_df.columns)

print("Columnas en train:", train_df.columns)
print("Columnas en test :", test_df.columns)


Train shape: (750000, 9)
Test shape : (250000, 8)

Columnas disponibles: Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'Calories'],
      dtype='object')
Columnas en train: Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'Calories'],
      dtype='object')
Columnas en test : Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp'],
      dtype='object')


In [19]:
# Exploración básica del train

print("\n--- INFO TRAIN ---")
print(train_df.info())  
print("\nValores faltantes:\n", train_df.isna().sum())
print("\nEstadísticas:\n", train_df.describe())


--- INFO TRAIN ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB
None

Valores faltantes:
 id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

Estadísticas:
                   id            Age         Height         Weight  \
count  750000.000000  750000.000000  750000.000000  750000.000000   
mean   3

In [20]:
# Preprocesamiento
def preparar_datos(df):
    data = df.copy()
    # Codificar Sex (0 = Female, 1 = Male)
    if "Sex" in data.columns:
        data["Sex"] = data["Sex"].astype(str).str.lower().map(
            {"female": 0, "male": 1, "f": 0, "m": 1}
        )
    return data

train_df = preparar_datos(train_df)
test_df  = preparar_datos(test_df)

# Features y target
X_cols = ["Sex", "Age", "Height", "Weight", "Duration", "Heart_Rate", "Body_Temp"]

X = train_df[X_cols]
y = train_df["Calories"]

In [21]:
#División interna para evaluar
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Escalado + imputación
num_preprocess = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
X_train_proc = num_preprocess.fit_transform(X_train)
X_val_proc   = num_preprocess.transform(X_val)

In [22]:
# Paso 4: Entrenar modelos 

modelos = {
    "Regresión Lineal": LinearRegression(),
    "Random Forest": RandomForestRegressor(
        n_estimators=100,   # antes 300 → reducimos para que sea más rápido
        random_state=42,
        n_jobs=-1           # usar todos los núcleos disponibles
    ),
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=100,   # dejamos en 100 (por defecto) → suficiente para dataset pequeño
        random_state=42
    )
}

resultados = {}

for nombre, modelo in modelos.items():
    print(f"Entrenando {nombre}...")
    modelo.fit(X_train_proc, y_train)
    pred = modelo.predict(X_val_proc)
    
    mae = mean_absolute_error(y_val, pred)
    rmse = mean_squared_error(y_val, pred, squared=False)
    r2 = r2_score(y_val, pred)
    
    resultados[nombre] = {
        "modelo": modelo,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "pred": pred
    }

# Mostrar métricas en DataFrame
metricas_df = pd.DataFrame([
    {"Modelo": nombre, "MAE": res["MAE"], "RMSE": res["RMSE"], "R2": res["R2"]}
    for nombre, res in resultados.items()
])

print("\n--- Resultados en validación interna ---")
print(metricas_df)



Entrenando Regresión Lineal...




Entrenando Random Forest...




Entrenando Gradient Boosting...

--- Resultados en validación interna ---
              Modelo       MAE       RMSE        R2
0   Regresión Lineal  8.082859  11.058772  0.968445
1      Random Forest  2.309292   3.822297  0.996230
2  Gradient Boosting  3.173100   4.751223  0.994175




In [23]:
# Mostrar resultados
metricas_df = pd.DataFrame([
    {"Modelo": nombre, "MAE": res["MAE"], "RMSE": res["RMSE"], "R2": res["R2"]}
    for nombre, res in resultados.items()
])

print("\n--- Resultados en validación interna ---")
print(metricas_df)




--- Resultados en validación interna ---
              Modelo       MAE       RMSE        R2
0   Regresión Lineal  8.082859  11.058772  0.968445
1      Random Forest  2.309292   3.822297  0.996230
2  Gradient Boosting  3.173100   4.751223  0.994175
