# Entrenamos un modelo

## Train - test split

En problemas de series temporales, el orden es importante y no podemos hacer un split aleatorio.

In [11]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Cargamos los datos
df_features = pd.read_parquet('../data/processed/X.parquet')
df_target = pd.read_parquet('../data/processed/y.parquet')
df_data = pd.read_parquet('../data/processed/features_target_2024_01.parquet')

df_data.head()

Unnamed: 0,hour_of_day,day_of_week,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,target
0,0,1,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0,...,193.0,241.0,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130
1,1,1,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64.0,...,241.0,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27
2,2,1,0.0,0.0,0.0,1.0,1.0,8.0,64.0,139.0,...,225.0,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0
3,3,1,0.0,0.0,1.0,1.0,8.0,64.0,139.0,199.0,...,200.0,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0.0,1
4,4,1,0.0,1.0,1.0,8.0,64.0,139.0,199.0,138.0,...,172.0,209.0,104.0,220.0,149.0,130.0,27.0,0.0,1.0,3


In [12]:
# Dividimos en train y test (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(
    df_data.drop(columns='target'), df_data['target'], 
    test_size=0.2, 
    random_state=42, 
    shuffle=False  # No barajamos porque es un problema temporal
)

y_train, X_train

(0      130
 1       27
 2        0
 3        1
 4        3
       ... 
 571    256
 572    222
 573    191
 574    239
 575    156
 Name: target, Length: 576, dtype: int64,
      hour_of_day  day_of_week  rides_previous_24_hour  rides_previous_23_hour  \
 0              0            1                    59.0                     4.0   
 1              1            1                     4.0                     0.0   
 2              2            1                     0.0                     0.0   
 3              3            1                     0.0                     0.0   
 4              4            1                     0.0                     1.0   
 ..           ...          ...                     ...                     ...   
 571           19            3                   270.0                   185.0   
 572           20            3                   185.0                   183.0   
 573           21            3                   183.0                   192.0   
 574  

## Entrenamos modelos simple regresión lineal

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# Baseline con regresión lineal
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)


In [20]:
# Predicciones
y_pred_baseline = baseline_model.predict(X_test)

In [23]:
# Evaluación
mae = mean_absolute_error(y_test, y_pred_baseline)
rmse = mean_squared_error(y_test, y_pred_baseline, squared=False)
r2 = r2_score(y_test, y_pred_baseline)

print(f"Baseline (Regresión Lineal) - MAE: {mae:.2f} | RMSE: {rmse:.2f} | R2: {r2:.2f}")

Baseline (Regresión Lineal) - MAE: 27.31 | RMSE: 35.47 | R2: 0.84


## Random Forest

In [24]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predicciones
y_pred_rf = rf_model.predict(X_test)

# Evaluación
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2 = r2_score(y_test, y_pred_rf)

print(f"Random Forest - MAE: {mae_rf:.2f} | RMSE: {rmse_rf:.2f} | R2: {r2:.2f}")


Random Forest - MAE: 20.85 | RMSE: 27.88 | R2: 0.90


In [25]:
y_pred_rf, y_test

(array([ 53.64,  12.8 ,   3.2 ,   1.17,   1.12,   0.89,   4.24,  35.92,
         94.07, 131.72, 176.75, 151.65, 190.37, 175.05, 233.14, 212.02,
        230.96, 221.16, 221.26, 229.83, 203.21, 173.49, 187.87, 151.47,
         40.08,  10.91,   0.99,   0.85,   1.06,   2.12,   5.08,  45.97,
         74.02, 112.75, 112.3 , 123.62, 131.2 , 134.01, 144.03, 155.  ,
        146.03, 115.16,  97.37,  90.12,  84.8 ,  89.85,  91.14,  88.19,
         47.43,  18.34,  10.19,   3.3 ,   3.04,   4.3 ,   7.28,  24.43,
         47.31,  92.  , 111.46, 122.23, 139.66, 177.5 , 212.99, 204.69,
        216.31, 218.13, 245.09, 255.72, 246.22, 237.41, 225.21, 206.14,
         79.03,  22.67,   8.59,   9.91,   3.43,   1.38,   4.15,  27.62,
         62.96, 166.81, 197.34, 149.59, 178.57, 184.47, 239.13, 225.03,
        214.25, 217.96, 225.71, 204.99, 183.78, 165.18, 204.08, 192.54,
        100.28,  15.9 ,   9.36,   2.5 ,   1.63,   1.38,   7.27,  66.14,
         94.58, 157.03, 167.23, 151.43, 176.09, 205.1 , 247.17, 