# Entrenamos un modelo

## Train - test split

En problemas de series temporales, el orden es importante y no podemos hacer un split aleatorio.

In [14]:
import sys
from pathlib import Path

# Añade src al path
sys.path.append(str(Path().resolve().parent / "src"))

In [15]:
import pandas as pd
from paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data_year_2024_4weeks_lags.parquet')
df

Unnamed: 0,pickup_hour,pickup_location_id,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,target
0,2024-01-29 00:00:00,138,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0,...,207.0,256.0,199.0,243.0,217.0,307.0,251.0,302.0,247.0,138
1,2024-01-29 01:00:00,138,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64.0,...,256.0,199.0,243.0,217.0,307.0,251.0,302.0,247.0,138.0,15
2,2024-01-29 02:00:00,138,0.0,0.0,0.0,1.0,1.0,8.0,64.0,139.0,...,199.0,243.0,217.0,307.0,251.0,302.0,247.0,138.0,15.0,25
3,2024-01-29 03:00:00,138,0.0,0.0,1.0,1.0,8.0,64.0,139.0,199.0,...,243.0,217.0,307.0,251.0,302.0,247.0,138.0,15.0,25.0,2
4,2024-01-29 04:00:00,138,0.0,1.0,1.0,8.0,64.0,139.0,199.0,138.0,...,217.0,307.0,251.0,302.0,247.0,138.0,15.0,25.0,2.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8107,2024-12-31 19:00:00,138,330.0,328.0,310.0,366.0,381.0,71.0,16.0,0.0,...,142.0,160.0,117.0,87.0,173.0,113.0,221.0,160.0,137.0,64
8108,2024-12-31 20:00:00,138,328.0,310.0,366.0,381.0,71.0,16.0,0.0,0.0,...,160.0,117.0,87.0,173.0,113.0,221.0,160.0,137.0,64.0,53
8109,2024-12-31 21:00:00,138,310.0,366.0,381.0,71.0,16.0,0.0,0.0,3.0,...,117.0,87.0,173.0,113.0,221.0,160.0,137.0,64.0,53.0,100
8110,2024-12-31 22:00:00,138,366.0,381.0,71.0,16.0,0.0,0.0,3.0,6.0,...,87.0,173.0,113.0,221.0,160.0,137.0,64.0,53.0,100.0,63


In [18]:
from data_split import train_test_split
from datetime import datetime


X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2024, 10, 1),
    target_column_name='target'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(5904, 674)
y_train.shape=(5904,)
X_test.shape=(2208, 674)
y_test.shape=(2208,)


## Entrenamos Baseline

Predecimos la siguiente hora con el valor de la hora anterior

In [19]:
import numpy as np

# modelo baseline que predice dándole la hora anterior
class BaselineModelPreviousHour:
    """
    Prediction = actual demand observed in the last hour
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_1_hour']

In [20]:
model_bs1 = BaselineModelPreviousHour()
predictions = model_bs1.predict(X_test)

# Evaluación del modelo
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=45.8474


## Entrenamos Baseline II

Este Baseline va a predecir con datos de la día anterior

In [21]:
import pandas as pd
import numpy as np

class BaselineModelPreviousDay:
    """
    Prediction = actual demand observed in the last week
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_24_hour']

In [22]:
model_bs2 = BaselineModelPreviousDay()
predictions = model_bs2.predict(X_test)

# Evaluación del modelo
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=47.6667


## Baseline III

Predecimos con datos de la semana anterior

In [23]:
import pandas as pd
import numpy as np

class BaselineModelPreviousWeek:
    """
    Prediction = actual demand observed in the last week
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_168_hour'] # 7 días * 24 horas

In [25]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)

# Evaluación del modelo
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=46.8596


## Baseline Average

In [26]:
class BaselineModelLast4Weeks:
    """
    Prediction = actual demand observed at t - 7 days, t - 14 days, t - 21 days, t - 28 days
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> pd.Series:
        """"""
        return 0.25*(
            X_test[f'rides_previous_{7*24}_hour'] + \
            X_test[f'rides_previous_{2*7*24}_hour'] + \
            X_test[f'rides_previous_{3*7*24}_hour'] + \
            X_test[f'rides_previous_{4*7*24}_hour']
        )

In [27]:
model = BaselineModelLast4Weeks()
predictions = model.predict(X_test)

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=38.8298
