# Entrenamos un modelo

## Train - test split

En problemas de series temporales, el orden es importante y no podemos hacer un split aleatorio.

In [24]:
import sys
from pathlib import Path

# Añade src al path
sys.path.append(str(Path().resolve().parent / "src"))

In [25]:
import pandas as pd
from paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,pickup_hour,pickup_location_id,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,target
0,2024-01-08 00:00:00,138,59.0,4.0,0.0,0.0,0.0,1.0,1.0,8.0,...,222.0,195.0,177.0,229.0,195.0,291.0,235.0,237.0,156.0,80
1,2024-01-08 01:00:00,138,4.0,0.0,0.0,0.0,1.0,1.0,8.0,64.0,...,195.0,177.0,229.0,195.0,291.0,235.0,237.0,156.0,80.0,8
2,2024-01-08 02:00:00,138,0.0,0.0,0.0,1.0,1.0,8.0,64.0,139.0,...,177.0,229.0,195.0,291.0,235.0,237.0,156.0,80.0,8.0,2
3,2024-01-08 03:00:00,138,0.0,0.0,1.0,1.0,8.0,64.0,139.0,199.0,...,229.0,195.0,291.0,235.0,237.0,156.0,80.0,8.0,2.0,0
4,2024-01-08 04:00:00,138,0.0,1.0,1.0,8.0,64.0,139.0,199.0,138.0,...,195.0,291.0,235.0,237.0,156.0,80.0,8.0,2.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,2024-01-31 19:00:00,138,270.0,185.0,183.0,192.0,157.0,48.0,10.0,0.0,...,152.0,132.0,157.0,143.0,200.0,170.0,145.0,142.0,197.0,200
572,2024-01-31 20:00:00,138,185.0,183.0,192.0,157.0,48.0,10.0,0.0,2.0,...,132.0,157.0,143.0,200.0,170.0,145.0,142.0,197.0,200.0,205
573,2024-01-31 21:00:00,138,183.0,192.0,157.0,48.0,10.0,0.0,2.0,1.0,...,157.0,143.0,200.0,170.0,145.0,142.0,197.0,200.0,205.0,137
574,2024-01-31 22:00:00,138,192.0,157.0,48.0,10.0,0.0,2.0,1.0,1.0,...,143.0,200.0,170.0,145.0,142.0,197.0,200.0,205.0,137.0,222


In [26]:
from data_split import train_test_split
from datetime import datetime


X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2024, 1, 25),
    target_column_name='target'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(408, 170)
y_train.shape=(408,)
X_test.shape=(168, 170)
y_test.shape=(168,)


## Entrenamos Baseline

Predecimos la siguiente hora con el valor de la hora anterior

In [27]:
import numpy as np

# modelo baseline que predice dándole la hora anterior
class BaselineModelPreviousHour:
    """
    Prediction = actual demand observed in the last hour
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_1_hour']

In [28]:
model_bs1 = BaselineModelPreviousHour()
predictions = model_bs1.predict(X_test)
predictions

0      157.0
1       48.0
2       10.0
3        0.0
4        2.0
       ...  
163    197.0
164    200.0
165    205.0
166    137.0
167    222.0
Name: rides_previous_1_hour, Length: 168, dtype: float64

In [29]:
# Evaluación del modelo
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=34.7083


## Entrenamos Baseline II

Este Baseline va a predecir con datos de la día anterior

In [33]:
import pandas as pd
import numpy as np

class BaselineModelPreviousDay:
    """
    Prediction = actual demand observed in the last week
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_24_hour']

In [34]:
model_bs2 = BaselineModelPreviousDay()
predictions = model_bs2.predict(X_test)
predictions

0       12.0
1        6.0
2        0.0
3        0.0
4        0.0
       ...  
163    232.0
164    192.0
165    139.0
166    183.0
167    112.0
Name: rides_previous_24_hour, Length: 168, dtype: float64

In [35]:
# Evaluación del modelo
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=39.0893


## Baseline III

Predecimos con datos de la semana anterior

In [None]:
import pandas as pd
import numpy as np

class BaselineModelPreviousWeek:
    """
    Prediction = actual demand observed in the last week
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_168_hour'] # 7 días * 24 horas

In [37]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)
predictions

0       49.0
1        2.0
2        1.0
3        0.0
4        0.0
       ...  
163    270.0
164    185.0
165    183.0
166    192.0
167    157.0
Name: rides_previous_168_hour, Length: 168, dtype: float64

In [38]:
# Evaluación del modelo
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=24.6429


## Baseline Average

In [39]:
import pandas as pd
import numpy as np

class BaselineModelAverage:
    """
    Prediction = actual demand observed in the last week
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return (X_test[f'rides_previous_168_hour'] + X_test[f'rides_previous_24_hour'] + X_test[f'rides_previous_1_hour'])/3 # 7 días * 24 horas

In [40]:
model = BaselineModelAverage()
predictions = model.predict(X_test)
predictions

0       72.666667
1       18.666667
2        3.666667
3        0.000000
4        0.666667
          ...    
163    233.000000
164    192.333333
165    175.666667
166    170.666667
167    163.666667
Length: 168, dtype: float64

In [41]:
# Evaluación del modelo
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=24.1111
