In [3]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.pardir)))

In [12]:
import pandas as pd
import numpy as np
from src.paths import TRANSFORMED_DATA_DIR
from datetime import datetime
from src.data_split import train_test_split

In [5]:
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tarbular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hours,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,17.0
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,9.0
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,4,3.0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,4,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-27,199,0.0
88290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-28,199,0.0
88291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-29,199,0.0
88292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-12-30,199,0.0


In [7]:
X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022,6,1,0,0,0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32226, 674)
y_train.shape=(32226,)
X_test.shape=(56068, 674)
y_test.shape=(56068,)


In [18]:
class BaselineModelPreviousHour:
    """
    Prediction is the actual demand observed in the previous 1 hours
    """
    def fit(self,X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test['rides_previous_1_hour']

In [19]:
model = BaselineModelPreviousHour()
baseline_predictions = model.predict(X_test)
baseline_predictions

0         0.0
1         5.0
2        13.0
3        12.0
4        14.0
         ... 
56063     0.0
56064     0.0
56065     0.0
56066     0.0
56067     0.0
Name: rides_previous_1_hour, Length: 56068, dtype: float32

In [20]:
from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, baseline_predictions)
print(f'{test_mae=:.4f}')

test_mae=6.1252


<p>We can say that the baseline model makes an error of approximately 6 trips on average </p>

That is what the Mean Absolute Error tells us.

## Lets implement another baseline model, we will leverage on the fact that our time-series data has some weekly seasonality to it.
For example, to predict what is going to happen the next hour, and the next hour happens to be Friday at 5 pm for instance, we can look back at what happened the previous Friday at 5 and use that historical demand as our estimate.

In [24]:
class BaselineModelPreviousWeek:
    """
    Predictions = actual demand observed at t - 7 days
    """
    def fit(self,X_train: pd.DataFrame, y_train: pd.Series):
        pass
    def predict(self,X_test: pd.DataFrame) -> np.array:
        ## returns the actual demand oberved 7 days ago
        return X_test[f'rides_previous_{7*24}_hour']

In [None]:
model2 = BaselineModelPreviousWeek()
baseline_predictions2 = model2.predict(X_test)
test_mae2 = mean_absolute_error(y_test,baseline_predictions2)
print(f'Baseline model 2 MAE={np.round(test_mae2,4)}')

Baseline model 2 MAE=3.7233
