In [12]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,2.0,1.0,...,4.0,0.0,3.0,0.0,0.0,0.0,0.0,2022-10-29,1,0.0
1,0.0,2.0,0.0,0.0,2.0,2.0,3.0,3.0,4.0,0.0,...,1.0,6.0,2.0,5.0,0.0,0.0,0.0,2022-10-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,9.0,3.0,6.0,1.0,2.0,0.0,0.0,2022-10-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-10-29,2,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-10-30,2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,51.0,34.0,23.0,15.0,16.0,5.0,14.0,21.0,18.0,46.0,...,100.0,96.0,85.0,90.0,72.0,88.0,79.0,2022-10-30,264,41.0
791,31.0,21.0,4.0,4.0,3.0,5.0,23.0,48.0,51.0,61.0,...,124.0,89.0,88.0,96.0,65.0,82.0,70.0,2022-10-31,264,22.0
792,7.0,6.0,3.0,4.0,3.0,5.0,7.0,6.0,5.0,10.0,...,29.0,13.0,9.0,10.0,5.0,10.0,7.0,2022-10-29,265,6.0
793,6.0,5.0,8.0,6.0,6.0,0.0,1.0,2.0,8.0,6.0,...,8.0,10.0,7.0,3.0,3.0,6.0,2.0,2022-10-30,265,10.0


In [11]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(0, 674)
y_train.shape=(0,)
X_test.shape=(795, 674)
y_test.shape=(795,)


In [3]:
import numpy as np

class BaselineModelPreviousHour:
    """
    Prediction = actual demand observed in the last hour
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_1_hour']

In [4]:
model = BaselineModelPreviousHour()
predictions = model.predict(X_test)
predictions

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
790    79.0
791    70.0
792     7.0
793     2.0
794     4.0
Name: rides_previous_1_hour, Length: 795, dtype: float32

In [5]:
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=5.9572


In [6]:
import pandas as pd
import numpy as np

class BaselineModelPreviousWeek:
    """
    Prediction = actual demand observed at t - 7 days
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_{7*24}_hour']

In [7]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)

In [8]:
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=4.6025


In [9]:
class BaselineModelLast4Weeks:
    """
    Prediction = actual demand observed at t - 7 days, t - 14 days, t - 21 days, t - 28 days
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> pd.Series:
        """"""
        return 0.25*(
            X_test[f'rides_previous_{7*24}_hour'] + \
            X_test[f'rides_previous_{2*7*24}_hour'] + \
            X_test[f'rides_previous_{3*7*24}_hour'] + \
            X_test[f'rides_previous_{4*7*24}_hour']
        )

In [10]:
model = BaselineModelLast4Weeks()
predictions = model.predict(X_test)

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=4.0701
