In [None]:
import sys
from pathlib import Path

# Añade src al path
sys.path.append(str(Path().resolve().parent / "src"))

In [None]:
import pandas as pd
from paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data_4months_4weeks_lags.parquet')

df

In [None]:
from data_split import train_test_split
from datetime import datetime


X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2024, 1, 25),
    target_column_name='target'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

In [None]:
# Creamos una nueva clase para añadir features con media de 4 semanas
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds one column with the average rides from
    - 7 days ago
    - 14 days ago
    - 21 days ago
    - 28 days ago
    """
    X['average_rides_last_4_weeks'] = 0.25*(
        X[f'rides_previous_{7*24}_hour'] + \
        X[f'rides_previous_{2*7*24}_hour'] + \
        X[f'rides_previous_{3*7*24}_hour'] + \
        X[f'rides_previous_{4*7*24}_hour']
    )
    return X

In [None]:

from sklearn.preprocessing import FunctionTransformer

# Creamos función con FunctionTransformer
add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False)

In [None]:
# Aplicamos a los datos
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# Añadimos features temporales hour y day_of_week
# Creamos clase para transformar los datos
class TemporalFeaturesEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_ = X.copy()
        
        # Generate numeric columns from datetime
        X_["hour"] = X_['pickup_hour'].dt.hour
        X_["day_of_week"] = X_['pickup_hour'].dt.dayofweek
        
        return X_.drop(columns=['pickup_hour'])

In [None]:
# Aplicamos ese feature eng a los datos
add_temporal_features = TemporalFeaturesEngineer()
add_temporal_features.fit_transform(X_train)

In [None]:
# Entrenamiento
import lightgbm as lgb
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error

# Usamos un pipeline
pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

# Entrenamos el modelo
pipeline.fit(X_train, y_train)

# Predecciomos
predictions = pipeline.predict(X_test)

# Evaluamos el modelo
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')