# Stack
1. LightGBM
3. XGBoost
4. meta model - Ridge

In [27]:
''' !pip install -q gdown
!pip install -q category_encoders
!pip install -q optuna '''
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import gdown
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.linear_model import Ridge
import shap
from sklearn.ensemble import StackingRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
import category_encoders as ce
from category_encoders import TargetEncoder
import optuna

# Load data

In [28]:
file_id = "1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="uber.csv", quiet=False)
df = pd.read_csv("uber.csv")
df.head()

Downloading...
From: https://drive.google.com/uc?id=1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1
To: /content/uber.csv
100%|██████████| 23.5M/23.5M [00:00<00:00, 176MB/s]


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


# Set up
1. Log transform to habdle skew in the target.
2. Drop rows with nans in features without a meaningful imputation value.
3. Establish hold out set.

In [29]:
df.drop_duplicates(inplace=True)

df['log_fare_amount'] = np.log1p(df['fare_amount'])

df = df.dropna(subset=[
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_latitude',
    'dropoff_longitude',
    'pickup_datetime',
    'log_fare_amount'
])

X = df.drop(columns=['fare_amount', 'log_fare_amount', 'key', 'Unnamed: 0'])
y = df['log_fare_amount']

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Extraction
### Datetime features extracted (originals dropped)
1. hour
2. weekday
3. month
4. is weekend
5. is rush hour
6. missing datatime flag

If datetime extractor is unable to extract useable datatime, filled with place holder, to prevent crash.

### Coordinate features extracted (originals dropped)
1. haversine distance
2. manhatten distance
3. barring
4. pickup cluster
5. dropoff cluster
6. pickup grid
7. dropoff grid
8. pickup coordinate missing flag
9. dropoff coordinate missing flag

If coordinate extractor is unable to extract useable coordinates, filled with place holder, to prevent crash.

In [30]:
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_column='pickup_datetime', add_missing_flag=True):
        self.datetime_column = datetime_column
        self.add_missing_flag = add_missing_flag
        self._output_config = {"transform": "default"}
        self.placeholder = pd.Timestamp("1900-01-01 00:00:00")

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        X[self.datetime_column] = pd.to_datetime(X[self.datetime_column], errors='coerce')

        if self.add_missing_flag:
            X[f'{self.datetime_column}_missing'] = X[self.datetime_column].isna()

        X[self.datetime_column] = X[self.datetime_column].fillna(self.placeholder)

        X['hour'] = X[self.datetime_column].dt.hour
        X['weekday'] = X[self.datetime_column].dt.weekday
        X['month'] = X[self.datetime_column].dt.month
        X['is_weekend'] = X['weekday'] >= 5
        X['is_rush_hour'] = X['hour'].isin([7, 8, 9, 16, 17, 18, 19])

        X.drop(columns=[self.datetime_column], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CoordinateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=20, placeholder=-999.0):
        self.n_clusters = n_clusters
        self.placeholder = placeholder
        self._output_config = {"transform": "default"}
        self.kmeans_pickup = None
        self.kmeans_dropoff = None

    def fit(self, X, y=None):
        pickup_coords  = X[["pickup_latitude", "pickup_longitude"]].astype(float)
        dropoff_coords = X[["dropoff_latitude", "dropoff_longitude"]].astype(float)

        self.kmeans_pickup  = KMeans(n_clusters=self.n_clusters, random_state=42).fit(pickup_coords)
        self.kmeans_dropoff = KMeans(n_clusters=self.n_clusters, random_state=42).fit(dropoff_coords)
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        geo_cols = [
            "pickup_latitude", "pickup_longitude",
            "dropoff_latitude", "dropoff_longitude",
        ]
        for col in geo_cols:
            X[col] = pd.to_numeric(X[col], errors="coerce")

        X["pickup_missing"]  = X[["pickup_latitude", "pickup_longitude"]].isna().any(axis=1)
        X["dropoff_missing"] = X[["dropoff_latitude", "dropoff_longitude"]].isna().any(axis=1)

        for col in geo_cols:
            X[col] = X[col].fillna(self.placeholder)

        X["pickup_latitude"]   = X["pickup_latitude"].clip(40.5, 41.0)
        X["pickup_longitude"]  = X["pickup_longitude"].clip(-74.3, -73.6)
        X["dropoff_latitude"]  = X["dropoff_latitude"].clip(40.5, 41.0)
        X["dropoff_longitude"] = X["dropoff_longitude"].clip(-74.3, -73.6)

        def haversine(lat1, lon1, lat2, lon2):
            R = 6371.0
            lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
            return R * 2 * np.arcsin(np.sqrt(a))

        def manhattan(lat1, lon1, lat2, lon2):
            return haversine(lat1, lon1, lat2, lon1) + haversine(lat2, lon1, lat2, lon2)

        def bearing(lat1, lon1, lat2, lon2):
            dlon = np.radians(lon2 - lon1)
            lat1, lat2 = np.radians(lat1), np.radians(lat2)
            x = np.sin(dlon) * np.cos(lat2)
            y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
            return (np.degrees(np.arctan2(x, y)) + 360) % 360

        X["distance_km"] = haversine(
            X["pickup_latitude"], X["pickup_longitude"],
            X["dropoff_latitude"], X["dropoff_longitude"]
        )

        X["manhattan_km"] = manhattan(
            X["pickup_latitude"], X["pickup_longitude"],
            X["dropoff_latitude"], X["dropoff_longitude"]
        )

        X["bearing"] = bearing(
            X["pickup_latitude"], X["pickup_longitude"],
            X["dropoff_latitude"], X["dropoff_longitude"]
        )

        p_lat_g = X["pickup_latitude"].round(3)
        p_lon_g = X["pickup_longitude"].round(3)
        d_lat_g = X["dropoff_latitude"].round(3)
        d_lon_g = X["dropoff_longitude"].round(3)

        X["grid_distance_km"] = manhattan(p_lat_g, p_lon_g, d_lat_g, d_lon_g)

        X["pickup_cluster"]  = self.kmeans_pickup.predict(X[["pickup_latitude", "pickup_longitude"]])
        X["dropoff_cluster"] = self.kmeans_dropoff.predict(X[["dropoff_latitude", "dropoff_longitude"]])

        X["pickup_grid"] = p_lat_g.astype(str) + "_" + p_lon_g.astype(str)
        X["dropoff_grid"] = d_lat_g.astype(str) + "_" + d_lon_g.astype(str)

        X.drop(columns=geo_cols, inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

# Feature engineering

In [31]:
class DurationEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, rush_speed_kmh: float = 20.0, normal_speed_kmh: float = 30.0):
        self.rush_speed_kmh = rush_speed_kmh
        self.normal_speed_kmh = normal_speed_kmh
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y=None):
        self._fitted_ = True
        return self

    def transform(self, X):
        from sklearn.utils.validation import check_is_fitted
        check_is_fitted(self, "_fitted_")

        X = X.copy()
        distance = X["manhattan_km"] if "manhattan_km" in X else X["distance_km"]
        speed_kmh = (
            np.where(X["is_rush_hour"], self.rush_speed_kmh, self.normal_speed_kmh)
            if "is_rush_hour" in X
            else self.normal_speed_kmh
        )
        X["estimated_duration_min"] = (distance / speed_kmh) * 60
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self
class PickupClusterDensity(BaseEstimator, TransformerMixin):

    def __init__(
        self,
        cluster_col: str = "pickup_cluster",
        output_name: str = "pickup_cluster_density",
    ):
        self.cluster_col = cluster_col
        self.output_name = output_name
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        if self.cluster_col not in X_df.columns:
            raise ValueError(f"Column {self.cluster_col!r} not found in input.")

        counts = X_df[self.cluster_col].value_counts(dropna=False).astype(float)
        total = float(len(X_df))
        self._density_ = (counts / total).to_dict()
        self._global_density_ = 1.0 / total
        return self

    def transform(self, X):
        check_is_fitted(self, ["_density_"])
        X_out = pd.DataFrame(X).copy()

        X_out[self.output_name] = (
            X_out[self.cluster_col]
            .map(self._density_)
            .fillna(self._global_density_)
        )

        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class GridAvgFareOOF(BaseEstimator, TransformerMixin):

    def __init__(
        self,
        grid_col: str = "pickup_grid",
        output_name: str = "grid_avg_fare",
        n_splits: int = 5,
        random_state: int = 42,
        handle_unseen: str = "global_mean",
    ):
        self.grid_col = grid_col
        self.output_name = output_name
        self.n_splits = n_splits
        self.random_state = random_state
        self.handle_unseen = handle_unseen
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y):
        X_df = pd.DataFrame(X).reset_index(drop=True)
        if self.grid_col not in X_df.columns:
            raise ValueError(f"Column {self.grid_col!r} not found in input.")

        y = pd.Series(y).reset_index(drop=True)
        kf = KFold(
            n_splits=self.n_splits,
            shuffle=True,
            random_state=self.random_state,
        )

        oof_vals = pd.Series(np.nan, index=X_df.index, dtype=float)

        for train_idx, val_idx in kf.split(X_df):
            means = (
                pd.DataFrame(
                    {self.grid_col: X_df.loc[train_idx, self.grid_col], "y": y[train_idx]}
                )
                .groupby(self.grid_col)["y"]
                .mean()
            )
            oof_vals.loc[val_idx] = (
                X_df.loc[val_idx, self.grid_col]
                .map(means)
                .fillna(y[train_idx].mean())
            )

        self._oof_values_ = oof_vals
        self._grid_means_ = (
            pd.DataFrame({self.grid_col: X_df[self.grid_col], "y": y})
            .groupby(self.grid_col)["y"]
            .mean()
            .to_dict()
        )
        self._global_mean_ = float(y.mean())
        self._return_oof_ = True
        return self

    def transform(self, X):
        check_is_fitted(self, ["_grid_means_"])
        X_out = pd.DataFrame(X).copy()

        if self._return_oof_ and len(X_out) == len(self._oof_values_):
            X_out[self.output_name] = self._oof_values_.values
            self._return_oof_ = False
            return X_out

        mapped = X_out[self.grid_col].map(self._grid_means_)
        if self.handle_unseen == "global_mean":
            mapped = mapped.fillna(self._global_mean_)
        X_out[self.output_name] = mapped.astype(float)
        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class PickupDropFreq(BaseEstimator, TransformerMixin):

    def __init__(
        self,
        pickup_col: str = "pickup_cluster",
        dropoff_col: str = "dropoff_cluster",
        output_name: str = "pickup_drop_freq",
        handle_unseen: str = "global_mean",
    ):
        self.pickup_col = pickup_col
        self.dropoff_col = dropoff_col
        self.output_name = output_name
        self.handle_unseen = handle_unseen
        self._output_config = {"transform": "pandas"}
    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)

        missing_cols = [c for c in [self.pickup_col, self.dropoff_col] if c not in X_df]
        if missing_cols:
            raise ValueError(f"Input is missing columns: {missing_cols}")


        pair_counts = (
            X_df.groupby([self.pickup_col, self.dropoff_col], dropna=False)
                .size()
                .astype(float)
        )
        total = float(len(X_df))
        self._freq_ = (pair_counts / total).to_dict()
        self._global_mean_ = 1.0 / total
        return self

    def transform(self, X):
        check_is_fitted(self, ["_freq_"])
        X_out = pd.DataFrame(X).copy()

        pairs = list(zip(X_out[self.pickup_col], X_out[self.dropoff_col]))
        mapped = pd.Series(pairs).map(self._freq_)

        if self.handle_unseen == "global_mean":
            mapped = mapped.fillna(self._global_mean_)

        X_out[self.output_name] = mapped.astype(float)
        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class ColumnPairConcat(BaseEstimator, TransformerMixin):

    def __init__(self, col_a, col_b, output_col, as_string: bool = True):
        self.col_a = col_a
        self.col_b = col_b
        self.output_col = output_col
        self.as_string = as_string
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out = pd.DataFrame(X).copy()
        if self.col_a not in X_out or self.col_b not in X_out:
            missing = [c for c in [self.col_a, self.col_b] if c not in X_out]
            raise ValueError(f"Missing columns: {missing}")

        combo = X_out[self.col_a].astype(str) + "_" + X_out[self.col_b].astype(str)
        if self.as_string:
            X_out[self.output_col] = combo
        else:

            X_out[self.output_col] = combo.apply(lambda s: hash(s) & 0x7FFFFFFF)

        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class SameClusterFlag(BaseEstimator, TransformerMixin):

    def __init__(
        self,
        pickup_col: str = "pickup_cluster",
        dropoff_col: str = "dropoff_cluster",
        output_col: str = "same_cluster",
    ):
        self.pickup_col = pickup_col
        self.dropoff_col = dropoff_col
        self.output_col = output_col
        self._output_config = {"transform": "pandas"}
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out = pd.DataFrame(X).copy()

        missing = [c for c in [self.pickup_col, self.dropoff_col] if c not in X_out]
        if missing:
            raise ValueError(f"Input is missing columns: {missing}")

        X_out[self.output_col] = (
            X_out[self.pickup_col] == X_out[self.dropoff_col]
        )

        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class DistanceComparisons(BaseEstimator, TransformerMixin):

    def __init__(
        self,
        distance_col: str = "distance_km",
        manhattan_col: str = "manhattan_km",
        ratio_name: str = "distance_over_manhattan",
        diff_name: str = "manhattan_minus_distance",
        eps: float = 1e-6,
    ):
        self.distance_col = distance_col
        self.manhattan_col = manhattan_col
        self.ratio_name = ratio_name
        self.diff_name = diff_name
        self.eps = eps
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out = pd.DataFrame(X).copy()

        missing = [
            c
            for c in [self.distance_col, self.manhattan_col]
            if c not in X_out.columns
        ]
        if missing:
            raise ValueError(f"Input is missing columns: {missing}")

        d  = X_out[self.distance_col].astype(float)
        dm = X_out[self.manhattan_col].astype(float)

        X_out[self.ratio_name] = d / (dm + self.eps)

        X_out[self.diff_name] = dm - d

        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class TimeOfDayFlags(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        hour_col: str = "hour",
        work_hours: tuple = (9, 17),
        morning_range: tuple = (5, 11),
        afternoon_range: tuple = (12, 16),
        night_range: tuple = (21, 4),
        output_prefix: str = "",
    ):
        self.hour_col = hour_col
        self.work_hours = work_hours
        self.morning_range = morning_range
        self.afternoon_range = afternoon_range
        self.night_range = night_range
        self.output_prefix = output_prefix
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y=None):
        return self

    def _in_range(self, h, rng):

        start, end = rng
        if start <= end:
            return (h >= start) & (h <= end)
        else:
            return (h >= start) | (h <= end)

    def transform(self, X):
        X_out = pd.DataFrame(X).copy()

        if self.hour_col not in X_out.columns:
            raise ValueError(f"Column '{self.hour_col}' not found in input.")

        h = X_out[self.hour_col].astype(int)

        X_out[f"{self.output_prefix}is_work_hour"] = self._in_range(h, self.work_hours)
        X_out[f"{self.output_prefix}is_morning"]   = self._in_range(h, self.morning_range)
        X_out[f"{self.output_prefix}is_afternoon"] = self._in_range(h, self.afternoon_range)
        X_out[f"{self.output_prefix}is_night"]     = self._in_range(h, self.night_range)

        return X_out

    def set_output(self, *, transform=None):

        self._output_config["transform"] = transform
        return self

class GridDistanceInteractions(BaseEstimator, TransformerMixin):

    def __init__(
        self,
        dist_col: str = "distance_km",
        grid_dist_col: str = "grid_distance_km",
        bearing_col: str = "bearing",
        prefix: str = "",
    ):
        self.dist_col = dist_col
        self.grid_dist_col = grid_dist_col
        self.bearing_col = bearing_col
        self.prefix = prefix
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y=None):
        self._fitted_ = True
        return self

    def transform(self, X):
        check_is_fitted(self, "_fitted_")
        X_out = pd.DataFrame(X).copy()

        required = [self.dist_col, self.grid_dist_col, self.bearing_col]
        missing = [c for c in required if c not in X_out.columns]
        if missing:
            raise ValueError(f"Input is missing columns: {missing}")

        d  = X_out[self.dist_col].astype(float)
        gd = X_out[self.grid_dist_col].astype(float)
        b  = X_out[self.bearing_col].astype(float)

        X_out[f"{self.prefix}dist_times_grid_dist"]   = d * gd
        X_out[f"{self.prefix}bearing_times_dist"]     = b * d
        X_out[f"{self.prefix}bearing_times_grid_dist"] = b * gd

        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

# Feature engineering pipeline

In [32]:
fe_pipeline = Pipeline([
    ("duration_est",          DurationEstimator()),
    ("pickup_cluster_dens",   PickupClusterDensity()),
    ("pickup_drop_freq",      PickupDropFreq()),
    ("grid_avg_fare",         GridAvgFareOOF()), #double check leakage
    ("same_cluster_flag",     SameClusterFlag()),
    ("tod_flags",             TimeOfDayFlags()),
    ("dist_comparisons",      DistanceComparisons()),
    ("cluster_interaction",   ColumnPairConcat("pickup_cluster",
                                               "dropoff_cluster",
                                               "cluster_interaction",
                                               as_string=True)),
    ("grid_interaction",      ColumnPairConcat("pickup_grid",
                                               "dropoff_grid",
                                               "grid_interaction",
                                               as_string=True)),
    ("grid_interactions", GridDistanceInteractions(prefix="int_")),
])

# Custom classes
1. To float
2. Clip to valid range
3. Cyclic transforer
4. Target encode wrapper

In [33]:
class ToFloat64(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('float64')
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class ClipToValidRange(BaseEstimator, TransformerMixin):
    def __init__(self, valid_min=1, valid_max=6):
        self.valid_min = valid_min
        self.valid_max = valid_max
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            mask = X[col].between(self.valid_min, self.valid_max)
            X[col] = X[col].where(mask, np.nan)
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CyclicFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

        self.hour_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(2 * np.pi * X / 24),
            np.cos(2 * np.pi * X / 24)
        ]), validate=False)

        self.bearing_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(np.radians(X)),
            np.cos(np.radians(X))
        ]), validate=False)

        self.weekday_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(2 * np.pi * X / 7),
            np.cos(2 * np.pi * X / 7)
        ]), validate=False)

        self.month_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(2 * np.pi * X / 12),
            np.cos(2 * np.pi * X / 12)
        ]), validate=False)

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        result = pd.DataFrame(index=X.index)

        if 'hour' in X.columns:
            hour_transformed = self.hour_transformer.transform(X[['hour']])
            result[['hour_sin', 'hour_cos']] = pd.DataFrame(hour_transformed, index=X.index)

        if 'bearing' in X.columns:
            bearing_transformed = self.bearing_transformer.transform(X[['bearing']])
            result[['bearing_sin', 'bearing_cos']] = pd.DataFrame(bearing_transformed, index=X.index)

        if 'weekday' in X.columns:
            weekday_transformed = self.weekday_transformer.transform(X[['weekday']])
            result[['weekday_sin', 'weekday_cos']] = pd.DataFrame(weekday_transformed, index=X.index)

        if 'month' in X.columns:
            month_transformed = self.month_transformer.transform(X[['month']])
            result[['month_sin', 'month_cos']] = pd.DataFrame(month_transformed, index=X.index)

        for col in ['hour', 'bearing', 'weekday', 'month']:
            if col in X.columns:
                X.drop(columns=[col], inplace=True)

        X = pd.concat([X, result], axis=1)

        return X

    def set_output(self, *, transform=None):
        return self

class InfToNanConverter(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):
        self.columns = columns
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y=None):
        if isinstance(self.columns, list) and len(self.columns) and isinstance(X, pd.DataFrame):
            self._col_idx_ = [X.columns.get_loc(c) if isinstance(c, str) else c
                              for c in self.columns]
        return self

    def transform(self, X):
        check_is_fitted(self, [])
        X_out = pd.DataFrame(X).copy()

        if self.columns is None:
            cols = X_out.columns
        else:
            cols = self.columns if not hasattr(self, "_col_idx_") else self._col_idx_

        X_out[cols] = X_out[cols].replace([np.inf, -np.inf], np.nan)

        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CategoryCaster(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "pandas"}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out = pd.DataFrame(X).copy().astype("category")
        return X_out

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

# Passenger feature
1. Coerce to float for uniformed nans.
2. Clip range to 1-6 people.
3. Impute with the mode.

In [34]:
passenger_pipeline = Pipeline([
    ('coerce', ToFloat64()),
    ('clip', ClipToValidRange()),
    ('impute', SimpleImputer(strategy='most_frequent')),
])

# Binary features
1. Coerce to float for numeric binarys, 0 1.

In [35]:
binary_pipeline = Pipeline([
    ('coerce', ToFloat64()),
])

# Cyclic features
1. Convert to sinusoidal compenents.

In [36]:
cyclic_pipeline = Pipeline([
    ('cyclic', CyclicFeatureTransformer()),
])

# Ordinal features
1. Pairs well with tree based models used in this stack.

In [37]:
categorical_pipeline = Pipeline([
    ("cast_cat", CategoryCaster()),
])

# Ratio pipeline

In [38]:
ratio_pipeline = Pipeline([
    ('inf', InfToNanConverter()),
    ('impute', SimpleImputer(strategy='most_frequent')),
])

# Drop low impact features

In [39]:
class DropFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features_to_drop = [

]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        drop_cols = [col for col in self.features_to_drop if col in X.columns]
        return X.drop(columns=drop_cols)

    def set_output(self, *, transform=None):
        return self

# Pipeline

In [40]:
passenger_features = ['passenger_count']
binary_features = ['is_weekend', 'is_rush_hour', 'pickup_missing', 'dropoff_missing', 'pickup_datetime_missing', 'same_cluster', "is_work_hour", "is_morning", "is_afternoon", "is_night",]
cyclic_features = ['hour', 'bearing', 'weekday', 'month']
categorical_features = ['pickup_cluster', 'dropoff_cluster', 'pickup_grid', 'dropoff_grid', 'cluster_interaction', 'grid_interaction']
ratio_features = [
    "distance_over_manhattan", "distance_ratio", "grid_to_manhattan_ratio", "grid_to_direct_ratio",
]

preprocessor = ColumnTransformer([
    ('pass', passenger_pipeline, passenger_features),
    ('cyclic', cyclic_pipeline, cyclic_features),
    ('bin', binary_pipeline, binary_features),
    ('ord', categorical_pipeline, categorical_features),
    ('rat', ratio_pipeline, ratio_features),
], remainder='passthrough')

base_learners = [
    ('lgbm', LGBMRegressor(random_state=42, n_jobs=-1)),
    ('xgb', xgb.XGBRegressor(
        random_state=42,
        n_jobs=-1,
        tree_method='hist',
        enable_categorical=True
    ))
]

meta_model = Ridge(random_state=42)

stacked_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_model,
    passthrough=False,
    n_jobs=-1
)

full_pipeline = Pipeline([
    ('datetime_features',  DatetimeFeatureExtractor(datetime_column='pickup_datetime')),
    ('coordinate_features', CoordinateFeatureExtractor(n_clusters=20)),
    ('feature_engineering', fe_pipeline),
    ('preprocessing',       preprocessor),
    ('model',               stacked_model)
])

full_pipeline.set_output(transform='pandas')

_ = full_pipeline

#Metrics

In [41]:
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

full_pipeline.fit(X_subtrain, y_subtrain)
y_val_pred_log = full_pipeline.predict(X_val)

y_val_true = np.expm1(y_val)
y_val_pred = np.expm1(y_val_pred_log)

val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
val_mse = mean_squared_error(y_val_true, y_val_pred)
val_mae = mean_absolute_error(y_val_true, y_val_pred)
val_r2 = r2_score(y_val_true, y_val_pred)

print("Validation Set Metrics (Un-logged, in dollars):")
print(f"RMSE: ${val_rmse:.4f}")
print(f"MSE:  ${val_mse:.4f}")
print(f"MAE:  ${val_mae:.4f}")
print(f"R²:   {val_r2:.4f}")

full_pipeline.fit(X_train, y_train)
y_holdout_pred_log = full_pipeline.predict(X_holdout)

y_holdout_true = np.expm1(y_holdout)
y_holdout_pred = np.expm1(y_holdout_pred_log)

holdout_rmse = np.sqrt(mean_squared_error(y_holdout_true, y_holdout_pred))
holdout_mse = mean_squared_error(y_holdout_true, y_holdout_pred)
holdout_mae = mean_absolute_error(y_holdout_true, y_holdout_pred)
holdout_r2 = r2_score(y_holdout_true, y_holdout_pred)

print("\nHold-Out Set Metrics (Un-logged, in dollars):")
print(f"RMSE: ${holdout_rmse:.4f}")
print(f"MSE:  ${holdout_mse:.4f}")
print(f"MAE:  ${holdout_mae:.4f}")
print(f"R²:   {holdout_r2:.4f}")

ValueError: could not convert string to float: '40.748_-73.977'

#Shap feature analysis

In [None]:
X_sampled = X_holdout.sample(n=1000, random_state=42)

X_sampled_transformed = full_pipeline[:-1].transform(X_sampled)

model = full_pipeline.named_steps['model']

explainer = shap.Explainer(model.predict, X_sampled_transformed)
shap_values = explainer(X_sampled_transformed)

mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
shap_importance_df = pd.DataFrame({
    'Feature': X_sampled_transformed.columns,
    'Mean |SHAP Value|': mean_abs_shap
}).sort_values(by='Mean |SHAP Value|', ascending=False).reset_index(drop=True)

print("\nSHAP Feature Importances (All Features):")
print(shap_importance_df)

PermutationExplainer explainer: 1001it [06:35,  2.46it/s]


SHAP Feature Importances (All Features):
                                Feature  Mean |SHAP Value|
0                     clip__distance_km           0.327966
1                     tar__dropoff_grid           0.063032
2                      tar__pickup_grid           0.047075
3          clip__cluster_bearing_offset           0.033566
4                      cyclic__hour_cos           0.032946
5                   cyclic__bearing_sin           0.025823
6            clip__distance_bearing_sin           0.021834
7                  ord__dropoff_cluster           0.019436
8                   cyclic__bearing_cos           0.018480
9                  clip__distance_ratio           0.016964
10                     cyclic__hour_sin           0.015312
11                  ord__pickup_cluster           0.012587
12                   clip__manhattan_km           0.010888
13                  cyclic__weekday_cos           0.010309
14    clip__distance_times_cluster_diff           0.008271
15            




# Summary
The model is generalising well, not overfitting or underfitting, and is accomplishing it's goal well. The top feature is distance followed manhatten distance which makes sense.