# Stack
1. linear regression
2. random forrest
3. xgboost
4. meta model - ridge

In [36]:
!pip install -q gdown
import gdown
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import shap
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate,  cross_val_score, KFold
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
! pip install -q category_encoders
from category_encoders import TargetEncoder
!pip install category_encoders
import category_encoders as ce
! pip install optuna
import optuna
import xgboost as xgb



In [37]:
file_id = "1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="uber.csv", quiet=False)
df = pd.read_csv("uber.csv")
df.head()

Downloading...
From: https://drive.google.com/uc?id=1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1
To: /content/uber.csv
100%|██████████| 23.5M/23.5M [00:00<00:00, 152MB/s]


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


# Set up

In [38]:
df.drop_duplicates(inplace=True)

df['log_fare_amount'] = np.log1p(df['fare_amount'])

df = df.dropna(subset=[
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_latitude',
    'dropoff_longitude',
    'pickup_datetime',
    'log_fare_amount'
])

X = df.drop(columns=['fare_amount', 'log_fare_amount', 'key', 'Unnamed: 0'])
y = df['log_fare_amount']

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Extraction

In [39]:
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_column='pickup_datetime'):
        self.datetime_column = datetime_column
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        X[self.datetime_column] = pd.to_datetime(X[self.datetime_column], errors='coerce')

        X['hour'] = X[self.datetime_column].dt.hour
        X['weekday'] = X[self.datetime_column].dt.weekday
        X['month'] = X[self.datetime_column].dt.month
        X['is_weekend'] = X['weekday'] >= 5
        X['is_rush_hour'] = X['hour'].isin([7, 8, 9, 16, 17, 18, 19])

        X.drop(columns=[self.datetime_column], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CoordinateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=20):
        self.n_clusters = n_clusters
        self._output_config = {"transform": "default"}
        self.kmeans_pickup = None
        self.kmeans_dropoff = None

    def fit(self, X, y=None):
        pickup_coords = X[['pickup_latitude', 'pickup_longitude']]
        dropoff_coords = X[['dropoff_latitude', 'dropoff_longitude']]

        self.kmeans_pickup = KMeans(n_clusters=self.n_clusters, random_state=42).fit(pickup_coords)
        self.kmeans_dropoff = KMeans(n_clusters=self.n_clusters, random_state=42).fit(dropoff_coords)

        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        def haversine(lat1, lon1, lat2, lon2):
            R = 6371
            lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
            return R * 2 * np.arcsin(np.sqrt(a))

        def manhattan(lat1, lon1, lat2, lon2):
            return (
                haversine(lat1, lon1, lat2, lon1) +
                haversine(lat2, lon1, lat2, lon2)
            )

        def bearing(lat1, lon1, lat2, lon2):
            dlon = np.radians(lon2 - lon1)
            lat1, lat2 = np.radians(lat1), np.radians(lat2)
            x = np.sin(dlon) * np.cos(lat2)
            y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
            return (np.degrees(np.arctan2(x, y)) + 360) % 360

        X['distance_km'] = haversine(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['manhattan_km'] = manhattan(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['bearing'] = bearing(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['pickup_cluster'] = self.kmeans_pickup.predict(X[['pickup_latitude', 'pickup_longitude']])
        X['dropoff_cluster'] = self.kmeans_dropoff.predict(X[['dropoff_latitude', 'dropoff_longitude']])

        X['pickup_grid'] = (
            X['pickup_latitude'].round(3).astype(str) + "_" +
            X['pickup_longitude'].round(3).astype(str)
        )

        X['dropoff_grid'] = (
            X['dropoff_latitude'].round(3).astype(str) + "_" +
            X['dropoff_longitude'].round(3).astype(str)
        )

        X.drop(columns=[
            'pickup_latitude', 'pickup_longitude',
            'dropoff_latitude', 'dropoff_longitude'
        ], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

# Feature engineering

In [40]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X['distance_diff'] = X['manhattan_km'] - X['distance_km']
        X['distance_ratio'] = X['manhattan_km'] / X['distance_km']
        X['distance_per_passenger'] = X['distance_km'] / X['passenger_count']
        X['is_peak_weekend'] = X['is_rush_hour'] * X['is_weekend']
        X['total_person_km'] = X['distance_km'] * X['passenger_count']

        return X

    def set_output(self, *, transform=None):
        return self

# Custom classes
1. to float
2. clip range
3. cyclic
4. target encode wrapper
5. inf to nan

In [41]:
class ToFloat64(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('float64')
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class ClipToValidRange(BaseEstimator, TransformerMixin):
    def __init__(self, valid_min=1, valid_max=6):
        self.valid_min = valid_min
        self.valid_max = valid_max
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            mask = X[col].between(self.valid_min, self.valid_max)
            X[col] = X[col].where(mask, np.nan)
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CyclicFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

        self.hour_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(2 * np.pi * X / 24),
            np.cos(2 * np.pi * X / 24)
        ]), validate=False)

        self.bearing_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(np.radians(X)),
            np.cos(np.radians(X))
        ]), validate=False)

        self.weekday_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(2 * np.pi * X / 7),
            np.cos(2 * np.pi * X / 7)
        ]), validate=False)

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        result = pd.DataFrame(index=X.index)

        if 'hour' in X.columns:
            hour_transformed = self.hour_transformer.transform(X[['hour']])
            result[['hour_sin', 'hour_cos']] = pd.DataFrame(hour_transformed, index=X.index)

        if 'bearing' in X.columns:
            bearing_transformed = self.bearing_transformer.transform(X[['bearing']])
            result[['bearing_sin', 'bearing_cos']] = pd.DataFrame(bearing_transformed, index=X.index)

        if 'weekday' in X.columns:
            weekday_transformed = self.weekday_transformer.transform(X[['weekday']])
            result[['weekday_sin', 'weekday_cos']] = pd.DataFrame(weekday_transformed, index=X.index)

        if 'hour' in X.columns:
            X.drop(columns=['hour'], inplace=True)
        if 'bearing' in X.columns:
            X.drop(columns=['bearing'], inplace=True)
        if 'weekday' in X.columns:
            X.drop(columns=['weekday'], inplace=True)

        X = pd.concat([X, result], axis=1)

        return X

    def set_output(self, *, transform=None):
        return self

class TargetEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):

        self.encoder = ce.TargetEncoder(cols=cols)
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):

        self.encoder.fit(X, y)
        return self

    def transform(self, X):

        X_transformed = self.encoder.transform(X)

        return pd.DataFrame(X_transformed)

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self


class InfToNaNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X = X.copy()
        X.replace([np.inf, -np.inf], np.nan, inplace=True)
        return X

    def set_output(self, *, transform=None):

        return self

# Passenger feature

In [42]:
passenger_pipeline = Pipeline([
    ('convert_to_float64', ToFloat64()),
    ('clip_invalid', ClipToValidRange()),
    ('impute_missing', SimpleImputer(strategy='most_frequent'))
])

# Binary features

In [43]:
binary_pipeline = Pipeline([
    ('convert_to_float64', ToFloat64()),
    ('impute_missing', SimpleImputer(strategy='most_frequent'))
])

# Distance features

In [44]:
distance_pipeline = Pipeline([
    ('scaler', StandardScaler()),
])

# Cyclic features

In [45]:
cyclic_pipeline = Pipeline([
    ('cyclic_features', CyclicFeatureTransformer()),
    ('scaler', StandardScaler())
])

# One hot features

In [46]:
onehot_pipeline = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Target encode features

In [47]:
target_pipeline = Pipeline([
    ('target', ce.TargetEncoder())
])

# Ratio features

In [48]:
ratio_pipeline = Pipeline([
    ('inf_to_nan', InfToNaNTransformer()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

# Drop low impact features

In [49]:
class DropLowSHAPFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features_to_drop = [
    "oh__pickup_cluster_5", "oh__dropoff_cluster_4", "oh__pickup_cluster_2", "oh__pickup_cluster_3",
    "oh__pickup_cluster_7", "oh__pickup_cluster_6", "oh__pickup_cluster_4", "oh__pickup_cluster_14",
    "oh__pickup_cluster_15", "oh__pickup_cluster_8", "oh__pickup_cluster_9", "oh__pickup_cluster_10",
    "oh__dropoff_cluster_3", "oh__dropoff_cluster_2", "oh__pickup_cluster_17", "oh__pickup_cluster_19",
    "oh__pickup_cluster_12", "oh__pickup_cluster_13", "oh__dropoff_cluster_6", "oh__dropoff_cluster_5",
    "oh__dropoff_cluster_11", "oh__dropoff_cluster_10", "oh__dropoff_cluster_8", "oh__dropoff_cluster_7",
    "oh__dropoff_cluster_13", "oh__dropoff_cluster_17", "oh__dropoff_cluster_18", "oh__dropoff_cluster_19",
]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        drop_cols = [col for col in self.features_to_drop if col in X.columns]
        return X.drop(columns=drop_cols)

    def set_output(self, *, transform=None):
        return self

# Pipeline

In [50]:
passenger_features = ['passenger_count']
binary_features = ['is_weekend', 'is_rush_hour', 'is_peak_weekend']
distance_features = ['distance_km', 'manhattan_km', 'distance_diff', 'total_person_km']
cyclic_features = ['hour', 'bearing', 'weekday']
target_features = ['month', 'pickup_grid', 'dropoff_grid']
ratio_features = ['distance_ratio', 'distance_per_passenger']
one_hot = ['pickup_cluster', 'dropoff_cluster']

preprocessor = ColumnTransformer([
    ('pass', passenger_pipeline, passenger_features),
    ('cyclic', cyclic_pipeline, cyclic_features),
    ('tar', target_pipeline, target_features),
    ('bin', binary_pipeline, binary_features),
    ('dis', distance_pipeline, distance_features),
    ('rat', ratio_pipeline, ratio_features),
    ('ohe', onehot_pipeline, one_hot),
])

rf_model = RandomForestRegressor(
    n_estimators=182,
    max_depth=41,
    min_samples_split=15,
    min_samples_leaf=13,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

xgb_model = xgb.XGBRegressor(
    n_estimators=149,
    max_depth=3,
    learning_rate=0.16779455063297968,
    min_child_weight=12,
    subsample=0.8434805496582364,
    colsample_bytree=0.795643706710287,
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

base_learners = [
    ('lr', LinearRegression()),
    ('rf', rf_model),
    ('xgb', xgb_model)
]

meta_model = Ridge(alpha=0.30288462605316574)

stacked_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_model,
    passthrough=True,
    n_jobs=-1
)

full_pipeline = Pipeline([
    ('datetime_features', DatetimeFeatureExtractor(datetime_column='pickup_datetime')),
    ('coordinate_features', CoordinateFeatureExtractor(n_clusters=20)),
    ('feature_engineering', FeatureEngineer()),
    ('preprocessing', preprocessor),
    ('drop_low_shap', DropLowSHAPFeatures()),
    ('model', stacked_model)
])

full_pipeline.set_output(transform='pandas')

_ = full_pipeline

#Metrics

In [51]:
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

full_pipeline.fit(X_subtrain, y_subtrain)
y_val_pred_log = full_pipeline.predict(X_val)

y_val_true = np.expm1(y_val)
y_val_pred = np.expm1(y_val_pred_log)

val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
val_mse = mean_squared_error(y_val_true, y_val_pred)
val_mae = mean_absolute_error(y_val_true, y_val_pred)
val_r2 = r2_score(y_val_true, y_val_pred)

print("Validation Set Metrics (Un-logged, in dollars):")
print(f"RMSE: ${val_rmse:.4f}")
print(f"MSE:  ${val_mse:.4f}")
print(f"MAE:  ${val_mae:.4f}")
print(f"R²:   {val_r2:.4f}")

full_pipeline.fit(X_train, y_train)
y_holdout_pred_log = full_pipeline.predict(X_holdout)

y_holdout_true = np.expm1(y_holdout)
y_holdout_pred = np.expm1(y_holdout_pred_log)

holdout_rmse = np.sqrt(mean_squared_error(y_holdout_true, y_holdout_pred))
holdout_mse = mean_squared_error(y_holdout_true, y_holdout_pred)
holdout_mae = mean_absolute_error(y_holdout_true, y_holdout_pred)
holdout_r2 = r2_score(y_holdout_true, y_holdout_pred)

print("\nHold-Out Set Metrics (Un-logged, in dollars):")
print(f"RMSE: ${holdout_rmse:.4f}")
print(f"MSE:  ${holdout_mse:.4f}")
print(f"MAE:  ${holdout_mae:.4f}")
print(f"R²:   {holdout_r2:.4f}")

Validation Set Metrics (Un-logged, in dollars):
RMSE: $4.9521
MSE:  $24.5231
MAE:  $2.1624
R²:   0.7368

Hold-Out Set Metrics (Un-logged, in dollars):
RMSE: $4.9465
MSE:  $24.4677
MAE:  $2.1529
R²:   0.7426


#Shap feature analysis

In [52]:
X_sampled = X_holdout.sample(n=1000, random_state=42)

X_sampled_transformed = full_pipeline[:-1].transform(X_sampled)

model = full_pipeline.named_steps['model']

explainer = shap.Explainer(model.predict, X_sampled_transformed)
shap_values = explainer(X_sampled_transformed)

mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
shap_importance_df = pd.DataFrame({
    'Feature': X_sampled_transformed.columns,
    'Mean |SHAP Value|': mean_abs_shap
}).sort_values(by='Mean |SHAP Value|', ascending=False).reset_index(drop=True)

print("\nSHAP Feature Importances (All Features):")
print(shap_importance_df)

PermutationExplainer explainer: 1001it [12:19,  1.33it/s]


SHAP Feature Importances (All Features):
                        Feature  Mean |SHAP Value|
0              dis__distance_km           0.213521
1         ohe__pickup_cluster_0           0.127894
2        ohe__pickup_cluster_16           0.103152
3             dis__manhattan_km           0.094766
4        ohe__pickup_cluster_18           0.069612
5        ohe__dropoff_cluster_9           0.043579
6        ohe__dropoff_cluster_0           0.042927
7             tar__dropoff_grid           0.040236
8              cyclic__hour_cos           0.034125
9           rat__distance_ratio           0.034084
10          cyclic__bearing_sin           0.029416
11             tar__pickup_grid           0.024656
12         dis__total_person_km           0.021056
13             cyclic__hour_sin           0.016639
14  rat__distance_per_passenger           0.013662
15          cyclic__bearing_cos           0.011680
16           dis__distance_diff           0.011322
17          cyclic__weekday_cos         




# Summary
The model is generalising well, not overfitting or underfitting, and is accomplishing it's goal well. The top feature is distance followed manhatten distance which makes sense.