# Stack
1. linear regression
2. random forrest
3. xgboost
4. meta model - ridge

In [133]:
''' !pip install -q gdown '''
import gdown
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import shap
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
''' ! pip install -q category_encoders '''
from category_encoders import TargetEncoder
''' !pip install category_encoders '''
import category_encoders as ce
''' ! pip install optuna '''
import optuna

In [134]:
file_id = "1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="uber.csv", quiet=False)
df = pd.read_csv("uber.csv")
df.head()

Downloading...
From: https://drive.google.com/uc?id=1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1
To: /content/uber.csv
100%|██████████| 23.5M/23.5M [00:00<00:00, 55.4MB/s]


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


# Set up

In [135]:
df.drop_duplicates(inplace=True)

df['log_fare_amount'] = np.log1p(df['fare_amount'])

df = df.dropna(subset=[
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_latitude',
    'dropoff_longitude',
    'pickup_datetime',
    'log_fare_amount'
])

X = df.drop(columns=['fare_amount', 'log_fare_amount', 'key', 'Unnamed: 0'])
y = df['log_fare_amount']

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Extraction

In [136]:
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_column='pickup_datetime'):
        self.datetime_column = datetime_column
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        X[self.datetime_column] = pd.to_datetime(X[self.datetime_column], errors='coerce')

        X['hour'] = X[self.datetime_column].dt.hour
        X['weekday'] = X[self.datetime_column].dt.weekday
        X['month'] = X[self.datetime_column].dt.month
        X['is_weekend'] = X['weekday'] >= 5
        X['is_rush_hour'] = X['hour'].isin([7, 8, 9, 16, 17, 18, 19])

        X.drop(columns=[self.datetime_column], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CoordinateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=20):
        self.n_clusters = n_clusters
        self._output_config = {"transform": "default"}
        self.kmeans_pickup = None
        self.kmeans_dropoff = None

    def fit(self, X, y=None):
        pickup_coords = X[['pickup_latitude', 'pickup_longitude']]
        dropoff_coords = X[['dropoff_latitude', 'dropoff_longitude']]

        self.kmeans_pickup = KMeans(n_clusters=self.n_clusters, random_state=42).fit(pickup_coords)
        self.kmeans_dropoff = KMeans(n_clusters=self.n_clusters, random_state=42).fit(dropoff_coords)

        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        def haversine(lat1, lon1, lat2, lon2):
            R = 6371
            lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
            return R * 2 * np.arcsin(np.sqrt(a))

        def manhattan(lat1, lon1, lat2, lon2):
            return (
                haversine(lat1, lon1, lat2, lon1) +
                haversine(lat2, lon1, lat2, lon2)
            )

        def bearing(lat1, lon1, lat2, lon2):
            dlon = np.radians(lon2 - lon1)
            lat1, lat2 = np.radians(lat1), np.radians(lat2)
            x = np.sin(dlon) * np.cos(lat2)
            y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
            return (np.degrees(np.arctan2(x, y)) + 360) % 360

        X['distance_km'] = haversine(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['manhattan_km'] = manhattan(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['bearing'] = bearing(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['pickup_cluster'] = self.kmeans_pickup.predict(X[['pickup_latitude', 'pickup_longitude']])
        X['dropoff_cluster'] = self.kmeans_dropoff.predict(X[['dropoff_latitude', 'dropoff_longitude']])

        X['pickup_grid'] = (
            X['pickup_latitude'].round(3).astype(str) + "_" +
            X['pickup_longitude'].round(3).astype(str)
        )

        X['dropoff_grid'] = (
            X['dropoff_latitude'].round(3).astype(str) + "_" +
            X['dropoff_longitude'].round(3).astype(str)
        )

        X.drop(columns=[
            'pickup_latitude', 'pickup_longitude',
            'dropoff_latitude', 'dropoff_longitude'
        ], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

# Feature engineering

In [137]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['distance_diff'] = X['manhattan_km'] - X['distance_km']
        X['distance_ratio'] = X['manhattan_km'] / X['distance_km']

        X['pickup_lat_long'] = X['pickup_grid'].apply(lambda x: np.array([float(i) for i in x.split('_')]))
        X['dropoff_lat_long'] = X['dropoff_grid'].apply(lambda x: np.array([float(i) for i in x.split('_')]))

        X['grid_distance'] = [
            np.linalg.norm(pickup - dropoff) for pickup, dropoff in zip(X['pickup_lat_long'], X['dropoff_lat_long'])
        ]

        X['grid_distance_to_manhattan'] = X['grid_distance'] - X['manhattan_km']
        X['grid_distance_to_direct'] = X['grid_distance'] - X['distance_km']
        X['grid_to_manhattan_ratio'] = X['grid_distance'] / (X['manhattan_km'])
        X['grid_to_direct_ratio'] = X['grid_distance'] / (X['distance_km'])

        return X

    def set_output(self, *, transform=None):
        return self

# Custom classes
1. to float
2. clip range
3. cyclic
4. target encode wrapper
5. inf to nan

In [138]:
class ToFloat64(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('float64')
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class ClipToValidRange(BaseEstimator, TransformerMixin):
    def __init__(self, valid_min=1, valid_max=6):
        self.valid_min = valid_min
        self.valid_max = valid_max
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            mask = X[col].between(self.valid_min, self.valid_max)
            X[col] = X[col].where(mask, np.nan)
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CyclicFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

        self.hour_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(2 * np.pi * X / 24),
            np.cos(2 * np.pi * X / 24)
        ]), validate=False)

        self.bearing_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(np.radians(X)),
            np.cos(np.radians(X))
        ]), validate=False)

        self.weekday_transformer = FunctionTransformer(lambda X: np.column_stack([
            np.sin(2 * np.pi * X / 7),
            np.cos(2 * np.pi * X / 7)
        ]), validate=False)

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        result = pd.DataFrame(index=X.index)

        if 'hour' in X.columns:
            hour_transformed = self.hour_transformer.transform(X[['hour']])
            result[['hour_sin', 'hour_cos']] = pd.DataFrame(hour_transformed, index=X.index)

        if 'bearing' in X.columns:
            bearing_transformed = self.bearing_transformer.transform(X[['bearing']])
            result[['bearing_sin', 'bearing_cos']] = pd.DataFrame(bearing_transformed, index=X.index)

        if 'weekday' in X.columns:
            weekday_transformed = self.weekday_transformer.transform(X[['weekday']])
            result[['weekday_sin', 'weekday_cos']] = pd.DataFrame(weekday_transformed, index=X.index)

        if 'hour' in X.columns:
            X.drop(columns=['hour'], inplace=True)
        if 'bearing' in X.columns:
            X.drop(columns=['bearing'], inplace=True)
        if 'weekday' in X.columns:
            X.drop(columns=['weekday'], inplace=True)

        X = pd.concat([X, result], axis=1)

        return X

    def set_output(self, *, transform=None):
        return self

class TargetEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):

        self.encoder = ce.TargetEncoder(cols=cols)
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):

        self.encoder.fit(X, y)
        return self

    def transform(self, X):

        X_transformed = self.encoder.transform(X)

        return pd.DataFrame(X_transformed)

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self


class InfToNaNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X = X.copy()
        X.replace([np.inf, -np.inf], np.nan, inplace=True)
        return X

    def set_output(self, *, transform=None):

        return self

# Passenger feature

In [139]:
passenger_pipeline = Pipeline([
    ('convert_to_float64', ToFloat64()),
    ('clip_invalid', ClipToValidRange()),
    ('impute_missing', SimpleImputer(strategy='most_frequent'))
])

# Binary features

In [140]:
binary_pipeline = Pipeline([
    ('convert_to_float64', ToFloat64()),
    ('impute_missing', SimpleImputer(strategy='most_frequent'))
])

# Distance features

In [141]:
distance_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Cyclic features

In [142]:
cyclic_pipeline = Pipeline([
    ('cyclic_features', CyclicFeatureTransformer()),
    ('scaler', StandardScaler())
])

# Categorical features

In [143]:
onehot_pipeline = Pipeline([
    ('cat_clean', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# One hot

In [144]:
categorical_pipeline = Pipeline([
    ('target_encoder', ce.TargetEncoder())
])

# Ratio features

In [145]:
ratio_pipeline = Pipeline([
    ('inf_to_nan', InfToNaNTransformer()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

# Drop low impact or highly correlated features

In [146]:
class DropLowSHAPFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features_to_drop = [
    "bin_clean__is_rush_hour", "oh__pickup_cluster_5", "oh__dropoff_cluster_4",
    "oh__pickup_cluster_17", "oh__dropoff_cluster_13", "oh__dropoff_cluster_5",
    "oh__dropoff_cluster_11", "oh__pickup_cluster_7", "oh__dropoff_cluster_19",
    "oh__dropoff_cluster_18", "oh__pickup_cluster_8", "oh__pickup_cluster_6",
    "oh__dropoff_cluster_8", "oh__pickup_cluster_9", "oh__pickup_cluster_4",
    "oh__pickup_cluster_2", "oh__pickup_cluster_3", "oh__pickup_cluster_12",
    "oh__pickup_cluster_13", "oh__pickup_cluster_14", "oh__pickup_cluster_15",
    "oh__dropoff_cluster_3", "oh__dropoff_cluster_2", "oh__pickup_cluster_19",
    "oh__pickup_cluster_10", "oh__dropoff_cluster_10", "oh__dropoff_cluster_7",
    "oh__dropoff_cluster_6", "oh__dropoff_cluster_17"
]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        drop_cols = [col for col in self.features_to_drop if col in X.columns]
        return X.drop(columns=drop_cols)

    def set_output(self, *, transform=None):
        return self

# Pipeline

In [147]:

passenger_features = ['passenger_count']

binary_features = ['is_weekend', 'is_rush_hour']

distance_features = ['distance_km', 'manhattan_km', 'distance_diff', 'grid_distance_to_manhattan', 'grid_distance_to_direct']

cyclic_features = ['hour', 'bearing', 'weekday']

categorical_features = ['month', 'pickup_grid', 'dropoff_grid']

ratio_features = ['distance_diff', 'distance_ratio', 'grid_to_manhattan_ratio', 'grid_to_direct_ratio']

one_hot = ['pickup_cluster', 'dropoff_cluster']

preprocessor = ColumnTransformer([
    ('passenger_clean', passenger_pipeline, passenger_features),
    ('cyclic_clean', cyclic_pipeline, cyclic_features),
    ('cat_clean', categorical_pipeline, categorical_features),
    ('bin_clean', binary_pipeline, binary_features),
    ('dis_clean', distance_pipeline, distance_features),
    ('rat_clean', ratio_pipeline, ratio_features),
    ('oh', onehot_pipeline, one_hot),
])

full_pipeline = Pipeline([
    ('datetime_features', DatetimeFeatureExtractor(datetime_column='pickup_datetime')),
    ('coordinate_features', CoordinateFeatureExtractor(n_clusters=20)),
    ('feature_engineering', FeatureEngineer()),
    ('preprocessing', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])
full_pipeline.set_output(transform='pandas')
_=full_pipeline

In [152]:
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    optuna_pipeline = Pipeline([
        ('datetime_features', DatetimeFeatureExtractor(datetime_column='pickup_datetime')),
        ('coordinate_features', CoordinateFeatureExtractor(n_clusters=20)),
        ('feature_engineering', FeatureEngineer()),
        ('preprocessing', preprocessor),
        ('model', model)
    ])

    optuna_pipeline.fit(X_subtrain, y_subtrain)

    y_pred = optuna_pipeline.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)

    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

best_params = study.best_params
print("Best hyperparameters:", best_params)

[I 2025-07-31 17:14:55,627] A new study created in memory with name: no-name-87f97218-1e30-4323-8a56-cb982852f00b
[I 2025-07-31 17:15:27,650] Trial 0 finished with value: 0.06477979208672635 and parameters: {'n_estimators': 66, 'max_depth': 23, 'min_samples_split': 18, 'min_samples_leaf': 14, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.06477979208672635.
[I 2025-07-31 17:16:19,803] Trial 1 finished with value: 0.06527707446888133 and parameters: {'n_estimators': 153, 'max_depth': 23, 'min_samples_split': 19, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 0 with value: 0.06477979208672635.
[I 2025-07-31 17:23:35,852] Trial 2 finished with value: 0.06902773351685669 and parameters: {'n_estimators': 129, 'max_depth': 37, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': None}. Best is trial 0 with value: 0.06477979208672635.
[I 2025-07-31 17:24:53,626] Trial 3 finished with value: 0.06426783940979597 and parameters: {'n_estimators': 182, 'max_depth'

Best hyperparameters: {'n_estimators': 182, 'max_depth': 41, 'min_samples_split': 15, 'min_samples_leaf': 13, 'max_features': 'sqrt'}


# rf hyperparam tuning
1. tuned for mse