In [60]:
!pip install -q gdown category_encoders optuna
import warnings
import numpy as np
import pandas as pd
from scipy.stats import skew
import gdown
import shap
import optuna
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from category_encoders import TargetEncoder
import category_encoders as ce
import joblib

In [61]:
file_id = "1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="uber.csv", quiet=False)
df = pd.read_csv("uber.csv")

Downloading...
From: https://drive.google.com/uc?id=1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1
To: /content/uber.csv
100%|██████████| 23.5M/23.5M [00:00<00:00, 187MB/s]


In [62]:
df.drop_duplicates(inplace=True)

df['log_fare_amount'] = np.log1p(df['fare_amount'])

df = df.dropna(subset=[
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_latitude',
    'dropoff_longitude',
    'pickup_datetime',
    'log_fare_amount'
])

X = df.drop(columns=['fare_amount', 'log_fare_amount', 'key', 'Unnamed: 0'])
y = df['log_fare_amount']

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [63]:
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_column='pickup_datetime'):
        self.datetime_column = datetime_column
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        X[self.datetime_column] = pd.to_datetime(X[self.datetime_column], errors='coerce')

        X['hour'] = X[self.datetime_column].dt.hour
        X['weekday'] = X[self.datetime_column].dt.weekday
        X['month'] = X[self.datetime_column].dt.month
        X['is_weekend'] = X['weekday'] >= 5
        X['is_rush_hour'] = X['hour'].isin([7, 8, 9, 16, 17, 18, 19])

        X.drop(columns=[self.datetime_column], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CoordinateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=20):
        self.n_clusters = n_clusters
        self._output_config = {"transform": "default"}
        self.kmeans_pickup = None
        self.kmeans_dropoff = None

    def fit(self, X, y=None):
        pickup_coords = X[['pickup_latitude', 'pickup_longitude']]
        dropoff_coords = X[['dropoff_latitude', 'dropoff_longitude']]

        self.kmeans_pickup = KMeans(n_clusters=self.n_clusters, random_state=42).fit(pickup_coords)
        self.kmeans_dropoff = KMeans(n_clusters=self.n_clusters, random_state=42).fit(dropoff_coords)

        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        def haversine(lat1, lon1, lat2, lon2):
            R = 6371
            lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
            return R * 2 * np.arcsin(np.sqrt(a))

        def manhattan(lat1, lon1, lat2, lon2):
            return (
                haversine(lat1, lon1, lat2, lon1) +
                haversine(lat2, lon1, lat2, lon2)
            )

        def bearing(lat1, lon1, lat2, lon2):
            dlon = np.radians(lon2 - lon1)
            lat1, lat2 = np.radians(lat1), np.radians(lat2)
            x = np.sin(dlon) * np.cos(lat2)
            y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
            return (np.degrees(np.arctan2(x, y)) + 360) % 360

        X['distance_km'] = haversine(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['manhattan_km'] = manhattan(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['bearing'] = bearing(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['pickup_cluster'] = self.kmeans_pickup.predict(X[['pickup_latitude', 'pickup_longitude']])
        X['dropoff_cluster'] = self.kmeans_dropoff.predict(X[['dropoff_latitude', 'dropoff_longitude']])

        X['pickup_grid'] = (
            X['pickup_latitude'].round(3).astype(str) + "_" +
            X['pickup_longitude'].round(3).astype(str)
        )

        X['dropoff_grid'] = (
            X['dropoff_latitude'].round(3).astype(str) + "_" +
            X['dropoff_longitude'].round(3).astype(str)
        )

        X.drop(columns=[
            'pickup_latitude', 'pickup_longitude',
            'dropoff_latitude', 'dropoff_longitude'
        ], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

In [64]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X['distance_diff'] = X['manhattan_km'] - X['distance_km']
        X['distance_ratio'] = X['manhattan_km'] / X['distance_km']
        X['distance_per_passenger'] = X['distance_km'] / X['passenger_count']
        X['is_peak_weekend'] = X['is_rush_hour'] * X['is_weekend']
        X['total_person_km'] = X['distance_km'] * X['passenger_count']

        return X

    def set_output(self, *, transform=None):
        return self

In [65]:
class ToFloat64(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('float64')
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class ClipToValidRange(BaseEstimator, TransformerMixin):
    def __init__(self, valid_min=1, valid_max=6):
        self.valid_min = valid_min
        self.valid_max = valid_max
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            mask = X[col].between(self.valid_min, self.valid_max)
            X[col] = X[col].where(mask, np.nan)
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

def transform_hour(X):
    return np.column_stack([
        np.sin(2 * np.pi * X / 24),
        np.cos(2 * np.pi * X / 24)
    ])

def transform_bearing(X):
    return np.column_stack([
        np.sin(np.radians(X)),
        np.cos(np.radians(X))
    ])

def transform_weekday(X):
    return np.column_stack([
        np.sin(2 * np.pi * X / 7),
        np.cos(2 * np.pi * X / 7)
    ])

class CyclicFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

        self.hour_transformer = FunctionTransformer(transform_hour, validate=False)
        self.bearing_transformer = FunctionTransformer(transform_bearing, validate=False)
        self.weekday_transformer = FunctionTransformer(transform_weekday, validate=False)

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        result = pd.DataFrame(index=X.index)

        if 'hour' in X.columns:
            hour_transformed = self.hour_transformer.transform(X[['hour']])
            result[['hour_sin', 'hour_cos']] = pd.DataFrame(hour_transformed, index=X.index)

        if 'bearing' in X.columns:
            bearing_transformed = self.bearing_transformer.transform(X[['bearing']])
            result[['bearing_sin', 'bearing_cos']] = pd.DataFrame(bearing_transformed, index=X.index)

        if 'weekday' in X.columns:
            weekday_transformed = self.weekday_transformer.transform(X[['weekday']])
            result[['weekday_sin', 'weekday_cos']] = pd.DataFrame(weekday_transformed, index=X.index)

        X.drop(columns=[col for col in ['hour', 'bearing', 'weekday'] if col in X.columns], inplace=True)
        X = pd.concat([X, result], axis=1)

        return X

    def set_output(self, *, transform=None):
        return self

class TargetEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):

        self.encoder = ce.TargetEncoder(cols=cols)
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):

        self.encoder.fit(X, y)
        return self

    def transform(self, X):

        X_transformed = self.encoder.transform(X)

        return pd.DataFrame(X_transformed)

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self


class InfToNaNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X = X.copy()
        X.replace([np.inf, -np.inf], np.nan, inplace=True)
        return X

    def set_output(self, *, transform=None):

        return self

In [66]:
passenger_pipeline = Pipeline([
    ('convert_to_float64', ToFloat64()),
    ('clip_invalid', ClipToValidRange()),
    ('impute_missing', SimpleImputer(strategy='most_frequent'))
])

In [67]:
binary_pipeline = Pipeline([
    ('convert_to_float64', ToFloat64()),
    ('impute_missing', SimpleImputer(strategy='most_frequent'))
])


In [68]:
distance_pipeline = Pipeline([
    ('scaler', StandardScaler()),
])

In [69]:
cyclic_pipeline = Pipeline([
    ('cyclic_features', CyclicFeatureTransformer()),
    ('scaler', StandardScaler())
])

In [70]:
onehot_pipeline = Pipeline([
    ('cat_clean', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [71]:
target_pipeline = Pipeline([
    ('target_encoder', ce.TargetEncoder())
])

In [72]:
ratio_pipeline = Pipeline([
    ('inf_to_nan', InfToNaNTransformer()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

In [73]:
class DropLowSHAPFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features_to_drop = [
    "oh__pickup_cluster_5", "oh__dropoff_cluster_4", "oh__pickup_cluster_2", "oh__pickup_cluster_3",
    "oh__pickup_cluster_7", "oh__pickup_cluster_6", "oh__pickup_cluster_4", "oh__pickup_cluster_14",
    "oh__pickup_cluster_15", "oh__pickup_cluster_8", "oh__pickup_cluster_9", "oh__pickup_cluster_10",
    "oh__dropoff_cluster_3", "oh__dropoff_cluster_2", "oh__pickup_cluster_17", "oh__pickup_cluster_19",
    "oh__pickup_cluster_12", "oh__pickup_cluster_13", "oh__dropoff_cluster_6", "oh__dropoff_cluster_5",
    "oh__dropoff_cluster_11", "oh__dropoff_cluster_10", "oh__dropoff_cluster_8", "oh__dropoff_cluster_7",
    "oh__dropoff_cluster_13", "oh__dropoff_cluster_17", "oh__dropoff_cluster_18", "oh__dropoff_cluster_19",
]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        drop_cols = [col for col in self.features_to_drop if col in X.columns]
        return X.drop(columns=drop_cols)

    def set_output(self, *, transform=None):
        return self

In [74]:
passenger_features = ['passenger_count']
binary_features = ['is_weekend', 'is_rush_hour', 'is_peak_weekend']
distance_features = ['distance_km', 'manhattan_km', 'distance_diff', 'total_person_km']
cyclic_features = ['hour', 'bearing', 'weekday']
target_features = ['month', 'pickup_grid', 'dropoff_grid']
ratio_features = ['distance_ratio', 'distance_per_passenger']
one_hot = ['pickup_cluster', 'dropoff_cluster']

preprocessor = ColumnTransformer([
    ('pass', passenger_pipeline, passenger_features),
    ('cyclic', cyclic_pipeline, cyclic_features),
    ('tar', target_pipeline, target_features),
    ('bin', binary_pipeline, binary_features),
    ('dis', distance_pipeline, distance_features),
    ('rat', ratio_pipeline, ratio_features),
    ('ohe', onehot_pipeline, one_hot),
])


rf_model = RandomForestRegressor(
    n_estimators=182,
    max_depth=41,
    min_samples_split=15,
    min_samples_leaf=13,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

xgb_model = xgb.XGBRegressor(
    n_estimators=149,
    max_depth=3,
    learning_rate=0.16779455063297968,
    min_child_weight=12,
    subsample=0.8434805496582364,
    colsample_bytree=0.795643706710287,
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

base_learners = [
    ('lr', LinearRegression()),
    ('rf', rf_model),
    ('xgb', xgb_model)
]

meta_model = Ridge(alpha=0.30288462605316574)

stacked_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_model,
    passthrough=True,
    n_jobs=-1
)

full_pipeline = Pipeline([
    ('datetime_features', DatetimeFeatureExtractor(datetime_column='pickup_datetime')),
    ('coordinate_features', CoordinateFeatureExtractor(n_clusters=20)),
    ('feature_engineering', FeatureEngineer()),
    ('preprocessing', preprocessor),
    ('drop_low_shap', DropLowSHAPFeatures()),
    ('model', stacked_model)
])

full_pipeline.set_output(transform='pandas')

_ = full_pipeline

In [75]:
full_pipeline.fit(X, y)

joblib.dump(full_pipeline, "model_pipeline.pkl")

['model_pipeline.pkl']