# Linear regression baseline for simple Interpretability

In [67]:
''' !pip install -q gdown '''
import gdown
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import shap
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [68]:
file_id = "1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="uber.csv", quiet=False)
df = pd.read_csv("uber.csv")
df.head()

Downloading...
From: https://drive.google.com/uc?id=1nvFRd8uiUV8OfoihMOXZ0Emle1ksxwW1
To: /content/uber.csv
100%|██████████| 23.5M/23.5M [00:00<00:00, 168MB/s]


Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [69]:
df.drop_duplicates(inplace=True)

df['log_fare_amount'] = np.log1p(df['fare_amount'])

df = df.dropna(subset=[
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_latitude',
    'dropoff_longitude',
    'pickup_datetime',
    'log_fare_amount'
])

X = df.drop(columns=['fare_amount', 'log_fare_amount', 'key', 'Unnamed: 0'])
y = df['log_fare_amount']

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Extraction
1. datetime
2. coordinate

In [70]:
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_column='pickup_datetime'):
        self.datetime_column = datetime_column
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        X[self.datetime_column] = pd.to_datetime(X[self.datetime_column], errors='coerce')

        X['hour'] = X[self.datetime_column].dt.hour
        X['weekday'] = X[self.datetime_column].dt.weekday
        X['month'] = X[self.datetime_column].dt.month
        X['is_weekend'] = X['weekday'] >= 5
        X['is_rush_hour'] = X['hour'].isin([7, 8, 9, 16, 17, 18, 19])

        X.drop(columns=[self.datetime_column], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CoordinateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=20):
        self.n_clusters = n_clusters
        self._output_config = {"transform": "default"}
        self.kmeans_pickup = None
        self.kmeans_dropoff = None

    def fit(self, X, y=None):
        pickup_coords = X[['pickup_latitude', 'pickup_longitude']]
        dropoff_coords = X[['dropoff_latitude', 'dropoff_longitude']]

        self.kmeans_pickup = KMeans(n_clusters=self.n_clusters, random_state=42).fit(pickup_coords)
        self.kmeans_dropoff = KMeans(n_clusters=self.n_clusters, random_state=42).fit(dropoff_coords)

        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        def haversine(lat1, lon1, lat2, lon2):
            R = 6371
            lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
            return R * 2 * np.arcsin(np.sqrt(a))

        def manhattan(lat1, lon1, lat2, lon2):
            return (
                haversine(lat1, lon1, lat2, lon1) +
                haversine(lat2, lon1, lat2, lon2)
            )

        def bearing(lat1, lon1, lat2, lon2):
            dlon = np.radians(lon2 - lon1)
            lat1, lat2 = np.radians(lat1), np.radians(lat2)
            x = np.sin(dlon) * np.cos(lat2)
            y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
            return (np.degrees(np.arctan2(x, y)) + 360) % 360

        X['distance_km'] = haversine(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['manhattan_km'] = manhattan(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['bearing'] = bearing(
            X['pickup_latitude'], X['pickup_longitude'],
            X['dropoff_latitude'], X['dropoff_longitude']
        )

        X['pickup_cluster'] = self.kmeans_pickup.predict(X[['pickup_latitude', 'pickup_longitude']])
        X['dropoff_cluster'] = self.kmeans_dropoff.predict(X[['dropoff_latitude', 'dropoff_longitude']])

        X['pickup_grid'] = (
            X['pickup_latitude'].round(3).astype(str) + "_" +
            X['pickup_longitude'].round(3).astype(str)
        )

        X['dropoff_grid'] = (
            X['dropoff_latitude'].round(3).astype(str) + "_" +
            X['dropoff_longitude'].round(3).astype(str)
        )

        X.drop(columns=[
            'pickup_latitude', 'pickup_longitude',
            'dropoff_latitude', 'dropoff_longitude'
        ], inplace=True)

        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

# Custom classes
1. To float
2. Clip range

In [71]:
class ToFloat64(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('float64')
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class ClipToValidRange(BaseEstimator, TransformerMixin):
    def __init__(self, valid_min=1, valid_max=6):
        self.valid_min = valid_min
        self.valid_max = valid_max
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            mask = X[col].between(self.valid_min, self.valid_max)
            X[col] = X[col].where(mask, np.nan)
        return X

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

class CyclicFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._output_config = {"transform": "default"}

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        result = pd.DataFrame(index=X.index)

        for col in X.columns:
            if col == 'hour':
                result['hour_sin'] = np.sin(2 * np.pi * X[col] / 24)
                result['hour_cos'] = np.cos(2 * np.pi * X[col] / 24)
            elif col == 'bearing':
                radians = np.radians(X[col])
                result['bearing_sin'] = np.sin(radians)
                result['bearing_cos'] = np.cos(radians)

        return result

    def set_output(self, *, transform=None):
        self._output_config["transform"] = transform
        return self

# Clean passenger count

In [72]:
passenger_pipeline = Pipeline([
    ('convert_to_float64', ToFloat64()),
    ('clip_invalid', ClipToValidRange()),
    ('impute_missing', SimpleImputer(strategy='most_frequent'))
])

# Distance features

In [73]:
distance_pipeline = Pipeline([
    ('clipper', FunctionTransformer(lambda X: np.clip(X, 0, 50), feature_names_out='one-to-one')),
    ('scaler', StandardScaler())
])

#Cyclic features

In [74]:
cyclic_pipeline = Pipeline([
    ('cyclic_features', CyclicFeatureTransformer())
])

# Categorical feature

In [75]:
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Pipeline

In [76]:
distance_features = ['distance_km', 'manhattan_km']

cyclic_features = ['hour', 'bearing']

categorical_features = ['pickup_cluster', 'dropoff_cluster']

preprocessor = ColumnTransformer([
    ('passenger_clean', passenger_pipeline, ['passenger_count']),
    ('distance_clean', distance_pipeline, distance_features),
    ('cyclic_clean', cyclic_pipeline, cyclic_features),
    ('cat_clean', categorical_pipeline, categorical_features),
])
full_pipeline = Pipeline([
    ('datetime_features', DatetimeFeatureExtractor(datetime_column='pickup_datetime')),
    ('coordinate_features', CoordinateFeatureExtractor(n_clusters=20)),
    ('preprocessing', preprocessor),
     ('model', LinearRegression())
])
full_pipeline.set_output(transform='pandas')
_=full_pipeline

# Metircs

In [79]:
cv_results = cross_validate(
    full_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring={
        'rmse': 'neg_root_mean_squared_error',
        'mae': 'neg_mean_absolute_error',
        'r2': 'r2'
    },
    return_train_score=False
)

print("Cross-Validation Metrics (5-Fold):")
print(f"RMSE: {-np.mean(cv_results['test_rmse']):.4f} ± {np.std(cv_results['test_rmse']):.4f}")
print(f"MAE:  {-np.mean(cv_results['test_mae']):.4f} ± {np.std(cv_results['test_mae']):.4f}")
print(f"R²:   {np.mean(cv_results['test_r2']):.4f} ± {np.std(cv_results['test_r2']):.4f}")

full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_holdout)

mse = mean_squared_error(y_holdout, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_holdout, y_pred)
r2 = r2_score(y_holdout, y_pred)

print("\nHold-Out Evaluation Metrics:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"R²:   {r2:.4f}")

Cross-Validation Metrics (5-Fold):
RMSE: 0.3561 ± 0.0054
MAE:  0.2576 ± 0.0020
R²:   0.5786 ± 0.0122

Hold-Out Evaluation Metrics:
RMSE: 0.3556
MAE:  0.2566
R²:   0.5760


# Featrue improtance

In [80]:
full_pipeline.fit(X_train, y_train)

X_holdout_transformed = full_pipeline.named_steps['preprocessing'].transform(
    full_pipeline.named_steps['coordinate_features'].transform(
        full_pipeline.named_steps['datetime_features'].transform(X_holdout)
    )
)

model = full_pipeline.named_steps['model']

explainer = shap.Explainer(model.predict, X_holdout_transformed)
shap_values = explainer(X_holdout_transformed)

mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
shap_importance_df = pd.DataFrame({
    'Feature': X_holdout_transformed.columns,
    'Mean |SHAP Value|': mean_abs_shap
}).sort_values(by='Mean |SHAP Value|', ascending=False).reset_index(drop=True)

print("SHAP Feature Importances (from hold-out set):")
print(shap_importance_df)

PermutationExplainer explainer: 39998it [22:28, 29.49it/s]                           


SHAP Feature Importances (from hold-out set):
                             Feature  Mean |SHAP Value|
0        cat_clean__pickup_cluster_0           0.653760
1       cat_clean__pickup_cluster_16           0.648806
2       distance_clean__manhattan_km           0.603797
3       cat_clean__pickup_cluster_18           0.412646
4        distance_clean__distance_km           0.330749
5       cat_clean__dropoff_cluster_0           0.210309
6       cat_clean__dropoff_cluster_9           0.191740
7      cat_clean__dropoff_cluster_14           0.086844
8       cat_clean__pickup_cluster_11           0.053902
9      cat_clean__dropoff_cluster_16           0.052742
10     cat_clean__dropoff_cluster_15           0.050206
11       cat_clean__pickup_cluster_1           0.049541
12      cat_clean__dropoff_cluster_1           0.026848
13            cyclic_clean__hour_cos           0.018578
14     cat_clean__dropoff_cluster_12           0.018002
15         cyclic_clean__bearing_sin           0.015070
16

# Summary
1. For a baseline linear regression these metrics are solid, no sign of overfitting, no sign of data-leakage, and reasonable generalisation.