In [46]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Read data from the combined parquet files.

In [47]:
df = pd.read_parquet("tripdata_combined.parquet").sample(frac=0.10, random_state=42) # Sample 10% of points to save on storage
print(df.head().to_markdown())

|         |   VendorID | tpep_pickup_datetime   | tpep_dropoff_datetime   |   passenger_count |   trip_distance |   RatecodeID | store_and_fwd_flag   |   PULocationID |   DOLocationID |   payment_type |   fare_amount |   extra |   mta_tax |   tip_amount |   tolls_amount |   improvement_surcharge |   total_amount |   congestion_surcharge |   airport_fee |   Airport_fee |
|--------:|-----------:|:-----------------------|:------------------------|------------------:|----------------:|-------------:|:---------------------|---------------:|---------------:|---------------:|--------------:|--------:|----------:|-------------:|---------------:|------------------------:|---------------:|-----------------------:|--------------:|--------------:|
| 3146451 |          2 | 2023-10-29 17:34:16    | 2023-10-29 17:45:31     |                 1 |            0.88 |            1 | N                    |            164 |            137 |              1 |          11.4 |     0   |       0.5 |         3.08 

In [48]:
df.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee
count,2300000.0,2300000,2300000,2151064.0,2300000.0,2151064.0,2300000.0,2300000.0,2300000.0,2300000.0,2300000.0,2300000.0,2300000.0,2300000.0,2300000.0,2300000.0,2151064.0,97773.0,2053291.0
mean,1.750393,2023-12-16 09:20:43.986089,2023-12-16 09:38:07.116801,1.353481,4.23947,1.961679,164.6753,163.7547,1.148153,19.33486,1.476817,0.4830315,3.4087,0.5764747,0.9719633,28.08585,2.249244,0.106727,0.145823
min,1.0,2001-01-01 00:28:40,2001-01-01 01:11:09,0.0,0.0,1.0,1.0,1.0,0.0,-999.0,-7.5,-0.5,-80.0,-71.0,-1.0,-1000.0,-2.5,-1.25,-1.75
25%,1.0,2023-06-22 21:54:16.750000,2023-06-22 22:08:50,1.0,1.03,1.0,132.0,113.0,1.0,9.3,0.0,0.5,0.0,0.0,1.0,15.8,2.5,0.0,0.0
50%,2.0,2023-12-14 18:52:07.500000,2023-12-14 19:12:13.500000,1.0,1.78,1.0,161.0,162.0,1.0,13.5,1.0,0.5,2.72,0.0,1.0,21.0,2.5,0.0,0.0
75%,2.0,2024-06-07 23:33:09,2024-06-07 23:50:19.750000,1.0,3.4,1.0,233.0,234.0,1.0,21.9,2.5,0.5,4.3,0.0,1.0,30.6,2.5,0.0,0.0
max,6.0,2024-11-30 23:59:25,2024-12-01 13:14:59,9.0,129871.8,99.0,265.0,265.0,4.0,2997.76,14.25,10.05,480.5,270.0,1.0,3012.45,2.5,1.25,1.75
std,0.4359384,,,0.8600717,234.4735,9.249702,64.18539,69.71409,0.6028825,19.12963,1.83168,0.1186797,4.101854,2.215643,0.226408,23.84616,0.8332922,0.35458,0.4905044


In [49]:
for col in df:
    print(col, ":", df[col].isna().sum(), "null values")

VendorID : 0 null values
tpep_pickup_datetime : 0 null values
tpep_dropoff_datetime : 0 null values
passenger_count : 148936 null values
trip_distance : 0 null values
RatecodeID : 148936 null values
store_and_fwd_flag : 148936 null values
PULocationID : 0 null values
DOLocationID : 0 null values
payment_type : 0 null values
fare_amount : 0 null values
extra : 0 null values
mta_tax : 0 null values
tip_amount : 0 null values
tolls_amount : 0 null values
improvement_surcharge : 0 null values
total_amount : 0 null values
congestion_surcharge : 148936 null values
airport_fee : 2202227 null values
Airport_fee : 246709 null values


In [50]:
# Remove all cols with null values 
df = df.dropna(axis=1)

In [51]:
# Convert all dates to float types, so that they can be scaled.
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].values.astype(np.float64)
df['tpep_dropoff_datetime'] = df['tpep_dropoff_datetime'].values.astype(np.float64)

In [52]:
print(df.head().to_markdown())

|         |   VendorID |   tpep_pickup_datetime |   tpep_dropoff_datetime |   trip_distance |   PULocationID |   DOLocationID |   payment_type |   fare_amount |   extra |   mta_tax |   tip_amount |   tolls_amount |   improvement_surcharge |   total_amount |
|--------:|-----------:|-----------------------:|------------------------:|----------------:|---------------:|---------------:|---------------:|--------------:|--------:|----------:|-------------:|---------------:|------------------------:|---------------:|
| 3146451 |          2 |            1.6986e+15  |             1.6986e+15  |            0.88 |            164 |            137 |              1 |          11.4 |     0   |       0.5 |         3.08 |              0 |                       1 |          18.48 |
| 2682438 |          2 |            1.68494e+15 |             1.68494e+15 |            1.3  |            161 |            237 |              4 |          12.8 |     0   |       0.5 |         0    |              0 |            

In [53]:
print(df.shape)

(2300000, 14)


### Test Regression Models

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [55]:
# Create, scale, and split data

target = 'tip_amount'
X = df.drop(columns=[target])
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# Split data for 80% training,, 20% test 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [56]:
# Define all models and their parameter grids
models = {
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'model__n_neighbors': [3, 5, 7],
            'model__weights': ['uniform', 'distance'],
            'model__algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    },
    'KMeansRegression': {
        'model': KMeans(),
        'param_grid': {
            'model__n_clusters': [5, 10, 15],
            'model__init': ['k-means++', 'random'],
            'model__n_init': [5, 10]
        }
    },
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {
            'model__fit_intercept': [True, False],
            'model__n_jobs': [-1]
        }
    },
    'XGBoostRegressor': {
        'model': XGBRegressor(random_state=42),
        'param_grid': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [3, 6],
            'model__learning_rate': [0.01, 0.1],
            'model__subsample': [0.8, 1.0]
        }
    },
    'SVR': {
        'model': SVR(),
        'param_grid': {
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__C': [0.1, 1, 10],
            'model__epsilon': [0.01, 0.1]
        }
    },
    'Random Forest' : {
        'model' : RandomForestRegressor(random_state=42),
        'param_grid' : {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None],
            'model__min_samples_split': [5, 10],
            'model__max_features': ['sqrt'],
            'model__n_jobs': [-1] 
        }
    }
}

In [57]:
# Dictionary to store best parameters
best_params = {}
performances = {}

for name, config in models.items():
    print(f"\n=== Tuning {name} ===")
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', config['model'])
    ])
    
    # Grid search with reduced CV folds for speed
    search = GridSearchCV(
        pipeline,
        config['param_grid'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=3
    )
    
    # Fit on a subset of data for demonstration
    search.fit(X_train[:5000], y_train[:5000])  # Use full data in production
    
    # Store best parameters
    best_params[name] = search.best_params_
    
    test_pred = search.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    test_r2 = r2_score(y_test, test_pred)
    test_mae = mean_absolute_error(y_test, test_pred)
    test_evs = explained_variance_score(y_test, test_pred)

    performances[name] = {
        'RMSE': test_rmse,
        'R²': test_r2,
        'Mean Absolute Error': test_mae,
        'Explained Variance Score': test_evs
    }
    
    print(f"Best parameters for {name}: {search.best_params_}")


=== Tuning KNeighborsRegressor ===
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for KNeighborsRegressor: {'model__algorithm': 'auto', 'model__n_neighbors': 7, 'model__weights': 'distance'}

=== Tuning KMeansRegression ===
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for KMeansRegression: {'model__init': 'k-means++', 'model__n_clusters': 5, 'model__n_init': 5}

=== Tuning LinearRegression ===
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters for LinearRegression: {'model__fit_intercept': True, 'model__n_jobs': -1}

=== Tuning XGBoostRegressor ===
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for XGBoostRegressor: {'model__learning_rate': 0.1, 'model__max_depth': 6, 'model__n_estimators': 200, 'model__subsample': 1.0}

=== Tuning SVR ===
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for SVR: {'model__C': 1, 'model__epsilon': 0.1, 'model_

In [58]:
# Print all best parameters
for name, params in best_params.items():
    print(f"\n=== {name} Optimal Parameters Found ===")
    for param, value in params.items():
        print(f"  {param}: {value}")
    print()

    # Output results
    print("\nTest Set Performance: ")
    for score in performances[name]:
        print(f"{score}: {performances[name][score]:.4f}")


=== KNeighborsRegressor Optimal Parameters Found ===
  model__algorithm: auto
  model__n_neighbors: 7
  model__weights: distance


Test Set Performance: 
RMSE: 2.4550
R²: 0.6302
Mean Absolute Error: 1.2577
Explained Variance Score: 0.6303

=== KMeansRegression Optimal Parameters Found ===
  model__init: k-means++
  model__n_clusters: 5
  model__n_init: 5


Test Set Performance: 
RMSE: 4.7992
R²: -0.4134
Mean Absolute Error: 2.9125
Explained Variance Score: -0.3337

=== LinearRegression Optimal Parameters Found ===
  model__fit_intercept: True
  model__n_jobs: -1


Test Set Performance: 
RMSE: 2.9052
R²: 0.4821
Mean Absolute Error: 0.3072
Explained Variance Score: 0.4821

=== XGBoostRegressor Optimal Parameters Found ===
  model__learning_rate: 0.1
  model__max_depth: 6
  model__n_estimators: 200
  model__subsample: 1.0


Test Set Performance: 
RMSE: 1.4188
R²: 0.8765
Mean Absolute Error: 0.4676
Explained Variance Score: 0.8765

=== SVR Optimal Parameters Found ===
  model__C: 1
  mode

### Test Baseline Simple Model
(Simply Guesses Average Tip)

In [59]:
all_data = pd.read_parquet('tripdata_combined.parquet')
baseline_val = np.average(all_data['tip_amount'])
test_pred = [baseline_val] * len(y_test)


test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_r2 = r2_score(y_test, test_pred)
test_mae = mean_absolute_error(y_test, test_pred)
test_evs = explained_variance_score(y_test, test_pred)

print("Simple Model Test Set Performance: ")
print(f"RMSE: {test_rmse:.4f}")
print(f"R²: {test_r2:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"Explained Variance Score: {test_evs:.4f}")

Simple Model Test Set Performance: 
RMSE: 4.0368
R²: -0.0000
MAE: 2.6029
Explained Variance Score: 0.0000
