In [75]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Regression Using All Features

### Read data from the combined parquet files.

In [76]:
# df = pd.read_parquet("tripdata_combined.parquet").sample(frac=0.30, random_state=42) # Sample 10% of points to save on storage
df = pd.read_parquet("tripdata_combined.parquet")
print(df.head().to_markdown())

|         |   VendorID | tpep_pickup_datetime   | tpep_dropoff_datetime   |   passenger_count |   trip_distance |   RatecodeID | store_and_fwd_flag   |   PULocationID |   DOLocationID |   payment_type |   fare_amount |   extra |   mta_tax |   tip_amount |   tolls_amount |   improvement_surcharge |   total_amount |   congestion_surcharge |   airport_fee |   Airport_fee |
|--------:|-----------:|:-----------------------|:------------------------|------------------:|----------------:|-------------:|:---------------------|---------------:|---------------:|---------------:|--------------:|--------:|----------:|-------------:|---------------:|------------------------:|---------------:|-----------------------:|--------------:|--------------:|
| 2790731 |          2 | 2023-01-29 17:52:02    | 2023-01-29 17:56:43     |                 1 |            1.17 |            1 | N                    |            262 |             74 |              2 |           7.2 |     0   |       0.5 |         0    

In [77]:
df.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee
count,23000000.0,23000000,23000000,21507080.0,23000000.0,21507080.0,23000000.0,23000000.0,23000000.0,23000000.0,23000000.0,23000000.0,23000000.0,23000000.0,23000000.0,23000000.0,21507080.0,976660.0,20530420.0
mean,1.75062,2023-12-16 09:36:28.253082,2023-12-16 09:53:49.922349,1.352881,4.500603,1.954811,164.6832,163.6657,1.147765,19.37793,1.477126,0.4830539,3.410598,0.5769028,0.9721349,28.13297,2.249524,0.107368,0.1461886
min,1.0,2001-01-01 00:08:31,2001-01-01 01:11:09,0.0,0.0,1.0,1.0,1.0,0.0,-1349.8,-10.0,-0.5,-330.88,-84.0,-1.0,-1374.37,-2.5,-1.25,-1.75
25%,2.0,2023-06-23 02:41:01,2023-06-23 02:57:26.500000,1.0,1.03,1.0,132.0,113.0,1.0,9.3,0.0,0.5,0.0,0.0,1.0,15.8,2.5,0.0,0.0
50%,2.0,2023-12-14 17:46:30.500000,2023-12-14 18:07:53,1.0,1.78,1.0,161.0,162.0,1.0,13.5,1.0,0.5,2.72,0.0,1.0,21.0,2.5,0.0,0.0
75%,2.0,2024-06-08 00:37:25.750000,2024-06-08 00:52:18.250000,1.0,3.4,1.0,233.0,234.0,1.0,21.9,2.5,0.5,4.3,0.0,1.0,30.6,2.5,0.0,0.0
max,6.0,2026-06-26 23:53:12,2026-06-27 20:59:10,9.0,398608.6,99.0,265.0,265.0,5.0,335544.4,65.99,53.16,999.99,355.0,2.0,335550.9,2.75,1.25,1.75
std,0.4358652,,,0.8589388,339.7133,9.212094,64.14523,69.73993,0.6030052,100.6047,1.832403,0.1191817,4.092717,2.213326,0.2257728,101.6165,0.8325842,0.355538,0.4911651


In [78]:
for col in df:
    print(col, ":", df[col].isna().sum(), "null values")

VendorID : 0 null values
tpep_pickup_datetime : 0 null values
tpep_dropoff_datetime : 0 null values
passenger_count : 1492922 null values
trip_distance : 0 null values
RatecodeID : 1492922 null values
store_and_fwd_flag : 1492922 null values
PULocationID : 0 null values
DOLocationID : 0 null values
payment_type : 0 null values
fare_amount : 0 null values
extra : 0 null values
mta_tax : 0 null values
tip_amount : 0 null values
tolls_amount : 0 null values
improvement_surcharge : 0 null values
total_amount : 0 null values
congestion_surcharge : 1492922 null values
airport_fee : 22023340 null values
Airport_fee : 2469582 null values


In [79]:
# Remove all cols with null values 
df = df.dropna(axis=1)

In [80]:
# Convert all dates to float types, so that they can be scaled.
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].values.astype(np.float64)
df['tpep_dropoff_datetime'] = df['tpep_dropoff_datetime'].values.astype(np.float64)

In [81]:
print(df.head().to_markdown())

|         |   VendorID |   tpep_pickup_datetime |   tpep_dropoff_datetime |   trip_distance |   PULocationID |   DOLocationID |   payment_type |   fare_amount |   extra |   mta_tax |   tip_amount |   tolls_amount |   improvement_surcharge |   total_amount |
|--------:|-----------:|-----------------------:|------------------------:|----------------:|---------------:|---------------:|---------------:|--------------:|--------:|----------:|-------------:|---------------:|------------------------:|---------------:|
| 2790731 |          2 |            1.67501e+15 |             1.67502e+15 |            1.17 |            262 |             74 |              2 |           7.2 |     0   |       0.5 |         0    |              0 |                       1 |          11.2  |
|  666153 |          1 |            1.67319e+15 |             1.67319e+15 |            0.9  |            229 |            237 |              2 |           6.5 |     2.5 |       0.5 |         0    |              0 |            

In [82]:
print(df.shape)

(23000000, 14)


### Test Regression Models

In [83]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [84]:
# Create, scale, and split data

target = 'tip_amount'
X = df.drop(columns=[target])
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# Split data for 80% training,, 20% test 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [85]:
# Define all models and their parameter grids
models = {
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'model__n_neighbors': [3, 5, 7],
            'model__weights': ['uniform', 'distance'],
            'model__algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    },
    'KMeansRegression': {
        'model': KMeans(),
        'param_grid': {
            'model__n_clusters': [5, 10, 15],
            'model__init': ['k-means++', 'random'],
            'model__n_init': [5, 10]
        }
    },
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {
            'model__fit_intercept': [True, False],
            'model__n_jobs': [-1]
        }
    },
    'XGBoostRegressor': {
        'model': XGBRegressor(random_state=42),
        'param_grid': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [3, 6],
            'model__learning_rate': [0.01, 0.1],
            'model__subsample': [0.8, 1.0]
        }
    },
    'SVR': {
        'model': SVR(),
        'param_grid': {
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__C': [0.1, 1, 10],
            'model__epsilon': [0.01, 0.1]
        }
    },
    'Random Forest' : {
        'model' : RandomForestRegressor(random_state=42),
        'param_grid' : {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None],
            'model__min_samples_split': [5, 10],
            'model__max_features': ['sqrt'],
            'model__n_jobs': [-1] 
        }
    }
}

In [86]:
# Dictionary to store best parameters
best_params = {}
performances = {}

for name, config in models.items():
    print(f"\n=== Tuning {name} ===")
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', config['model'])
    ])
    
    # Grid search with reduced CV folds for speed
    search = GridSearchCV(
        pipeline,
        config['param_grid'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=3
    )
    
    # Fit on a subset of data for demonstration
    search.fit(X_train[:5000], y_train[:5000])  # Use full data in production
    
    # Store best parameters
    best_params[name] = search.best_params_
    
    test_pred = search.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    test_r2 = r2_score(y_test, test_pred)
    test_mae = mean_absolute_error(y_test, test_pred)
    test_evs = explained_variance_score(y_test, test_pred)

    performances[name] = {
        'RMSE': test_rmse,
        'R²': test_r2,
        'Mean Absolute Error': test_mae,
        'Explained Variance Score': test_evs
    }
    
    print(f"Best parameters for {name}: {search.best_params_}")


=== Tuning KNeighborsRegressor ===
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for KNeighborsRegressor: {'model__algorithm': 'auto', 'model__n_neighbors': 5, 'model__weights': 'distance'}

=== Tuning KMeansRegression ===
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for KMeansRegression: {'model__init': 'k-means++', 'model__n_clusters': 5, 'model__n_init': 10}

=== Tuning LinearRegression ===
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters for LinearRegression: {'model__fit_intercept': True, 'model__n_jobs': -1}

=== Tuning XGBoostRegressor ===
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for XGBoostRegressor: {'model__learning_rate': 0.1, 'model__max_depth': 6, 'model__n_estimators': 200, 'model__subsample': 0.8}

=== Tuning SVR ===
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for SVR: {'model__C': 1, 'model__epsilon': 0.1, 'model

In [87]:
# Print all best parameters
for name, params in best_params.items():
    print(f"\n=== {name} Optimal Parameters Found ===")
    for param, value in params.items():
        print(f"  {param}: {value}")
    print()

    # Output results
    print("Test Set Performance: ")
    for score in performances[name]:
        print(f"{score}: {performances[name][score]:.4f}")


=== KNeighborsRegressor Optimal Parameters Found ===
  model__algorithm: auto
  model__n_neighbors: 5
  model__weights: distance

Test Set Performance: 
RMSE: 2.5171
R²: 0.6188
Mean Absolute Error: 1.2336
Explained Variance Score: 0.6188

=== KMeansRegression Optimal Parameters Found ===
  model__init: k-means++
  model__n_clusters: 5
  model__n_init: 10

Test Set Performance: 
RMSE: 4.7825
R²: -0.3763
Mean Absolute Error: 2.9838
Explained Variance Score: -0.2368

=== LinearRegression Optimal Parameters Found ===
  model__fit_intercept: True
  model__n_jobs: -1

Test Set Performance: 
RMSE: 1.6806
R²: 0.8301
Mean Absolute Error: 0.2831
Explained Variance Score: 0.8301

=== XGBoostRegressor Optimal Parameters Found ===
  model__learning_rate: 0.1
  model__max_depth: 6
  model__n_estimators: 200
  model__subsample: 0.8

Test Set Performance: 
RMSE: 1.5199
R²: 0.8610
Mean Absolute Error: 0.4511
Explained Variance Score: 0.8610

=== SVR Optimal Parameters Found ===
  model__C: 1
  model__

### Test Baseline Simple Model
(Simply Guesses Average Tip)

In [88]:
all_data = pd.read_parquet('tripdata_combined.parquet')
baseline_val = np.average(all_data['tip_amount'])
test_pred = [baseline_val] * len(y_test)


test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_r2 = r2_score(y_test, test_pred)
test_mae = mean_absolute_error(y_test, test_pred)
test_evs = explained_variance_score(y_test, test_pred)

print("Simple Model Test Set Performance: ")
print(f"RMSE: {test_rmse:.4f}")
print(f"R²: {test_r2:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"Explained Variance Score: {test_evs:.4f}")

Simple Model Test Set Performance: 
RMSE: 4.0766
R²: -0.0000
MAE: 2.5986
Explained Variance Score: 0.0000


## Regression Using Payment Method, Taxi Arrival Time, and Trip Length Only

In [100]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [101]:
df = pd.read_parquet("tripdata_combined.parquet")[['trip_distance', 'payment_type', 'trip_distance', 'tip_amount']]
print(df.head().to_markdown())

|         |   trip_distance |   payment_type |   trip_distance |   tip_amount |
|--------:|----------------:|---------------:|----------------:|-------------:|
| 2790731 |            1.17 |              2 |            1.17 |         0    |
|  666153 |            0.9  |              2 |            0.9  |         0    |
| 1985683 |            0.95 |              1 |            0.95 |         2.38 |
| 2154231 |            0.88 |              1 |            0.88 |         1.5  |
| 2493619 |            1.03 |              1 |            1.03 |         2.3  |


In [102]:
df.describe()

Unnamed: 0,trip_distance,payment_type,trip_distance.1,tip_amount
count,23000000.0,23000000.0,23000000.0,23000000.0
mean,4.500603,1.147765,4.500603,3.410598
std,339.7133,0.6030052,339.7133,4.092717
min,0.0,0.0,0.0,-330.88
25%,1.03,1.0,1.03,0.0
50%,1.78,1.0,1.78,2.72
75%,3.4,1.0,3.4,4.3
max,398608.6,5.0,398608.6,999.99


In [103]:
for col in df:
    print(col, ":", df[col].isna().sum(), "null values")

trip_distance : trip_distance    0
trip_distance    0
dtype: int64 null values
payment_type : 0 null values
trip_distance : trip_distance    0
trip_distance    0
dtype: int64 null values
tip_amount : 0 null values


In [104]:
# Remove all cols with null values 
df = df.dropna(axis=1)

In [105]:
print(df.shape)

(23000000, 4)


### Test Regression Models

In [106]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [107]:
# Create, scale, and split data
target = 'tip_amount'
X = df.drop(columns=[target])
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# Split data for 80% training,, 20% test 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [108]:
# Define all models and their parameter grids
models = {
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'model__n_neighbors': [3, 5, 7],
            'model__weights': ['uniform', 'distance'],
            'model__algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    },
    'KMeansRegression': {
        'model': KMeans(),
        'param_grid': {
            'model__n_clusters': [5, 10, 15],
            'model__init': ['k-means++', 'random'],
            'model__n_init': [5, 10]
        }
    },
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {
            'model__fit_intercept': [True, False],
            'model__n_jobs': [-1]
        }
    },
    'XGBoostRegressor': {
        'model': XGBRegressor(random_state=42),
        'param_grid': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [3, 6],
            'model__learning_rate': [0.01, 0.1],
            'model__subsample': [0.8, 1.0]
        }
    },
    'SVR': {
        'model': SVR(),
        'param_grid': {
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__C': [0.1, 1, 10],
            'model__epsilon': [0.01, 0.1]
        }
    },
    'Random Forest' : {
        'model' : RandomForestRegressor(random_state=42),
        'param_grid' : {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None],
            'model__min_samples_split': [5, 10],
            'model__max_features': ['sqrt'],
            'model__n_jobs': [-1] 
        }
    }
}

In [109]:
# Dictionary to store best parameters
best_params = {}
performances = {}

for name, config in models.items():
    print(f"\n=== Tuning {name} ===")
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', config['model'])
    ])
    
    # Grid search with reduced CV folds for speed
    search = GridSearchCV(
        pipeline,
        config['param_grid'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=3
    )
    
    # Fit on a subset of data for demonstration
    search.fit(X_train[:5000], y_train[:5000])  # Use full data in production
    
    # Store best parameters
    best_params[name] = search.best_params_
    
    test_pred = search.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    test_r2 = r2_score(y_test, test_pred)
    test_mae = mean_absolute_error(y_test, test_pred)
    test_evs = explained_variance_score(y_test, test_pred)

    performances[name] = {
        'RMSE': test_rmse,
        'R²': test_r2,
        'Mean Absolute Error': test_mae,
        'Explained Variance Score': test_evs
    }
    
    print(f"Best parameters for {name}: {search.best_params_}")


=== Tuning KNeighborsRegressor ===
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for KNeighborsRegressor: {'model__algorithm': 'ball_tree', 'model__n_neighbors': 7, 'model__weights': 'uniform'}

=== Tuning KMeansRegression ===
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for KMeansRegression: {'model__init': 'random', 'model__n_clusters': 5, 'model__n_init': 5}

=== Tuning LinearRegression ===
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters for LinearRegression: {'model__fit_intercept': True, 'model__n_jobs': -1}

=== Tuning XGBoostRegressor ===
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for XGBoostRegressor: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__subsample': 1.0}

=== Tuning SVR ===
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for SVR: {'model__C': 1, 'model__epsilon': 0.1, 'model

In [110]:
# Print all best parameters
for name, params in best_params.items():
    print(f"\n=== {name} Optimal Parameters Found ===")
    for param, value in params.items():
        print(f"  {param}: {value}")
    print()

    # Output results
    print("Test Set Performance: ")
    for score in performances[name]:
        print(f"{score}: {performances[name][score]:.4f}")


=== KNeighborsRegressor Optimal Parameters Found ===
  model__algorithm: ball_tree
  model__n_neighbors: 7
  model__weights: uniform

Test Set Performance: 
RMSE: 2.8359
R²: 0.5161
Mean Absolute Error: 1.4019
Explained Variance Score: 0.5161

=== KMeansRegression Optimal Parameters Found ===
  model__init: random
  model__n_clusters: 5
  model__n_init: 5

Test Set Performance: 
RMSE: 4.6248
R²: -0.2871
Mean Absolute Error: 2.5831
Explained Variance Score: -0.2427

=== LinearRegression Optimal Parameters Found ===
  model__fit_intercept: True
  model__n_jobs: -1

Test Set Performance: 
RMSE: 136.8987
R²: -1126.7464
Mean Absolute Error: 2.4246
Explained Variance Score: -1126.7353

=== XGBoostRegressor Optimal Parameters Found ===
  model__learning_rate: 0.1
  model__max_depth: 3
  model__n_estimators: 100
  model__subsample: 1.0

Test Set Performance: 
RMSE: 2.6630
R²: 0.5733
Mean Absolute Error: 1.3209
Explained Variance Score: 0.5733

=== SVR Optimal Parameters Found ===
  model__C: 1

### Test Baseline Simple Model
(Simply Guesses Average Tip)

In [111]:
all_data = pd.read_parquet('tripdata_combined.parquet')
baseline_val = np.average(all_data['tip_amount'])
test_pred = [baseline_val] * len(y_test)


test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_r2 = r2_score(y_test, test_pred)
test_mae = mean_absolute_error(y_test, test_pred)
test_evs = explained_variance_score(y_test, test_pred)

print("Simple Model Test Set Performance: ")
print(f"RMSE: {test_rmse:.4f}")
print(f"R²: {test_r2:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"Explained Variance Score: {test_evs:.4f}")

Simple Model Test Set Performance: 
RMSE: 4.0766
R²: -0.0000
MAE: 2.5986
Explained Variance Score: 0.0000
