### Read data from the combined parquet files.

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [21]:
# df = pd.read_parquet("tripdata_combined.parquet").sample(frac=0.30, random_state=42) # Sample 10% of points to save on storage
df = pd.read_parquet("tripdata_combined.parquet").iloc[:,:-3]
print(df.head().to_markdown())

|         |   VendorID | tpep_pickup_datetime   | tpep_dropoff_datetime   |   passenger_count |   trip_distance |   RatecodeID | store_and_fwd_flag   |   PULocationID |   DOLocationID |   payment_type |   fare_amount |   extra |   mta_tax |   tip_amount |   tolls_amount |   improvement_surcharge |   total_amount |   congestion_surcharge |   airport_fee |   Airport_fee |
|--------:|-----------:|:-----------------------|:------------------------|------------------:|----------------:|-------------:|:---------------------|---------------:|---------------:|---------------:|--------------:|--------:|----------:|-------------:|---------------:|------------------------:|---------------:|-----------------------:|--------------:|--------------:|
| 2790731 |          2 | 2023-01-29 17:52:02    | 2023-01-29 17:56:43     |                 1 |            1.17 |            1 | N                    |            262 |             74 |              2 |           7.2 |     0   |       0.5 |         0    

In [22]:
df.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee
count,22719360.0,22719358,22719358,21227490.0,22719360.0,21227490.0,22719360.0,22719360.0,22719360.0,22719360.0,22719360.0,22719360.0,22719360.0,22719360.0,22719360.0,22719360.0,21227490.0,968300.0,20259190.0
mean,1.747663,2023-12-15 20:51:24.882377,2023-12-15 21:08:48.967704,1.352564,3.416808,1.963403,164.7458,163.7303,1.120787,19.87479,1.507484,0.4948571,3.448991,0.5913178,0.9962025,28.79924,2.305407,0.10979,0.1516155
min,1.0,2001-01-01 00:08:31,2001-01-01 01:11:09,0.0,0.0,1.0,1.0,1.0,0.0,-48.91,-3.0,-0.5,0.0,0.0,-1.0,0.01,-2.5,0.0,0.0
25%,1.0,2023-06-22 16:45:06.250000,2023-06-22 17:07:31.250000,1.0,1.04,1.0,132.0,113.0,1.0,9.3,0.0,0.5,0.0,0.0,1.0,15.96,2.5,0.0,0.0
50%,2.0,2023-12-13 23:41:00,2023-12-13 23:57:26,1.0,1.79,1.0,161.0,162.0,1.0,13.5,1.0,0.5,2.74,0.0,1.0,21.0,2.5,0.0,0.0
75%,2.0,2024-06-07 14:06:48,2024-06-07 14:24:54.750000,1.0,3.4,1.0,233.0,234.0,1.0,22.6,2.5,0.5,4.34,0.0,1.0,30.72,2.5,0.0,0.0
max,6.0,2026-06-26 23:53:12,2026-06-27 20:59:10,9.0,99.91,99.0,265.0,265.0,4.0,335544.4,65.99,53.16,99.99,355.0,2.0,335550.9,2.75,1.25,1.75
std,0.4375836,,,0.8599776,4.487399,9.265057,64.16397,69.7338,0.5466269,101.0515,1.816023,0.05264071,4.024048,2.203933,0.06082273,101.9809,0.6697884,0.353814,0.486914


In [23]:
for col in df:
    print(col, ":", df[col].isna().sum(), "null values")

VendorID : 0 null values
tpep_pickup_datetime : 0 null values
tpep_dropoff_datetime : 0 null values
passenger_count : 1491871 null values
trip_distance : 0 null values
RatecodeID : 1491871 null values
store_and_fwd_flag : 1491871 null values
PULocationID : 0 null values
DOLocationID : 0 null values
payment_type : 0 null values
fare_amount : 0 null values
extra : 0 null values
mta_tax : 0 null values
tip_amount : 0 null values
tolls_amount : 0 null values
improvement_surcharge : 0 null values
total_amount : 0 null values
congestion_surcharge : 1491871 null values
airport_fee : 21751058 null values
Airport_fee : 2460171 null values


In [24]:
# Remove all cols with null values 
df = df.dropna(axis=1)

In [25]:
# Convert all dates to float types, so that they can be scaled.
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].values.astype(np.float64)
df['tpep_dropoff_datetime'] = df['tpep_dropoff_datetime'].values.astype(np.float64)

In [26]:
print(df.head().to_markdown())

|         |   VendorID |   tpep_pickup_datetime |   tpep_dropoff_datetime |   trip_distance |   PULocationID |   DOLocationID |   payment_type |   fare_amount |   extra |   mta_tax |   tip_amount |   tolls_amount |   improvement_surcharge |   total_amount |
|--------:|-----------:|-----------------------:|------------------------:|----------------:|---------------:|---------------:|---------------:|--------------:|--------:|----------:|-------------:|---------------:|------------------------:|---------------:|
| 2790731 |          2 |            1.67501e+15 |             1.67502e+15 |            1.17 |            262 |             74 |              2 |           7.2 |     0   |       0.5 |         0    |              0 |                       1 |          11.2  |
|  666153 |          1 |            1.67319e+15 |             1.67319e+15 |            0.9  |            229 |            237 |              2 |           6.5 |     2.5 |       0.5 |         0    |              0 |            

In [27]:
print(df.shape)

(22719358, 14)


In [28]:
# Create, scale, and split data

target = 'tip_amount'
X = df.drop(columns=[target])
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# Split data for 80% training,, 20% test 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## Regression Using All Features

### Test Regression Models

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor

In [11]:
# Define all models and their parameter grids
models = {
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'model__n_neighbors': [3, 5, 7, 10],
            'model__weights': ['uniform', 'distance'],
            'model__algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    },
    'KMeansRegression': {
        'model': KMeans(),
        'param_grid': {
            'model__n_clusters': [5, 10, 15, 20],
            'model__init': ['k-means++', 'random'],
            'model__n_init': [5, 10]
        }
    },
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {
            'model__fit_intercept': [True, False],
            'model__n_jobs': [-1]
        }
    },
    'XGBoostRegressor': {
        'model': XGBRegressor(random_state=42),
        'param_grid': {
            'model__n_estimators': [100, 200, 300, 400],
            'model__max_depth': [3, 6, 9],
            'model__learning_rate': [0.01, 0.1],
            'model__subsample': [0.8, 1.0]
        }
    },
    'SVR': {
        'model': SVR(),
        'param_grid': {
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__C': [0.1, 1, 10],
            'model__epsilon': [0.01, 0.1]
        }
    },
    'Random Forest' : {
        'model' : RandomForestRegressor(random_state=42),
        'param_grid' : {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None],
            'model__min_samples_split': [5, 10],
            'model__max_features': ['sqrt'],
            'model__n_jobs': [-1] 
        }
    },
     'Neural Network': {
        'model': MLPRegressor(random_state=42, early_stopping=True),
        'param_grid': {
            'model__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'model__activation': ['relu', 'tanh'],
            'model__alpha': [0.0001, 0.001],  # L2 regularization
            'model__learning_rate_init': [0.001, 0.01],
            'model__batch_size': [32, 64],
            'model__max_iter': [200, 500]
        }
    }, 
    # 'LightGBM': {
    #     'model': LGBMRegressor(random_state=42, verbose=-1),  # verbose=-1 to suppress output
    #     'param_grid': {
    #         'model__n_estimators': [100, 200],
    #         'model__max_depth': [3, 5, 7],
    #         'model__learning_rate': [0.01, 0.1],
    #         'model__num_leaves': [31, 63],
    #         'model__subsample': [1.0],
    #         'model__colsample_bytree': [1.0],
    #         'model__reg_alpha': [0, 0.1],  # L1 regularization
    #         'model__reg_lambda': [0, 0.1],  # L2 regularization
    #         'model__n_jobs': [-1]
    #     }
    # }
}

In [12]:
# Dictionary to store best parameters
best_params = {}
performances = {}

for name, config in models.items():
    print(f"\n=== Tuning {name} ===")
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', config['model'])
    ])
    
    # Grid search with reduced CV folds for speed
    search = GridSearchCV(
        pipeline,
        config['param_grid'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=3
    )
    
    # Fit on a subset of data for demonstration
    search.fit(X_train[:5000], y_train[:5000])  # Use full data in production
    
    # Store best parameters
    best_params[name] = search.best_params_
    
    test_pred = search.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    test_r2 = r2_score(y_test, test_pred)
    test_mae = mean_absolute_error(y_test, test_pred)
    test_evs = explained_variance_score(y_test, test_pred)

    performances[name] = {
        'RMSE': test_rmse,
        'R²': test_r2,
        'Mean Absolute Error': test_mae,
        'Explained Variance Score': test_evs
    }
    
    print(f"Best parameters for {name}: {search.best_params_}")


=== Tuning KNeighborsRegressor ===
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for KNeighborsRegressor: {'model__algorithm': 'auto', 'model__n_neighbors': 5, 'model__weights': 'distance'}

=== Tuning KMeansRegression ===
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for KMeansRegression: {'model__init': 'k-means++', 'model__n_clusters': 5, 'model__n_init': 5}

=== Tuning LinearRegression ===
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters for LinearRegression: {'model__fit_intercept': True, 'model__n_jobs': -1}

=== Tuning XGBoostRegressor ===
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters for XGBoostRegressor: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 0.8}

=== Tuning SVR ===
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for SVR: {'model__C': 1, 'model__epsilon': 0.1, 'mode

In [13]:
# Print all best parameters
for name, params in best_params.items():
    print(f"\n=== {name} Optimal Parameters Found ===")
    for param, value in params.items():
        print(f"  {param}: {value}")
    print()

    # Output results
    print("Test Set Performance: ")
    for score in performances[name]:
        print(f"{score}: {performances[name][score]:.4f}")


=== KNeighborsRegressor Optimal Parameters Found ===
  model__algorithm: auto
  model__n_neighbors: 5
  model__weights: distance

Test Set Performance: 
RMSE: 2.5171
R²: 0.6188
Mean Absolute Error: 1.2336
Explained Variance Score: 0.6188

=== KMeansRegression Optimal Parameters Found ===
  model__init: k-means++
  model__n_clusters: 5
  model__n_init: 5

Test Set Performance: 
RMSE: 4.7747
R²: -0.3718
Mean Absolute Error: 2.9664
Explained Variance Score: -0.1505

=== LinearRegression Optimal Parameters Found ===
  model__fit_intercept: True
  model__n_jobs: -1

Test Set Performance: 
RMSE: 1.6806
R²: 0.8301
Mean Absolute Error: 0.2831
Explained Variance Score: 0.8301

=== XGBoostRegressor Optimal Parameters Found ===
  model__learning_rate: 0.1
  model__max_depth: 3
  model__n_estimators: 400
  model__subsample: 0.8

Test Set Performance: 
RMSE: 1.5457
R²: 0.8562
Mean Absolute Error: 0.5825
Explained Variance Score: 0.8563

=== SVR Optimal Parameters Found ===
  model__C: 1
  model__e

### Test Baseline Simple Model
(Simply Guesses Average Tip)

In [14]:
all_data = pd.read_parquet('tripdata_combined.parquet')
baseline_val = np.average(all_data['tip_amount'])
test_pred = [baseline_val] * len(y_test)


test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_r2 = r2_score(y_test, test_pred)
test_mae = mean_absolute_error(y_test, test_pred)
test_evs = explained_variance_score(y_test, test_pred)

print("Simple Model Test Set Performance: ")
print(f"RMSE: {test_rmse:.4f}")
print(f"R²: {test_r2:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"Explained Variance Score: {test_evs:.4f}")

Simple Model Test Set Performance: 
RMSE: 4.0766
R²: -0.0000
MAE: 2.5986
Explained Variance Score: 0.0000


## Regression Using Payment Method, Taxi Arrival Time, and Trip Length Only

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [16]:
df = pd.read_parquet("tripdata_combined.parquet")[['trip_distance', 'payment_type', 'trip_distance', 'tip_amount']]
print(df.head().to_markdown())

|         |   trip_distance |   payment_type |   trip_distance |   tip_amount |
|--------:|----------------:|---------------:|----------------:|-------------:|
| 2790731 |            1.17 |              2 |            1.17 |         0    |
|  666153 |            0.9  |              2 |            0.9  |         0    |
| 1985683 |            0.95 |              1 |            0.95 |         2.38 |
| 2154231 |            0.88 |              1 |            0.88 |         1.5  |
| 2493619 |            1.03 |              1 |            1.03 |         2.3  |


In [17]:
df.describe()

Unnamed: 0,trip_distance,payment_type,trip_distance.1,tip_amount
count,23000000.0,23000000.0,23000000.0,23000000.0
mean,4.500603,1.147765,4.500603,3.410598
std,339.7133,0.6030052,339.7133,4.092717
min,0.0,0.0,0.0,-330.88
25%,1.03,1.0,1.03,0.0
50%,1.78,1.0,1.78,2.72
75%,3.4,1.0,3.4,4.3
max,398608.6,5.0,398608.6,999.99


In [18]:
for col in df:
    print(col, ":", df[col].isna().sum(), "null values")

trip_distance : trip_distance    0
trip_distance    0
dtype: int64 null values
payment_type : 0 null values
trip_distance : trip_distance    0
trip_distance    0
dtype: int64 null values
tip_amount : 0 null values


In [19]:
# Remove all cols with null values 
df = df.dropna(axis=1)

In [20]:
print(df.shape)

(23000000, 4)


### Test Regression Models

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [22]:
# Create, scale, and split data
target = 'tip_amount'
X = df.drop(columns=[target])
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# Split data for 80% training,, 20% test 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [23]:
# Define all models and their parameter grids
models = {
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'model__n_neighbors': [3, 5, 7, 10],
            'model__weights': ['uniform', 'distance'],
            'model__algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    },
    'KMeansRegression': {
        'model': KMeans(),
        'param_grid': {
            'model__n_clusters': [5, 10, 15, 20],
            'model__init': ['k-means++', 'random'],
            'model__n_init': [5, 10]
        }
    },
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {
            'model__fit_intercept': [True, False],
            'model__n_jobs': [-1]
        }
    },
    'XGBoostRegressor': {
        'model': XGBRegressor(random_state=42),
        'param_grid': {
            'model__n_estimators': [100, 200, 300, 400],
            'model__max_depth': [3, 6, 9],
            'model__learning_rate': [0.01, 0.1],
            'model__subsample': [0.8, 1.0]
        }
    },
    'SVR': {
        'model': SVR(),
        'param_grid': {
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__C': [0.1, 1, 10],
            'model__epsilon': [0.01, 0.1]
        }
    },
    'Random Forest' : {
        'model' : RandomForestRegressor(random_state=42),
        'param_grid' : {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None],
            'model__min_samples_split': [5, 10],
            'model__max_features': ['sqrt'],
            'model__n_jobs': [-1] 
        }
    },
     'Neural Network': {
        'model': MLPRegressor(random_state=42, early_stopping=True),
        'param_grid': {
            'model__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'model__activation': ['relu', 'tanh'],
            'model__alpha': [0.0001, 0.001],  # L2 regularization
            'model__learning_rate_init': [0.001, 0.01],
            'model__batch_size': [32, 64],
            'model__max_iter': [200, 500]
        }
    }, 
    # 'LightGBM': {
    #     'model': LGBMRegressor(random_state=42, verbose=-1),  # verbose=-1 to suppress output
    #     'param_grid': {
    #         'model__n_estimators': [100, 200],
    #         'model__max_depth': [3, 5, 7],
    #         'model__learning_rate': [0.01, 0.1],
    #         'model__num_leaves': [31, 63],
    #         'model__subsample': [1.0],
    #         'model__colsample_bytree': [1.0],
    #         'model__reg_alpha': [0, 0.1],  # L1 regularization
    #         'model__reg_lambda': [0, 0.1],  # L2 regularization
    #         'model__n_jobs': [-1]
    #     }
    # }
}

In [24]:
# Dictionary to store best parameters
best_params = {}
performances = {}

for name, config in models.items():
    print(f"\n=== Tuning {name} ===")
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', config['model'])
    ])
    
    # Grid search with reduced CV folds for speed
    search = GridSearchCV(
        pipeline,
        config['param_grid'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=3
    )
    
    # Fit on a subset of data for demonstration
    search.fit(X_train[:5000], y_train[:5000])  # Use full data in production
    
    # Store best parameters
    best_params[name] = search.best_params_
    
    test_pred = search.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    test_r2 = r2_score(y_test, test_pred)
    test_mae = mean_absolute_error(y_test, test_pred)
    test_evs = explained_variance_score(y_test, test_pred)

    performances[name] = {
        'RMSE': test_rmse,
        'R²': test_r2,
        'Mean Absolute Error': test_mae,
        'Explained Variance Score': test_evs
    }
    
    print(f"Best parameters for {name}: {search.best_params_}")


=== Tuning KNeighborsRegressor ===
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for KNeighborsRegressor: {'model__algorithm': 'ball_tree', 'model__n_neighbors': 10, 'model__weights': 'uniform'}

=== Tuning KMeansRegression ===
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for KMeansRegression: {'model__init': 'random', 'model__n_clusters': 5, 'model__n_init': 5}

=== Tuning LinearRegression ===
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters for LinearRegression: {'model__fit_intercept': True, 'model__n_jobs': -1}

=== Tuning XGBoostRegressor ===
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters for XGBoostRegressor: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 1.0}

=== Tuning SVR ===
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters for SVR: {'model__C': 1, 'model__epsilon': 0.1, 'm

In [25]:
# Print all best parameters
for name, params in best_params.items():
    print(f"\n=== {name} Optimal Parameters Found ===")
    for param, value in params.items():
        print(f"  {param}: {value}")
    print()

    # Output results
    print("Test Set Performance: ")
    for score in performances[name]:
        print(f"{score}: {performances[name][score]:.4f}")


=== KNeighborsRegressor Optimal Parameters Found ===
  model__algorithm: ball_tree
  model__n_neighbors: 10
  model__weights: uniform

Test Set Performance: 
RMSE: 2.7579
R²: 0.5423
Mean Absolute Error: 1.3680
Explained Variance Score: 0.5424

=== KMeansRegression Optimal Parameters Found ===
  model__init: random
  model__n_clusters: 5
  model__n_init: 5

Test Set Performance: 
RMSE: 4.4914
R²: -0.2139
Mean Absolute Error: 2.6052
Explained Variance Score: -0.0633

=== LinearRegression Optimal Parameters Found ===
  model__fit_intercept: True
  model__n_jobs: -1

Test Set Performance: 
RMSE: 136.8987
R²: -1126.7464
Mean Absolute Error: 2.4246
Explained Variance Score: -1126.7353

=== XGBoostRegressor Optimal Parameters Found ===
  model__learning_rate: 0.01
  model__max_depth: 3
  model__n_estimators: 400
  model__subsample: 1.0

Test Set Performance: 
RMSE: 2.6690
R²: 0.5713
Mean Absolute Error: 1.3470
Explained Variance Score: 0.5714

=== SVR Optimal Parameters Found ===
  model__C:

### Test Baseline Simple Model
(Simply Guesses Average Tip)

In [26]:
all_data = pd.read_parquet('tripdata_combined.parquet')
baseline_val = np.average(all_data['tip_amount'])
test_pred = [baseline_val] * len(y_test)


test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_r2 = r2_score(y_test, test_pred)
test_mae = mean_absolute_error(y_test, test_pred)
test_evs = explained_variance_score(y_test, test_pred)

print("Simple Model Test Set Performance: ")
print(f"RMSE: {test_rmse:.4f}")
print(f"R²: {test_r2:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"Explained Variance Score: {test_evs:.4f}")

Simple Model Test Set Performance: 
RMSE: 4.0766
R²: -0.0000
MAE: 2.5986
Explained Variance Score: 0.0000


## Use Best Model to Verify RQ2 Claims

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
import joblib
import numpy as np

In [40]:
mlp_regressor = Pipeline([
    ('scaler', StandardScaler()),  
    ('model', MLPRegressor(
        activation='relu',
        alpha=0.001,              
        batch_size=32,
        hidden_layer_sizes=(50, 50),  
        learning_rate_init=0.01,
        max_iter=200
    ))
])

# Fit the model (replace with your actual data)
mlp_regressor.fit(X_train, y_train)

# Save to a .pkl file
joblib.dump(mlp_regressor, 'kevin_mlp_regressor.pkl')
print("Model saved to kevin_mlp_regressor.pkl")

KeyboardInterrupt: 

### To Test:

To get a higher tip ratio:

1. Avoid work late at night and early in the morning.
(23:00 - 6:00)
2. Since online orders can only be paid by credit card.
Therefore get more online orders can get more tips.
3. Try to get more orders for trips that are less than 30
kilometers long.

In [57]:
# Load the saved model
mlp_regressor = joblib.load('kevin_mlp_regressor.pkl')

In [None]:
test_samples = pd.read_parquet("tripdata_combined.parquet").sample(n=200, random_state=42).iloc[:,:-3]

# Remove all cols with null values 
test_samples = test_samples.dropna(axis=1)

print(test_samples.head().to_markdown())
print(len(test_samples))

|         |   VendorID | tpep_pickup_datetime   | tpep_dropoff_datetime   |   trip_distance |   PULocationID |   DOLocationID |   payment_type |   fare_amount |   extra |   mta_tax |   tip_amount |   tolls_amount |   improvement_surcharge |   total_amount |
|--------:|-----------:|:-----------------------|:------------------------|----------------:|---------------:|---------------:|---------------:|--------------:|--------:|----------:|-------------:|---------------:|------------------------:|---------------:|
| 1409269 |          2 | 2023-05-13 13:14:30    | 2023-05-13 13:26:49     |            2.74 |             87 |             25 |              1 |         15.6  |     0   |       0.5 |         0.4  |              0 |                       1 |          20    |
|  495957 |          2 | 2023-06-05 15:19:10    | 2023-06-06 00:00:00     |           17.3  |            132 |            163 |              1 |         70    |     0   |       0.5 |         7.58 |              0 |            

Testing 1.

In [59]:
test_late = test_samples[
    (test_samples['tpep_pickup_datetime'].dt.hour >= 23) | 
    (test_samples['tpep_pickup_datetime'].dt.hour < 6)
]

test_early = test_samples[
    ~(  # Invert the condition with NOT (~)
        (test_samples['tpep_pickup_datetime'].dt.hour >= 23) | 
        (test_samples['tpep_pickup_datetime'].dt.hour < 6)
    )
]

# Convert all dates to float types, so that they can be scaled.
test_late['tpep_pickup_datetime'] = test_late['tpep_pickup_datetime'].values.astype(np.float64)
test_late['tpep_dropoff_datetime'] = test_late['tpep_dropoff_datetime'].values.astype(np.float64)

test_early['tpep_pickup_datetime'] = test_early['tpep_pickup_datetime'].values.astype(np.float64)
test_early['tpep_dropoff_datetime'] = test_early['tpep_dropoff_datetime'].values.astype(np.float64)


print(test_late.shape[0], test_early.shape[1])

26 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_late['tpep_pickup_datetime'] = test_late['tpep_pickup_datetime'].values.astype(np.float64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_late['tpep_dropoff_datetime'] = test_late['tpep_dropoff_datetime'].values.astype(np.float64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_early[

In [60]:
for col in test_late:
    print(col, test_late[col].dtype)

VendorID int64
tpep_pickup_datetime float64
tpep_dropoff_datetime float64
trip_distance float64
PULocationID int64
DOLocationID int64
payment_type int64
fare_amount float64
extra float64
mta_tax float64
tip_amount float64
tolls_amount float64
improvement_surcharge float64
total_amount float64


In [61]:
test_late = test_late.drop(columns=['tip_amount'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_late)
X_scaled = pd.DataFrame(X_scaled, columns=test_late.columns, index=test_late.index)

late_predictions = mlp_regressor.predict(X_scaled)  
print("Mean Tip Predicted:", np.mean(late_predictions))  

Mean Tip Predicted: 8.955422955406398


In [62]:
test_early = test_early.drop(columns=['tip_amount'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_early)
X_scaled = pd.DataFrame(X_scaled, columns=test_early.columns, index=test_early.index)

early_predictions = mlp_regressor.predict(X_scaled)  
print("Mean Tip Predicted:", np.mean(early_predictions))  

Mean Tip Predicted: 8.760839908103048


Testing 2.

Payment_type values:
- 1=Credit Card
- 2=Cash
- 3=No charge
- 4=Dispute
- 5=Unknown
- 6=Voided Trip

[Source](https://medium.com/@liam.lim/nyc-yellow-taxi-trip-record-analysis-7eb389a0470c)

In [63]:
test_cc = test_samples[test_samples['payment_type']==1].drop(columns=['tip_amount'])
test_no_cc = test_samples[test_samples['payment_type'] != 1].drop(columns=['tip_amount'])


# Convert all dates to float types, so that they can be scaled.
test_cc['tpep_pickup_datetime'] = test_cc['tpep_pickup_datetime'].values.astype(np.float64)
test_cc['tpep_dropoff_datetime'] = test_cc['tpep_dropoff_datetime'].values.astype(np.float64)
test_no_cc['tpep_pickup_datetime'] = test_no_cc['tpep_pickup_datetime'].values.astype(np.float64)
test_no_cc['tpep_dropoff_datetime'] = test_no_cc['tpep_dropoff_datetime'].values.astype(np.float64)

print(test_cc.shape[0], test_no_cc.shape[1])

149 13


In [64]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_cc)
X_scaled = pd.DataFrame(X_scaled, columns=test_cc.columns, index=test_cc.index)

cc_predictions = mlp_regressor.predict(X_scaled)  
print("Mean Tip Predicted:", np.mean(cc_predictions)) 

Mean Tip Predicted: 7.440410961165445


In [65]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_no_cc)
X_scaled = pd.DataFrame(X_scaled, columns=test_no_cc.columns, index=test_no_cc.index)

no_cc_predictions = mlp_regressor.predict(X_scaled)  
print("Mean Tip Predicted:", np.mean(no_cc_predictions)) 

Mean Tip Predicted: 5.040474825240105


Testing 3.

Note that 30 km is roughly 18.6411 miles, and our data stores trip distance in miles. 

In [68]:
long_trips = test_samples[
    test_samples['trip_distance'] >= 18.6411
].drop(columns=['tip_amount'])

short_trips = test_samples[
    test_samples['trip_distance'] < 18.6411
].drop(columns=['tip_amount'])

# Convert datetime columns to float64 for scaling
long_trips['tpep_pickup_datetime'] = long_trips['tpep_pickup_datetime'].values.astype(np.float64)
long_trips['tpep_dropoff_datetime'] = long_trips['tpep_dropoff_datetime'].values.astype(np.float64)

short_trips['tpep_pickup_datetime'] = short_trips['tpep_pickup_datetime'].values.astype(np.float64)
short_trips['tpep_dropoff_datetime'] = short_trips['tpep_dropoff_datetime'].values.astype(np.float64)

print(long_trips.shape[0], short_trips.shape[0])

1 199


In [69]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(long_trips)
X_scaled = pd.DataFrame(X_scaled, columns=long_trips.columns, index=long_trips.index)

long_predictions = mlp_regressor.predict(X_scaled)  
print("Mean Tip Predicted:", np.mean(long_predictions)) 

Mean Tip Predicted: 3.305815605862838


In [70]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(short_trips)
X_scaled = pd.DataFrame(X_scaled, columns=short_trips.columns, index=short_trips.index)

short_predictions = mlp_regressor.predict(X_scaled)  
print("Mean Tip Predicted:", np.mean(short_predictions)) 

Mean Tip Predicted: 8.716635970686118
