In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Load the dataset into a pandas DataFrame (from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
taxi_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet')

In [4]:
# Display the first few rows of the dataset
taxi_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [5]:
# Drop rows with missing values.
taxi_df = taxi_df.dropna()

In [6]:
# Calculate trip duration in minutes
taxi_df['trip_duration'] = (taxi_df['tpep_dropoff_datetime'] - taxi_df['tpep_pickup_datetime']).dt.total_seconds() / 60

In [7]:
# Create a varible named 'target_variable' to store the name of the thing we're trying to predict, 'total_amount'.
target_variable = taxi_df['total_amount']

In [8]:
# Create a list called 'feature_cols' containing the feature names that we'll be using to predict our target variable. The list should contain 'VendorID', 'trip_distance', 'payment_type', 'PULocationID', 'DOLocationID', and 'trip_duration'.
feature_cols = ['VendorID', 'trip_distance', 'payment_type', 'PULocationID', 'DOLocationID', 'trip_duration']

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(taxi_df[feature_cols], target_variable, test_size=0.2, random_state=42)

In [10]:
# Calculate the mean of the training target variable
y_mean = y_train.mean()

# Create a baseline model that always predicts the mean
y_pred_baseline = [y_mean] * len(y_test)

# Evaluate the baseline model using mean absolute error
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)

print(f"Baseline MAE (always predicting the mean): {baseline_mae}")

# Is it any good?  The baseline MAE provides a point of reference. Any reasonable model should perform better than this.
# If a model has a significantly lower MAE than this baseline, it's a sign that the model has learned some predictive power.
# If the model has a MAE close or higher than the baseline, the model is not adding much value over simply guessing the average fare.

Baseline MAE (always predicting the mean): 9.198227928516678


In [11]:
# Identify categorical and continuous features
categorical_cols = ['VendorID', 'payment_type', 'PULocationID', 'DOLocationID']
continuous_cols = ['trip_distance', 'trip_duration']

# Create transformers for categorical and continuous features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
continuous_transformer = StandardScaler()

# Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('cont', continuous_transformer, continuous_cols)
    ])

# Example usage with a Linear Regression model (you can replace with any model)
model = LinearRegression()

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Model MAE: {mae}")

# Compare with baseline (you should have this baseline value from previous steps)
baseline_mae = 9.2  # Example baseline value, replace with your actual baseline value

# Check if the model beats the baseline
if mae < baseline_mae:
    print("The model beats the baseline!")
else:
    print("The model does not beat the baseline.")

Model MAE: 3.3854011161645885
The model beats the baseline!


In [12]:
# Create a pipeline with column transformations and RandomForestRegressor
pipeline_rf_optimized = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply the preprocessor defined earlier
    ('regressor', RandomForestRegressor(
        random_state=42,  # Ensure reproducibility
        n_estimators=30,  # Reduced number of trees for faster training
        max_depth=5,  # Limit the maximum depth of the trees to reduce complexity
        min_samples_split=10,  # Increase min samples required for splitting a node
        min_samples_leaf=4,  # Minimum number of samples required in each leaf node
        max_features='sqrt',  # Use the square root of the number of features for splits
        n_jobs=-1,  # Use all CPU cores for parallel processing
        bootstrap=True,  # Bootstrap sampling for tree construction
        max_samples=0.8  # Use 80% of the samples to train each tree
    ))  # RandomForestRegressor with optimized hyperparameters
])

# Fit the Random Forest model on the training data
pipeline_rf_optimized.fit(X_train, y_train)

# Make predictions on the test data using the fitted Random Forest Regressor pipeline
y_pred_rf_optimized = pipeline_rf_optimized.predict(X_test)

# Output the predictions (optional)
print(y_pred_rf_optimized)

from sklearn.metrics import mean_absolute_error

# Evaluate the performance of the Random Forest model using MAE
mae_rf_optimized = mean_absolute_error(y_test, y_pred_rf_optimized)

# Print the Mean Absolute Error for the Random Forest model
print(f"Mean Absolute Error of the Random Forest model: {mae_rf_optimized}")

[17.19394295 17.44641709 17.05805589 ... 17.62613778 17.59912835
 17.62613778]
Mean Absolute Error of the Random Forest model: 6.937271603153348


In [13]:
# Define the hyperparameters to tune
param_grid = {
    'regressor__n_estimators': [25, 50],  # Fewer trees
    'regressor__max_depth': [5, 10],  # Fewer depths
    'regressor__min_samples_split': [2, 5],  # Fewer splits
}

In [16]:
# Perform grid search to find the best hyperparameters. This could take a while.
grid_search = GridSearchCV(estimator = pipeline_rf_optimized, param_grid = param_grid,
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits




In [17]:
# Get the best Random Forest model with the best hyperparameters
best_rf_model = grid_search.best_estimator_

# Print the best hyperparameters found
print(f"Best hyperparameters: {grid_search.best_params_}")

# Print the best cross-validation score
print(f"Best cross-validation score: {grid_search.best_score_}")

Best hyperparameters: {'regressor__max_depth': 5, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 50}
Best cross-validation score: 0.2786029480894175


In [18]:
# Get the best Random Forest model with the best hyperparameters
best_rf_model = grid_search.best_estimator_

# Train the best Random Forest model on the full training data
best_rf_model.fit(X_train, y_train)

# Optionally, you can check how the model performs on the test data
y_pred_best_rf = best_rf_model.predict(X_test)

# Evaluate the performance using Mean Absolute Error (MAE)
from sklearn.metrics import mean_absolute_error
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)

print(f"Mean Absolute Error of the optimized Random Forest model: {mae_best_rf}")

Mean Absolute Error of the optimized Random Forest model: 6.55675727634028


In [19]:
# Make predictions on the test data using the fitted Random Forest model
y_pred_best_rf = best_rf_model.predict(X_test)

# Optionally, print or inspect the predictions
print(y_pred_best_rf)

[17.14243045 17.19953653 17.83214398 ... 17.36870904 17.34622723
 17.21742185]
