# NYC Taxi Fare Prediction | PyTorch Neural Network Model

Download Dataset

In [None]:
#!kaggle competitions download -c new-york-city-taxi-fare-prediction

Downloading new-york-city-taxi-fare-prediction.zip to /Users/liu_michael/Documents/UWaterloo1/PyTorch/NYC-Taxi
 87%|█████████████████████████████████▏    | 1.36G/1.56G [00:00<00:00, 2.89GB/s]
100%|██████████████████████████████████████| 1.56G/1.56G [00:00<00:00, 2.93GB/s]


In [None]:
#import zipfile
#with zipfile.ZipFile('new-york-city-taxi-fare-prediction.zip', 'r') as zip_ref:
    #zip_ref.extractall('data')

Get Data into DataFrames

In [1]:
import pandas as pd
import random
selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
dtypes = {
    'fare_amount': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'dropoff_latitude': 'float32',
    'passenger_count': 'uint8'
}
random.seed(42)
sample_fraction = 0.05
def skip_row(row_index):
    if row_index == 0:
        return False
    return random.random() > sample_fraction

random.seed(42)
df = pd.read_csv(
    'data/train.csv', 
    usecols = selected_cols,
    parse_dates = ['pickup_datetime'],
    date_format = '%Y-%m-%d %H:%M:%S %Z',
    dtype = dtypes,
    skiprows = skip_row
    )

test_df = pd.read_csv(
    'data/test.csv',
    dtype = dtypes,
    parse_dates = ['pickup_datetime'],
    date_format = '%Y-%m-%d %H:%M:%S %Z'
)

In [2]:
df.info(show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2769960 entries, 0 to 2769959
Data columns (total 7 columns):
 #   Column             Non-Null Count    Dtype              
---  ------             --------------    -----              
 0   fare_amount        2769960 non-null  float32            
 1   pickup_datetime    2769960 non-null  datetime64[ns, UTC]
 2   pickup_longitude   2769960 non-null  float32            
 3   pickup_latitude    2769960 non-null  float32            
 4   dropoff_longitude  2769944 non-null  float32            
 5   dropoff_latitude   2769944 non-null  float32            
 6   passenger_count    2769960 non-null  uint8              
dtypes: datetime64[ns, UTC](1), float32(5), uint8(1)
memory usage: 76.6 MB


In [3]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,2769960.0,2769960.0,2769960.0,2769944.0,2769944.0,2769960.0
mean,11.35242,-72.50466,39.91626,-72.50387,39.91396,1.685441
std,9.850647,12.72167,10.37595,13.00545,10.44149,1.320807
min,-300.0,-3439.245,-3492.264,-3367.929,-3483.855,0.0
25%,6.0,-73.99205,40.73492,-73.99141,40.73399,1.0
50%,8.5,-73.9818,40.75264,-73.98017,40.75312,1.0
75%,12.5,-73.96708,40.76711,-73.96368,40.76808,2.0
max,1273.31,3442.185,3376.602,3442.185,3351.403,208.0


Remove Values that Make No Sense

In [4]:
df = df.loc[
    df['fare_amount'].between(0, 200) &
    df['passenger_count'].between(1, 6) &
    df['pickup_latitude'].between(40.5, 41.0) &
    df['dropoff_latitude'].between(40.5, 41.0) &
    df['pickup_longitude'].between(-74.3, -73.6) &
    df['dropoff_longitude'].between(-74.3, -73.6)
]

df = df.loc[df['fare_amount'] > 0]
df = df.loc[~(
    (df['pickup_latitude'] == df['dropoff_latitude']) &
    (df['pickup_longitude'] == df['dropoff_longitude'])
)]

In [5]:
pd.set_option("display.float_format", "{:.2f}".format)
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,2671019.0,2671019.0,2671019.0,2671019.0,2671019.0,2671019.0
mean,11.32,-73.98,40.75,-73.97,40.75,1.69
std,9.45,0.03,0.03,0.03,0.03,1.31
min,0.01,-74.3,40.5,-74.3,40.5,1.0
25%,6.0,-73.99,40.74,-73.99,40.74,1.0
50%,8.5,-73.98,40.75,-73.98,40.75,1.0
75%,12.5,-73.97,40.77,-73.97,40.77,2.0
max,200.0,-73.6,41.0,-73.6,41.0,6.0


Prepare Data for Training

In [6]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, test_size = 0.2, random_state = 123)
len(train_df), len(valid_df)

(2136815, 534204)

In [14]:
X_columns = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
            'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
y_column = ['fare_amount']

In [75]:
X_train = train_df[X_columns]
y_train = train_df[y_column]

X_valid = valid_df[X_columns]
y_valid = valid_df[y_column]

X_test = test_df[X_columns]

Feature Engineering

In [76]:
import numpy as np
def add_date_features(df, col = 'pickup_datetime'):
    df['year'] = df[col].dt.year
    df['month'] = df[col].dt.month
    df['day_of_month'] = df[col].dt.day
    df['day_of_week'] = df[col].dt.weekday
    df['hour'] = df[col].dt.hour
    df['is_weekend'] = df[col].dt.weekday.isin([5,6]).astype('uint8')
    df['is_night'] = ((df[col].dt.hour < 6) | (df[col].dt.hour > 22)).astype('uint8')
    df['rush_hour'] = df[col].dt.hour.isin([7,8,9,16,17,18]).astype('uint8')
    df['week'] = df[col].dt.isocalendar().week.astype('int16')
    df['quarter'] = df[col].dt.quarter
    return df

def haversine_np(lat1, lon1, lat2, lon2):
    """
    Haversine distance in km
    lat/lon inputs can be scalars, pandas Series, or NumPy arrays
    """
    R = 6371.0  # Earth radius in km
    
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c

def add_trip_distance(df):
    df['trip_distance_km'] = haversine_np(
        df['pickup_latitude'],
        df['pickup_longitude'],
        df['dropoff_latitude'],
        df['dropoff_longitude']
    )
    return df

def add_is_manhattan(df):
    df['pickup_in_manhattan'] = (
        (df['pickup_longitude'] > -74.03) &
        (df['pickup_longitude'] < -73.93) &
        (df['pickup_latitude'] > 40.70) &
        (df['pickup_latitude'] < 40.85)
    ).astype('uint8')
    
    df['dropoff_in_manhattan'] = (
        (df['dropoff_longitude'] > -74.03) &
        (df['dropoff_longitude'] < -73.93) &
        (df['dropoff_latitude'] > 40.70) &
        (df['dropoff_latitude'] < 40.85)
    ).astype('uint8')
    return df

jfk_lonlat = (-73.7781, 40.6413)
lga_lonlat = (-73.8740, 40.7769)
ewr_lonlat = (-74.1745, 40.6895)
met_lonlat = (-73.9632, 40.7794)
wtc_lonlat = (-74.0099, 40.7126)

def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
    lon, lat = landmark_lonlat
    df[landmark_name + '_drop_distance'] = haversine_np(
        df['dropoff_latitude'], df['dropoff_longitude'],
        lat, lon
    )

def add_landmarks(df):
    landmarks = [
        ('jfk', jfk_lonlat),
        ('lga', lga_lonlat),
        ('ewr', ewr_lonlat),
        ('met', met_lonlat),
        ('wtc', wtc_lonlat)
    ]
    for name, lonlat in landmarks:
        add_landmark_dropoff_distance(df, name, lonlat)
    return df

def cross_manhattan(df):
    df['trip_crosses_manhattan'] = (df['pickup_in_manhattan'] != df['dropoff_in_manhattan']).astype('uint8')
    return df

In [77]:
def apply_features(df, datetime_col = "pickup_datetime"):
    add_date_features(df, col = datetime_col)
    add_trip_distance(df)
    add_is_manhattan(df)
    add_landmarks(df)
    cross_manhattan(df)
    return df

In [78]:
X_train = apply_features(X_train)
X_valid = apply_features(X_valid)
X_test = apply_features(X_test)

X_train.shape, X_valid.shape, X_test.shape

((2136815, 25), (534204, 25), (9914, 25))

One Hot Encoding

In [79]:
numeric_columns = [
    'year', 'passenger_count',
    'is_weekend', 'is_night', 'rush_hour',
    'trip_distance_km', 'pickup_in_manhattan', 'dropoff_in_manhattan',
    'jfk_drop_distance', 'lga_drop_distance', 'ewr_drop_distance',
    'met_drop_distance', 'wtc_drop_distance', 'trip_crosses_manhattan'
]

categorical_columns = ['month', 'quarter', 'day_of_month', 'day_of_week', 'hour', 'week']

In [80]:
print(X_train.columns)
print(X_test.columns)

Index(['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year',
       'month', 'day_of_month', 'day_of_week', 'hour', 'is_weekend',
       'is_night', 'rush_hour', 'week', 'quarter', 'trip_distance_km',
       'pickup_in_manhattan', 'dropoff_in_manhattan', 'jfk_drop_distance',
       'lga_drop_distance', 'ewr_drop_distance', 'met_drop_distance',
       'wtc_drop_distance', 'trip_crosses_manhattan'],
      dtype='object')
Index(['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year',
       'month', 'day_of_month', 'day_of_week', 'hour', 'is_weekend',
       'is_night', 'rush_hour', 'week', 'quarter', 'trip_distance_km',
       'pickup_in_manhattan', 'dropoff_in_manhattan', 'jfk_drop_distance',
       'lga_drop_distance', 'ewr_drop_distance', 'met_drop_distance',
       'wtc_drop_distance', 'trip_crosses_manhattan'],
      dtype='objec

In [81]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), numeric_columns),   
        ("cat", OneHotEncoder(
            handle_unknown = "ignore",  
            sparse_output = False,      
            dtype = "uint8"        
        ), categorical_columns)
    ],
    verbose_feature_names_out = False,
)

preprocessor.set_output(transform = "pandas")

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,'uint8'
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [82]:
X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
X_test  = preprocessor.transform(X_test)

In [83]:
X_train.shape, X_valid.shape, X_test.shape

((2136815, 145), (534204, 145), (9914, 145))

In [84]:
encoded_cols = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_columns)

In [85]:
encoded_columns = encoded_cols.tolist()

In [86]:
input_columns = encoded_columns + numeric_columns
X_train = X_train[input_columns]
X_valid = X_valid[input_columns]
X_test  = X_test[input_columns]
X_train.shape, X_valid.shape, X_test.shape

((2136815, 145), (534204, 145), (9914, 145))

Transform into PyTorch Tensors

In [87]:
import torch
from torch import nn

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'mps'

In [88]:
X_train_tensor = torch.tensor(X_train.to_numpy(dtype = "float32"), device = device)
y_train_tensor = torch.tensor(y_train.to_numpy(dtype = "float32").reshape(-1, 1), device = device)

X_valid_tensor = torch.tensor(X_valid.to_numpy(dtype = "float32"), device = device)
y_valid_tensor = torch.tensor(y_valid.to_numpy(dtype = "float32").reshape(-1, 1), device = device)

X_test_tensor  = torch.tensor(X_test.to_numpy(dtype = "float32"), device = device)

In [89]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)

train_loader = DataLoader(train_dataset, batch_size = 8192, shuffle = True, drop_last = True)
valid_loader = DataLoader(valid_dataset, batch_size = 8192, shuffle = False)

Create Model

In [101]:
class TaxiFareModel(nn.Module):
    def __init__(self, input_features):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_features, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(p = 0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(p = 0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.layers(x)

In [102]:
input_size = X_train_tensor.shape[1]
torch.manual_seed(123)
model = TaxiFareModel(input_size).to(device)

In [103]:
model

TaxiFareModel(
  (layers): Sequential(
    (0): Linear(in_features=145, out_features=256, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.1, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): ReLU()
    (10): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [104]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

Training Loop

In [105]:
epochs = 18
torch.manual_seed(123)
for epoch in range(epochs + 1):

    # ----Training----
    model.train()
    total_train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        predictions = model(X_batch)
        loss = loss_fn(predictions, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
    avg_train_loss = total_train_loss / len(train_loader)

    # ----Validation----
    model.eval()
    total_valid_loss = 0
    with torch.inference_mode():
        for X_batch, y_batch in valid_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            predictions = model(X_batch)
            loss = loss_fn(predictions, y_batch)
            
            total_valid_loss += loss.item()
        avg_valid_loss = total_valid_loss / len(valid_loader)
        
    print(f"Epoch {epoch:02d} | Train MSE: {avg_train_loss:.6f} | Valid MSE: {avg_valid_loss:.6f}")

Epoch 00 | Train MSE: 33.415930 | Valid MSE: 13.531774
Epoch 01 | Train MSE: 13.903462 | Valid MSE: 13.190116
Epoch 02 | Train MSE: 13.529364 | Valid MSE: 12.974375
Epoch 03 | Train MSE: 13.234944 | Valid MSE: 12.741170
Epoch 04 | Train MSE: 12.902288 | Valid MSE: 12.358016
Epoch 05 | Train MSE: 12.709338 | Valid MSE: 12.388647
Epoch 06 | Train MSE: 12.582399 | Valid MSE: 12.182323
Epoch 07 | Train MSE: 12.474762 | Valid MSE: 12.116239
Epoch 08 | Train MSE: 12.351425 | Valid MSE: 12.109476
Epoch 09 | Train MSE: 12.267848 | Valid MSE: 12.011122
Epoch 10 | Train MSE: 12.166614 | Valid MSE: 11.979307
Epoch 11 | Train MSE: 12.111968 | Valid MSE: 12.009171
Epoch 12 | Train MSE: 12.002856 | Valid MSE: 11.894768
Epoch 13 | Train MSE: 11.951046 | Valid MSE: 12.046024
Epoch 14 | Train MSE: 11.890669 | Valid MSE: 11.784234
Epoch 15 | Train MSE: 11.801957 | Valid MSE: 11.871897
Epoch 16 | Train MSE: 11.694641 | Valid MSE: 11.819107
Epoch 17 | Train MSE: 11.643963 | Valid MSE: 11.803606
Epoch 18 |

In [121]:
model.eval()
with torch.inference_mode():
    y_test_pred = model(X_test_tensor)
y_test_pred_np = y_test_pred.cpu().numpy().squeeze()

In [122]:
results = pd.DataFrame(
    {
        'key': test_df['key'].values,
        'fare_amount': y_test_pred_np
    }
)
results.to_csv('pytorch_preds.csv', index = None)