# CZ4041 Machine Learning Project
## Kaggle Competition: New York City Taxi Trip Duration Prediction

## Source Code by Group 21
---

In this notebook, we have created a Neural Network model for NYC Taxi Trip Duration Prediction using TensorFlow 2 / Keras.

This notebook is provided separately to allow the notebook to be run independently, and to be run in parallel in multiple sessions (i.e. separate Google Colaboratory sessions for increased parallelization).

In [None]:
# Essential libraries
import pandas as pd  # Pandas for using dataframes and reading/writing CSVs 
import numpy as np   # Numpy for vector operations and basic math
import tensorflow as tf # For Deep Learning - to work with neural networks

# To split the original train dataset into train and validation sets
from sklearn.model_selection import train_test_split 

# Additional libraries
import warnings
warnings.filterwarnings('ignore')

### Read the complete train and test datasets after saving them in the main notebook (EDA_FeatureEngineering_XGBoost.ipynb).

In [None]:
train = pd.read_csv('train_complete.csv')
test = pd.read_csv('test_complete.csv')
print(f'Shape of Train Set: {train.shape}\nShape of Test Set: {test.shape}')

In [None]:
# To check whether the train data has the correct columns/features.
train.head()

In [None]:
# List of important/selected features
feature_names = ['vendor_id','passenger_count', 'pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag', 'pickup_pca0',
    'pickup_pca1', 'dropoff_pca0', 'dropoff_pca1', 'distance_haversine', 
    'distance_dummy_manhattan', 'direction', 'pca_manhattan', 'center_latitude',
    'center_longitude', 'pickup_weekday', 'pickup_month', 'pickup_hour_weekofyear',
    'pickup_hour', 'pickup_minute', 'pickup_dt', 'pickup_week_hour',
    'pickup_day', 'pickup_week', 'pickup_minute_of_the_day', 'pickup_dayofyear',
    'pickup_am', 'night_trip', 'rush_hour', 'weekday',
    'pickup_is_weekend', 'pickup_holiday', 'pickup_near_holiday', 'pickup_businessday',
    'pickup_cluster', 'dropoff_cluster', 'avg_speed_h_gby_pickup_hour', 'avg_speed_m_gby_pickup_hour',
    'log_trip_duration_gby_pickup_hour', 'avg_speed_h_gby_pickup_date', 'avg_speed_m_gby_pickup_date',
    'log_trip_duration_gby_pickup_date', 'avg_speed_h_gby_pickup_dt_bin', 'avg_speed_m_gby_pickup_dt_bin',
    'log_trip_duration_gby_pickup_dt_bin', 'avg_speed_h_gby_pickup_week_hour', 'avg_speed_m_gby_pickup_week_hour',
    'log_trip_duration_gby_pickup_week_hour', 'avg_speed_h_gby_pickup_cluster', 'avg_speed_m_gby_pickup_cluster',
    'log_trip_duration_gby_pickup_cluster','avg_speed_h_gby_dropoff_cluster','avg_speed_m_gby_dropoff_cluster',
    'log_trip_duration_gby_dropoff_cluster','avg_speed_h_center_lat_bin_center_long_bin',
    'cnt_center_lat_bin_center_long_bin', 'avg_speed_h_pickup_hour_center_lat_bin_center_long_bin',
    'cnt_pickup_hour_center_lat_bin_center_long_bin', 'avg_speed_h_pickup_hour_pickup_cluster',
    'cnt_pickup_hour_pickup_cluster', 'avg_speed_h_pickup_hour_dropoff_cluster',
    'cnt_pickup_hour_dropoff_cluster', 'avg_speed_h_pickup_cluster_dropoff_cluster',
    'cnt_pickup_cluster_dropoff_cluster', 'count_60min', 'dropoff_cluster_count','pickup_cluster_count',
    'total_distance', 'total_travel_time', 'number_of_steps'
]

In [None]:
print('We have %i features.' % len(feature_names))

# Main Neural Networks Code

## Setting Hyperparameters

In [None]:
# Please set the hyperparameters to be tested here.
# Note: learning rate is to be set manually in the next cell

NUM_NEURONS = 400
NUM_LAYERS = 40

OPTIMIZER = "adam" # Choose from between: sgd, momentum, rmsprop, and adam
L2_REGULARIZATION = 5e-5
DROP_RATE = 0 # For Dropout

BATCH_SIZE = 128
EPOCHS = 100

In [None]:
# Code to choose optimizer
if OPTIMIZER == "adam":
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-7, amsgrad=False)
if OPTIMIZER == "sgd":
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0, nesterov=False)
if OPTIMIZER == "momentum":
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=False) # Can vary momentum if needed
if OPTIMIZER == "rmsprop":
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=1e-3, rho=0.9, momentum=0.0, epsilon=1e-7)
else:
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-7, amsgrad=False)

## Feature Scaling

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_ = train[feature_names].values
train_normalized = sc.fit_transform(train_)

## Splitting Training Set into Train and Validation sets

First, creating the "y" vector we stores the log of target trip duration values + 1.

Also zipping the train and validation sets into TensorFlow datasets.

In [None]:
y = np.log(train['trip_duration'].values + 1)

In [None]:
# Training
X_train, X_val, y_train, y_val = train_test_split(train_normalized, y, test_size=0.2, random_state=7) # Random state 7 to improve consistency over multiple runs
X_train = np.nan_to_num(X_train, copy=False, nan=0)
X_val = np.nan_to_num(X_val, copy=False, nan=0)

train_dataset = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(X_train.tolist()), tf.data.Dataset.from_tensor_slices(y_train.tolist())))
val_dataset = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(X_val.tolist()), tf.data.Dataset.from_tensor_slices(y_val.tolist())))

X_train.shape, X_val.shape, y_train.shape, y_val.shape

## Batching the train and validation sets

In [None]:
batched_train_ds = train_dataset.batch(BATCH_SIZE)
batched_val_ds = val_dataset.batch(BATCH_SIZE)

## Building the model using TensorFlow's Sequential API

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.BatchNormalization())
for i in range(NUM_LAYERS):
    model.add(tf.keras.layers.Dense(NUM_NEURONS, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(L2_REGULARIZATION)))
    
    # Uncomment this for Dropout Regularization
    # if (i + 1) % (range(NUM_LAYERS) / 5) == 0:
    #     model.add(tf.keras.layers.Dropout(rate=DROP_RATE)) 

model.add(tf.keras.layers.Dense(1, activation=None))

# Training the Neural Network Model!

In [None]:
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')]
)
history = model.fit(
    x=batched_train_ds, validation_data=batched_val_ds,
    epochs=EPOCHS
)

In [None]:
# Evaluate on Validation Set
model.evaluate(x=batched_val_ds)

# Producing the Final Predictions on the Original Kaggle NYC Test Set

In [None]:
# Applying feature scaling by transforming based on the Scaler used for the train set
test_normalized = sc.transform(test[feature_names].values)

In [None]:
# Preparing the test data for use with the neural network model
final_test = test_normalized
final_test = np.nan_to_num(final_test, copy=False, nan=0)
final_test_ds = tf.data.Dataset.from_tensor_slices(final_test.tolist())
batched_final_test = final_test_ds.batch(BATCH_SIZE)

In [None]:
# Producing the final Kaggle submission CSV for neural networks!
ytest = model.predict(batched_final_test)
print('Test shape OK.') if test.shape[0] == ytest.shape[0] else print('Oops')
test['trip_duration'] = np.exp(ytest) - 1
test['id'] = test_original['id']
test[['id', 'trip_duration']].to_csv('submission_neural_network.csv', index=False)

## That's the end of the code for Neural Networks and this project as a whole! Thanks for reading!