# TabNet Modeling - Predicting Taxi Trip Durations in NYC

This notebook experiments with the TabNet neural network architecture for learning on tabular data.

## Set up

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
sns.set_style('whitegrid')

In [2]:
import pandas as pd
import numpy as np

from load_preprocess_data import load_train_data, load_test_data

In [3]:
# load data
train_data = load_train_data('data/W22P1_train.csv')
test_data = load_test_data('data/W22P1_test.csv')

In [4]:
train_data.head()

Unnamed: 0_level_0,pickup_datetime,dayofweek,hour,passenger_count,distance_km,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,log_trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2016-01-07 19:32:15,3,19,1,1.2597,-73.986389,40.756615,-73.999794,40.761631,520,6.253829
1,2016-01-27 08:07:32,2,8,1,2.35665,-73.956039,40.767609,-73.968201,40.78669,989,6.896694
2,2016-01-31 13:52:55,6,13,1,2.806862,-73.975998,40.751137,-74.001854,40.735229,657,6.487684
3,2016-01-19 08:00:19,1,8,3,3.15551,-73.960121,40.781952,-73.97197,40.755039,1035,6.942157
4,2016-01-25 23:32:14,0,23,1,1.725446,-73.987434,40.760139,-73.990982,40.744862,621,6.431331


In [5]:
all_covariates = list(test_data.columns)

original_covariates = ['hour', 'passenger_count',
                       'pickup_longitude', 'pickup_latitude',
                       'dropoff_longitude', 'dropoff_latitude']

# numerical covariates
numerical_covariates = ['hour', 'passenger_count', 'distance_km',
                        'pickup_longitude', 'pickup_latitude',
                        'dropoff_longitude', 'dropoff_latitude']

# categorical + numerical covariates
cat_numerical_covariates = ['dayofweek', 'hour',
                      'passenger_count', 'distance_km',
                      'pickup_longitude', 'pickup_latitude',
                      'dropoff_longitude', 'dropoff_latitude']

print('covariates: ', all_covariates)

covariates:  ['pickup_datetime', 'dayofweek', 'hour', 'passenger_count', 'distance_km', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']


In [6]:
# train-test split the training data (so that we can evaluate without submitting)
from sklearn.model_selection import train_test_split
train_train_data, train_test_data = train_test_split(train_data, test_size=0.1)

In [7]:
def create_X_y(train_data, test_data, covariates, label):
    X_train = train_data[covariates]
    X_test = test_data[covariates]

    y_train = train_data[label]
    y_test = test_data[label]

    return X_train, X_test, y_train, y_test

In [8]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error

def eval_model(model, X, y, metric='rmsle', log=False):
    '''evaluate model on given model via the given metric'''

    y_pred = model.predict(X)
    if log:
        y_pred = np.exp(y_pred)
        y = np.exp(y)

    if metric=='rmsle':
        return np.sqrt(mean_squared_log_error(y, y_pred))
    elif 'msle':
        return mean_squared_log_error(y, y_pred)
    elif metric=='mse':
        return mean_squared_error(y, y_pred)
    elif metric=='rmse':
        return np.sqrt(mean_squared_error(y, y_pred))
    elif metric=='mae':
        return mean_absolute_error(y, y_pred)
    else:
        raise ValueError()

In [9]:
def create_submission(model, covariates, log=False):
    X_test = test_data[covariates]
    y_pred = model.predict(X_test)

    if log:
        y_pred = np.exp(y_pred)

    df = pd.DataFrame(index=test_data.index, data=y_pred, columns=['trip_duration'])

    return df

## TabNet

> https://arxiv.org/pdf/1908.07442.pdf

> https://github.com/titu1994/tf-TabNet

In [11]:
from tabnet import TabNetRegressor
import tensorflow as tf

col_names = numerical_covariates

feature_columns = []
for col_name in col_names:
    feature_columns.append(tf.feature_column.numeric_column(col_name))

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, col_names, 'log_trip_duration')

BATCH_SIZE = len(X_train) // 100

train_ds = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train)).batch(BATCH_SIZE)
test_ds = tf.data.Dataset.from_tensor_slices((dict(X_test), y_test)).batch(BATCH_SIZE)

2022-03-23 14:08:46.965740: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-03-23 14:08:46.988063: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [12]:
model = TabNetRegressor(feature_columns=feature_columns, num_regressors=1, feature_dim=256, output_dim=128)

[TabNet]: 128 features will be used for decision steps.


In [13]:
# lr = tf.keras.optimizers.schedules.ExponentialDecay(0.01, decay_steps=100, decay_rate=0.9, staircase=False)
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
optimizer = tf.keras.optimizers.Adamax(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name="Adamax")
rmse_metric = tf.keras.metrics.RootMeanSquaredError()
model.compile(optimizer, loss='mean_squared_error', metrics=['mse', rmse_metric])

In [14]:
model.fit(train_ds, validation_data=test_ds, epochs=100, verbose=2)

Epoch 1/100
100/100 - 29s - loss: 2.4443 - mse: 2.4443 - root_mean_squared_error: 1.5634 - val_loss: 0.5713 - val_mse: 0.5713 - val_root_mean_squared_error: 0.7558 - 29s/epoch - 288ms/step
Epoch 2/100
100/100 - 15s - loss: 0.6208 - mse: 0.6208 - root_mean_squared_error: 0.7879 - val_loss: 0.5695 - val_mse: 0.5695 - val_root_mean_squared_error: 0.7546 - 15s/epoch - 145ms/step
Epoch 3/100
100/100 - 15s - loss: 0.6191 - mse: 0.6191 - root_mean_squared_error: 0.7868 - val_loss: 0.5667 - val_mse: 0.5667 - val_root_mean_squared_error: 0.7528 - 15s/epoch - 147ms/step
Epoch 4/100
100/100 - 15s - loss: 0.6167 - mse: 0.6167 - root_mean_squared_error: 0.7853 - val_loss: 0.5647 - val_mse: 0.5647 - val_root_mean_squared_error: 0.7515 - 15s/epoch - 152ms/step
Epoch 5/100
100/100 - 15s - loss: 0.6149 - mse: 0.6149 - root_mean_squared_error: 0.7841 - val_loss: 0.5625 - val_mse: 0.5625 - val_root_mean_squared_error: 0.7500 - 15s/epoch - 146ms/step
Epoch 6/100
100/100 - 16s - loss: 0.6128 - mse: 0.6128 

<keras.callbacks.History at 0x7f168dee71d0>

In [15]:
np.sqrt(np.mean((np.array(model.predict(train_ds)).flatten() - y_train)**2)) # train RMSLE

0.6034718739799461

In [16]:
np.sqrt(np.mean((np.array(model.predict(test_ds)).flatten() - y_test)**2)) # test RMSLE

0.562103271344189