In [1]:
import matplotlib
matplotlib.use("TkAgg")
import import_ipynb
import utils
from utils import preprocess, feature_engineer
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization
from sklearn.metrics import mean_squared_error

importing Jupyter notebook from utils.ipynb


Using TensorFlow backend.


In [2]:
df = pd.read_csv('NYC_taxi.csv', parse_dates = ['pickup_datetime'], nrows = 500000)

In [3]:
# Perform preprocessing and feature engineering
df = preprocess(df)
df = feature_engineer(df)

In [4]:
# Scale the features
df_prescaled = df.copy()
df_scaled = df.drop(['fare_amount'], axis = 1)
df_scaled = scale(df_scaled)
cols = df.columns.tolist()
cols.remove('fare_amount')
df_scaled = pd.DataFrame(df_scaled, columns = cols, index = df.index)
df_scaled = pd.concat([df_scaled, df['fare_amount']], axis = 1)
df = df_scaled.copy()

In [5]:
# Split the dataframe into a training and testing set
X = df.loc[:, df.columns != 'fare_amount'] 
y = df.fare_amount
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [6]:
# Build neural network in Keras
model = Sequential()
model.add(Dense(128, activation = 'relu', input_dim = X_train.shape[1]))
# model.add(BatchNormalization())
model.add(Dense(64, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(1))

In [7]:
model.compile(loss = 'mse', optimizer = 'adam', metrics = ['mse'])

In [8]:
model.fit(X_train, y_train, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x20cd806b4e0>

In [11]:
# Results
train_pred = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
test_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
print("Train RMSE: {:0.2f}".format(train_rmse))
print("Test RMSE: {:0.2f}".format(test_rmse))

Train RMSE: 3.34
Test RMSE: 3.29


In [10]:
def predict_random(df_prescaled, X_test, model):
    sample = X_test.sample(n = 1, random_state = np.random.randint(low = 0, high = 10000))
    idx = sample.index[0]

    actual_fare = df_prescaled.loc[idx,'fare_amount']
    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_of_week = day_names[df_prescaled.loc[idx,'day_of_week']]
    hour = df_prescaled.loc[idx,'hour']
    predicted_fare = model.predict(sample)[0][0]
    rmse = np.sqrt(np.square(predicted_fare - actual_fare))

    print("Trip Details: {}, {}:00hrs".format(day_of_week, hour))  
    print("Actual fare: ${:0.2f}".format(actual_fare))
    print("Predicted fare: ${:0.2f}".format(predicted_fare))
    print("RMSE: ${:0.2f}".format(rmse))

predict_random(df_prescaled, X_test, model)

Trip Details: Saturday, 21:00hrs
Actual fare: $16.50
Predicted fare: $16.94
RMSE: $0.44
