In [1]:
import pandas as pd
import numpy as np
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.metrics import mean_squared_log_error
import keras

In [2]:
#Dataset load
ds_train = pd.read_csv("/clear_dataset.zip" , compression="zip")

In [3]:
ds_train.head()

Unnamed: 0,passenger_count,trip_duration,month_pickup,dow_pickup,hour_pickup,BoroCode,BoroName,dist_km,average_speed
0,1,455,3,0,17,1,Manhattan,1.498521,11.856428
1,1,663,6,6,0,1,Manhattan,1.805507,9.803659
2,1,2124,1,1,11,1,Manhattan,6.385098,10.822201
3,1,429,4,2,19,1,Manhattan,1.485498,12.465721
4,1,435,3,5,13,1,Manhattan,1.188588,9.836594


### One hot encoding categorical features

In [4]:
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('int64')):
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            #drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df
ds_train = oneHotEncode(ds_train, ["BoroCode"])

### Circular predictors for hours, month and day_of_week variables

In [5]:
ds_train['sin_hour_pickup'] = np.sin(2*np.pi*ds_train.hour_pickup/24)
ds_train['cos__hour_pickup'] = np.cos(2*np.pi*ds_train.hour_pickup/24)
ds_train.drop('hour_pickup', axis=1, inplace=True)

In [6]:
ds_train['sin_month_pickup'] = np.sin(2*np.pi*ds_train.month_pickup/12)
ds_train['cos__month_pickup'] = np.cos(2*np.pi*ds_train.month_pickup/12)
ds_train.drop('month_pickup', axis=1, inplace=True)

In [7]:
ds_train['sin_dow_pickup'] = np.sin(2*np.pi*ds_train.dow_pickup/7)
ds_train['cos__dow_pickup'] = np.cos(2*np.pi*ds_train.dow_pickup/7)
ds_train.drop('dow_pickup', axis=1, inplace=True)

## Feature selection


Removal average speed since it is calculated based on the predicted variable

In [8]:
ds_train.drop(['average_speed', "BoroName"], axis=1, inplace=True)

# Train neural network

In [9]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = ds_train.shape[1]-1, activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss=keras.losses.MeanSquaredLogarithmicError(), optimizer='adam', metrics=keras.losses.MeanSquaredLogarithmicError())
NN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1792      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 256)               65792     
                                                                 
 dense_4 (Dense)             (None, 1)                 257       
                                                                 
Total params: 166,657
Trainable params: 166,657
Non-trainable params: 0
_________________________________________________________________


In [10]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [None]:
y_values = ds_train["trip_duration"]
x_values = ds_train.drop(["trip_duration"],axis=1)
NN_model.fit(x_values, y_values, epochs=200, batch_size=256, validation_split = 0.2, callbacks=callbacks_list)


Epoch 1/200
Epoch 1: val_loss did not improve from 0.19675
Epoch 2/200
Epoch 2: val_loss did not improve from 0.19675
Epoch 3/200
Epoch 3: val_loss did not improve from 0.19675
Epoch 4/200
Epoch 4: val_loss did not improve from 0.19675
Epoch 5/200
Epoch 5: val_loss did not improve from 0.19675
Epoch 6/200
Epoch 6: val_loss did not improve from 0.19675
Epoch 7/200
Epoch 7: val_loss did not improve from 0.19675
Epoch 8/200
Epoch 8: val_loss did not improve from 0.19675
Epoch 9/200
Epoch 9: val_loss did not improve from 0.19675
Epoch 10/200
Epoch 10: val_loss did not improve from 0.19675
Epoch 11/200
Epoch 11: val_loss did not improve from 0.19675
Epoch 12/200
Epoch 12: val_loss did not improve from 0.19675
Epoch 13/200
Epoch 13: val_loss did not improve from 0.19675
Epoch 14/200
Epoch 14: val_loss did not improve from 0.19675
Epoch 15/200
Epoch 15: val_loss did not improve from 0.19675
Epoch 16/200
Epoch 16: val_loss did not improve from 0.19675
Epoch 17/200
Epoch 17: val_loss did not im