# **In this notebook, I will build a baseline model for predicting the aggregated demand from T+1 to T+5**

In [3]:
import pandas as pd
import numpy as np
import geohash2
import os
import tensorflow as tf

import keras 
from keras import backend as K
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
tf.enable_eager_execution()

Using TensorFlow backend.


In [None]:
data_dir = '/home/angps/Documents/GrabChallenge/Traffic Management/Data'

In [None]:
df = pd.read_csv(os.path.join(data_dir,'cleaned_training.csv'))

In [5]:
df.head()

Unnamed: 0,geohash6,day,demand,latitude,longitude,latitude_error,longitude_error,Hour,Minute,Period
0,qp03wc,18,0.020072,-5.353088,90.653687,0.002747,0.005493,20,0,1712
1,qp03pn,10,0.024721,-5.413513,90.664673,0.002747,0.005493,14,30,922
2,qp09sw,9,0.102821,-5.325623,90.906372,0.002747,0.005493,6,15,793
3,qp0991,32,0.088755,-5.353088,90.752563,0.002747,0.005493,5,0,2996
4,qp090q,15,0.074468,-5.413513,90.719604,0.002747,0.005493,4,0,1360


## **Baseline Model**

**We will only use data from day 15 onwards so every data point has a previous full 14 day demand data. We would also split the dataset 80-20 for training and test set**

In [None]:
df = df[df['day'] >= 15]
train_df, val_df = train_test_split(df, test_size=0.2, random_state=0)
train_df = train_df.reset_index(drop=True, inplace=False)
val_df = val_df.reset_index(drop=True, inplace=False)

In [7]:
train_df.shape, val_df.shape

((2607932, 10), (651984, 10))

In the baseline model, we will use the past hour, day and week aggregated demand as the input to a simple LSTM model.

In [None]:
def dayhourmin_to_period(day, hour, minute):
    return ((day-1) * 24 * 4) + (hour * 4) + minute//15

def period_to_dayhourmin(period):
    day = period//96 + 1
    hour = (period - (day-1) * 96)//4
    minute = (period - ((day-1) * 96) - (hour*4)) * 15
    return (day, hour, minute)

def get_demand_from_period(df, geohash, period):
    day, hour, minute = period_to_dayhourmin(period)
    demand_queried = df[(df['geohash6'] == geohash) & (df['day'] == day)
                & (df['Hour'] == hour) & (df['Minute'] == minute)]['demand'].values
    if len(demand_queried) > 0:
        return demand_queried[0]
    else:
        return 0
    
def get_past_demand(df, geohash, day, hour, minute, num_periods=24):
    period = dayhourmin_to_period(day, hour, minute)
    X = [get_demand_from_period(df, geohash, period - i) for i in range(1, num_periods + 1)] \
            + [get_demand_from_period(df, geohash, period - 96), get_demand_from_period(df, geohash, period - 96*7), get_demand_from_period(df, geohash, period - 96*14)]
    return np.array(X)

def get_future_demand(df, geohash, day, hour, minute, num_periods=5):
    period = dayhourmin_to_period(day, hour, minute)
    return np.array([get_demand_from_period(df, geohash, period + i) for i in range(num_periods)])

In [9]:
def fc_layer(inputs, output_units, batch_norm=True):
    net = tf.keras.layers.Dense(output_units)(inputs)
    if batch_norm:
        net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Activation('relu')(net)
    return net

def baseline_model():
    _input = tf.keras.layers.Input(shape=(27,1))
    net = tf.keras.layers.CuDNNLSTM(units=8)(_input)
    #net = fc_layer(net, 16)
    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Activation('relu')(net)
    net = tf.keras.layers.Dense(5, kernel_initializer='normal', activation='relu')(net)
    model = tf.keras.models.Model(inputs=_input, outputs=net)
    return model

model = baseline_model()

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

model.compile(optimizer = "rmsprop", loss = root_mean_squared_error, 
              metrics =["mean_squared_error"])

Instructions for updating:
Colocations handled automatically by placer.


In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 27, 1)             0         
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (None, 8)                 352       
_________________________________________________________________
batch_normalization_v1 (Batc (None, 8)                 32        
_________________________________________________________________
activation (Activation)      (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 5)                 45        
Total params: 429
Trainable params: 413
Non-trainable params: 16
_________________________________________________________________


In [None]:

def data_gen(typ, batch_size, past_periods=27):
    import time
    if typ == 'train':
        df = train_df
    elif typ == 'val':
        df = val_df
    max_rows = len(df)
    while True:
        past_demands = np.zeros((batch_size, past_periods))
        future_demands = np.zeros((batch_size, 5))
        rows = np.random.choice(a=max_rows, size=batch_size, replace=False)
        for i in range(len(rows)):
            row = rows[i]
            geohash = df.loc[row, 'geohash6']
            day = df.loc[row, 'day']
            hour = df.loc[row, 'Hour']
            minute = df.loc[row, 'Minute']
            past_demands[i] = get_past_demand(df, geohash, day, hour, minute)
            future_demands[i] = get_future_demand(df, geohash, day, hour, minute)
        yield np.reshape(past_demands, (batch_size, past_periods,1)), future_demands


In [12]:
train_gen = data_gen('train', batch_size=4)
val_gen = data_gen('val', batch_size=4)

# train_gen = DataGenerator(df=train_df)
# val_gen = DataGenerator(df=val_df)
num_epochs = 2
base_lr = 0.001


def lr_linear_decay(epoch):
    return (base_lr * (1 - (epoch/num_epochs)))

history = model.fit_generator(generator=train_gen,
                              validation_data=val_gen,
                              steps_per_epoch=20,
                              validation_steps=20,
                              max_queue_size=10,
                              epochs=num_epochs,
                              verbose=1)

Epoch 1/2
Instructions for updating:
Use tf.cast instead.
Epoch 2/2


In [13]:
lat = -5.331116
long = 90.631714
day=24
hour=22
minute=0
feat = get_past_demand(df, geohash2.encode(lat,long), day, hour, minute)
model.predict(np.reshape(feat, (-1,27,1)))

array([[0.027203  , 0.0164283 , 0.01745857, 0.01381193, 0.01533502]],
      dtype=float32)

In [14]:
cor_demand = get_future_demand(df, geohash2.encode(lat,long), day, hour, minute)
root_mean_squared_error(cor_demand, model.predict(np.reshape(feat, (-1,27,1)))[0])

<tf.Tensor: id=1751, shape=(), dtype=float64, numpy=0.01865811462276602>