# World Data League 2021
## Notebook Template

This notebook is one of the mandatory deliverables when you submit your solution (alongside the video pitch). Its structure follows the WDL evaluation criteria and it has dedicated cells where you can add descriptions. Make sure your code is readable as it will be the only technical support the jury will have to evaluate your work.

The notebook must:

*   💻 have all the code that you want the jury to evaluate
*   🧱 follow the predefined structure
*   📄 have markdown descriptions where you find necessary
*   👀 be saved with all the output that you want the jury to see
*   🏃‍♂️ be runnable


## External links and resources
Paste here all the links to external resources that are necessary to understand and run your code. Add descriptions to make it clear how to use them during evaluation.http://158.101.207.63:8080/data/ for datahttp://158.101.207.63:8080/models/ for models

## Introduction
Describe how you framed the challenge by telling us what problem are you trying to solve and how your solution solves that problem.

## Development
Start coding here! 👩‍💻

Don't hesitate to create markdown cells to include descriptions of your work where you see fit, as well as commenting your code.

We know that you know exactly where to start when it comes to crunching data and building models, but don't forget that WDL is all about social impact...so take that into consideration as well.

In [None]:
import sys
import warnings
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Model, load_model
from keras.utils.vis_utils import plot_model
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM, GRU
from keras.models import Sequential
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib as mpl
import matplotlib.pyplot as plt
import pydot_ng as pydot
import datetime as dt
import os

warnings.filterwarnings("ignore")

In [None]:
traffic_flow_2016 = pd.read_hdf("../predicting-traffic-flow/data/traffic_flow_2016.hdf")
traffic_flow_2017 = pd.read_hdf("../predicting-traffic-flow/data/traffic_flow_2016.hdf")
traffic_flow_2018 = pd.read_hdf("../predicting-traffic-flow/data/traffic_flow_2016.hdf")
traffic_flow_2019 = pd.read_hdf("../predicting-traffic-flow/data/traffic_flow_2016.hdf")

# Create groups by entity_id
def preprocess_data(train_df, test_df):    
    split_train = train_df[['entity_id','dateobservedfrom', 'intensity']]
    split_test = test_df[['entity_id','dateobservedfrom', 'intensity']]

    train_d = split_train.groupby(['entity_id'])
    test_d = split_test.groupby(['entity_id'])
    
    return train_d, test_d

train_df, test_df = preprocess_data(
    train_df=pd.concat([traffic_flow_2016, traffic_flow_2017, traffic_flow_2018]), 
    test_df=traffic_flow_2019)

del traffic_flow_2016, traffic_flow_2017, traffic_flow_2018, traffic_flow_2019

In [None]:
def get_lstm(units):

    model = Sequential()
    model.add(LSTM(units[1], input_shape=(units[0], 1), return_sequences=True))
    model.add(LSTM(units[2]))
    model.add(Dropout(0.2))
    model.add(Dense(units[3], activation='sigmoid'))

    return model


def get_gru(units):

    model = Sequential()
    model.add(GRU(units[1], input_shape=(units[0], 1), return_sequences=True))
    model.add(GRU(units[2]))
    model.add(Dropout(0.2))
    model.add(Dense(units[3], activation='sigmoid'))

    return model


def _get_sae(inputs, hidden, output):

    model = Sequential()
    model.add(Dense(hidden, input_dim=inputs, name='hidden'))
    model.add(Activation('sigmoid'))
    model.add(Dropout(0.2))
    model.add(Dense(output, activation='sigmoid'))

    return model


def get_saes(layers):

    sae1 = _get_sae(layers[0], layers[1], layers[-1])
    sae2 = _get_sae(layers[1], layers[2], layers[-1])
    sae3 = _get_sae(layers[2], layers[3], layers[-1])

    saes = Sequential()
    saes.add(Dense(layers[1], input_dim=layers[0], name='hidden1'))
    saes.add(Activation('sigmoid'))
    saes.add(Dense(layers[2], name='hidden2'))
    saes.add(Activation('sigmoid'))
    saes.add(Dense(layers[3], name='hidden3'))
    saes.add(Activation('sigmoid'))
    saes.add(Dropout(0.2))
    saes.add(Dense(layers[4], activation='sigmoid'))

    models = [sae1, sae2, sae3, saes]

    return models


def process_data(train, test, lags):
    attr = 'intensity'
    
    df1 = train
    df2 = test
    
    df1['dateobservedfrom'] = pd.to_datetime(df1['dateobservedfrom'],dayfirst=True)
    df2['dateobservedfrom'] = pd.to_datetime(df2['dateobservedfrom'],dayfirst=True) 
    
    #set the date column as index
    # Resample 30 min
    
    df1 = df1.set_index('dateobservedfrom').resample('30min').mean()
    df2 = df2.set_index('dateobservedfrom').resample('30min').mean()
    
    # Polynomial Interpolation for NaN imputation
    df1 = df1.interpolate('polynomial', order=1)
    df2 = df2.interpolate('polynomial', order=1)

    scaler = MinMaxScaler(feature_range=(0, 1)).fit(df1[attr].values.reshape(-1, 1))
    flow1 = scaler.transform(df1[attr].values.reshape(-1, 1)).reshape(1, -1)[0]
    flow2 = scaler.transform(df2[attr].values.reshape(-1, 1)).reshape(1, -1)[0]

    train, test = [], []
    for i in range(lags, len(flow1)):
        train.append(flow1[i - lags: i + 1])
        #train.append(flow1[i - lags: i + 1])
    for i in range(lags, len(flow2)):
        test.append(flow2[i - lags: i + 1])

    train = np.array(train)
    test = np.array(test)
    np.random.shuffle(train)

    X_train = train[:, :-1]
    y_train = train[:, -1]
    X_test = test[:, :-1]
    y_test = test[:, -1]

    return X_train, y_train, X_test, y_test, scaler

class Pogbar(tf.keras.callbacks.Callback):
    def __init__(self, sensor, model_name):
        super(Pogbar, self).__init__()
        self.epoch = 1
        self.sensor = sensor
        self.model_name = model_name
        
    def set_params(self, params):
        self.epochs = params['epochs']
        
    def on_epoch_begin(self, epoch, logs=None):
        print("\r{}: Training {} [{}/{}]"
                  .format(self.sensor, 
                          self.model_name, 
                          self.epoch, 
                          self.epochs), 
              end="")
        self.epoch += 1
        
    def on_train_end(self, logs=None):
        print("")
        
def train_model(sensor, model, X_train, y_train, name, config):
    training_callbacks = [
        Pogbar(sensor, name),
        tf.keras.callbacks.CSVLogger("./logs/{}-{}.csv".format(dt.datetime.now(), name), separator=",", append=False)
    ]
    
    model.compile(loss="mse", optimizer="rmsprop", metrics=['mape'])
    # early = EarlyStopping(monitor='val_loss', patience=30, verbose=0, mode='auto')
    hist = model.fit(
        X_train, y_train,
        verbose=0,
        batch_size=config["batch"],
        epochs=config["epochs"],
        validation_split=0.05,
        callbacks=training_callbacks)

    model.save('model/' + sensor + name + '.h5')
    df = pd.DataFrame.from_dict(hist.history)
    df.to_csv('model/' + sensor + name + ' loss.csv', encoding='utf-8', index=False)


def train_saes(sensor, models, X_train, y_train, name, config):
    training_callbacks = [
        Pogbar(sensor, name),
        tf.keras.callbacks.CSVLogger("./logs/{}-{}.csv".format(dt.datetime.now(), name), separator=",", append=False)
    ]
    
    temp = X_train
    # early = EarlyStopping(monitor='val_loss', patience=30, verbose=0, mode='auto')

    for i in range(len(models) - 1):
        if i > 0:
            p = models[i - 1]
            hidden_layer_model = Model(p.input,
                                       p.get_layer('hidden').output)
            temp = hidden_layer_model.predict(temp)

        m = models[i]
        m.compile(loss="mse", optimizer="rmsprop", metrics=['mape'])

        m.fit(temp, y_train, batch_size=config["batch"],
              epochs=config["epochs"],
              validation_split=0.05,
              verbose=0,
              callbacks=training_callbacks)

        models[i] = m

    saes = models[-1]
    for i in range(len(models) - 1):
        weights = models[i].get_layer('hidden').get_weights()
        saes.get_layer('hidden%d' % (i + 1)).set_weights(weights)

    train_model(sensor, saes, X_train, y_train, name, config)


# come here to configure the model
def train_main(sensor, train_data, test_data, config = {"batch": 256, "epochs": 30}):
    lag = 12
    X_train, y_train, _, _, _ = process_data(train_data, test_data, lag)
    if os.path.isfile('model/' + sensor.split(":")[-1] + 'LSTM.h5'):
        m = tf.keras.models.load_model('./model/' + sensor + 'LSTM.h5')
    else:
        X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
        m = get_lstm([12, 64, 64, 1])
        train_model(sensor, m, X_train, y_train, "LSTM", config)
    if os.path.isfile('model/' + sensor.split(":")[-1] + 'GRU.h5'):
        m = tf.keras.models.load_model('./model/' + sensor + 'GRU.h5')
    else:
        X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
        m = get_gru([12, 64, 64, 1])
        train_model(sensor, m, X_train, y_train, "GRU", config)
    if os.path.isfile('model/' + sensor.split(":")[-1] + 'SAES.h5'):
        m = tf.keras.models.load_model('./model/' + sensor + 'SAES.h5')
    else:
        X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1]))
        m = get_saes([12, 400, 400, 400, 1])
        train_saes(sensor, m, X_train, y_train, "SAES", config)

        
def MAPE(y_true, y_pred):
    
    y = [x for x in y_true if x > 0]
    y_pred = [y_pred[i] for i in range(len(y_true)) if y_true[i] > 0]

    num = len(y_pred)
    sums = 0

    for i in range(num):
        tmp = abs(y[i] - y_pred[i]) / y[i]
        sums += tmp

    mape = sums * (100 / num)

    return mape


def eva_regress(name, y_true, y_pred):
    mape = MAPE(y_true, y_pred)
    vs = metrics.explained_variance_score(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    print("                           {}".format(name))
    print()
    print('explained_variance_score:  %f' % vs)
    print('                    mape:  %f%%' % mape)
    print('                     mae:  %f' % mae)
    print('                     mse:  %f' % mse)
    print('                    rmse:  %f' % math.sqrt(mse))
    print('                      r2:  %f' % r2)
    print()


def plot_results(y_true, y_preds, names):

    d = '2016-12-31 00:00:00'
    x = pd.date_range(d, periods=48, freq='30min') #######change according to frequency conversion of time series data

    fig = plt.figure()
    ax = fig.add_subplot(111)

    ax.plot(x, y_true, label='True Data')
    for name, y_pred in zip(names, y_preds):
        ax.plot(x, y_pred, label=name)

    plt.legend()
    plt.grid(True)
    plt.xlabel('Time of Day')
    plt.ylabel('Flow')

    date_format = mpl.dates.DateFormatter("%H:%M")
    ax.xaxis.set_major_formatter(date_format)
    fig.autofmt_xdate()

    plt.show()


def main(train_data, test_data, sensor):
    lstm = load_model('model/' + sensor.split(":")[-1] + 'LSTM.h5')
    gru = load_model('model/' + sensor.split(":")[-1] + 'GRU.h5')
    saes = load_model('model/' + sensor.split(":")[-1] + 'SAES.h5')
    models = [lstm, gru, saes]
    names = ['LSTM', 'GRU', 'SAEs']

    lag = 12
    _, _, X_test, y_test, scaler = process_data(train_data, test_data, lag)
    y_test = scaler.inverse_transform(y_test.reshape(-1, 1)).reshape(1, -1)[0]

    y_preds = []
    for name, model in zip(names, models):
        if name == 'SAEs':
            X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]))
        else:
            X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
        file = 'images/' + name + '.png'
        plot_model(model, to_file=file, show_shapes=True)
        predicted = model.predict(X_test)
        predicted = scaler.inverse_transform(predicted.reshape(-1, 1)).reshape(1, -1)[0]
        y_preds.append(predicted[:48])
        eva_regress(name, y_test, predicted)

    plot_results(y_test[:48], y_preds, names)

In [None]:
for (sensor, train_data),(_, test_data) in zip(train_df, test_df):
    train_main(sensor.split(":")[-1], train_data, test_data, config={"batch": 256, "epochs": 60})
    main(train_data, test_data, sensor)

## Conclusions

### Scalability and Impact
Tell us how applicable and scalable your solution is if you were to implement it in a city. Identify possible limitations and measure the potential social impact of your solution.

### Future Work
Now picture the following scenario: imagine you could have access to any type of data that could help you solve this challenge even better. What would that data be and how would it improve your solution? 🚀