In [1]:
#!pip install pandarallel psycopg2-binary



In [2]:
# Use some functions from tensorflow_docs
#!pip install -q git+https://github.com/tensorflow/docs

In [18]:
from common.functions import *
import numpy as np
from pandarallel import pandarallel
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.model_selection import cross_val_score, GridSearchCV, learning_curve, train_test_split, validation_curve
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sqlalchemy import create_engine
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_docs as tfdocs
#import tensorflow_docs.plots
import tensorflow_docs.modeling

In [4]:
cod_stop = '8_06277'
cod_line = '8__658___'

In [5]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
crtm_poll_filtered = get_crtm_poll(cod_stop, cod_line)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=62), Label(value='0 / 62'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6796), Label(value='0 / 6796'))), …

In [7]:
samples_train, samples_test = train_test_split(crtm_poll_filtered, test_size=0.25, random_state=42)

In [8]:
weekdays = [[0],
            [1],
            [2],
            [3],
            [4],
            [5],
            [6]]
weekday_ohe = OneHotEncoder(sparse=False).fit(weekdays)
weekday_ohe_df = pd.DataFrame(weekday_ohe.transform(samples_train['eta'].dt.weekday.values.reshape(-1, 1)))

In [9]:
y_train = -samples_train['error'].values.reshape(-1, 1)

X_train = pd.concat([samples_train['remaining_seconds_est'].dt.total_seconds().reset_index(drop=True),
                     samples_train['eta'].apply(get_day_time).reset_index(drop=True),
                     weekday_ohe_df.reset_index(drop=True),
                     samples_train['eta'].apply(get_day_type_bool).reset_index(drop=True),
                     samples_train['static'].reset_index(drop=True)
                     ], axis=1)

In [10]:
X_train.head()

Unnamed: 0,remaining_seconds_est,eta,0,1,2,3,4,5,6,eta.1,static
0,1141.0,8.246111,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0
1,1116.0,20.565,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,4191.0,11.3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
3,3970.0,16.277222,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
4,3296.0,16.8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1


In [11]:
samples_train.head()

Unnamed: 0,actual_date,cod_stop,cod_line,cod_issue,eta,destination_stop,remaining_seconds_est,eta_date,static,arrival_time,remaining_seconds,error
54005,2020-03-04 07:55:45+00:00,8_06277,8__658___,5303453,2020-03-04 08:14:46+00:00,8_11449,00:19:01,4,0,2020-03-04 08:15:25+00:00,1180.0,39.0
5676,2020-02-17 20:15:18+00:00,8_06277,8__658___,5303252,2020-02-17 20:33:54+00:00,8_11449,00:18:36,17,0,2020-02-17 20:36:17+00:00,1259.0,143.0
16446,2020-02-21 10:08:09+00:00,8_06277,8__658___,5306000,2020-02-21 11:18:00+00:00,8_11449,01:09:51,21,1,2020-02-21 11:09:33+00:00,3684.0,-507.0
65261,2020-03-07 15:10:28+00:00,8_06277,8__658___,5308471,2020-03-07 16:16:38+00:00,8_11449,01:06:10,7,0,2020-03-07 16:15:40+00:00,3912.0,-58.0
8354,2020-02-18 15:53:04+00:00,8_06277,8__658___,5305364,2020-02-18 16:48:00+00:00,8_11449,00:54:56,18,1,2020-02-18 16:39:55+00:00,2811.0,-485.0


In [12]:
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
y_scaler = StandardScaler().fit(y_train)
y_train_scaled = y_scaler.transform(y_train).ravel()

In [89]:
def build_model():
    layer_size = 128
    model = keras.Sequential([
        layers.Dense(layer_size, activation='sigmoid', input_shape=[
                     X_train_scaled.shape[1]]),
        layers.Dense(layer_size, activation='sigmoid'),
        layers.Dense(layer_size, activation='sigmoid'),
        keras.layers.Dropout(0.2),
        layers.Dense(layer_size, activation='sigmoid'),
        layers.Dense(layer_size, activation='sigmoid'),
        layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.Adam()

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae', 'mse'])
    return model

In [90]:
model = build_model()

In [91]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1536      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 1

In [92]:
model.predict(X_train_scaled[:10])

array([[-0.6150486 ],
       [-0.61453   ],
       [-0.615466  ],
       [-0.61539423],
       [-0.614752  ],
       [-0.61492   ],
       [-0.6153999 ],
       [-0.61523974],
       [-0.6150883 ],
       [-0.61525226]], dtype=float32)

In [93]:
EPOCHS = 1000

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100)

with tf.device("/gpu:0"):
    history = model.fit(
        X_train_scaled, y_train_scaled,
        batch_size=int(np.floor(0.8*X_train_scaled.shape[0])),
        epochs=EPOCHS, validation_split=0.2, verbose=0,
        callbacks=[early_stop, tfdocs.modeling.EpochDots()])


Epoch: 0, loss:1.3808,  mae:0.8768,  mse:1.3808,  val_loss:1.0756,  val_mae:0.8388,  val_mse:1.0756,  
....................................................................................................
Epoch: 100, loss:0.6408,  mae:0.6439,  mse:0.6408,  val_loss:0.6045,  val_mae:0.6173,  val_mse:0.6045,  
....................................................................................................
Epoch: 200, loss:0.4610,  mae:0.4860,  mse:0.4610,  val_loss:0.4498,  val_mae:0.4794,  val_mse:0.4498,  
....................................................................................................
Epoch: 300, loss:0.4568,  mae:0.4806,  mse:0.4568,  val_loss:0.4478,  val_mae:0.4757,  val_mse:0.4478,  
....................................................................................................
Epoch: 400, loss:0.4537,  mae:0.4766,  mse:0.4537,  val_loss:0.4455,  val_mae:0.4724,  val_mse:0.4455,  
........................................................................

In [94]:
fig = go.Figure()
# fig.add_trace(go.Scatter(y=history.history['loss'],
#                     name='Trainning loss'))
# fig.add_trace(go.Scatter(y=history.history['val_loss'],
#                     name='Validation loss'))
fig.add_trace(go.Scatter(y=y_scaler.inverse_transform(history.history['mae']),
                    name='Trainning MAE'))
fig.add_trace(go.Scatter(y=y_scaler.inverse_transform(history.history['val_mae']),
                    name='Validation MAE'))
fig.update_layout(xaxis_title='Epoch',
                  yaxis_title='MAE (seconds)')
fig.show()