In [1]:
from common.functions import *
import numpy as np
from pandarallel import pandarallel
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sqlalchemy import create_engine

In [2]:
engine = create_engine(engine_string)

In [3]:
cod_stop = '8_06297'
cod_line = '8__658___'

arrival_times = pd.read_sql_query(
    "SELECT * FROM arrival_times WHERE cod_stop = '{cod_stop}' AND cod_line = '{cod_line}'".format(cod_stop=cod_stop, cod_line=cod_line), con=engine)

In [4]:
crtm_poll = pd.read_sql_query("SELECT * FROM crtm_poll "
                              "WHERE cod_stop = '{cod_stop}' "
                              "AND cod_line = '{cod_line}'".format(cod_stop=cod_stop,
                                                                   cod_line=cod_line),
                              con=engine)
crtm_poll['remaining_seconds_est'] = crtm_poll['eta'] - \
    crtm_poll['actual_date']
crtm_poll['eta_date'] = crtm_poll['eta'].dt.day

In [5]:
# Initialization
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
crtm_poll_grouped = crtm_poll.groupby(
    ['cod_issue', 'cod_stop', 'cod_line', 'eta_date'])
crtm_poll_filtered = crtm_poll_grouped.parallel_apply(
    lambda x: filter_static_values(x)).reset_index(drop=True)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

In [7]:
def get_arrival_time(row):
    cod_line = row['cod_line']
    cod_issue = row['cod_issue']
    eta_date = row['eta_date']

    selected_arrival_times = arrival_times[(arrival_times['cod_line'] == cod_line) &
                                           (arrival_times['cod_issue'] == cod_issue) &
                                           (arrival_times['eta_date']
                                            == eta_date)
                                           ]['arrival_time']

    if (len(selected_arrival_times.index) == 1):
        arrival_time = selected_arrival_times.iloc[0]
    else:
        arrival_time = None

    return arrival_time

In [8]:
crtm_poll_filtered['arrival_time'] = crtm_poll_filtered.parallel_apply(
    get_arrival_time, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2857), Label(value='0 / 2857'))), …

In [9]:
crtm_poll_filtered.dropna(inplace=True)

In [10]:
crtm_poll_filtered['remaining_seconds'] = (
    crtm_poll_filtered['arrival_time'] - crtm_poll_filtered['actual_date']).astype('timedelta64[s]')
crtm_poll_filtered['error'] = (crtm_poll_filtered['arrival_time'] -
                               crtm_poll_filtered['eta']).astype('timedelta64[s]')

In [11]:
crtm_poll_filtered = crtm_poll_filtered[crtm_poll_filtered['error'] < 1500] # error<25min
crtm_poll_filtered = crtm_poll_filtered[crtm_poll_filtered['remaining_seconds'] >= 0]
crtm_poll_filtered = crtm_poll_filtered[crtm_poll_filtered['remaining_seconds'] <= 90*60]

In [12]:
crtm_poll_filtered_grouped = crtm_poll_filtered.groupby(np.floor(crtm_poll_filtered['remaining_seconds']/60))

In [13]:
sample_count = crtm_poll_filtered_grouped.apply(lambda x: len(x.index)).reset_index(name='count')

In [21]:
fig = px.line(sample_count, x='remaining_seconds', y='count')

# Edit the layout
fig.update_layout(title='Sample count depending on the reported remaining time',
                  xaxis_title='Remaining time (minutes)',
                  yaxis_title='Count')

fig.update_xaxes(autorange="reversed")

fig.show()

In [15]:
fig = px.scatter(crtm_poll_filtered,
                 x=crtm_poll_filtered['remaining_seconds']/60,
                 y=crtm_poll_filtered['error']/60,
                 hover_data=['cod_issue'])

# Edit the layout
fig.update_layout(title='Estimation error',
                  xaxis_title='Remaining time (minutes)',
                  yaxis_title='Error (minutes)')

fig.update_xaxes(autorange="reversed")
fig.show()

In [16]:
fig = px.scatter(crtm_poll_filtered,
                 x=crtm_poll_filtered['remaining_seconds_est'].dt.total_seconds()/60,
                 y=crtm_poll_filtered['remaining_seconds']/60,
                 color='cod_issue',
                 hover_data=['eta_date'])

# Edit the layout
fig.update_layout(title='Remaining minutes: estimated vs observed',
                  xaxis_title='Estimated remaining time (minutes)',
                  yaxis_title='Observed remaining time (minutes)')

fig.update_xaxes(autorange="reversed")
fig.show()

In [17]:
fig = go.Figure()

for name, group in crtm_poll_filtered_grouped:
    fig.add_trace(go.Box(x=np.repeat(name, len(group)), y=group['error']/60))
    
# Edit the layout
fig.update_layout(title='Estimation error boxplot',
                  xaxis_title='Remaining time (minutes)',
                  yaxis_title='Error (minutes)')

fig.update_xaxes(autorange="reversed")
    
fig.show()

In [18]:
mae = crtm_poll_filtered_grouped.apply(lambda x: np.mean(np.abs(x['error']/60))).reset_index(name='mae')
mape = crtm_poll_filtered_grouped.apply(lambda x: np.mean(np.abs(x['error']/60)/np.abs(x.name))*100).reset_index(name='mape')

In [20]:
fig = go.Figure()

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=mae['remaining_seconds'], y=mae['mae'], name='MAE'),
    secondary_y=False)

fig.add_trace(
    go.Scatter(x=mape['remaining_seconds'], y=mape['mape'], name='MAPE'),
    secondary_y=True)

fig.update_xaxes(autorange="reversed")

# Set y-axes titles
fig.update_yaxes(title_text="<b>MAE</b> (minutes)", secondary_y=False)
fig.update_yaxes(title_text="<b>MAPE</b> %", secondary_y=True)

# Edit the layout
fig.update_layout(title='CRTM API estimation error depending on the reported remaining time',
                  xaxis_title='Remaining time (minutes)')

fig.show()