# BUSES HEADWAY ANALYSIS AND OUTLIERS DETECTION

In [207]:
import pandas as pd
import json

import datetime
from datetime import timedelta

import statistics
from statistics import mean
import math

from pandarallel import pandarallel
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
pandarallel.initialize()

import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
pio.templates.default = 'plotly_white'
pd.set_option("display.precision", 3)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Lines collected dictionary

In [10]:
#Line collected dict
with open('M6Data/lines_collected_dict.json', 'r') as f:
    lines_collected_dict = json.load(f)

## Buses time intervals differentiation
From the data available for each line, we are going to construct a dictionary with the time intervals for each line
and the frequencies of the buses for that intervals

In [11]:
#Line stops dict
with open('M6Data/freq_ranges_dict.json', 'r') as f:
    freq_ranges_dict = json.load(f)

## Times between stops for each time interval
Here we are going to calculate the mean time between stops for each time interval of the day, for the data recovered, making use of the preprocessed attribute 'arrival_time'. This data is calculated in a script called 'times_bt_stops.py' and loaded here.

In [16]:
day_type_dict = { #0 = Monday, 1 = Tuesday ...
    'LA' : [0,1,2,3,4], #LABORABLES
    'LJ' : [0,1,2,3], #LUNES A JUEVES
    'VV' : [4], #VIERNES
    'SA' : [5], #SABADOS
    'FE' : [6], #DOMIGOS O FESTIVOS
}

In [65]:
#Load times between stops data
times_bt_stops = pd.read_csv('../times_bt_stops.csv',
    dtype={
        'line': 'str',
        'direction': 'uint16',
        'st_hour': 'uint16',
        'end_hour': 'uint16',
        'stopA': 'uint16',
        'stopB': 'uint16',
        'bus': 'uint16',
        'trip_time':'float32',
        'api_trip_time':'int32'
    }
)[['line','direction','date','st_hour','end_hour','stopA','stopB','bus','trip_time','api_trip_time']]
#Parse the dates
times_bt_stops['date'] = pd.to_datetime(times_bt_stops['date'], format='%Y-%m-%d')
times_bt_stops.head()

Unnamed: 0,line,direction,date,st_hour,end_hour,stopA,stopB,bus,trip_time,api_trip_time
0,1,1,2020-02-25,20,21,4514,4022,123,111.875,114
1,1,1,2020-02-25,20,21,4514,4022,8324,108.465,97
2,1,1,2020-02-25,20,21,4022,3687,123,17.053,58
3,1,1,2020-02-25,20,21,4022,3687,8324,135.143,59
4,1,1,2020-02-25,20,21,3687,737,123,94.099,67


In [222]:
#Selection attributes
line = '1'
direction = 1
day_type = 'LA'
time_int = 0
stopA = int(lines_collected_dict[line][str(direction)]['stops'][1])
stopB = int(lines_collected_dict[line][str(direction)]['stops'][2])
st_hour = 7
end_hour = 11

#Select data
selection_df = times_bt_stops.loc[(times_bt_stops.line == line) & \
                      (times_bt_stops.direction == direction) & \
                      (times_bt_stops.date.dt.weekday.isin(day_type_dict[day_type])) & \
                      (times_bt_stops.stopA == stopA) & (times_bt_stops.stopB == stopB) & \
                      (times_bt_stops.st_hour >= st_hour) & (times_bt_stops.end_hour <= end_hour)]
#Times to each stop histogram
fig = px.histogram(selection_df, x="trip_time", marginal="box",
                   hover_data=selection_df.columns, nbins=100)
fig.show()

## Get the time interval between buses
We can build the time interval between the buses inside a line and destination by getting the time remaining for the buses to the stop and extracting the difference between buses. This data is processed in a scrip called 'headways.py' and processed here.

In [248]:
headways = pd.read_csv('../headways.csv',
    dtype={
        'line': 'str',
        'direction': 'uint16',
        'busA': 'uint16',
        'busB': 'uint16',
        'headway':'uint16',
        'busB_ttls':'uint16'
    }
)[['line','direction','datetime','hw_pos','busA','busB','headway','busB_ttls']]
#Parse the dates
headways['datetime'] = pd.to_datetime(headways['datetime'], format='%Y-%m-%d %H:%M:%S.%f')
headways = headways.sort_values(by=['line','datetime','direction'], ascending=True).reset_index(drop = True)
headways.head()

Unnamed: 0,line,direction,datetime,hw_pos,busA,busB,headway,busB_ttls
0,1,1,2020-03-02 07:00:59.620835,0,0,118,0,655
1,1,1,2020-03-02 07:00:59.620835,1,118,121,1499,2155
2,1,2,2020-03-02 07:00:59.620835,0,0,117,0,390
3,1,2,2020-03-02 07:00:59.620835,1,117,113,1603,1993
4,1,2,2020-03-02 07:00:59.620835,2,113,116,662,2656


In [249]:
#Attributes for data selection
line = '1'
start_date = datetime.datetime(year=2020,month=3,day=2,hour=7)
end_date = datetime.datetime(year=2020,month=3,day=2,hour=11) 

#Select the data
selected_df = headways.loc[(headways.line == line) & \
                          (headways.datetime > start_date) & \
                          (headways.datetime < end_date)]
#Unique buses and times in data
unique_buses = selected_df.busB.unique().tolist()
times = pd.to_datetime(selected_df.datetime.unique(), format='%Y-%m-%d %H:%M:%S.%f').to_list()

#Maximum temporal distances
max_dist1 = selected_df.loc[selected_df.direction == 1].busB_ttls.max()
max_dist2 = selected_df.loc[selected_df.direction == 2].busB_ttls.max()

## Buses inside the line animated plot

In [250]:
# BUILD ANIMATED FIGURE OF BUSES INSIDE THE LINE
fig_dict = {
    "data": [],
    "layout": {},
    "frames": []
}

size = 20
dur = 250

# fill in most of layout
fig_dict["layout"]["xaxis"] = {
    "range": [-50, max(max_dist1,max_dist2)+50], 
    "title": "Headways (in seconds) of line {} buses".format(line)
}
fig_dict["layout"]["yaxis"] = {
    "range": [0.5,2.5], 
    "title": "Direction"
}
fig_dict["layout"]["hovermode"] = "closest"
fig_dict["layout"]["sliders"] = {
    "args": [
        "transition", {
            "duration": dur,
            "easing": "cubic-in-out"
        }
    ],
    "initialValue": times[0],
    "plotlycommand": "animate",
    "values": times,
    "visible": True
}
fig_dict["layout"]["updatemenus"] = [
    {
        "buttons": [
            {
                "args": [None, {"frame": {"duration": dur, "redraw": False},
                                "fromcurrent": True, "transition": {"duration": dur,
                                                                    "easing": "quadratic-in-out"}}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": False},
                                  "mode": "immediate",
                                  "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }
]

sliders_dict = {
    "active": 0,
    "yanchor": "top",
    "xanchor": "left",
    "currentvalue": {
        "font": {"size": 20},
        "prefix": "Time:",
        "visible": True,
        "xanchor": "right"
    },
    "transition": {"duration": dur, "easing": "cubic-in-out"},
    "pad": {"b": 10, "t": 50},
    "len": 0.9,
    "x": 0.1,
    "y": 0,
    "steps": []
}

# make data
time = times[0]
for bus in unique_buses :
    try :
        bus_data = selected_df.loc[(selected_df.datetime == time) & \
                                  (selected_df.busB == bus)].iloc[0]
        #Values for the frame
        y = bus_data.direction
        if y == 1 :
            x = bus_data.busB_ttls
        else :
            x = max_dist2 - bus_data.busB_ttls
            
        data_dict = {
            "x": [x],
            "y": [y],
            "mode": "markers",
            "text": str(bus_data.busB),
            "marker": {
                "sizemode": "area",
                "sizeref": 200000,
                "size": size
            },
            "name": str(bus_data.busB)
        }
        fig_dict["data"].append(data_dict)
    except IndexError :
        data_dict = {
            "x": [-5000],
            "y": [5000],
            "mode": "markers",
            "text": str(bus),
            "marker": {
                "sizemode": "area",
                "sizeref": 200000,
                "size": size
            },
            "name": str(bus)
        }
        fig_dict["data"].append(data_dict)
            
# make frames
for time in times :
    frame = {"data": [], "name": time.strftime('%H:%M:%S')}
    for bus in unique_buses :
        try :
            bus_data = selected_df.loc[(selected_df.datetime == time) & \
                                  (selected_df.busB == bus)].iloc[0]
            #Values for the frame
            y = bus_data.direction
            if y == 1 :
                x = bus_data.busB_ttls
            else :
                x = max_dist2 - bus_data.busB_ttls
                
            data_dict = {
                "x": [x],
                "y": [y],
                "mode": "markers",
                "text": str(bus_data.busB),
                "marker": {
                    "sizemode": "area",
                    "sizeref": 200000,
                    "size": size
                },
                "name": str(bus_data.busB)
            }
            frame["data"].append(data_dict)
        except IndexError :
            data_dict = {
                "x": [-5000],
                "y": [5000],
                "mode": "markers",
                "text": str(bus),
                "marker": {
                    "sizemode": "area",
                    "sizeref": 200000,
                    "size": 0
                },
                "name": str(bus)
            }
            frame["data"].append(data_dict)

    fig_dict["frames"].append(frame)
    slider_step = {"args": [
        [time.strftime('%H:%M:%S')],
        {"frame": {"duration": dur, "redraw": False},
         "mode": "immediate",
         "transition": {"duration": dur}}
    ],
        "label": time.strftime('%H:%M:%S'),
        "method": "animate"}
    sliders_dict["steps"].append(slider_step)


fig_dict["layout"]["sliders"] = [sliders_dict]

fig = go.Figure(fig_dict)

fig.show()