# BUSES HEADWAY ANALYSIS AND OUTLIERS DETECTION

In [2]:
import pandas as pd
import json

import datetime
from datetime import timedelta

import statistics
import math

from pandarallel import pandarallel
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
pandarallel.initialize()

import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = 'plotly_white'

pd.set_option("display.precision", 9)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
## Load buses data
buses_data = pd.read_csv('../buses_data.csv',
    dtype={
        'line': 'str',
        'destination': 'str',
        'stop': 'uint16',
        'bus': 'uint16',
        'given_coords': 'bool',
        'pos_in_burst':'uint16',
        'estimateArrive': 'int32',
        'DistanceBus': 'int32',
        'request_time': 'int32',
        'lat':'float32',
        'lon':'float32'
    }
)[['line','destination','stop','bus','datetime','estimateArrive','DistanceBus','given_coords','lat','lon']]

#Eliminate rows with non coherent values
buses_data = buses_data.loc[(buses_data.estimateArrive < 999999) & (buses_data.DistanceBus >= 0)]
buses_data = buses_data.loc[((buses_data.DistanceBus == 0) & (buses_data.estimateArrive == 0)) | ((buses_data.DistanceBus > 0) & (buses_data.estimateArrive > 0))]

#Parse the  datetime
buses_data['datetime'] = pd.to_datetime(buses_data['datetime'], format='%Y-%m-%d %H:%M:%S.%f')

#Select the data of March 9
buses_data = buses_data.loc[(buses_data.datetime.dt.day == 9)&(buses_data.datetime.dt.month == 3)].sort_values(by=['datetime']).reset_index(drop=True)

#Show first five rows
buses_data.head()

Unnamed: 0,line,destination,stop,bus,datetime,estimateArrive,DistanceBus,given_coords,lat,lon
0,132,MONCLOA,1650,4830,2020-03-09 07:01:00.067487,595,2692,True,40.480522156,-3.688912392
1,132,MONCLOA,1650,4829,2020-03-09 07:01:00.067487,16,50,True,40.47706604,-3.715248108
2,82,MONCLOA,4405,4711,2020-03-09 07:01:00.090464,38,215,False,40.493011475,-3.725033045
3,82,MONCLOA,4405,8299,2020-03-09 07:01:00.090464,680,3857,True,40.471019745,-3.728758097
4,132,HOSPITAL LA PAZ,1649,4824,2020-03-09 07:01:00.106063,637,3565,True,40.438598633,-3.718589306


In [6]:
#Line collected dict
with open('M6Data/lines_collected_dict.json', 'r') as f:
    lines_collected_dict = json.load(f)

## Buses time intervals differentiation
From the data available for each line, we are going to construct a dictionary with the time intervals for each line
and the frequencies of the buses for that intervals

In [4]:
#Line stops dict
with open('M6Data/freq_ranges_dict.json', 'r') as f:
    freq_ranges_dict = json.load(f)
freq_ranges_dict

{'1': {'LA': [{'time_range': [7, 9], 'freq_range': [14, 18]},
   {'time_range': [9, 20], 'freq_range': [11, 14]},
   {'time_range': [20, 22], 'freq_range': [13, 15]},
   {'time_range': [22, 23], 'freq_range': [14, 18]}],
  'VV': [],
  'SA': [{'time_range': [7, 9], 'freq_range': [17, 29]},
   {'time_range': [9, 12], 'freq_range': [15, 20]},
   {'time_range': [12, 22], 'freq_range': [13, 17]},
   {'time_range': [22, 23], 'freq_range': [15, 20]}],
  'FE': [{'time_range': [7, 8], 'freq_range': [24, 36]},
   {'time_range': [8, 12], 'freq_range': [19, 25]},
   {'time_range': [12, 22], 'freq_range': [15, 19]},
   {'time_range': [22, 23], 'freq_range': [19, 25]}]},
 '44': {'LA': [{'time_range': [6, 8], 'freq_range': [8, 15]},
   {'time_range': [8, 20], 'freq_range': [8, 11]},
   {'time_range': [20, 23], 'freq_range': [11, 17]}],
  'VV': [],
  'SA': [{'time_range': [6, 10], 'freq_range': [15, 21]},
   {'time_range': [10, 22], 'freq_range': [11, 16]},
   {'time_range': [22, 23], 'freq_range': [1

## Get the time interval between buses
We can build the time interval between the buses inside a line and destination by getting the time remaining for the buses to the stop and extracting the difference between buses.

In [61]:
def get_headways(df,line,dest) :
    '''
    Returns a list of lists with the headways between all the buses inside the line. 
    
    Parameters
    -----------------------------------
        df : Dataframe
            Dataframe with the info 
        line : str
            Line whose buses we are interested in 
        dest: str
            Destination whose buses we are interested in
    '''
    #List of time interval lists that we are going to return 
    time_dists_list = []
    
    #Get the data for desired line and location
    df = df.loc[(df.line == line)&(df.destination == dest)].reset_index(drop=True)
    
    #Get the date from the reduced df
    date = df.iloc[0].datetime
    direction = '1' if dest == lines_collected_dict[line]['destinations'][1] else '2'
    stops = lines_collected_dict[line][direction]['stops']
    
    #First day interval for work days (LA = LABORABLES)
    time_range_low = date.replace(hour=freq_ranges_dict[line]['LA'][0]['time_range'][0],minute=0,second=0,microsecond=0)
    time_range_high = date.replace(hour=freq_ranges_dict[line]['LA'][0]['time_range'][1],minute=0,second=0,microsecond=0)
    
    #Loc rows inside select day interval
    df = df.loc[(df.datetime > time_range_low) & (df.datetime < time_range_high)]
    
    #First interval for the iteration :
    start_interval = date - timedelta(seconds=5)
    end_interval = date + timedelta(seconds=5)
    while True :
        interval_df = df.loc[(df.datetime > start_interval) & (df.datetime < end_interval)]
        #We loop through all stops in reverse order
        time_dists = []
        time_dists_dict = {}
        pos_list = []
        for stop in list(reversed(stops)) :
            stop_df = interval_df.loc[interval_df.stop == int(stop)].sort_values(by='estimateArrive',ascending=True)
            stop_df = stop_df.drop_duplicates('bus',keep='first').reset_index(drop=True)
            
            if stop_df.shape[0] > 1 :
                for i in range(stop_df.shape[0]-1) :
                    bus1 = stop_df.iloc[i]
                    bus2 = stop_df.iloc[i+1]
                    name = str(bus1.bus)+'-'+str(bus2.bus)
                    time_dist = round(bus2.estimateArrive-bus1.estimateArrive,3)
                    time_dists_dict[name] = time_dist
                    if name not in pos_list :
                        pos_list.append(name)
        
        #Create time intervals ordered list
        for i in pos_list:
            time_dists.append(time_dists_dict[i])
        time_dists_list.append(time_dists)
        
        #Update iteration interval
        next_df = df.loc[df.datetime > end_interval]
        if next_df.shape[0] != 0 :
            date = next_df.iloc[0].datetime
            start_interval = date - timedelta(seconds=5)
            end_interval = date + timedelta(seconds=5)
        else :
            break
            
    #When the day interval is finished we return the list
    return time_dists_list

get_headways(buses_data,'132','MONCLOA')

[[622, 579, 398],
 [568, 540, 421],
 [571, 567, 458],
 [554, 527, 456, 416],
 [499, 592, 453, 398],
 [533, 591, 416, 377],
 [521, 620, 432, 376],
 [500, 621, 482, 330],
 [509, 562, 527, 380],
 [491, 534, 472, 297],
 [628, 451, 312],
 [651, 426, 355],
 [634, 446, 409],
 [715, 433, 371],
 [710, 487, 392],
 [652, 458, 456],
 [630, 527, 468, 773, 421],
 [669, 472, 477, 290, 424, 422],
 [563, 590, 505, 336, 412, 422],
 [536, 552, 508, 364, 401, 420],
 [549, 518, 237, 444, 420],
 [562, 469, 195, 441, 422],
 [584, 510, 244, 388, 411],
 [610, 505, 300, 462, 0],
 [622, 489, 236, 417, 397],
 [624, 507, 249, 413, 398],
 [635, 486, 193, 440, 425],
 [652, 450, 233, 439, 426],
 [626, 440, 233, 379, 422],
 [631, 455, 203, 425, 409],
 [681, 429, 245, 418, 348],
 [636, 440, 227, 438, 430],
 [650, 484, 192, 440, 412, 428],
 [595, 500, 218, 409, 407, 450, 419],
 [551, 180, 428, 397, 432, 420],
 [522, 226, 459, 380, 445, 421],
 [530, 135, 526, 346, 448, 421],
 [484, 124, 563, 337, 498, 420],
 [585, 118, 5