# ESTIMATORS ANALYSIS
In the following notebook we are going to compare the data proportioned by the API with the data gathered with my own phone tracked location while I was inside bus 8607 of line 91(F). And the quality of the real arrival time and position estimators built.

In [74]:
import glob

import pandas as pd
import json

import gpxpy

import datetime
from datetime import timedelta

import statistics
import math

import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = 'plotly_white'

pd.set_option("display.precision", 9)

In [7]:
#Route and lines shape
lines_shapes = pd.read_csv('M6Data/lines_shapes.csv')
line91_1 = lines_shapes.loc[(lines_shapes['line_id']==91)&(lines_shapes['direction']==1)]
line91_2 = lines_shapes.loc[(lines_shapes['line_id']==91)&(lines_shapes['direction']==2)]

In [8]:
#Stops
stops = pd.read_csv('M6Data/stops.csv')

In [9]:
#Line stops dict
with open('M6Data/line_stops_dict.json', 'r') as f:
    line_stops_dict = json.load(f)

In [18]:
#High frecuency line 91(F) data
hf_buses_data = pd.read_csv('experiment-F.csv',
    dtype={
        'line': 'str',
        'destination': 'str',
        'stop': 'int32',
        'bus': 'int32',
        'isHead': 'bool',
        'given_coords': 'int32',
        'pos_in_burst':'int32',
        'deviation': 'int32',
        'estimateArrive': 'int32',
        'DistanceBus': 'int32',
        'request_time': 'int32',
        'lat':'float',
        'lon':'float'
    }
)

#We drop the columns we are not interested in
hf_buses_data = hf_buses_data.drop(['isHead','deviation','request_time','pos_in_burst'],axis=1)

#Eliminate rows with non coherent values
hf_buses_data = hf_buses_data.loc[(hf_buses_data.estimateArrive < 999999) & (hf_buses_data.DistanceBus >= 0)]
hf_buses_data = hf_buses_data.loc[((hf_buses_data.DistanceBus == 0) & (hf_buses_data.estimateArrive == 0)) | ((hf_buses_data.DistanceBus > 0) & (hf_buses_data.estimateArrive > 0))]

#Parse the  datetime
hf_buses_data['datetime'] = pd.to_datetime(hf_buses_data['datetime'], format='%Y-%m-%d %H:%M:%S.%f')

#Select data for March 9
hf_buses_data = hf_buses_data.loc[hf_buses_data.datetime.dt.day == 9]

#Set the datetime as index and drop the datetime column
#hf_buses_data = hf_buses_data.sort_values(by=['datetime']).set_index(pd.DatetimeIndex(hf_buses_data['datetime'])).drop(['datetime'], axis=1)

#Show first five rows
hf_buses_data.head()

Unnamed: 0,bus,line,stop,datetime,destination,estimateArrive,DistanceBus,given_coords,lat,lon
0,8606,F,1693,2020-03-09 10:15:14.413612,CIUDAD UNIVERSITARIA,347,1517,1,40.449985615,-3.734595387
1,8603,F,1693,2020-03-09 10:15:14.413612,CIUDAD UNIVERSITARIA,496,2293,1,40.44843293,-3.734188718
2,8606,F,3278,2020-03-09 10:15:14.417809,CIUDAD UNIVERSITARIA,94,427,1,40.449985615,-3.734595387
3,8603,F,3278,2020-03-09 10:15:14.417809,CIUDAD UNIVERSITARIA,316,1306,1,40.44843293,-3.734188718
4,8603,F,1418,2020-03-09 10:15:14.418781,CIUDAD UNIVERSITARIA,85,195,1,40.44843293,-3.734188718


In [42]:
#Normal retrieved data
buses_data = pd.read_csv('../buses_data.csv',
    dtype={
        'line': 'str',
        'destination': 'str',
        'stop': 'int32',
        'bus': 'int32',
        'isHead': 'bool',
        'given_coords': 'int32',
        'pos_in_burst':'int32',
        'deviation': 'int32',
        'estimateArrive': 'int32',
        'DistanceBus': 'int32',
        'request_time': 'int32',
        'lat':'float',
        'lon':'float'
    }
)

#We drop the columns we are not interested in
buses_data = buses_data.drop(['isHead','deviation','request_time','pos_in_burst'],axis=1)

#Eliminate rows with non coherent values
buses_data = buses_data.loc[(buses_data.estimateArrive < 999999) & (buses_data.DistanceBus >= 0)]
buses_data = buses_data.loc[((buses_data.DistanceBus == 0) & (buses_data.estimateArrive == 0)) | ((buses_data.DistanceBus > 0) & (buses_data.estimateArrive > 0))]

#Parse the  datetime
buses_data['datetime'] = pd.to_datetime(buses_data['datetime'], format='%Y-%m-%d %H:%M:%S.%f')

#Select the data for line F in March 9 (like the hf data)
mask = (buses_data.datetime > hf_buses_data.datetime.min()) & (buses_data.datetime < hf_buses_data.datetime.max())
buses_data = buses_data.loc[(buses_data.line=='F') & mask].sort_values(by=['datetime']).reset_index(drop=True)

#Set the datetime as index and drop the datetime column
#buses_data = buses_data.sort_values(by=['datetime']).set_index(pd.DatetimeIndex(buses_data['datetime'])).drop(['datetime'], axis=1)

#Show first five rows
buses_data.head()

Unnamed: 0,bus,line,stop,datetime,destination,estimateArrive,DistanceBus,given_coords,lat,lon
0,8603,F,1693,2020-03-09 10:15:20.338493,CIUDAD UNIVERSITARIA,490,2265,1,40.44843293,-3.734188718
1,8606,F,1693,2020-03-09 10:15:20.338493,CIUDAD UNIVERSITARIA,341,1491,1,40.449985615,-3.734595387
2,8608,F,3276,2020-03-09 10:15:20.844635,CUATRO CAMINOS,383,1579,1,40.447668581,-3.726714627
3,8605,F,3276,2020-03-09 10:15:20.844635,CUATRO CAMINOS,605,1264,1,40.446392521,-3.704028544
4,8606,F,4289,2020-03-09 10:15:20.852928,CIUDAD UNIVERSITARIA,658,2945,1,40.449985615,-3.734595387


In [43]:
#Stops in line 1
stops_in_experiment = hf_buses_data.stop.unique().tolist()

In [44]:
#Buses in experiment
buses_calc = hf_buses_data.loc[hf_buses_data.given_coords == 0].bus.unique().tolist()
buses_given = hf_buses_data.loc[~hf_buses_data.bus.isin(buses_calc)].bus.unique().tolist()

In [66]:
#FUNCTIONS
def haversine(coord1, coord2):
    '''
    Returns distance between two given coordinates in meters
    '''
    R = 6372800  # Earth radius in meters
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    
    phi1, phi2 = math.radians(lat1), math.radians(lat2) 
    dphi       = math.radians(lat2 - lat1)
    dlambda    = math.radians(lon2 - lon1)
    
    a = math.sin(dphi/2)**2 + \
        math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    
    return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))

def find_nearest_row_by_time(df, time):
    """
    Returns the row nearest to the time passed in the dataframe
    
        Parameters
        ----------
        df : dataframe
            Dataframe where we want to find the row
        time : datetime
    """
    min_seconds_error = 1000000.0
    for row in df.itertuples() :
        error = abs((row.datetime-time).total_seconds())
        if  error < min_seconds_error :
            min_seconds_error = error
            nearest_row = row
    return nearest_row

def find_nearest_row(df,lat,lon) :
    """
    Returns the row nearest to the coordinates passed in the dataframe
    
        Parameters
        ----------
        df : dataframe
            Dataframe where we want to find the row
        lat: float
        lon: float
    """
    min_dist_error = 1000000.0
    for row in df.itertuples() :
        error = math.sqrt(abs(row.lat-lat)+abs(row.lon-lon))
        if  error < min_dist_error :
            min_dist_error = error
            nearest_row = row
    return nearest_row

def find_nearest_row_by_dist(df,dist_traveled) :
    """
    Returns the row nearest to the distance traveled passed in the dataframe
    
        Parameters
        ----------
        df : dataframe
            Dataframe where we want to find the row
        dist_traveled : float
    """
    min_dist_error = 1000000.0
    for row in df.itertuples() :
        error = abs(row.dist_traveled-dist_traveled)
        if  error < min_dist_error :
            min_dist_error = error
            nearest_row = row
    return nearest_row

def nearest_point_on_line (line, bus_lat, bus_lon) :
    """
    Returns the coordinates of the bus location
    
        Parameters
        ----------
        line: DataFrame
            Points belonging to the requested line
        bus_lat : float
        bus_lon : float
    """
    #Find nearest row
    nearest_row = find_nearest_row(line,bus_lat,bus_lon)
    
    return nearest_row.lon,nearest_row.lat

def distance_on_line (line, lat1, lon1, lat2, lon2) :
    """
    Returns the distance on the line between the points

        Parameters
        ----------
        line: DataFrame
            Points belonging to the requested line
        destination : string
            The bus destination
        lat1 : float
        lon1 : float
        lat1 : float
        lon1 : float
        
    """
    nearest_row1 = find_nearest_row(line,lat1,lon1)
    nearest_row2 = find_nearest_row(line,lat2,lon2)
    
    distance = abs(nearest_row1.dist_traveled-nearest_row2.dist_traveled)

    return distance

#Get bus estimated coords on the line
def point_by_distance_on_line (line,distance,stop_lat,stop_lon) :
    """
    Returns the coordinates of the bus location

        Parameters
        ----------
        line: DataFrame
            Points belonging to the requested line
        distance : float
            Distance of the bus to the stop in meters
        stop_lat : float
        stop_lon : float
    """
    
    nearest_row_to_stop = find_nearest_row(line,stop_lat,stop_lon)
    dist_traveled_of_bus = nearest_row_to_stop.dist_traveled - distance
    nearest_row_to_distance = find_nearest_row_by_dist(line,dist_traveled_of_bus)
    
    #And we return the coordinates of the point
    return nearest_row_to_distance.lon,nearest_row_to_distance.lat

## Phone tracked location data
To check how good the data given by the API is, I have tracked my phone GPS location every 1 second while I was inside the bus myself. The data tracked is loaded here to carry out various analysis.

In [46]:
#Read gpx file and turn it into a dataframe
def read_gpx_to_df (f) :
    gpx_file = open(f, 'r')
    gpx = gpxpy.parse(gpx_file)
    #Si es de tipo tracks
    if len(gpx.tracks) != 0 :
        data = gpx.tracks[0].segments[0].points
        segment = gpx.tracks[0].segments[0]
        segment_length = segment.length_3d()
        df = pd.DataFrame(columns=['datetime','lon', 'lat', 'alt','speed'])
        for point_idx, point in enumerate(segment.points):
            df = df.append({'datetime' : point.time, 'lon': point.longitude, 'lat' : point.latitude, 'alt' : point.elevation, 'speed' : segment.get_speed(point_idx)}, ignore_index=True)
        #We remove timezone data and add one hour to get the Madrid time
        df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S.%f%tz')
        df['datetime'] = df['datetime'].dt.tz_localize(None) + datetime.timedelta(hours=1)
    #Si es de tipo waypoints
    else :
        waypoints = gpx.waypoints
        df = pd.DataFrame(columns=['stop','datetime','lon', 'lat', 'alt'])
        for point in waypoints:
            df = df.append({'stop': point.name, 'datetime' : point.time, 'lon': point.longitude, 'lat' : point.latitude, 'alt' : point.elevation}, ignore_index=True)
        #We remove timezone data and add one hour to get the Madrid time
        df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S.%f%tz')
        df['datetime'] = df['datetime'].dt.tz_localize(None)
    
    return df

path = 'TrackedLocations'
files = [f for f in glob.glob(path + "**/8607-*.gpx", recursive=True)]
dfs = []
for f in files:
    dfs.append(read_gpx_to_df(f))


bus_tracked = pd.concat(dfs).sort_values(by=['datetime']).reset_index(drop=True)
bus_tracked

Unnamed: 0,datetime,lon,lat,alt,speed
0,2020-03-09 10:55:02.470,-3.705201047,40.446766209,816.793459202,58.960653598
1,2020-03-09 10:55:04.000,-3.705089997,40.447005137,765.020535761,41.228519969
2,2020-03-09 10:55:06.000,-3.705314968,40.446730946,795.243950676,12.350089443
3,2020-03-09 10:55:08.000,-3.705333657,40.446714652,795.243950676,1.132441969
4,2020-03-09 10:55:10.000,-3.705327954,40.446733216,795.243950676,0.696652090
...,...,...,...,...,...
307,2020-03-09 11:05:40.000,-3.727766774,40.452113618,691.853407636,6.321058970
308,2020-03-09 11:05:42.000,-3.727766774,40.452113618,691.853407636,0.727502469
309,2020-03-09 11:05:44.000,-3.727757416,40.452124022,692.237337837,1.203284841
310,2020-03-09 11:05:46.000,-3.727751159,40.452150828,693.683540501,1.705913498


In [47]:
stops_tracked = read_gpx_to_df('TrackedLocations/Stops-8607.gpx')
stops_tracked

Unnamed: 0,stop,datetime,lon,lat,alt
0,1418,2020-03-09 10:57:11,-3.707847145,40.44707172,712.786432503
1,1416,2020-03-09 10:58:15,-3.710908012,40.446661015,692.744076421
2,2712,2020-03-09 10:59:33,-3.713919237,40.446402323,699.295760576
3,3278,2020-03-09 11:01:36,-3.717195132,40.449164682,669.566603916
4,5140,2020-03-09 11:02:29,-3.720670464,40.448679015,657.647485598
5,3292,2020-03-09 11:03:13,-3.724208792,40.447888805,642.311820455
6,3274,2020-03-09 11:03:57,-3.725775755,40.448534444,627.487182634
7,1693,2020-03-09 11:04:57,-3.727532982,40.45003255,636.852775836
8,4285,2020-03-09 11:05:44,-3.727714455,40.452011667,636.014593012


In [75]:
#Token and styles for the mapbox api
mapbox_access_token = 'pk.eyJ1IjoiYWxlanAxOTk4IiwiYSI6ImNrNnFwMmM0dDE2OHYzZXFwazZiZTdmbGcifQ.k5qPtvMgar7i9cbQx1fP0w'
style_day = 'mapbox://styles/alejp1998/ck6z9mohb25ni1iod4sqvqa0d'

#Data
lats = bus_tracked.lat.tolist()
lons = bus_tracked.lon.tolist()
stop_lats = stops_tracked.lat.tolist()
stop_lons = stops_tracked.lon.tolist()
stop_names = stops_tracked.stop.tolist()

#We create the figure object
fig = go.Figure()

fig.add_trace(go.Scattermapbox(
    lat=lats,
    lon=lons,
    mode='lines',
    line=dict(width=2, color='purple'),
    text='',
    hoverinfo='text'
))

#Add buses to figure
fig.add_trace(go.Scattermapbox(
    lat=stop_lats,
    lon=stop_lons,
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=5,
        color='green',
        opacity=0.7
    ),
    text=['Stop: {}'.format(stop) for stop in stop_names],
    hoverinfo='text'
))

#And set the figure layout
fig.update_layout(
    title='Pair bus coords when it arrives to stop vs the coords of that stop',
    height=500,
    margin=dict(r=0, l=0, t=0, b=0),
    hovermode='closest',
    showlegend=False,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=statistics.mean(lats),
            lon=statistics.mean(lons)
        ),
        pitch=0,
        zoom=14,
        style=style_day
    )
)

## Arrival time of the bus
We cannot determine the real moment when the bus arrived each stop, but whe can develop an estimator for that time. Here we are going to choose that estimator and try to guess its quality

In [76]:
#ETAs Figures Building
selected_stop = 4285

fs_df = hf_buses_data.loc[hf_buses_data.stop == selected_stop]
fs_buses = fs_df['bus'].unique().tolist()

fig1 = go.Figure() #Distances of bus to stop
fig2 = go.Figure() #ETAs
fig3 = go.Figure() #Error in ETAs
for bus in fs_buses :
    fs_bus_df = fs_df.loc[fs_df['bus']==bus]
    line = fs_bus_df.iloc[0]['line']
    bus_times = fs_bus_df.datetime.tolist()
    bus_dists = [dist/1000 for dist in fs_bus_df['DistanceBus'].tolist()]
    bus_etas = [eta/60 for eta in fs_bus_df['estimateArrive'].tolist()]
    
    bus_times_splitted,bus_dists_splitted,bus_etas_splitted,bus_etas_error_splitted = [],[],[],[] 
    last_distance = bus_dists[0]
    last_index = 0
    for i in range(len(bus_dists)) :
        if (bus_dists[i] - last_distance)>1 :
            bus_times_splitted.append(bus_times[last_index:i])
            bus_dists_splitted.append(bus_dists[last_index:i])
            bus_etas_splitted.append(bus_etas[last_index:i])
            #Bus etas error
            bus_etas_error = []
            last_time = bus_times[i-1]
            if bus_etas[i-1] <= 1 : 
                for k in range(last_index,i) :
                    error = (bus_times[k] + timedelta(minutes=bus_etas[k])) - last_time
                    error_mins = error.total_seconds()/60
                    bus_etas_error.append(error_mins)
                bus_etas_error_splitted.append(bus_etas_error)
            
            last_index = i
        last_distance = bus_dists[i]
        
    # Create and style traces
    for i in range(len(bus_times_splitted)) :
        fig1.add_trace(go.Scatter(x=bus_times_splitted[i], y=bus_dists_splitted[i], name='{}-{}'.format(line,bus),
                                 line=dict(width=1)))
        fig2.add_trace(go.Scatter(x=bus_times_splitted[i], y=bus_etas_splitted[i], name='{}-{}'.format(line,bus),
                                 line=dict(width=1)))
        fig3.add_trace(go.Scatter(x=bus_times_splitted[i], y=bus_etas_error_splitted[i], name='{}-{}'.format(line,bus),
                                 line=dict(width=1)))
        
# Edit the layout
fig1.update_layout(title='Distance remaining for the buses heading stop {}'.format(selected_stop),
                   xaxis_title='Time',
                   yaxis_title='DISTANCE (km)')
# Edit the layout
fig2.update_layout(title='ETAs for the buses heading stop {}'.format(selected_stop),
                   xaxis_title='Time',
                   yaxis_title='ETA (minutes)')
# Edit the layout
fig3.update_layout(title='Error in ETAs for the buses heading stop {}. Positive or negative if it arrives later or sooner than expected, respectively'.format(selected_stop),
                   xaxis_title='Time',
                   yaxis_title='ETA ERROR (minutes)')

#Show distance remaining figure
fig1.show()

In [77]:
#Show ETA figure
fig2.show()

In [78]:
#Show ETA error figure
fig3.show()

In [79]:
#Destinations dictionary
destinations = {
    '1' : ('CRISTO REY', 'PROSPERIDAD'),
    '82' : ('MONCLOA', 'PITIS'),
    '132' : ('MONCLOA', 'HOSPITAL LA PAZ'),
    '91' : ('CUATRO CAMINOS', 'CIUDAD UNIVERSITARIA'),
    '92' : ('MONCLOA', 'CIUDAD UNIVERSITARIA'),
    '99' : ('AVENIDA DE SENECA', 'PARANINFO'),
    '502' : ('PLAZA DE CIBELES', 'VALDEBEBAS'),
    '506' : ('PLAZA DE CIBELES', 'LAS ROSAS')
}
destinations['91'][0]

'CUATRO CAMINOS'

In [183]:
def add_arrival_time_estim(df,threshold) :
    '''
    Returns the dataframe with a new column with the estimation of the time when the bus has arrived the stop
    that is giving the estimation, and another column with the trip index in the day. The estimation 
    is based on the value of ''estimateArrive'' for the first row that is less than threshold 
    seconds away from the stop.
    
    May take a long time for big dataframes
    
    Parameters
    ----------------------
    df : The dataframe where we wish to add the column
    
    '''
    #List to add the trip dataframes
    trips = []
    #For each line of the bus.
    lines = df.line.unique().tolist()
    for line in lines : 
        df_line = df.loc[df.line == line]
        line_id = str(lines_shapes.loc[lines_shapes.line_sn == line].iloc[0].line_id)
        #For each destination in that line
        for dest in destinations[line_id] :
            direction = '1' if dest == destinations[line_id][1] else '2'
            df_dest = df_line.loc[df.destination == dest]
            #For each stop in that line and destination
            for stop in line_stops_dict[line_id][direction] :
                df_stop = df_dest.loc[df_dest.stop == int(stop)]
                #For each bus in that line destination and stop
                buses = df_stop.bus.unique().tolist()
                for bus in buses :
                    df_bus = df_stop.loc[df_stop.bus == bus]
                    last_index = 0
                    last_distance = df_bus.iloc[0].DistanceBus
                    last_day = df_bus.iloc[0].datetime.day
                    day_trip = 0
                    for i in range(df_bus.shape[0]) :
                        if ((df_bus.iloc[i].DistanceBus - last_distance) > 1000) | (last_day != df_bus.iloc[i].datetime.day) | (i==df_bus.shape[0]-1) :
                            #Trip dataframe
                            df_trip = df_bus.iloc[last_index:i]
                            last_index = i
                            
                            if df_trip.shape[0] != 0 :
                                #Trip number inside the day
                                if last_day == df_trip.iloc[0].datetime.day :
                                    day_trip = day_trip + 1 
                                else :
                                    last_day = df_trip.iloc[0].datetime.day
                                    day_trip = 1

                                #Get first row with estimateArrive < threshold seconds
                                df_close = df_trip.loc[df_trip.estimateArrive<threshold]
                                if df_close.shape[0] != 0 :
                                    row = df_close.sort_values(by='datetime',ascending='True').iloc[0]
                                else :
                                    row = df_trip.loc[df_trip.estimateArrive==df_trip.estimateArrive.min()].iloc[0]

                                #Assign arrival time and trip day
                                df_trip = df_trip.assign(
                                    day_trip=day_trip,
                                    arrival_time=row.datetime + timedelta(seconds=int(row.estimateArrive))
                                )
                                trips.append(df_trip)
                        #Update last distance value
                        last_distance = df_bus.iloc[i].DistanceBus
            
    return pd.concat(trips).sort_values(by='datetime',ascending='True')[['line','destination','stop','bus','day_trip','datetime','estimateArrive','DistanceBus','arrival_time','given_coords','lat','lon']]

#Threshold is arbitrary set to half the interval between requests
hf_buses_data_arrival_times = add_arrival_time_estim(hf_buses_data,2.5)
hf_buses_data_arrival_times.head()

Unnamed: 0,line,destination,stop,bus,day_trip,datetime,estimateArrive,DistanceBus,arrival_time,given_coords,lat,lon
46,F,CIUDAD UNIVERSITARIA,4285,8606,1,2020-03-09 10:15:14.291579,392,1730,2020-03-09 10:20:04.324307,1,40.449985615,-3.734595387
47,F,CIUDAD UNIVERSITARIA,4285,8603,1,2020-03-09 10:15:14.291579,538,2629,2020-03-09 10:25:06.495087,1,40.44843293,-3.734188718
24,F,CUATRO CAMINOS,5316,8608,1,2020-03-09 10:15:14.331733,436,1975,2020-03-09 10:22:44.477872,1,40.447668581,-3.726714627
25,F,CUATRO CAMINOS,5316,8605,1,2020-03-09 10:15:14.331733,657,1637,2020-03-09 10:25:59.420481,1,40.446392521,-3.704028544
48,F,CUATRO CAMINOS,4284,8607,1,2020-03-09 10:15:14.336624,66,208,2020-03-09 10:16:39.406235,1,40.449713112,-3.735205562


In [188]:
#Threshold is arbitrary set to half the interval between requests
buses_data_arrival_times = add_arrival_time_estim(buses_data,27.5)
buses_data_arrival_times.head()

Unnamed: 0,line,destination,stop,bus,day_trip,datetime,estimateArrive,DistanceBus,arrival_time,given_coords,lat,lon
0,F,CIUDAD UNIVERSITARIA,1693,8603,1,2020-03-09 10:15:20.338493,490,2265,2020-03-09 10:24:41.278622,1,40.44843293,-3.734188718
1,F,CIUDAD UNIVERSITARIA,1693,8606,1,2020-03-09 10:15:20.338493,341,1491,2020-03-09 10:19:21.218566,1,40.449985615,-3.734595387
2,F,CUATRO CAMINOS,3276,8608,1,2020-03-09 10:15:20.844635,383,1579,2020-03-09 10:21:59.537811,1,40.447668581,-3.726714627
3,F,CUATRO CAMINOS,3276,8605,1,2020-03-09 10:15:20.844635,605,1264,2020-03-09 10:25:25.788645,1,40.446392521,-3.704028544
5,F,CIUDAD UNIVERSITARIA,4289,8603,1,2020-03-09 10:15:20.852928,736,3757,2020-03-09 10:30:07.327394,1,40.44843293,-3.734188718


In [189]:
def get_estim_errors(df,stops_df) :
    '''
    Returns a list with the estimation errors

    Parameters
    ----------------------
    df : The dataframe where we wish to add the column
    df_stops : The dataframe with the real stop arriving times
    '''
    mask = (df.datetime > stops_df.datetime.min()) & (df.datetime < stops_df.datetime.max())
    bus_8607 = df.loc[(df.bus == 8607)&mask]
    estim_errors = []
    for row in stops_df.itertuples():
        stop_row = bus_8607.loc[(bus_8607.stop == int(row.stop))].iloc[0]
        estim_error_seconds = (stop_row.arrival_time - row.datetime).total_seconds() 
        estim_errors.append(estim_error_seconds)
    
    return estim_errors

estim_errors_hf = get_estim_errors(hf_buses_data_arrival_times,stops_tracked)
estim_errors = get_estim_errors(buses_data_arrival_times,stops_tracked)

In [190]:
#We create the figure object
x = ['Stop: {}'.format(stop) for stop in stops_tracked.stop.tolist()]

fig4 = go.Figure()
fig4.add_trace(go.Bar(x=x,
    y=estim_errors_hf,
    name='HF'
))
fig4.add_trace(go.Bar(x=x,
    y=estim_errors,
    name='Orig'
))

fig4.update_layout(
    height=500,
    margin=dict(r=0, l=0, t=0, b=0),
    yaxis=dict(
        title='Error in seconds between estimated and real arrival times'
    )
)
#NEGATIVE IF IT ARRIVES SOONER THAN EXPECTED

## Calculated coords of the bus
As the coordinates given officially by the API seem to be of very bad quality, we are going to build an estimator for them, and then check its quality.

In [127]:
def nearest_time_rows(df,df_tracked,stop) :
    #Loc the values
    mask = (df.datetime > df_tracked.datetime.min()) & (df.datetime < df_tracked.datetime.max())
    bus_8607 = df.loc[(df.bus == 8607) & (df.stop == stop) & mask].sort_values(by='datetime',ascending=True)
    #Find the nearest rows
    nearest_rows,tracked_lats,tracked_lons = [],[],[]
    last_nearest_row = 0
    for row in df_tracked.itertuples():
        nearest_time_row = find_nearest_row_by_time(bus_8607,row.datetime)
        #Avoid adding duplicate nearest rows
        if last_nearest_row != nearest_time_row :
            tracked_lats.append(row.lat)
            tracked_lons.append(row.lon)
            nearest_rows.append(nearest_time_row)
            last_nearest_row = nearest_time_row
    
    new_df = pd.DataFrame(nearest_rows).drop('Index',axis=1)
    new_df['tracked_lat'] = tracked_lats
    new_df['tracked_lon'] = tracked_lons
    return new_df

#Get nearest rows for the specified bus and stop
hf_buses_data_path = nearest_time_rows(hf_buses_data_arrival_times,bus_tracked,4285)
hf_buses_data_path.head()

Unnamed: 0,line,destination,stop,bus,day_trip,datetime,estimateArrive,DistanceBus,arrival_time,given_coords,lat,lon,tracked_lat,tracked_lon
0,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:04.581986,528,2470,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446766209,-3.705201047
1,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:09.645939,523,2447,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446714652,-3.705333657
2,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:14.528373,518,2423,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446738472,-3.705345736
3,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:19.624826,513,2400,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446714881,-3.705359969
4,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:24.663989,508,2377,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446733719,-3.705382652


In [128]:
buses_data_path = nearest_time_rows(buses_data_arrival_times,bus_tracked,4285)
buses_data_path.head()

Unnamed: 0,line,destination,stop,bus,day_trip,datetime,estimateArrive,DistanceBus,arrival_time,given_coords,lat,lon,tracked_lat,tracked_lon
0,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:56:36.842317,436,2439,2020-03-09 11:05:46.442907,1,40.44680536,-3.70433438,40.446766209,-3.705201047
1,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:57:31.573753,407,2277,2020-03-09 11:05:46.442907,1,40.44680536,-3.70433438,40.447099769,-3.707907606
2,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:58:26.429938,352,1969,2020-03-09 11:05:46.442907,1,40.44680536,-3.70433438,40.446733384,-3.710678134
3,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:59:21.567477,292,1779,2020-03-09 11:05:46.442907,1,40.44680536,-3.70433438,40.446470511,-3.711878042
4,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 11:00:16.775696,264,1666,2020-03-09 11:05:46.442907,1,40.446784692,-3.703300035,40.447093914,-3.714205989


In [136]:
def get_distance_diff(df_tracked,df) :
    '''
    Returns a dataframe with the distance difference between the bus tracked data and the coordinates
    given in the dataframe
    '''
    distance_diff = []
    
    for row in df.itertuples() :
        distance_diff.append(
            haversine(
                (row.lat,row.lon),
                (row.tracked_lat,row.tracked_lon)
            ))
    return distance_diff

hf_buses_data_path['dist_to_tracked'] = get_distance_diff(bus_tracked,hf_buses_data_path)
hf_buses_data_path.head()

Unnamed: 0,line,destination,stop,bus,day_trip,datetime,estimateArrive,DistanceBus,arrival_time,given_coords,lat,lon,tracked_lat,tracked_lon,dist_to_tracked
0,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:04.581986,528,2470,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446766209,-3.705201047,73.487511968
1,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:09.645939,523,2447,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446714652,-3.705333657,85.182672381
2,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:14.528373,518,2423,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446738472,-3.705345736,85.928128128
3,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:19.624826,513,2400,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446714881,-3.705359969,87.391626623
4,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:55:24.663989,508,2377,2020-03-09 11:05:30.699917,1,40.44680536,-3.70433438,40.446733719,-3.705382652,89.087265487


In [137]:
buses_data_path['dist_to_tracked'] = get_distance_diff(bus_tracked,buses_data_path)
buses_data_path.head()

Unnamed: 0,line,destination,stop,bus,day_trip,datetime,estimateArrive,DistanceBus,arrival_time,given_coords,lat,lon,tracked_lat,tracked_lon,dist_to_tracked
0,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:56:36.842317,436,2439,2020-03-09 11:05:46.442907,1,40.44680536,-3.70433438,40.446766209,-3.705201047,73.487511968
1,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:57:31.573753,407,2277,2020-03-09 11:05:46.442907,1,40.44680536,-3.70433438,40.447099769,-3.707907606,304.219738447
2,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:58:26.429938,352,1969,2020-03-09 11:05:46.442907,1,40.44680536,-3.70433438,40.446733384,-3.710678134,537.022025875
3,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 10:59:21.567477,292,1779,2020-03-09 11:05:46.442907,1,40.44680536,-3.70433438,40.446470511,-3.711878042,639.614134121
4,F,CIUDAD UNIVERSITARIA,4285,8607,2,2020-03-09 11:00:16.775696,264,1666,2020-03-09 11:05:46.442907,1,40.446784692,-3.703300035,40.447093914,-3.714205989,923.764410596


In [139]:
#Distance between points of paths over time
fig6 = go.Figure()
fig6.add_trace(go.Scatter(
    x=hf_buses_data_path.datetime, 
    y=hf_buses_data_path.dist_to_tracked, 
    mode='lines',
    name='HF',
    line=dict(width=2)
))
fig6.add_trace(go.Scatter(
    x=buses_data_path.datetime, 
    y=buses_data_path.dist_to_tracked, 
    mode='lines',
    name='Orig',
    line=dict(width=2)
))

fig6.update_layout(
    height=500,
    margin=dict(r=0, l=0, t=0, b=0),
    yaxis=dict(
        title='Error in meters between real path and given path'
    )
)

In [145]:
#We create the figure object
fig7 = go.Figure()
#Real path
fig7.add_trace(go.Scattermapbox(
    name='Tracked path',
    lat=lats,
    lon=lons,
    mode='lines',
    line=dict(width=2, color='purple'),
    text='Real path',
    hoverinfo='text'
))
#Exp path
fig7.add_trace(go.Scattermapbox(
    name='HF path',
    lat=hf_buses_data_path.lat,
    lon=hf_buses_data_path.lon,
    mode='lines',
    line=dict(width=2, color='blue'),
    text='Experiment path',
    hoverinfo='text'
))
#Orig path
fig7.add_trace(go.Scattermapbox(
    name='Original path',
    lat=buses_data_path.lat,
    lon=buses_data_path.lon,
    mode='lines',
    line=dict(width=2, color='green'),
    text='Original path',
    hoverinfo='text'
))

#And set the figure layout
fig7.update_layout(
    title='Tracked path compared to given paths',
    height=500,
    margin=dict(r=0, l=0, t=50, b=0),
    hovermode='closest',
    showlegend=True,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=statistics.mean(lats),
            lon=statistics.mean(lons)
        ),
        pitch=0,
        zoom=14,
        style=style_day
    )
)

### Now that we know how bad is the location given by the api, we are going to compare the real path with the path obtained with the position estimator based on the DistanceBus attribute.

In [155]:
def get_calculated_coords(df,bus_tracked) :
    stop = stops.loc[stops.id == 4285].iloc[0]
    line = line91_1
    
    calculated_lats, calculated_lons = [],[]
    dist_calc = []
    
    for row in df.itertuples() :
        #Calc coordinates and append them to the list
        calc_point_lon,calc_point_lat = point_by_distance_on_line(line,row.DistanceBus,stop.lat,stop.lon)
        calculated_lats.append(calc_point_lat)
        calculated_lons.append(calc_point_lon)
        
        #Calculate distances between calc coords and stop coords
        dist_calc.append(haversine((row.tracked_lat,row.tracked_lon),(calc_point_lat,calc_point_lon)))
        
    
    df['calc_lat'] = calculated_lats
    df['calc_lon'] = calculated_lons
    df['dist_calc'] = dist_calc
    return df

def get_tracked_distance(df) :
    stop = stops.loc[stops.id == 4285].iloc[0]
    line = line91_1
    
    dist_to_stop,dist_to_stop_on_line = [],[]
    for row in df.itertuples() :
        #Calc distances between tracked coords and stop
        dist_to_stop.append(haversine((row.lat,row.lon),(stop.lat,stop.lon)))
        dist_to_stop_on_line.append(distance_on_line(line,row.lat,row.lon,stop.lat,stop.lon))
    
    df['dist_to_stop'] = dist_to_stop
    df['dist_to_stop_on_line'] = dist_to_stop_on_line
    return df

hf_buses_data_path = get_calculated_coords(hf_buses_data_path,bus_tracked)
buses_data_path = get_calculated_coords(buses_data_path,bus_tracked)
bus_tracked = get_tracked_distance(bus_tracked)

In [158]:
#Distance between points of paths over time
fig8 = go.Figure()
for row in stops_tracked.itertuples() :
    # Add shapes
    fig8.add_shape(
            # Line Vertical
            dict(
                type='line',
                x0=row.datetime,
                y0=0,
                x1=row.datetime,
                y1=2750,
                line=dict(
                    color='gold',
                    width=2,
                    dash='dashdot'
            )
    ))
fig8.add_trace(go.Scatter(
    x=hf_buses_data_path.datetime, 
    y=hf_buses_data_path.dist_calc,
    mode='lines',
    name='HF - Error in calculated coords',
    line=dict(width=2)
))
fig8.add_trace(go.Scatter(
    x=buses_data_path.datetime, 
    y=buses_data_path.dist_calc,
    mode='lines',
    name='Orig - Error in calculated coords',
    line=dict(width=2)
))
fig8.add_trace(go.Scatter(
    x=hf_buses_data_path.datetime, 
    y=hf_buses_data_path.DistanceBus,
    mode='lines',
    name='HF - Distance to stop given by the API ',
    line=dict(width=2)
))
fig8.add_trace(go.Scatter(
    x=buses_data_path.datetime, 
    y=buses_data_path.DistanceBus,
    mode='lines',
    name='Orig - Distance to stop given by the API ',
    line=dict(width=2)
))
fig8.add_trace(go.Scatter(
    x=bus_tracked.datetime, 
    y=bus_tracked.dist_to_stop_on_line,
    mode='lines',
    name='Real distance to stop on line',
    line=dict(width=2)
))
fig8.add_trace(go.Scatter(
    x=bus_tracked.datetime, 
    y=bus_tracked.dist_to_stop,
    mode='lines',
    name='Real euclidean distance to stop',
    line=dict(width=2)
))

fig8.update_layout(
    height=500,
    margin=dict(r=0, l=0, t=0, b=0),
    xaxis=dict(
        title='Time'
    )
)

In [160]:
#We create the figure object
fig9 = go.Figure()
#Real path
fig9.add_trace(go.Scattermapbox(
    name='Tracked path',
    lat=lats,
    lon=lons,
    mode='lines',
    line=dict(width=2, color='purple'),
    text='Real path',
    hoverinfo='text'
))
#Exp path
fig9.add_trace(go.Scattermapbox(
    name='Orig- Calculated path',
    lat=buses_data_path.calc_lat,
    lon=buses_data_path.calc_lon,
    mode='lines',
    line=dict(width=2, color='blue'),
    text='Experiment path',
    hoverinfo='text'
))
#Orig path
fig9.add_trace(go.Scattermapbox(
    name='HF - Calculated path',
    lat=hf_buses_data_path.calc_lat,
    lon=hf_buses_data_path.calc_lon,
    mode='lines',
    line=dict(width=2, color='green'),
    text='Original path',
    hoverinfo='text'
))

#And set the figure layout
fig9.update_layout(
    title='Pair bus coords when it arrives to stop vs the coords of that stop',
    height=500,
    margin=dict(r=0, l=0, t=0, b=0),
    hovermode='closest',
    showlegend=True,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=statistics.mean(lats),
            lon=statistics.mean(lons)
        ),
        pitch=0,
        zoom=14,
        style=style_day
    )
)