# CHARACTERIZATION OF COORDINATES AND REFRESH RATES OF THE GPS

In [64]:
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm

import datetime
from datetime import timedelta

import statistics
import math

import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px

from shapely.geometry import shape
from shapely.geometry import Point, LineString

import gpxpy
import glob

pd.set_option("display.precision", 9)

In [24]:
#Route and lines shape
route_lines = gpd.read_file('M6Data/route_lines.json',geometry='geometry')
line1 = route_lines.loc[(route_lines['line_id']=='91')&(route_lines['direction']=='1')]
line2 = route_lines.loc[(route_lines['line_id']=='91')&(route_lines['direction']=='2')]
route_lines.head()

Unnamed: 0,itinerary_id,line_id,direction,orig_dist,dist,geometry
0,6_N_501____1__IT_1,501,1,84.91,20.429,"LINESTRING (-3.69234 40.41997, -3.69193 40.420..."
1,6_N_501____2__IT_1,501,2,42.932,13.472,"LINESTRING (-3.64667 40.48592, -3.64668 40.485..."
2,6_N_502____1__IT_1,502,1,60.927,19.403,"LINESTRING (-3.69269 40.41872, -3.69269 40.418..."
3,6_N_502____2__IT_1,502,2,96.684,21.195,"LINESTRING (-3.61127 40.49814, -3.61091 40.498..."
4,6_N_503____1__IT_1,503,1,31.595,14.011,"LINESTRING (-3.69269 40.41872, -3.69269 40.418..."


In [25]:
#Stops
stops = gpd.read_file('M6Data/stops.json',geometry='geometry')
stops.head()

Unnamed: 0,stop_code,stop_name,stop_desc,zone_id,location_type,parent_station,wheelchair_boarding,geometry
0,161,Puerta de Alcalá,Plaza de la Independencia 3,A,0,,2,POINT (-3.68919 40.42069)
1,162,Retiro,Avda de Méjico SN,A,0,,2,POINT (-3.68826 40.41970)
2,164,Círculo de Bellas Artes,Calle Gran Vía 3,A,0,,2,POINT (-3.69715 40.41896)
3,168,Santo Domingo,Calle Gran Vía 56,A,0,,2,POINT (-3.70761 40.42159)
4,169,Santo Domingo,Calle Gran Vía 47,A,0,,2,POINT (-3.70746 40.42114)


In [26]:
#Experiment data
experiment = pd.read_csv('experiment-F.csv')
experiment = experiment[['bus','line','stop','datetime','isHead','destination','request_time','estimateArrive','DistanceBus','given_coords','lat','lon']]
experiment = experiment.loc[(experiment.estimateArrive < 999999) & (experiment.DistanceBus >= 0)]
experiment = experiment.loc[((experiment.DistanceBus == 0) & (experiment.estimateArrive == 0)) | ((experiment.DistanceBus > 0) & (experiment.estimateArrive > 0))]
experiment['datetime'] = pd.to_datetime(experiment['datetime'], format='%Y-%m-%d %H:%M:%S.%f')
experiment = experiment.sort_values(by=['datetime']).reset_index()
experiment.head()

Unnamed: 0,index,bus,line,stop,datetime,isHead,destination,request_time,estimateArrive,DistanceBus,given_coords,lat,lon
0,47,8603,F,4285,2020-03-09 10:15:14.291579,False,CIUDAD UNIVERSITARIA,79,538,2629,1,40.44843293,-3.734188718
1,46,8606,F,4285,2020-03-09 10:15:14.291579,False,CIUDAD UNIVERSITARIA,79,392,1730,1,40.449985615,-3.734595387
2,25,8605,F,5316,2020-03-09 10:15:14.331733,False,CUATRO CAMINOS,101,657,1637,1,40.446392521,-3.704028544
3,24,8608,F,5316,2020-03-09 10:15:14.331733,False,CUATRO CAMINOS,101,436,1975,1,40.447668581,-3.726714627
4,49,8604,F,4284,2020-03-09 10:15:14.336624,False,CUATRO CAMINOS,90,186,844,1,40.447901752,-3.731329186


In [27]:
experiment.describe()

Unnamed: 0,index,bus,stop,request_time,estimateArrive,DistanceBus,given_coords,lat,lon
count,53037.0,53037.0,53037.0,53037.0,53037.0,53037.0,53037.0,53037.0,53037.0
mean,27000.798668854,8606.9968324,3141.687369195,125.75584969,414.818786885,1398.682485812,0.936779984,40.44822114,-3.722790824
std,15597.665260748,5.266513948,1430.814592876,55.874756833,293.269266327,997.437848158,0.243360561,0.002015705,0.011793793
min,0.0,8603.0,185.0,55.0,0.0,0.0,0.0,40.423885752,-3.735309833
25%,13516.0,8605.0,1694.0,100.0,174.0,557.0,1.0,40.44682869,-3.733951549
50%,27001.0,8606.0,3276.0,118.0,389.0,1287.0,1.0,40.448202417,-3.726664463
75%,40499.0,8607.0,4288.0,141.0,617.0,2093.0,1.0,40.449713112,-3.711213798
max,54057.0,8631.0,5370.0,1179.0,1781.0,5753.0,1.0,40.453092765,-3.690537244


In [66]:
#Read gpx file and turn it into a dataframe
def read_gpx_to_df (f) :
    gpx_file = open(f, 'r')
    gpx = gpxpy.parse(gpx_file)
    #Si es de tipo tracks
    if len(gpx.tracks) != 0 :
        data = gpx.tracks[0].segments[0].points
        segment = gpx.tracks[0].segments[0]
        segment_length = segment.length_3d()
        df = pd.DataFrame(columns=['datetime','lon', 'lat', 'alt','speed'])
        for point_idx, point in enumerate(segment.points):
            df = df.append({'datetime' : point.time, 'lon': point.longitude, 'lat' : point.latitude, 'alt' : point.elevation, 'speed' : segment.get_speed(point_idx)}, ignore_index=True)
        #We remove timezone data and add one hour to get the Madrid time
        df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S.%f%tz')
        df['datetime'] = df['datetime'].dt.tz_localize(None) + datetime.timedelta(hours=1)
    #Si es de tipo waypoints
    else :
        waypoints = gpx.waypoints
        df = pd.DataFrame(columns=['stop','datetime','lon', 'lat', 'alt'])
        for point in waypoints:
            df = df.append({'stop': point.name, 'datetime' : point.time, 'lon': point.longitude, 'lat' : point.latitude, 'alt' : point.elevation}, ignore_index=True)
        #We remove timezone data and add one hour to get the Madrid time
        df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S.%f%tz')
        df['datetime'] = df['datetime'].dt.tz_localize(None)
    
    return df

path = 'TrackedLocations'
files = [f for f in glob.glob(path + "**/8607-*.gpx", recursive=True)]
dfs = []
for f in files:
    dfs.append(read_gpx_to_df(f))


bus_path = pd.concat(dfs).sort_values(by=['datetime']).reset_index()[['datetime','lon','lat','alt','speed']]
start_interval = bus_path.datetime.min()
end_interval = bus_path.datetime.max()
bus_path

Unnamed: 0,datetime,lon,lat,alt,speed
0,2020-03-09 10:55:02.470,-3.705201047,40.446766209,816.793459202,58.960653598
1,2020-03-09 10:55:04.000,-3.705089997,40.447005137,765.020535761,41.228519969
2,2020-03-09 10:55:06.000,-3.705314968,40.446730946,795.243950676,12.350089443
3,2020-03-09 10:55:08.000,-3.705333657,40.446714652,795.243950676,1.132441969
4,2020-03-09 10:55:10.000,-3.705327954,40.446733216,795.243950676,0.696652090
...,...,...,...,...,...
307,2020-03-09 11:05:40.000,-3.727766774,40.452113618,691.853407636,6.321058970
308,2020-03-09 11:05:42.000,-3.727766774,40.452113618,691.853407636,0.727502469
309,2020-03-09 11:05:44.000,-3.727757416,40.452124022,692.237337837,1.203284841
310,2020-03-09 11:05:46.000,-3.727751159,40.452150828,693.683540501,1.705913498


## GPS REFRESH RATE ANALYSIS

In [28]:
#Buses with given coords
buses_calc = experiment.loc[experiment.given_coords == 0].bus.unique().tolist()
buses_given = experiment.loc[~experiment.bus.isin(buses_calc)].bus.unique().tolist()

In [29]:
#Standard deviation of the location values
def std_buses (df) :
    std_buses_dict = {}
    for bus in df.bus.unique() : 
        bus_df = df.loc[df.bus == bus]
        if  True :#bus_df.given_coords.min() == 1 :
            std_buses_dict[bus] = {}
            std_buses_dict[bus]['lat'] = bus_df.lat.std()
            std_buses_dict[bus]['lon'] = bus_df.lon.std()
    
    return std_buses_dict

std_buses(experiment)

{8603: {'lat': 0.00022351454378411077, 'lon': 0.0022100123022477584},
 8606: {'lat': 0.0017144353065048283, 'lon': 0.007199309207092149},
 8605: {'lat': 0.0012394589646628768, 'lon': 0.01244060484221117},
 8608: {'lat': 0.0016839393040360205, 'lon': 0.009831825008321365},
 8604: {'lat': 0.0015192475686077963, 'lon': 0.012042683463117312},
 8607: {'lat': 0.0030549437449270164, 'lon': 0.013790367613385071},
 8631: {'lat': 0.001503898668693936, 'lon': 0.010153987450606715}}

In [30]:
#Number of repeated values for lat or lon values for a bus
experiment.loc[experiment.bus == 8607].lat.value_counts()

40.449713112    5020
40.446805360    1231
40.446828690     555
40.446825857     492
40.446784692     368
                ... 
40.449151451       1
40.449974283       1
40.447158400       1
40.447195877       1
40.449190767       1
Name: lat, Length: 274, dtype: int64

In [31]:
#Mean time transcurred before new value of location
def gps_update_time(df):
    update_time_buses_dict = {}
    for bus in df.bus.unique() : 
        bus_df = df.loc[df.bus == bus]
        if True : #bus_df.given_coords.min() == 1 :
            last_lat = bus_df.iloc[0].lat
            last_time = bus_df.iloc[0].datetime
            update_times = []
            for index,row in bus_df.iterrows() :
                if row.DistanceBus == 0 :
                    last_lat = row.lat
                    last_time = row.datetime
                elif last_lat != row.lat :
                    update_seconds = (row.datetime - last_time).total_seconds()
                    update_times.append(update_seconds)
                    last_lat = row.lat
                    last_time = row.datetime
            if len(update_times) != 0 :
                update_time_buses_dict[bus] = {}
                update_time_buses_dict[bus]['mean'] = statistics.mean(update_times)
                update_time_buses_dict[bus]['all'] = update_times
    
    return update_time_buses_dict

update_time_buses_dict = gps_update_time(experiment)

In [32]:
fig1 = go.Figure()
# Create and style traces
for bus in update_time_buses_dict.keys() :
    fig1.add_trace(go.Box(
        y=update_time_buses_dict[bus]['all'],
        name='Bus : {}'.format(bus),
        boxmean='sd' # represent mean and standard deviation
    ))
    
# Edit the layout
fig1.update_layout(title='Update times of gps (distance if the bus coords have been calculated)',
                   xaxis_title='Update number',
                   yaxis_title='Update time (seconds)')

fig1.show()

In [33]:
fig2 = go.Figure()
# Create and style traces
means = []
refresh_times = []
for bus in buses_given :
    if bus in update_time_buses_dict.keys() :
        refresh_times = refresh_times + update_time_buses_dict[bus]['all']
    
fig2.add_trace(go.Box(
    name='',
    y=refresh_times,
    boxmean='sd' # represent mean and standard deviation
))
    
# Edit the layout
fig2.update_layout(title='Update times of gps (distance if the bus coords have been calculated)',
                   xaxis_title='Update number',
                   yaxis_title='Update time (seconds)')

fig2.show()

## Distance of the bus coordinates to the nearest point on the line

In [34]:
def haversine(coord1, coord2):
    '''
    Returns distance between two given coordinates in meters
    '''
    R = 6372800  # Earth radius in meters
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    
    phi1, phi2 = math.radians(lat1), math.radians(lat2) 
    dphi       = math.radians(lat2 - lat1)
    dlambda    = math.radians(lon2 - lon1)
    
    a = math.sin(dphi/2)**2 + \
        math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    
    return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))

def nearest_point_on_line (line, bus_coords) :
    """
    Returns the coordinates of the bus location
        Parameters
        ----------
        line : geometry
            The shape of the line that the bus belongs to
        bus_coords :
            The coordinates of the bus
    """
    bus_point = Point(bus_coords)
    
    #First we calculate the normalized distance of the projected point in the line to the start of the line
    normalized_distance = line.project(bus_point,normalized=True)
    
    #Then we get the the coordinates of the point that is at the normalized distance obtained 
    #before from the start of the line with the interpolate method
    interpolated_point = line.interpolate(normalized_distance,normalized=True)
    
    #And we return the coordinates of the point
    return (interpolated_point.x,interpolated_point.y)

In [35]:
destinations = ['CIUDAD UNIVERSITARIA','CUATRO CAMINOS']
def distance_to_line(df) :
    distance_to_line_buses_dict = {}
    for bus in df.bus.unique() : 
        bus_df = df.loc[df.bus == bus]
        if True :#bus_df.given_coords.min() == 1 :
            dest = bus_df.iloc[0].destination
            if dest == 'CIUDAD UNIVERSITARIA' :
                line = line1
            else :
                line = line2
            #Calculate the nearest point of the line to the bus
            distances_to_line = []
            last_distance = 0
            for index,row in bus_df.iterrows() :
                nearest_point = nearest_point_on_line(line,(row.lon,row.lat))
                new_distance = haversine((row.lon,row.lat),nearest_point)
                if last_distance != new_distance :
                    distances_to_line.append(new_distance)
                    last_distance = new_distance

            if len(distances_to_line) != 0 :
                distance_to_line_buses_dict[bus] = {}
                distance_to_line_buses_dict[bus]['mean'] = statistics.mean(distances_to_line)
                distance_to_line_buses_dict[bus]['all'] = distances_to_line
        
    return distance_to_line_buses_dict

distance_to_line_buses_dict = distance_to_line(experiment)

In [37]:
fig3 = go.Figure()
for bus in distance_to_line_buses_dict.keys() :
    fig3.add_trace(go.Histogram(
        name='{}'.format(bus),
        x=distance_to_line_buses_dict[bus]['all'],
        xbins=dict( # bins used for histogram
            start=0,
            end=max(distance_to_line_buses_dict[bus]['all']),
            size=10
        )
    ))

# Overlay histograms
fig3.update_layout(
    title='Histogram of distance to the line for each bus',
    xaxis_title='Distance in meters',
    yaxis_title='Number of instances inside the interval',
    bargap=0.1, # gap between bars of adjacent location coordinates
    bargroupgap=0.05 # gap between bars of the same location coordinates
)
# Reduce opacity to see both histograms
fig3.update_traces(opacity=0.75)
fig3.show()

In [38]:
fig4 = go.Figure()
all_instances = []
for bus in distance_to_line_buses_dict.keys() :
    all_instances = all_instances + distance_to_line_buses_dict[bus]['all']

fig4.add_trace(go.Histogram(
    name='{}',
    x=all_instances,
    xbins=dict( # bins used for histogram
        start=0,
        end=max(all_instances),
        size=10
    )
))

# Overlay histograms
fig4.update_layout(
    title='Histogram of distance to the line for each bus',
    xaxis_title='Distance in meters',
    yaxis_title='Number of instances inside the interval',
    bargap=0.1, # gap between bars of adjacent location coordinates
    bargroupgap=0.05 # gap between bars of the same location coordinates
)
# Reduce opacity to see both histograms
fig4.update_traces(opacity=0.75)
fig4.show()

In [39]:
#Before applying the filter

#Sometimes when the distance is 0, the ETA is not 0
#experiment.loc[(experiment.DistanceBus == 0) & (experiment.estimateArrive != 0)].describe()
#It doesnt happen at the reverse case
#experiment.loc[(experiment.DistanceBus <= 0) & (experiment.estimateArrive == 0)].describe()

## Distance of the bus coordinates to the stop it has arrived

In [98]:
def distance_to_stop(df) :
    distance_to_stop_buses_dict = {}
    for bus in df.bus.unique() : 
        bus_df_all = df.loc[(df.bus == bus)].reset_index()
        bus_df = bus_df_all.loc[(bus_df_all.estimateArrive == 0) & (bus_df_all.DistanceBus == 0)]
        #Get values between time interval
        mask = ((start_interval < bus_df['datetime']) & (bus_df['datetime'] < end_interval))
        bus_df = bus_df.loc[mask]
        if True : #bus_df.given_coords.min() == 1 :
            distances_to_stop = []
            stops_points_lon,stops_points_lat = [],[]
            stopped_bus_points_lon,stopped_bus_points_lat = [],[]
            last_distance = 0
            for index,row in bus_df.iterrows() :
                stop_point = stops.loc[stops.stop_code == row.stop].iloc[0].geometry
                new_distance = haversine((row.lon,row.lat),(stop_point.x,stop_point.y))
                if last_distance != new_distance :
                    distances_to_stop.append(new_distance)
                    last_distance = new_distance
                    stops_points_lon.append(stop_point.x)
                    stops_points_lat.append(stop_point.y)
                    
                    k = index
                    #Delayed rows bus coords (2800,)
                    if (index+2000)<bus_df_all.shape[0] :
                        k = index + 2000
                    stopped_bus_points_lon.append(bus_df_all.iloc[k].lon)
                    stopped_bus_points_lat.append(bus_df_all.iloc[k].lat)
                    
                    
            if len(distances_to_stop) != 0 :
                distance_to_stop_buses_dict[bus] = {}
                distance_to_stop_buses_dict[bus]['mean'] = statistics.mean(distances_to_stop)
                distance_to_stop_buses_dict[bus]['all'] = distances_to_stop
                distance_to_stop_buses_dict[bus]['stop_lons'] = stops_points_lon
                distance_to_stop_buses_dict[bus]['stop_lats'] = stops_points_lat
                distance_to_stop_buses_dict[bus]['bus_lons'] = stopped_bus_points_lon
                distance_to_stop_buses_dict[bus]['bus_lats'] = stopped_bus_points_lat
    
    return distance_to_stop_buses_dict

distance_to_stop_buses_dict = distance_to_stop(experiment)

In [95]:
fig5 = go.Figure()
for bus in distance_to_stop_buses_dict.keys() :
    fig5.add_trace(go.Histogram(
        name='{}'.format(bus),
        x=distance_to_stop_buses_dict[bus]['all'],
        xbins=dict( # bins used for histogram
            start=min(distance_to_stop_buses_dict[bus]['all']),
            end=max(distance_to_stop_buses_dict[bus]['all']),
            size=10
        )
    ))

# Overlay histograms
fig5.update_layout(
    title='Histogram of distance to the stop when the bus has arrived it',
    xaxis_title='Distance in meters',
    yaxis_title='Number of instances inside the interval',
    bargap=0.1, # gap between bars of adjacent location coordinates
    bargroupgap=0.05 # gap between bars of the same location coordinates
)
# Reduce opacity to see both histograms
fig5.update_traces(opacity=0.75)
fig5.show()

In [96]:
fig6 = go.Figure()
all_instances = []
for bus in distance_to_stop_buses_dict.keys() :
    all_instances = all_instances + distance_to_stop_buses_dict[bus]['all']

fig6.add_trace(go.Histogram(
    name='{}',
    x=all_instances,
    xbins=dict( # bins used for histogram
        start=min(all_instances),
        end=max(all_instances),
        size=10
    )
))

# Overlay histograms
fig6.update_layout(
    title='Histogram of distance to the stop when the bus has arrived it',
    xaxis_title='Distance in meters',
    yaxis_title='Number of instances inside the interval',
    bargap=0.1, # gap between bars of adjacent location coordinates
    bargroupgap=0.05 # gap between bars of the same location coordinates
)
# Reduce opacity to see both histograms
fig6.update_traces(opacity=0.75)
fig6.show()

In [97]:
#Select bus for the figure
bus = 8607
stops_x = distance_to_stop_buses_dict[bus]['stop_lons']
stops_y = distance_to_stop_buses_dict[bus]['stop_lats']
buses_x = distance_to_stop_buses_dict[bus]['bus_lons']
buses_y = distance_to_stop_buses_dict[bus]['bus_lats']

#Token and styles for the mapbox api
mapbox_access_token = 'pk.eyJ1IjoiYWxlanAxOTk4IiwiYSI6ImNrNnFwMmM0dDE2OHYzZXFwazZiZTdmbGcifQ.k5qPtvMgar7i9cbQx1fP0w'
style_day = 'mapbox://styles/alejp1998/ck6z9mohb25ni1iod4sqvqa0d'

#We create the figure object
fig7 = go.Figure()
#Add the stops to the figure
fig7.add_trace(go.Scattermapbox(
    lat=stops_y,
    lon=stops_x,
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=5,
        color='green',
        opacity=0.7
    ),
    text='',
    hoverinfo='text'
))
#Add buses to figure
fig7.add_trace(go.Scattermapbox(
    lat=buses_y,
    lon=buses_x,
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=5,
        color='black',
        opacity=0.7
    ),
    text='',
    hoverinfo='text'
))
#Lines that connect stop with bus
for i in range(len(stops_x)) :
    fig7.add_trace(go.Scattermapbox(
        lat=[stops_y[i],buses_y[i]],
        lon=[stops_x[i],buses_x[i]],
        mode='lines',
        line=dict(width=0.5, color='red'),
        text='',
        hoverinfo='text'
    ))

#And set the figure layout
fig7.update_layout(
    title='Pair bus coords when it arrives to stop vs the coords of that stop',
    height=500,
    margin=dict(r=0, l=0, t=0, b=0),
    hovermode='closest',
    showlegend=False,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=statistics.mean(stops_y + buses_y),
            lon=statistics.mean(stops_x + buses_x)
        ),
        pitch=0,
        zoom=13,
        style=style_day
    )
)

In [48]:
def bus_different_coords(df) :
    different_coords_bus_dict = {}
    for bus in df.bus.unique() : 
        bus_df = df.loc[(df.bus == bus)]
        if True : #bus_df.given_coords.min() == 1 :
            lats = []
            lons = []
            last_lat,last_lon = 0,0
            for index,row in bus_df.iterrows() :
                new_lon = row.lon
                new_lat = row.lat
                if (last_lon != new_lon) | (last_lat != new_lat) :
                    lons.append(new_lon)
                    lats.append(new_lat)
                    last_lon,last_lat = new_lon,new_lat
                    
            if len(lats) != 0 :
                different_coords_bus_dict[bus] = {}
                different_coords_bus_dict[bus]['lon'] = lons
                different_coords_bus_dict[bus]['lat'] = lats
    
    return different_coords_bus_dict

different_coords_bus_dict = bus_different_coords(experiment)

In [57]:
#Select bus for the figure
bus = buses_calc[2]
x = different_coords_bus_dict[bus]['lon']
y = different_coords_bus_dict[bus]['lat']

line_x, line_y = [],[]
for coords in list(line1.iloc[0].geometry.coords) + list(line2.iloc[0].geometry.coords) :
    if (min(x) <= coords[0] <= max(x)) & (min(y) <= coords[1] <= max(y)) :
        line_x.append(coords[0])
        line_y.append(coords[1])

#Plot figure
fig8 = go.Figure()
fig8.add_trace(go.Histogram2dContour(
        x = x,
        y = y,
        colorscale = 'Blues',
        reversescale = True,
        xaxis = 'x',
        yaxis = 'y'
    ))
fig8.add_trace(go.Scatter(
        name = 'Bus coords',
        x = x,
        y = y,
        xaxis = 'x',
        yaxis = 'y',
        mode = 'markers',
        marker = dict(
            color = 'gold',
            size = 3
        )
    ))
fig8.add_trace(go.Scatter(
        name = 'Line 1',
        x = line_x,
        y = line_y,
        xaxis = 'x',
        yaxis = 'y',
        mode = 'lines',
        line=dict(
            color = 'white',
            width=1,
        ),
        opacity=0.5
    ))
fig8.add_trace(go.Histogram(
        y = y,
        xaxis = 'x2',
        marker = dict(
            color = 'rgba(0,0,0,1)'
        )
    ))
fig8.add_trace(go.Histogram(
        x = x,
        yaxis = 'y2',
        marker = dict(
            color = 'rgba(0,0,0,1)'
        )
    ))

fig8.update_layout(
    title='2D Histogram of bus {} given coordinates'.format(bus),
    xaxis_title='Longitude',
    yaxis_title='Latitude',
    autosize = False,
    xaxis = dict(
        zeroline = False,
        domain = [0,0.85],
        showgrid = False
    ),
    yaxis = dict(
        zeroline = False,
        domain = [0,0.85],
        showgrid = False
    ),
    xaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False
    ),
    yaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False
    ),
    height = 800,
    width = 800,
    bargap = 0,
    hovermode = 'closest',
    showlegend = False
)

## Relationship between ''DistanceBus'' and ''estimateArrive''

In [58]:
# Pearson correlation between distance to stop and ETA
experiment[['estimateArrive','DistanceBus']].corr(method = 'pearson')

Unnamed: 0,estimateArrive,DistanceBus
estimateArrive,1.0,0.801597753
DistanceBus,0.801597753,1.0


In [52]:
# Ordinary least squares regression
model = sm.OLS(experiment.DistanceBus, experiment.estimateArrive).fit()
model.summary()

0,1,2,3
Dep. Variable:,DistanceBus,R-squared (uncentered):,0.871
Model:,OLS,Adj. R-squared (uncentered):,0.871
Method:,Least Squares,F-statistic:,359400.0
Date:,"Mon, 09 Mar 2020",Prob (F-statistic):,0.0
Time:,16:44:09,Log-Likelihood:,-415930.0
No. Observations:,53037,AIC:,831900.0
Df Residuals:,53036,BIC:,831900.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
estimateArrive,3.1567,0.005,599.501,0.000,3.146,3.167

0,1,2,3
Omnibus:,3820.543,Durbin-Watson:,1.597
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5485.609
Skew:,-0.607,Prob(JB):,0.0
Kurtosis:,4.005,Cond. No.,1.0


In [53]:
#Get coef for distance bus
param = model.params['estimateArrive']
param

3.156685660589025

In [54]:
fig9 = go.Figure()
fig9.add_trace(go.Scatter(
    x = experiment.estimateArrive,
    y = experiment.DistanceBus,
    xaxis = 'x',
    yaxis = 'y',
    mode = 'markers',
    marker = dict(
        color = 'black',
        size = 0.75
    )
))
fig9.add_shape(type="line",
    x0=0,
    y0=0,
    x1=experiment.estimateArrive.max(),
    y1=param*experiment.estimateArrive.max(),
    line=dict(
        color='gold',
        width=2,
        dash="dashdot"
    ),
)
    
# Edit the layout
fig9.update_layout(title='OLS Regression Line for Time vs Distance Remaining to stop',
                   xaxis_title='ETA(seconds)',
                   yaxis_title='Distance to stop(meters)')

fig9.show()