# Calculating On-Time %

## Loading data

In [19]:
import requests
import pandas as pd
from operator import itemgetter
from math import sqrt, cos
from time import time
import psycopg2 as pg

In [20]:
# Muting chained assignment warning; needs refactoring
# but doesn't affect performance

pd.set_option('mode.chained_assignment', None)

### A single day's (05/24/2020) full data pulled from api

In [55]:
url = 'http://sfmta-ds.eba-hqpuyrup.us-east-1.elasticbeanstalk.com/daily-general-json'

In [56]:
json_data = requests.get(url, params={'day': '2020-05-20'}).json()

In [57]:
# making df

full_data = pd.DataFrame(data=json_data).sort_values('timestamp')

In [58]:
# sub dfs for testing
# single route, all vehicles
# single route, single vehicle

cali1 = full_data[full_data['rid']=='1']
cali1_highest = cali1[cali1['vid']==(cali1['vid'].value_counts().index[0])].sort_values('timestamp')

In [59]:
# using stops gathered by Labs 22 for expediency
# all stops
# all stops in one route

stops = pd.read_csv('https://raw.githubusercontent.com/Lambda-School-Labs/sfmta-data-analysis-ds/master/deprecated_assets/datasets/route_info.csv')
cali1_stops = stops[stops['route_id']=='1']

### Schedule Data From RDS

In [60]:
creds = {
  'user': 'lambdalabs24sfmta',
  'password': '3eDB79rHz3miocMz',
  'host': 'lambdalabs24sfmta.cykkiwxbfvpg.us-east-1.rds.amazonaws.com',
  'dbname': 'historicalTransitData'
}

In [61]:
cnx = pg.connect(**creds)
cursor = cnx.cursor()

query = """
        SELECT
        *
        FROM schedules
        WHERE rid = '1'
        """

cursor.execute(query)
rows = cursor.fetchall()

In [62]:
rows

[(1,
  '1',
  datetime.datetime(2020, 5, 20, 0, 0),
  None,
  {'route': [{'tr': [{'stop': [{'tag': '4277',
         'content': '--',
         'epochTime': '-1'},
        {'tag': '3825', 'content': '--', 'epochTime': '-1'},
        {'tag': '3893', 'content': '--', 'epochTime': '-1'},
        {'tag': '3848', 'content': '04:32:00', 'epochTime': '16320000'},
        {'tag': '6296', 'content': '04:37:00', 'epochTime': '16620000'},
        {'tag': '4026', 'content': '04:44:00', 'epochTime': '17040000'},
        {'tag': '4027', 'content': '04:50:00', 'epochTime': '17400000'},
        {'tag': '4028', 'content': '04:56:00', 'epochTime': '17760000'},
        {'tag': '34015', 'content': '04:59:00', 'epochTime': '17940000'}],
       'blockID': '0126'},
      {'stop': [{'tag': '4277', 'content': '--', 'epochTime': '-1'},
        {'tag': '3825', 'content': '--', 'epochTime': '-1'},
        {'tag': '3893', 'content': '--', 'epochTime': '-1'},
        {'tag': '3848', 'content': '04:40:00', 'epochTime'

## Engineering Probable Nearest Stop and Distance (For Confidence)

### Original helper function for wrangling

In [10]:
def wrangle_bus(df):
    """
    preps dataframe for a single bus
    gives accurate timestamps and naively calculates 
    dwell time as 1min per checkin with motion (kph <= 0)
    
    Largest bottleneck for time-cost in df prep
    currently not implemented until refactor
    """
    
    times = df['timestamp'].values
    ages = df['age'].values
    
    df['adjusted_timestamp'] = [pd.Timestamp(times[x]) - pd.Timedelta(seconds=ages[x]) for 
                                x in range(len(df['timestamp']))]
    
    df['timestamp'] = [pd.Timestamp(times[x]) for x in range(len(df['timestamp']))]

    dwell_count = 0
    dwell_totals = []

    for x in df['kph']:
        if x <= 0:
          dwell_count += 1
          dwell_totals.append(dwell_count)
        elif x > 0:
          dwell_totals.append(0)
          dwell_count = 0
            
    df['dwell'] = [dwell_totals[x] for x in range(len(df))]

    return df

### Function to calculate nearest stop within $X$ km by projected euclidean distance

In [11]:
def fcc_projection(loc1, loc2):
    """
    function to apply FCC recommended formulae
    for calculating distances on earth projected to a plane
    
    significantly faster computationally, negligible loss in accuracy
    
    Args: 
    loc1 - a tuple of lat/lon
    loc2 - a tuple of lat/lon
    """
    lat1, lat2 = loc1[0], loc2[0]
    lon1, lon2 = loc1[1], loc2[1]
    
    mean_lat = (lat1+lat2)/2
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1
    
    k1 = 111.13209 - 0.56605*cos(2*mean_lat) + .0012*cos(4*mean_lat)
    k2 = 111.41513*cos(mean_lat) - 0.09455*cos(3*mean_lat) + 0.00012*cos(5*mean_lat)
    
    distance = sqrt((k1*delta_lat)**2 + (k2*delta_lon)**2)
    
    return distance

In [12]:
def assign_stop(df, stops):
    """
    applies basic wrangling function
    calculates nearest stop from reported location in km
    returns dataframe with reported location, 
    nearest stop (coords and name), and distance between

    tested with single buses on single routes on a single day;
    technically route/vehicle/time agnostic
    don't foresee any issues generalizing
    
    implements FCC projection formulae for calculating distance
    
    Args:
    df - dataframe of transit data, requires 'latitude', 'longitude' columns
    stops - datafram of stops data, requires 'lat', 'lon', 'title' columns
    """
    
    # TO-DO: error handling for missing routes from either df or stops
    # Currently handling by intersecting sets during function call
    
    start = time()
    
    # wrangle_bus function is now largest time bottleneck - may be removed
    wrangle_bus(df)

    # creating list of lat/lon dictionaries for stops and reported bus locations
    stop_lats = stops['lat'].values
    stop_lons = stops['lon'].values

    reported_lats = df['latitude'].values
    reported_lons = df['longitude'].values

    stop_points = [{'latitude': stop_lats[x], 'longitude': stop_lons[x]} 
                 for x in range(len(stops))]

    reported_points = [{'latitude': reported_lats[x], 
                      'longitude': reported_lons[x]} 
                     for x in range(len(df))]

    # to minimize possible overlap between probable stops
    # 500 ft as km
    # upper end of previous range for minimum distance between stops according to sfmta
    # this value seems good but could use more testing
    radius = .1524

    # dict to tuples to play nice with geopy
    stop_point_tuples = [tuple(stop_points[x].values()) 
                       for x in range(len(stop_points))]

    reported_point_tuples = [tuple(reported_points[x].values()) 
                           for x in range(len(reported_points))]

    df['reported_location'] = reported_point_tuples
    
    print('Prep Complete')
    
    # generating ((lat/lon), distance) tuples for nearest stop within range
    # using FCC ellipsoidal earth projection
    distances = [{x: fcc_projection(location, x) 
                 for x in stop_point_tuples} 
                 for location in reported_point_tuples]
    
    print(f'Distances Generated => {len(distances)}')
    
    # sorting for nearest stop
    distances_sorted = [{k: v for k, v in sorted(distances[x].items(), 
                                                 key=itemgetter(1))}
                       for x in range(len(distances))]
    
    print(f'Distances Sorted => {len(distances_sorted)}')
    
    # creating list of nearest stops
    # nearest stop if nearest stop within radius, else None
    point_stops = [next(iter(distances_sorted[x].items())) 
                   if next(iter(distances_sorted[x].items()))[1] <= radius 
                   else None
                   for x in range(len(distances_sorted))]
    
    print(f'Stops Created => {len(point_stops)}')
    
    # assigning stop name from stops table based on lat/lon from previous step
    stop_tuples = list(zip(stops['lat'], stops['lon']))
    stop_titles = [stops['title'].iloc[stop_tuples.index(stop[0])] 
                   if stop != None
                   else None 
                   for stop in point_stops]
    
    print(f'Titles Created => {len(stop_titles)}')
    
    # pulling lat/lon and distance from tuples for df
    df['nearest_stop'] = [x[0] if x != None else None for x in point_stops]
    df['distance_in_km'] = [x[1] if x != None else None for x in point_stops]

    # pulling stop names from list for df
    df['stops'] = stop_titles

    # dropping columns of redundant information
    df = df.drop(columns=['age', 'rid', 'vid', 'latitude', 'longitude'])
    end = time()
    
    print(f'DF Complete\nTime Elapsed: {end-start} seconds\n')
    
    return df

## Function To Calculate On-Time %

## Results

### Full Day Test - Single Route, Single Bus

In [30]:
%%timeit -n 5 -r 10
%%capture

assign_stop(nbus_highest, nbus_stops)

383 ms ± 28.6 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


In [31]:
assign_stop(nbus_highest, nbus_stops)

Prep Complete
Distances Generated => 979
Distances Sorted => 979
Stops Created => 979
Titles Created => 979
DF Complete
Time Elapsed: 0.36827921867370605 seconds



Unnamed: 0,timestamp,kph,heading,direction,reported_location,nearest_stop,distance_in_km,stops,adjusted_timestamp,dwell
9458,2020-05-24 04:09:12,39,345,,"(37.7595, -122.508)","(37.7602999, -122.50818000000001)",0.090679,Judah St & La Playa St,2020-05-24 04:08:55,0
9516,2020-05-24 04:10:12,8,135,,"(37.7601, -122.509)","(37.7603599, -122.50900990000001)",0.028759,Judah & La Playa St,2020-05-24 04:09:55,0
9573,2020-05-24 04:11:12,0,218,,"(37.7602, -122.509)","(37.7603599, -122.50900990000001)",0.017715,Judah & La Playa St,2020-05-24 04:10:43,1
9630,2020-05-24 04:12:12,0,218,,"(37.7602, -122.509)","(37.7603599, -122.50900990000001)",0.017715,Judah & La Playa St,2020-05-24 04:11:43,2
9687,2020-05-24 04:13:12,0,218,,"(37.7602, -122.509)","(37.7603599, -122.50900990000001)",0.017715,Judah & La Playa St,2020-05-24 04:12:42,3
...,...,...,...,...,...,...,...,...,...,...
207455,2020-05-24 21:29:12,26,315,NBUS_I_F00,"(37.7785, -122.392)",,,,2020-05-24 21:28:56,0
207522,2020-05-24 21:30:12,21,225,NBUS_I_F00,"(37.7799, -122.395)",,,,2020-05-24 21:29:58,0
207587,2020-05-24 21:31:12,27,210,NBUS_I_F00,"(37.777, -122.398)",,,,2020-05-24 21:31:01,0
207651,2020-05-24 21:32:12,13,210,NBUS_I_F00,"(37.7767, -122.399)",,,,2020-05-24 21:32:02,0


### Full Day Test - Single Route, All Buses

In [32]:
%%timeit
%%capture

assign_stop(nbus, nbus_stops)

6.26 s ± 652 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
assign_stop(nbus, nbus_stops)

Prep Complete
Distances Generated => 17186
Distances Sorted => 17186
Stops Created => 17186
Titles Created => 17186
DF Complete
Time Elapsed: 5.886007070541382 seconds



Unnamed: 0,timestamp,kph,heading,direction,reported_location,nearest_stop,distance_in_km,stops,adjusted_timestamp,dwell
8439,2020-05-24 03:49:12,6,30,,"(37.7601, -122.509)","(37.7603599, -122.50900990000001)",0.028759,Judah & La Playa St,2020-05-24 03:49:12,0
8480,2020-05-24 03:50:12,0,218,,"(37.7602, -122.509)","(37.7603599, -122.50900990000001)",0.017715,Judah & La Playa St,2020-05-24 03:49:25,1
8524,2020-05-24 03:51:12,0,218,,"(37.7602, -122.509)","(37.7603599, -122.50900990000001)",0.017715,Judah & La Playa St,2020-05-24 03:50:12,2
8567,2020-05-24 03:52:12,0,218,,"(37.7602, -122.509)","(37.7603599, -122.50900990000001)",0.017715,Judah & La Playa St,2020-05-24 03:51:12,3
8612,2020-05-24 03:53:12,0,218,,"(37.7602, -122.509)","(37.7603599, -122.50900990000001)",0.017715,Judah & La Playa St,2020-05-24 03:52:12,4
...,...,...,...,...,...,...,...,...,...,...
208598,2020-05-24 21:49:12,0,267,NBUS_O_F00,"(37.7601, -122.509)","(37.7603599, -122.50900990000001)",0.028759,Judah & La Playa St,2020-05-24 21:48:49,1
208643,2020-05-24 21:50:12,0,267,NBUS_O_F00,"(37.7601, -122.509)","(37.7603599, -122.50900990000001)",0.028759,Judah & La Playa St,2020-05-24 21:49:50,2
208689,2020-05-24 21:51:12,0,267,NBUS_O_F00,"(37.7601, -122.509)","(37.7603599, -122.50900990000001)",0.028759,Judah & La Playa St,2020-05-24 21:50:52,3
208735,2020-05-24 21:52:12,0,267,NBUS_O_F00,"(37.7601, -122.509)","(37.7603599, -122.50900990000001)",0.028759,Judah & La Playa St,2020-05-24 21:51:54,4


### Full Day Test - All Lines, All Vehicles

In [34]:
start = time()

# Currently errors if either dataframe is missing a route contained in the other
# Handled by intersecting route ids from both dataframes in short term

# executes function on every route contained in daily data and stops
# creates dictionary of route_id: dataframe
stop_reports = {x: assign_stop(full_data[full_data.rid.eq(x)], 
                               stops[stops.route_id.eq(x)]) 
                for x in list(set(stops.route_id)&set(full_data.rid))}

end = time()

print(f'Total Time Elapsed: {end-start} seconds')

Prep Complete
Distances Generated => 3115
Distances Sorted => 3115
Stops Created => 3115
Titles Created => 3115
DF Complete
Time Elapsed: 3.123379945755005 seconds

Prep Complete
Distances Generated => 15765
Distances Sorted => 15765
Stops Created => 15765
Titles Created => 15765
DF Complete
Time Elapsed: 8.162233114242554 seconds

Prep Complete
Distances Generated => 8849
Distances Sorted => 8849
Stops Created => 8849
Titles Created => 8849
DF Complete
Time Elapsed: 2.237722873687744 seconds

Prep Complete
Distances Generated => 15800
Distances Sorted => 15800
Stops Created => 15800
Titles Created => 15800
DF Complete
Time Elapsed: 6.587782859802246 seconds

Prep Complete
Distances Generated => 1351
Distances Sorted => 1351
Stops Created => 1351
Titles Created => 1351
DF Complete
Time Elapsed: 0.5346200466156006 seconds

Prep Complete
Distances Generated => 10325
Distances Sorted => 10325
Stops Created => 10325
Titles Created => 10325
DF Complete
Time Elapsed: 2.7491261959075928 secon

## Generated Stop Dataframes

We now have a dictionary of route-specific dataframes with generated stops.

Function needs further refactoring for flexibility and simplified implementation.

In [35]:
# number of dataframes limited by number of valid route ids

len(stop_reports)

22

In [36]:
stop_reports.keys()

dict_keys(['91', '8', '38R', '49', '90', 'LBUS', 'L_OWL', '24', '44', 'N_OWL', '22', '5', 'NBUS', '19', '12', '14', '9', '1', '14R', '29', '25', '38'])

In [37]:
stop_reports['LBUS'].head()

Unnamed: 0,timestamp,kph,heading,direction,adjusted_timestamp,dwell,reported_location,nearest_stop,distance_in_km,stops
9873,2020-05-24 04:16:12,37,270,,2020-05-24 04:15:53,0,"(37.7356, -122.504)","(37.7361299, -122.50435)",0.070348,Wawona St & 46th Ave
9936,2020-05-24 04:17:12,11,60,,2020-05-24 04:16:53,0,"(37.7362, -122.504)","(37.7361299, -122.50435)",0.0397,Wawona St & 46th Ave
10003,2020-05-24 04:18:12,24,345,LBUS_I_F00,2020-05-24 04:18:04,0,"(37.7414, -122.505)","(37.7416899, -122.50452990000001)",0.061329,46th Ave & Taraval St
10071,2020-05-24 04:19:13,24,75,LBUS_I_F00,2020-05-24 04:19:05,0,"(37.742, -122.5)","(37.742019899999995, -122.5002)",0.022352,Taraval St & 42nd Ave
10143,2020-05-24 04:20:12,35,75,LBUS_I_F00,2020-05-24 04:19:53,0,"(37.7422, -122.496)",,,


### JSONify (WiP)

Pandas Timestamps don't play well with python;\
need to refactor to serve dataframes as JSON.

In [38]:
# convert dfs to dicts

data_dict = {
    key: stop_reports[key].to_dict(orient='records') 
    for key in stop_reports.keys()
}

In [39]:
data_dict.keys()

dict_keys(['91', '8', '38R', '49', '90', 'LBUS', 'L_OWL', '24', '44', 'N_OWL', '22', '5', 'NBUS', '19', '12', '14', '9', '1', '14R', '29', '25', '38'])

In [40]:
data_dict['N_OWL'][0:3]

[{'timestamp': Timestamp('2020-05-24 00:00:13'),
  'kph': 0,
  'heading': 267,
  'direction': 'N____O_N00',
  'adjusted_timestamp': Timestamp('2020-05-24 00:00:06'),
  'dwell': 1,
  'reported_location': (37.7642, -122.464),
  'nearest_stop': (37.76416, -122.46402990000001),
  'distance_in_km': 0.005531200293550708,
  'stops': 'Irving St & 7th Ave'},
 {'timestamp': Timestamp('2020-05-24 00:00:13'),
  'kph': 56,
  'heading': 0,
  'direction': 'N____O_N00',
  'adjusted_timestamp': Timestamp('2020-05-24 00:00:06'),
  'dwell': 0,
  'reported_location': (37.7841, -122.388),
  'nearest_stop': (37.7845499, -122.38795),
  'distance_in_km': 0.05005604239153493,
  'stops': 'The Embarcadero & Brannan St'},
 {'timestamp': Timestamp('2020-05-24 00:00:13'),
  'kph': 34,
  'heading': 75,
  'direction': 'N____I_N00',
  'adjusted_timestamp': Timestamp('2020-05-24 00:00:06'),
  'dwell': 0,
  'reported_location': (37.7641, -122.465),
  'nearest_stop': (37.76404, -122.46543),
  'distance_in_km': 0.04822613