# importing matplotlib for plotting at the bottom of this notebook

In [53]:
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
%matplotlib inline
%load_ext autoreload
%autoreload 2

!date
!whoami

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Wed Jun 12 22:41:22 PDT 2019
benjaminmalnor


# load in all functions from `route_shape_process_scripts.py` file

In [54]:
from route_shape_process_scripts import *

# General transformation flow in this notebook:
- import gtfs files
- pick a route of interest
- grab position files for all routes
- filter positions for only that route
- add time index (this is used to break up positions by datetime and join with gtfs)
- get popular shape in both route directions (direction_id)
- make a route_vertex geopandas dataframe - we'll use this to find "nearest" route node
- take all positions and find closest route node
- append route info (distance traveled / shape_pt_sequence)
- create timing metrics based on `trip_start_time` and `time_pct` <- position observation time - converted to Pacific

# import gtfs from 01_gtfs_transform notebook output

In [55]:
#if you ran notebook 01_gtfs this path should be correct
gtfs_merge_file_path = "../data/gtfs_merge/"
agg_filename = [f for f in listdir(gtfs_merge_file_path) if isfile(join(gtfs_merge_file_path, f)) 
                and 'agg' in f][0]
routes_filename = [f for f in listdir(gtfs_merge_file_path) if isfile(join(gtfs_merge_file_path, f)) 
                and 'routes' in f][0]
shapes_filename = [f for f in listdir(gtfs_merge_file_path) if isfile(join(gtfs_merge_file_path, f)) 
                and 'shapes' in f][0]
trips_filename = [f for f in listdir(gtfs_merge_file_path) if isfile(join(gtfs_merge_file_path, f)) 
                and 'trips' in f][0]

In [41]:
full_routes_gtfs = pd.read_csv(f"{gtfs_merge_file_path}{routes_filename}", low_memory=False)
full_shapes_gtfs = pd.read_csv(f"{gtfs_merge_file_path}{shapes_filename}", low_memory=False)
full_trips_gtfs = pd.read_csv(f"{gtfs_merge_file_path}{trips_filename}", low_memory=False)
full_trip_stop_schedule = pd.read_csv(f"{gtfs_merge_file_path}{agg_filename}", low_memory=False)

In [42]:
tripid_w_starttime = full_trip_stop_schedule.groupby('trip_id')\
                        .agg({'trip_start_time':'min'})\
                        .reset_index()

# break up the gtfs by `start date` and `end date`. TODO: there is a better way to do this. Right now, we want to make sure the vehicle position observation is joined with the `right` gtfs information. The simplest way to do that is to break up the position file by date and break up the gtfs by date and only join where the date windows match. 

In [43]:
full_trip_stop_schedule_dict = {}
for name, group in full_trip_stop_schedule.groupby(['start_gtfs_date','end_gtfs_date']):
    full_trip_stop_schedule_dict[name] = group

# analyze September --> November

## get route name -- id dictionary. Nathaniel has a better class for this in `/data_transformations` but I haven't incorporated it yet. the below dictionary works as a quick/dirty way to input `route_short_name` and output `route_id`

In [12]:
route_name_to_id_dict = dict(zip(full_routes_gtfs.route_short_name.tolist(),full_routes_gtfs.route_id.tolist()))

## select a route

In [30]:
#get_select_routeid_name(full_routes_gtfs, ['E Line'])[2]
route_of_interest = '1'
route_of_interest_id = route_name_to_id_dict[route_of_interest]
route_of_interest_id

100001

# get all position files for these months

In [22]:
position_file_location = "../data/intermed/"
month_list = ['201809', '201810', '201811']
full_route_positions = get_positions_months(month_list, position_file_location)

# take only the positions for the choosen route_id

In [23]:
single_route_positions = full_route_positions[full_route_positions['route_id']==input_dict['route_id']]

# make sure the route dataframe is not empty

In [25]:
single_route_positions.empty

False

# add time index columns

In [26]:
single_route_positions = convert_index_to_pct(single_route_positions)
single_route_positions = add_time_index_columns(single_route_positions)

# find the most popular shape on that route id going in one direction

In [32]:
direction = 1
shape_of_interest_id, trip_headsign = get_most_used_shape_id_per_direction(full_trip_stop_schedule, 
                                                                           route_of_interest_id, 
                                                                           direction)

# make one route_vertex_geo from shape_id

In [33]:
route_vertex_geo = make_geopandas_shape_df(full_shapes_gtfs, shape_of_interest_id)

# join position table with trip gtfs information

In [34]:
positions_w_trips = {}
for name, group in full_trips_gtfs.groupby(['start_gtfs_date','end_gtfs_date']):
    print(name)
    positions_w_trips[name] = join_positions_with_gtfs_trips(single_route_positions, group, name[0], name[1])

('2018-08-15', '2018-09-23')
('2018-09-24', '2018-09-24')
('2018-09-25', '2018-11-01')
('2018-11-02', '2018-11-09')
('2018-11-10', '2018-11-28')
('2018-11-29', '2018-12-11')
('2018-12-12', '2019-01-01')


# merge all dictionaries into one

In [35]:
for idx, dict_group in enumerate(positions_w_trips.keys()):
    print(dict_group)
    if positions_w_trips[dict_group].empty:
        pass
    else:
        if idx == 0:
            unpacked_positions_full = positions_w_trips[dict_group].copy()
        else:
            unpacked_positions_full = unpacked_positions_full.append(positions_w_trips[dict_group])

('2018-08-15', '2018-09-23')
('2018-09-24', '2018-09-24')
('2018-09-25', '2018-11-01')
('2018-11-02', '2018-11-09')
('2018-11-10', '2018-11-28')
('2018-11-29', '2018-12-11')
('2018-12-12', '2019-01-01')


# only take positions along one `shape_id`

In [36]:
unpacked_positions_one_shape = unpacked_positions_full[unpacked_positions_full['shape_id']==shape_of_interest_id]

# try parallel below - I don't think the parallel function will work on a Windows computer (I've had trouble in the past). It should work fine on a Mac.

### the below takes ~1-2 minutes on my computer depending on the shape/number of trips

In [37]:
if __name__ == "__main__":
    start = time.time()
    positions_w_near_node_df = get_close_node_process(unpacked_positions_one_shape, route_vertex_geo)
    end = time.time()
    print(end - start)

17.333288192749023


# convert `time_pct` to Pacific time for datetime tranforms below

In [38]:
positions_w_near_node_datetime = datetime_transform_df(positions_w_near_node_df)

In [44]:
position_w_node_schedule = join_tripstart(positions_w_near_node_datetime, 
                                                            full_trip_stop_schedule, 
                                                            tripid_w_starttime)

In [46]:
position_w_node_schedule['distance_btw_veh_and_shape'] = position_w_node_schedule\
                                                                .apply(lambda x: calc_distance(x['vehicle_lat'],
                                                                x['vehicle_long'], 
                                                                x['shape_pt_lat'],
                                                               x['shape_pt_lon']), axis=1)

# below is an interactive graph if you installed `ipywidgets`

# importing interactive libraries for interactive plots at the bottom of this notebook, you can comment these out if you just want to do transformations

In [47]:
#comment out these libraries if you don't want/have interactive widgets
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

# look at one unique trip

In [50]:
@interact
def show_one_unique_trip(x=list(position_w_node_schedule['month_day_trip_veh'].unique())):
    onetrip_df = position_w_node_schedule[
        position_w_node_schedule['month_day_trip_veh']==x][
                                                                    ['month_day_trip_veh',
                                                                     'trip_id',
                                                                     'route_id',
                                                                     'stop_name',
                                                                     'shape_pt_sequence',
                                                                     'shape_dist_traveled',
                                                                     'time_pct',
                                                                    'trip_start_time',
                                                                    'arrival_time',
                                                                     'actual_time_from_scheduled_start',
                                                                     'scheduled_time_from_scheduled_start'
                                                                    ]]

    x = onetrip_df['shape_dist_traveled'].values
    y1 = onetrip_df['actual_time_from_scheduled_start'].values
    y2 = onetrip_df['scheduled_time_from_scheduled_start'].values

    x_labels = onetrip_df['stop_name'].values
    fig, ax = plt.subplots(figsize=(15,7))
    ax.scatter(x, y1, label='actual_time', color='red')
    ax.scatter(x, y2, label='sched_time', color='blue')
    _ = ax.legend()

interactive(children=(Dropdown(description='x', options=('2018_8_31_39476797_4309', '2018_8_31_39476773_4331',…

# look at a particular `trip_id` (pretty messy right now- need to work on this graph)

In [52]:
@interact
def show_one_tripid(x=list(position_w_node_schedule['trip_id'].unique())):
    onetrip_df = position_w_node_schedule[
        position_w_node_schedule['trip_id']==x][
                                                                    ['month_day_trip_veh',
                                                                     'trip_id',
                                                                     'hour',
                                                                     'dow',
                                                                     'route_id',
                                                                     'shape_pt_sequence',
                                                                     'shape_dist_traveled',
                                                                     'time_pct',
                                                                    'trip_start_time',
                                                                     'stop_name',
                                                                    'arrival_time',
                                                                     'actual_time_from_scheduled_start',
                                                                     'scheduled_time_from_scheduled_start'
                                                                    ]]

    x = onetrip_df['shape_dist_traveled'].values
    y1 = onetrip_df['actual_time_from_scheduled_start'].values
    y2 = onetrip_df['scheduled_time_from_scheduled_start'].values

    x_labels = onetrip_df['stop_name'].values
    fig, ax = plt.subplots(figsize=(15,7))
    ax.scatter(x, y1, label='actual_time', color='red')
    ax.scatter(x, y2, label='sched_time', color='blue')
    hours = str(onetrip_df['hour'].unique().tolist())
    dow = str(onetrip_df['dow'].unique().tolist())
    ax.legend()
    title_str = "day of week {}, hour(s) {}".format(dow, hours)
    _ = ax.set_title(title_str, fontsize=14)

interactive(children=(Dropdown(description='x', options=(39476797, 39476773, 39476829, 39476731, 39476726, 394…