# importing interactive libraries for interactive plots at the bottom of this notebook, you can comment these out if you just want to do transformations

In [43]:
#comment out these libraries if you don't want/have interactive widgets
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

# importing matplotlib for plotting at the bottom of this notebook

In [44]:
import matplotlib.pyplot as plt
%matplotlib inline

# load in all functions from `route_shape_process_scripts.py` file

In [45]:
from route_shape_process_scripts import *

# General transformation flow in this notebook:
- import gtfs files
- pick a route of interest
- grab position files for all routes
- filter positions for only that route
- add time index (this is used to break up positions by datetime and join with gtfs)
- get popular shape in both route directions (direction_id)
- make a route_vertex geopandas dataframe - we'll use this to find "nearest" route node
- take all positions and find closest route node
- append route info (distance traveled / shape_pt_sequence)
- create timing metrics based on `trip_start_time` and `time_pct` <- position observation time - converted to Pacific

# import gtfs from 01_gtfs_transform notebook output

In [46]:
full_routes_gtfs = pd.read_csv("input_gtfs/gtfs_routes_2018-08-15_2018-12-12.csv", low_memory=False)
full_shapes_gtfs = pd.read_csv("input_gtfs/gtfs_shapes_2018-08-15_2018-12-12.csv", low_memory=False)
full_trips_gtfs = pd.read_csv("input_gtfs/gtfs_trips_2018-08-15_2018-12-12.csv", low_memory=False)
full_trip_stop_schedule = pd.read_csv("input_gtfs/gtfs_2018-08-15_2018-12-12.csv", low_memory=False)

# break up the gtfs by `start date` and `end date`. TODO: there is a better way to do this. Right now, we want to make sure the vehicle position observation is joined with the `right` gtfs information. The simplest way to do that is to break up the position file by date and break up the gtfs by date and only join where the date windows match. 

In [47]:
full_trip_stop_schedule_dict = {}
for name, group in full_trip_stop_schedule.groupby(['start_gtfs_date','end_gtfs_date']):
    full_trip_stop_schedule_dict[name] = group

# analyze September --> November

## get route name -- id dictionary. Nathaniel has a better class for this in `/data_transformations` but I haven't incorporated it yet. the below dictionary works as a quick/dirty way to input `route_short_name` and output `route_id`

In [7]:
route_name_to_id_dict = dict(zip(full_routes_gtfs.route_short_name.tolist(),full_routes_gtfs.route_id.tolist()))

## select a route

In [49]:
#get_select_routeid_name(full_routes_gtfs, ['E Line'])[2]
route_of_interest = '1'
route_of_interest_id = route_name_to_id_dict[route_of_interest]
input_dict = {'route_id':route_of_interest_id}
input_dict

{'route_id': 100001}

# get all position files for these months

In [50]:
month_list = ['201809', '201810', '201811']
full_route_positions = get_positions_months(month_list)

# take only the positions for the choosen route_id

In [79]:
single_route_positions = full_route_positions[full_route_positions['route_id']==input_dict['route_id']]

# add time index columns

In [80]:
single_route_positions = convert_index_to_pct(single_route_positions)
single_route_positions = add_time_index_columns(single_route_positions)

# find the most popular shape on that route id going in one direction

In [83]:
direction = 1
shape_id, trip_headsign = get_most_used_shape_id_per_direction(full_trip_stop_schedule, input_dict['route_id'], direction)

# add shape_id to the input_dict

In [85]:
input_dict['shape_id'] = shape_id

# make one route_vertex_geo from shape_id

In [88]:
route_vertex_geo = make_geopandas_shape_df(full_shapes_gtfs, input_dict['shape_id'])

# join position table with trip gtfs information

In [89]:
positions_w_trips = {}
for name, group in full_trips_gtfs.groupby(['start_gtfs_date','end_gtfs_date']):
    print(name)
    positions_w_trips[name] = join_positions_with_gtfs_trips(single_route_positions, group, name[0], name[1])

('2018-08-15', '2018-09-23')
('2018-09-24', '2018-09-24')
('2018-09-25', '2018-11-01')
('2018-11-02', '2018-11-09')
('2018-11-10', '2018-12-11')
('2018-12-12', '2019-01-07')


# merge all dictionaries into one

In [90]:
for idx, dict_group in enumerate(positions_w_trips.keys()):
    print(dict_group)
    if positions_w_trips[dict_group].empty:
        pass
    else:
        if idx == 0:
            unpacked_positions_full = positions_w_trips[dict_group].copy()
        else:
            unpacked_positions_full = unpacked_positions_full.append(positions_w_trips[dict_group])

('2018-08-15', '2018-09-23')
('2018-09-24', '2018-09-24')
('2018-09-25', '2018-11-01')
('2018-11-02', '2018-11-09')
('2018-11-10', '2018-12-11')
('2018-12-12', '2019-01-07')


# only take positions along one `shape_id`

In [91]:
unpacked_positions_one_shape = unpacked_positions_full[unpacked_positions_full['shape_id']==input_dict['shape_id']]

# try parallel below - I don't think the parallel function will work on a Windows computer (I've had trouble in the past). It should work fine on a Mac.

### the below takes ~1-2 minutes on my computer depending on the shape/number of trips

In [92]:
if __name__ == "__main__":
    start = time.time()
    positions_w_near_node_df = get_close_node_process(unpacked_positions_one_shape, route_vertex_geo)
    end = time.time()
    print(end - start)

10.823527097702026


# convert `time_pct` to Pacific time for datetime tranforms below

In [93]:
positions_w_near_node_datetime = datetime_transform_df(positions_w_near_node_df)

In [96]:
if __name__ == "__main__":
    start = time.time()
    distance_time_list_df = get_distance_time_diffs(positions_w_near_node_datetime)
    end = time.time()
    print(end - start)

24.720900774002075


# join with gtfs schedule on shape_pt_sequence & calculate times from `trip_start_time`

In [130]:
position_w_node_schedule = join_tripstart(distance_time_list_df, full_trip_stop_schedule)

# below is an interactive graph if you installed `ipywidgets`

# look at one unique trip

In [34]:
@interact
def show_one_unique_trip(x=list(distance_time_list_df['month_day_trip_veh'].unique())):
    onetrip_df = position_w_node_schedule[
        position_w_node_schedule['month_day_trip_veh']==x][
                                                                    ['month_day_trip_veh',
                                                                     'trip_id',
                                                                     'route_id',
                                                                     'stop_name',
                                                                     'shape_pt_sequence',
                                                                     'shape_dist_traveled',
                                                                     'time_pct',
                                                                    'trip_start_time',
                                                                    'arrival_time',
                                                                     'actual_time_from_scheduled_start',
                                                                     'scheduled_time_from_scheduled_start'
                                                                    ]]

    x = onetrip_df['shape_dist_traveled'].values
    y1 = onetrip_df['actual_time_from_scheduled_start'].values
    y2 = onetrip_df['scheduled_time_from_scheduled_start'].values

    x_labels = onetrip_df['stop_name'].values
    fig, ax = plt.subplots(figsize=(15,7))
    ax.plot(x, y1, label='actual_time', color='red')
    ax.plot(x, y2, label='sched_time', color='blue')
    ax.legend()
    _ = plt.xticks(x, x_labels, rotation='vertical')

interactive(children=(Dropdown(description='x', options=('2018_10_10_40572056_8207', '2018_10_10_40572060_8241…

# look at a particular `trip_id` (pretty messy right now- need to work on this graph)

In [35]:
@interact
def show_one_tripid(x=list(position_w_node_schedule['trip_id'].unique())):
    onetrip_df = position_w_node_schedule[
        position_w_node_schedule['trip_id']==x][
                                                                    ['month_day_trip_veh',
                                                                     'trip_id',
                                                                     'hour',
                                                                     'dow',
                                                                     'route_id',
                                                                     'shape_pt_sequence',
                                                                     'shape_dist_traveled',
                                                                     'time_pct',
                                                                    'trip_start_time',
                                                                     'stop_name',
                                                                    'arrival_time',
                                                                     'actual_time_from_scheduled_start',
                                                                     'scheduled_time_from_scheduled_start'
                                                                    ]]

    x = onetrip_df['shape_dist_traveled'].values
    y1 = onetrip_df['actual_time_from_scheduled_start'].values
    y2 = onetrip_df['scheduled_time_from_scheduled_start'].values

    x_labels = onetrip_df['stop_name'].values
    fig, ax = plt.subplots(figsize=(15,7))
    ax.plot(x, y1, label='actual_time', color='red')
    ax.plot(x, y2, label='sched_time', color='blue')
    hours = str(onetrip_df['hour'].unique().tolist())
    dow = str(onetrip_df['dow'].unique().tolist())
    ax.legend()
    title_str = "day of week {}, hour(s) {}".format(dow, hours)
    ax.set_title(title_str, fontsize=14)
    _ = plt.xticks(x, x_labels, rotation='vertical')

interactive(children=(Dropdown(description='x', options=(40572056, 40572060, 40572061, 40572063, 40572064, 405…

# show travel speeds across one unique trip

In [36]:
@interact
def show_trip_speeds(x=list(position_w_node_schedule['month_day_trip_veh'].unique())):
    onetrip_df = position_w_node_schedule[position_w_node_schedule['month_day_trip_veh']==x][['shape_pt_sequence',
                                                                                     'shape_dist_traveled',
                                                                                     'total_seconds_delta',
                                                                                     'travel_rate_mph',
                                                                                              'stop_name',
                                                                                     'shape_pt_seq_tuple']].reset_index()
    onetrip_df = onetrip_df[(onetrip_df['travel_rate_mph']>=0)&(onetrip_df['travel_rate_mph']<80)]
    x = onetrip_df['shape_dist_traveled'].values
    y = onetrip_df['travel_rate_mph'].values

    x_labels = onetrip_df['stop_name'].values
    fig, ax = plt.subplots(figsize=(15,7))
    ax.plot(x, y)
    _ = plt.xticks(x, x_labels, rotation='vertical')

interactive(children=(Dropdown(description='x', options=('2018_10_10_40572056_8207', '2018_10_10_40572060_8241…

# export

In [37]:
position_w_node_schedule.to_csv('transformed/route_{}_{}_shape_{}_stopsonly_2018-08-15_2018-12-11.csv'.format(
                                        route_of_interest,"".join(trip_headsign.split(" ")), input_dict['shape_id']), index=False)