# importing matplotlib for plotting at the bottom of this notebook

In [1]:
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from shapely.geometry import Point
import pandas as pd
import geopandas as gpd
import time
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime #added for Windows users
current_time = datetime.now() #added for Windows users
#!date #did not work on Windows
print(current_time.strftime('%m/%d/%Y %H:%M'))
!whoami

02/02/2020 23:40
root


# load in all functions from `route_shape_process_scripts.py` file

In [3]:
pwd

'/home/work/gtfs-realtime'

In [4]:
import route_shape_process_scripts as f1

# General transformation flow in this notebook:
- import gtfs files
- pick a route of interest
- grab position files for all routes
- filter positions for only that route
- add time index (this is used to break up positions by datetime and join with gtfs)
- get popular shape in both route directions (direction_id)
- make a route_vertex geopandas dataframe - we'll use this to find "nearest" route node
- take all positions and find closest route node
- append route info (distance traveled / shape_pt_sequence)
- create timing metrics based on `trip_start_time` and `time_pct` <- position observation time - converted to Pacific

# import gtfs from 01_gtfs_transform notebook output

In [5]:
#if you ran notebook 01_gtfs this path should be correct
gtfs_merge_file_path = "./data/gtfs_merge/"
agg_filename = [f for f in listdir(gtfs_merge_file_path) if isfile(join(gtfs_merge_file_path, f)) 
                and 'agg' in f][0]
routes_filename = [f for f in listdir(gtfs_merge_file_path) if isfile(join(gtfs_merge_file_path, f)) 
                and 'routes' in f][0]
shapes_filename = [f for f in listdir(gtfs_merge_file_path) if isfile(join(gtfs_merge_file_path, f)) 
                and 'shapes' in f][0]
trips_filename = [f for f in listdir(gtfs_merge_file_path) if isfile(join(gtfs_merge_file_path, f)) 
                and 'trips' in f][0]

In [6]:
full_routes_gtfs = pd.read_csv(f"{gtfs_merge_file_path}{routes_filename}", low_memory=False)
full_shapes_gtfs = pd.read_csv(f"{gtfs_merge_file_path}{shapes_filename}", low_memory=False)
full_trips_gtfs = pd.read_csv(f"{gtfs_merge_file_path}{trips_filename}", low_memory=False)
full_trip_stop_schedule = pd.read_csv(f"{gtfs_merge_file_path}{agg_filename}", low_memory=False)

In [7]:
tripid_w_starttime = full_trip_stop_schedule.groupby('trip_id')\
                        .agg({'trip_start_time':'min'})\
                        .reset_index()

# break up the gtfs by `start date` and `end date`. TODO: there is a better way to do this. Right now, we want to make sure the vehicle position observation is joined with the `right` gtfs information. The simplest way to do that is to break up the position file by date and break up the gtfs by date and only join where the date windows match. 

In [8]:
full_trip_stop_schedule_dict = {}
for name, group in full_trip_stop_schedule.groupby(['start_gtfs_date','end_gtfs_date']):
    full_trip_stop_schedule_dict[name] = group

## get route name -- id dictionary. Nathaniel has a better class for this in `/data_transformations` but I haven't incorporated it yet. the below dictionary works as a quick/dirty way to input `route_short_name` and output `route_id`

In [9]:
route_name_to_id_dict = dict(zip(full_routes_gtfs.route_short_name.tolist(),
                                 full_routes_gtfs.route_id.tolist()))

In [10]:
route_id_to_name_dict = dict(zip(full_routes_gtfs.route_id.tolist(),
                                 full_routes_gtfs.route_short_name.tolist()))

## select a route

In [11]:
#get_select_routeid_name(full_routes_gtfs, ['E Line'])[2]
route_of_interest = '7'
route_of_interest_id = route_name_to_id_dict[route_of_interest]
route_of_interest_id

100263

# get all position files for these months

In [12]:
position_file_location = "./data/intermed/"
position_date = "201905"
# month_list = ['201809', '201810', '201811']
# full_route_positions = get_positions_months(month_list, position_file_location)
full_route_positions = pd.read_hdf(f"{position_file_location}positions_{position_date}.h5",low_memory=False)

# take only the positions for the choosen route_id

In [13]:
single_route_positions = full_route_positions[
                        full_route_positions['route_id']==route_of_interest_id].copy()

# make sure the route dataframe is not empty

In [14]:
single_route_positions.empty

False

# add time index columns

In [15]:
#single_route_positions = f1.convert_index_to_pct(single_route_positions)
single_route_positions = f1.add_time_index_columns(single_route_positions)

# find the most popular shape on that route id going in one direction

In [16]:
direction = 1
shape_of_interest_id, trip_headsign = f1.get_most_used_shape_id_per_direction(full_trip_stop_schedule, 
                                                                           route_of_interest_id, 
                                                                           direction)
shape_of_interest_id

10007006

# make one route_vertex_geo from shape_id

In [17]:
route_vertex_geo = f1.make_geopandas_shape_df(full_shapes_gtfs, shape_of_interest_id)

# join position table with trip gtfs information

In [18]:
positions_w_trips = {}
for name, group in full_trips_gtfs.groupby(['start_gtfs_date','end_gtfs_date']):
    print(name)
    positions_w_trips[name] = f1.join_positions_with_gtfs_trips(single_route_positions, group, name[0], name[1])

('2019-04-20', '2019-05-08')
('2019-05-09', '2019-06-18')


# merge all dictionaries into one

In [19]:
for idx, dict_group in enumerate(positions_w_trips.keys()):
    print(dict_group)
    if positions_w_trips[dict_group].empty:
        pass
    else:
        if idx == 0:
            unpacked_positions_full = positions_w_trips[dict_group].copy()
        else:
            unpacked_positions_full = unpacked_positions_full.append(positions_w_trips[dict_group])

('2019-04-20', '2019-05-08')
('2019-05-09', '2019-06-18')


# only take positions along one `shape_id`

In [20]:
unpacked_positions_one_shape = unpacked_positions_full[unpacked_positions_full['shape_id']==shape_of_interest_id]

# try parallel below - I don't think the parallel function will work on a Windows computer (I've had trouble in the past). It should work fine on a Mac.

### the below takes ~1-2 minutes on my computer depending on the shape/number of trips

In [21]:
if __name__ == "__main__":
    start = time.time()
    positions_w_near_node_df = f1.get_close_node_process(unpacked_positions_one_shape, route_vertex_geo)
    end = time.time()
    print(end - start)

70.69993782043457


# convert `time_pct` to Pacific time for datetime tranforms below

In [22]:
positions_w_near_node_datetime = f1.datetime_transform_df(positions_w_near_node_df)

In [23]:
position_w_node_schedule = f1.join_tripstart(positions_w_near_node_datetime, 
                                                            full_trip_stop_schedule, 
                                                            tripid_w_starttime)

In [24]:
position_w_node_schedule['distance_btw_veh_and_shape'] = position_w_node_schedule\
                                                                .apply(lambda x: f1.calc_distance(x['vehicle_lat'],
                                                                x['vehicle_long'], 
                                                                x['shape_pt_lat'],
                                                               x['shape_pt_lon']), axis=1)

## add in Nathaniel's code for finding closest point on route

In [25]:
import find_closest_route_point as f2

In [26]:
position_w_node_schedule.drop(['index'], axis=1,inplace=True)

In [27]:
import numpy as np

In [28]:
position_w_node_schedule.shape

(65958, 34)

In [29]:
if __name__ == "__main__":
    start = time.time()
    position_w_closest_df = f2.get_closeset_point_process(position_w_node_schedule, full_shapes_gtfs, shape_of_interest_id)
    end = time.time()
    print(end - start)

410.31353735923767


## unpack the tuple

In [74]:
position_w_closest_df['closest_pt_coords'] = position_w_closest_df['closest_pt_on_route_tuple'].apply(lambda x: x[0])
position_w_closest_df['shape_dist_traveled_to_closest_pt'] = position_w_closest_df['closest_pt_on_route_tuple'].apply(lambda x: x[1])

In [75]:
position_w_closest_df.drop(['closest_pt_on_route_tuple'], axis=1,inplace=True)

In [76]:
veh_dist_to_shape_std_dev = position_w_closest_df['distance_btw_veh_and_shape'].describe()['std']

In [77]:
filtered_position_w_node_schedule = position_w_closest_df[position_w_closest_df['distance_btw_veh_and_shape']
                                                          <100].copy()

In [78]:
#output folder
output_folder = "./data/transformed/"
filtered_position_w_node_schedule.to_csv(f"{output_folder}route_{route_of_interest}_{position_date}.csv",index=False)