# This notebook goes through building transit network from gtfs to network standard

1. extract represetative trips
2. snap stops to roadway nodes
3. route bus on roadway via osmnx routing
4. route bus on roadway via shst routing
5. build non-bus/rail links and nodes
6. complete network node list that each transit path traverses
7. frquence based stop time
8. write out to transit network standard
9. write out quick QA/QC transit route true shape
10. write out network standard with rail nodes and links

In [1]:
import partridge as ptg
import peartree as pt
#%matplotlib inline
import requests
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, LineString
import networkx as nx
from shapely import wkt
from scipy.spatial import cKDTree
import osmnx as ox
from dbfread import DBF
from osgeo import ogr
import glob
import time
import json

In [None]:
step1_output_folder = "../data/processed/step1_roadway/"
data_interim_folder = "../data/interim/"
output_folder = "../tests/networkstandard/step2_transit/"

In [483]:
cd = "../gtfs_transit/"
muni_url = "https://transitfeeds.com/p/sfmta/60/20160125/download"

#osm drive file
link_file = step1_output_folder + "sf_link.json"
with open(link_file) as f:
    link_json = json.load(f)
link_df = pd.DataFrame(link_json)

#osm drive file
node_file = step1_output_folder + "sf_node.geojson"
node_gdf = gpd.read_file(node_file)

shape_gdf = gpd.read_file(step1_output_folder + "sf_shape.geojson")

In [485]:
link_df.info()
node_gdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74352 entries, 0 to 74351
Data columns (total 41 columns):
A                     74352 non-null int64
B                     74352 non-null int64
LANES                 38653 non-null float64
access                74352 non-null object
area                  74352 non-null object
bike_access           74352 non-null int64
bridge                74352 non-null object
drive_access          74352 non-null int64
est_width             74352 non-null object
forward               41385 non-null float64
fromIntersectionId    74352 non-null object
highway               74352 non-null object
id                    74352 non-null object
junction              74352 non-null object
key                   74352 non-null object
landuse               74352 non-null object
lanes                 74352 non-null object
length                74352 non-null float64
link                  74352 non-null object
maxspeed              74352 non-null object
name        

In [486]:
print(node_gdf.crs)
print(node_gdf.columns)

{'init': 'epsg:4326'}
Index(['osm_node_id', 'shst_node_id', 'drive_access', 'walk_access',
       'bike_access', 'model_node_id', 'geometry'],
      dtype='object')


In [487]:
node_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 27700 entries, 0 to 27699
Data columns (total 7 columns):
osm_node_id      27700 non-null int64
shst_node_id     27700 non-null object
drive_access     27700 non-null int64
walk_access      27700 non-null int64
bike_access      27700 non-null int64
model_node_id    27700 non-null int64
geometry         27700 non-null object
dtypes: int64(5), object(2)
memory usage: 1.5+ MB


In [3]:
# gfts shape shst match reference Id
muni_shst_df = gpd.read_file(data_interim_folder + "muni.transit.out.matched.geojson")

In [488]:
drive_node_gdf = node_gdf[node_gdf.drive_access == 1].copy()
drive_link_df = link_df[link_df.drive_access == 1].copy()

In [5]:
def ox_graph(nodes_df, links_df):
    """
        create an osmnx-flavored network graph
        osmnx doesn't like values that are arrays, so remove the variables
        that have arrays.  osmnx also requires that certain variables
        be filled in, so do that too.
        Parameters
        ----------
        nodes_df : GeoDataFrame
        link_df : GeoDataFrame
        Returns
        -------
        networkx multidigraph
    """
    try:
        graph_nodes = nodes_df.drop(
                ["inboundReferenceId", "outboundReferenceId"], axis=1
            )
    except:
        graph_nodes = nodes_df.copy()

    graph_nodes.gdf_name = "network_nodes"
    graph_nodes['id'] = graph_nodes['shst_node_id']

    graph_links = links_df.copy()
    graph_links['id'] = graph_links['shstReferenceId']
    graph_links['key'] = graph_links['shstReferenceId']

    G = ox.gdfs_to_graph(graph_nodes, graph_links)

    return G

In [489]:
# build network routing file for osmnx routing

G_drive_sf = ox_graph(drive_node_gdf,
                  drive_link_df)

In [490]:
nx.shortest_path(G_drive_sf, 293741891, 65284950, weight = "length")

[293741891,
 65290257,
 911547143,
 3593679267,
 423778249,
 65290252,
 5435466368,
 65290251,
 65290249,
 65290238,
 65290236,
 5435466213,
 5435466205,
 65290232,
 5435466333,
 5435466219,
 65290229,
 65290227,
 5435466158,
 65290225,
 5435466163,
 65281097,
 4911322443,
 5437055071,
 65284950]

In [10]:
def get_representative_feed_from_gtfs(work_dir, in_url, fetch = False):
    
    print('getting representative feed...')
    
    if fetch == True:
        #read and save zip from url
        resp = urlopen(in_url)
        zipfile = ZipFile(BytesIO(resp.read()))
    
    if fetch == True:
        zipfile.extractall(work_dir + "muni")
    
    file_loc = work_dir + "muni"
    
    # get feed for the busiest day
    feed = pt.get_representative_feed(file_loc)
    
    return feed

In [11]:
feed = get_representative_feed_from_gtfs(cd, muni_url, True)

getting representative feed...


In [627]:
feed.stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,6680994,25740.0,25740.0,4015,1,,,,
1,6680994,25808.0,25808.0,6294,2,,,,
2,6680994,25881.0,25881.0,6290,3,,,,
3,6680994,25920.0,25920.0,6314,4,,,,
4,6680994,25967.0,25967.0,6307,5,,,,
5,6680994,26024.0,26024.0,6302,6,,,,
6,6680994,26069.0,26069.0,6299,7,,,,
7,6680994,26109.0,26109.0,6316,8,,,,
8,6680994,26160.0,26160.0,6312,9,,,,
9,6680994,26228.0,26228.0,6315,10,,,,


In [655]:
# pick representatives for each route by direction, with most number of trip 
def get_representative_trip_for_route(feed):
    
    """
    get the representative trips for each route, by direction, tod
    
    """
    
    print('getting representative trip...')
    
    # get the first stop of each trip to determine the time period for each trip
    # process time
    stop_times_df = feed.stop_times.copy()
    stop_times_df['arrival_h'] = pd.to_datetime(stop_times_df['arrival_time'], unit = 's').dt.hour
    stop_times_df['arrival_m'] = pd.to_datetime(stop_times_df['arrival_time'], unit = 's').dt.minute
    stop_times_df['departure_h'] = pd.to_datetime(stop_times_df['departure_time'], unit = 's').dt.hour
    stop_times_df['departure_m'] = pd.to_datetime(stop_times_df['departure_time'], unit = 's').dt.minute
    
    # according to the gtfs reference, the stop sequence does not have to be consecutive, but has to always increase
    # so we can get the fisrt stop by the smallest stop sequence on the trip
    stop_times_df.sort_values(by = ["trip_id", "stop_sequence"], 
                              ascending = True, 
                              inplace = True)
    first_stop_df = stop_times_df.drop_duplicates(subset = ["trip_id"])
    
    ## identify peak, offpeak trips, based on the arrival time of first stop
    trip_df = feed.trips.copy()
    trip_df = pd.merge(trip_df, 
                       first_stop_df,
                       how = 'left',
                       on = 'trip_id')
    
    ## AM: 6-10am, MD: 10am-3pm, PM: 3-7pm, NT 7pm-3am, EA 3-6am
    trip_df['tod'] = np.where((trip_df['arrival_h'] >= 6) & (trip_df['arrival_h'] < 10),
                                 'AM',
                                 np.where((trip_df['arrival_h'] >= 10) & (trip_df['arrival_h'] < 15),
                                     'MD',
                                    np.where((trip_df['arrival_h'] >= 15) & (trip_df['arrival_h'] < 19),
                                         'PM',
                                        np.where((trip_df['arrival_h'] >= 3) & (trip_df['arrival_h'] < 6),
                                             'EA',
                                                 'NT'))))
  
    # get the most frequent trip for each route, by direction, by time of day
    ## trips share the same shape_id is considered being the same
    ## first get the trip count for each shape_id
    trip_freq_df = trip_df.groupby(['route_id', 'tod', 'direction_id', 'shape_id'])['trip_id'].count().\
                            to_frame().\
                            drop(index = 'other', level = 1).\
                            reset_index()
    
    ## then choose the most frequent shape_id for each route, frequency use the total number of trips
    def agg(x):
        m = x.shape_id.iloc[np.argmax(x.trip_id.values)]
        return pd.Series({'trip_num' : x.trip_id.sum(), 'shape_id' : m})
   
    trip_freq_df = trip_freq_df.reset_index().groupby(['route_id', 'tod', 'direction_id']).apply(agg)
    
    # retain the complete trip info of represent trip only
    trip_df = pd.merge(trip_df, trip_freq_df.reset_index(),
                      how = 'inner',
                      on = ['route_id', 'tod', 'direction_id', 'shape_id']).\
                drop_duplicates(['route_id', 'direction_id', 'tod'])
    
    return trip_df

In [656]:
trip_df = get_representative_trip_for_route(feed)

getting representative trip...


In [672]:
print(trip_df.info())
trip_df[trip_df.route_id == "1031"]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 10835
Data columns (total 21 columns):
route_id               600 non-null object
service_id             600 non-null object
trip_id                600 non-null object
trip_headsign          597 non-null object
direction_id           600 non-null int64
block_id               600 non-null object
shape_id               600 non-null object
arrival_time           600 non-null float64
departure_time         600 non-null float64
stop_id                600 non-null object
stop_sequence          600 non-null int64
stop_headsign          600 non-null object
pickup_type            0 non-null float64
drop_off_type          600 non-null object
shape_dist_traveled    0 non-null float64
arrival_h              600 non-null int64
arrival_m              600 non-null int64
departure_h            600 non-null int64
departure_m            600 non-null int64
tod                    600 non-null object
trip_num               600 non-null int

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id,arrival_time,departure_time,stop_id,...,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,arrival_h,arrival_m,departure_h,departure_m,tod,trip_num
321,1031,1,6682885,Downtown,1,5106,133152,25200.0,25200.0,3927,...,,,,,7,0,7,0,AM,14
456,1031,1,6777414,the Richmond District,0,5166,133146,60720.0,60720.0,4347,...,,,,,16,52,16,52,PM,14


In [491]:
def snap_stop_to_node(feed, node_gdf):
    
    """
    map gtfs stops to roadway nodes
    
    Parameters:
    ------------
    feed
    drive nodes
    
    return
    ------------
    stops with drive nodes id
    """
    
    print('snapping gtfs stops to roadway node osmid...')
    
    node_non_c_gdf = node_gdf.copy()
    node_non_c_gdf = node_non_c_gdf.to_crs({'init' : 'epsg:26915'})
    node_non_c_gdf['X'] = node_non_c_gdf.geometry.map(lambda g:g.x)
    node_non_c_gdf['Y'] = node_non_c_gdf.geometry.map(lambda g:g.y)
    inventory_node_ref = node_non_c_gdf[['X', 'Y']].values
    tree = cKDTree(inventory_node_ref)
    
    stop_df = feed.stops.copy()
    stop_df['geometry'] = [Point(xy) for xy in zip(stop_df['stop_lon'], stop_df['stop_lat'])]
    stop_df = gpd.GeoDataFrame(stop_df)
    stop_df.crs = {'init' : 'epsg:4326'}
    stop_df = stop_df.to_crs({'init' : 'epsg:26915'})
    stop_df['X'] = stop_df['geometry'].apply(lambda p: p.x)
    stop_df['Y'] = stop_df['geometry'].apply(lambda p: p.y)
   
    for i in range(len(stop_df)):
        point = stop_df.iloc[i][['X', 'Y']].values
        dd, ii = tree.query(point, k = 1)
        add_snap_gdf = gpd.GeoDataFrame(node_non_c_gdf.iloc[ii]).transpose().reset_index(drop = True)
        add_snap_gdf['stop_id'] = stop_df.iloc[i]['stop_id']
        if i == 0:
            stop_to_node_gdf = add_snap_gdf.copy()
        else:
            stop_to_node_gdf = stop_to_node_gdf.append(add_snap_gdf, ignore_index=True, sort=False)
    
    stop_df.drop(['X','Y'], axis = 1, inplace = True)
    stop_to_node_gdf = pd.merge(stop_df, stop_to_node_gdf, how = 'left', on = 'stop_id')
    
    column_list = feed.stops.columns.values.tolist() + ['osm_node_id', 'shst_node_id', "model_node_id"]
    
    return stop_to_node_gdf[column_list]

In [774]:
stop_df = snap_stop_to_node(feed, drive_node_gdf)

snapping gtfs stops to roadway node osmid...


In [775]:
stop_df.info()
stop_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3519 entries, 0 to 3518
Data columns (total 10 columns):
stop_id          3519 non-null object
stop_name        3519 non-null object
stop_desc        3519 non-null object
stop_lat         3519 non-null float64
stop_lon         3519 non-null float64
zone_id          3519 non-null object
stop_url         3519 non-null object
osm_node_id      3519 non-null object
shst_node_id     3519 non-null object
model_node_id    3519 non-null object
dtypes: float64(2), object(8)
memory usage: 302.4+ KB


Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,osm_node_id,shst_node_id,model_node_id
0,390,19th Avenue & Holloway St,,37.72119,-122.475096,,,3527108528,5014094a822c8a55290ed109a43d3687,1012489
1,913,DUBLIN ST & LAGRANDE AVE,,37.719192,-122.425802,,,4808718391,ac144f0c0a9977cf30163bf1a5309cc9,1017384
2,3003,2nd St & Brannan St,,37.781827,-122.391945,,,65315466,730a5b5ffca71fa58473cf5df382acae,1006787


In [675]:
def route_bus_link_osmnx(roadway_gdf, node_gdf, G, feed, trip, stop):
    
    """
    route bus with OSMNX routing
    
    Parameters
    ----------
    drive link
    drive node
    drive graph
    feed
    trip 
    stop
    
    return
    ----------
    dataframe of drive links bus trips traverses
    list of trips that could not be routed by OSMNX
    """
    
    trip_df = trip.copy()
    stop_df = stop.copy()
    stop_time_df = feed.stop_times.copy()
    
    chained_stop_df = stop_time_df[stop_time_df['trip_id'].isin(trip_df.trip_id.tolist())]
    chained_stop_to_node_df = pd.merge(chained_stop_df, 
                                       stop_df,
                                        how = 'left',
                                        on = 'stop_id')
    
    print('routing bus on roadway network with osmnx...')
    
    #osm_node_dict = dict(zip(node_gdf.osmid, node_gdf.N))
    
    trip_df = pd.merge(trip_df, feed.routes, how = 'left', on = 'route_id')
    bus_trip_df = trip_df[trip_df['route_type'] == 3]
    
    # to track trips that osmnx failed to route
    broken_shape_trip_list = []
    
    # output dataframe for osmnx success
    trip_link_shape_df = pd.DataFrame()
    
    # loop through for bus trips
    for trip_id in bus_trip_df.trip_id.unique():
        
        # get the stops on the trip
        trip_stop_df = chained_stop_to_node_df[chained_stop_to_node_df['trip_id'] == trip_id]

        try:
            print("routing" + trip_id)
            for s in range(len(trip_stop_df)-1):
                # from stop node OSM id
                closest_node_to_stop1 = int(trip_stop_df.osm_node_id.iloc[s])
                
                # to stop node OSM id
                closest_node_to_stop2 = int(trip_stop_df.osm_node_id.iloc[s+1])
                
                # osmnx routing btw from and to stops, return the list of nodes
                node_osmid_list = nx.shortest_path(G, closest_node_to_stop1, closest_node_to_stop2)
                
                # get the links
                if len(node_osmid_list) > 1:
                    osm_link_gdf = pd.DataFrame({'u' : node_osmid_list[:len(node_osmid_list)-1], 
                                            'v' : node_osmid_list[1:len(node_osmid_list)],
                                            'trip_id' : trip_id},
                                               )
                else:
                    continue
                
                trip_link_shape_df = trip_link_shape_df.append(osm_link_gdf, ignore_index = True, sort = False)
                
        

        except:
            broken_shape_trip_list = broken_shape_trip_list + [trip_id]
            print('  warning: cannot route bus: ' + trip_id)
            continue      
                
    trip_link_shape_df = pd.merge(trip_link_shape_df, trip_df[['trip_id', 'shape_id']], how = 'left', on = 'trip_id')

    trip_link_shape_df = pd.merge(trip_link_shape_df,
                                  drive_link_df[["u", "v", "wayId", "shstReferenceId", "shstGeometryId", "A", "B"]].\
                                      drop_duplicates(subset = ["u", "v"]),
                                  how = "left",
                                  on = ["u", "v"])
    
    
    return trip_link_shape_df, broken_shape_trip_list

In [676]:
stop_df[stop_df.stop_id.isin(["3042", "3071"])]

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,osm_node_id,shst_node_id,model_node_id
31,3042,Balboa St & 14th Ave,,37.77678,-122.4727,,,65349672,925fee006f9bc6713d7bd75ec1058c2e,1009533
58,3071,Balboa St & Park Presidio Blvd,,37.776788,-122.472411,,,65349672,925fee006f9bc6713d7bd75ec1058c2e,1009533


In [677]:
bus_osmnx_link_shape_df, bus_osmnx_broken_trip_list = route_bus_link_osmnx(drive_link_df, 
                                                                            drive_node_gdf, 
                                                                            G_drive_sf, 
                                                                            feed,
                                                                            trip_df, 
                                                                            stop_df)

routing bus on roadway network with osmnx...
routing6681228
routing6681219
routing6681318
routing6681305
routing6681390
routing6681070
routing6681042
routing6681038
routing6681034
routing6681049
routing6682885
routing6682961
routing6768113
routing6767752
routing6682823
routing6768226
routing6767277
routing6682952
routing6682951
routing6682850
routing6782020
routing6777414
routing6682866
routing6682864
routing6750403
routing6750367
routing6750360
routing6750337
routing6750336
routing6750475
routing6750483
routing6750661
routing6750633
routing6750613
routing6750520
routing6750519
routing6750587
routing6750582
routing6750546
routing6697427
routing6697425
routing6697413
routing6697398
routing6697387
routing6697454
routing6697443
routing6697495
routing6697481
routing6697469
routing6697866
routing6697858
routing6697844
routing6697841
routing6697892
routing6697786
routing6697771
routing6697839
routing6697823
routing6697807
routing7097415
routing6791152
routing6791146
routing6791142
routing679

routing7089892
routing7089934
routing7089771
routing7102336
routing7102408
routing7102399
routing7102386
routing7102364
routing7102360
routing7102327
routing7102437
routing7102426
routing7102384
routing7102122
routing7102120
routing7102096
routing7102145
routing7102108
routing7102081
routing7102068
routing7102161
routing7102324
routing7091231
routing7091228
routing7091197
routing7091140
routing7091128
routing7091410
routing7091398
routing7091373
routing7091305
routing7091291
routing7092288
routing7092317
routing7092259
routing7092184
routing7092343
routing7092262
routing7092207
routing7092380
routing7092410
routing7092409
routing7093220
routing7093202
routing7093196
routing7093284
routing7093239
routing7093096
routing7093093
routing7093144
routing7093143
routing7093138
routing7095116
routing7095091
routing7108203
routing7108197
routing7108138
routing7108187
routing7108186
routing7108258
routing7108256
routing7108280
routing7108350
routing7108325
routing7109115
routing7109113
routing710

In [678]:
bus_osmnx_link_shape_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71754 entries, 0 to 71753
Data columns (total 9 columns):
u                  71754 non-null int64
v                  71754 non-null int64
trip_id            71754 non-null object
shape_id           71754 non-null object
wayId              71754 non-null object
shstReferenceId    71754 non-null object
shstGeometryId     71754 non-null object
A                  71754 non-null int64
B                  71754 non-null int64
dtypes: int64(4), object(5)
memory usage: 5.5+ MB


In [679]:
# osmnx failed to route these trips: can be rail modes
print(bus_osmnx_broken_trip_list)

['6750403', '6750367', '6750360', '6750337', '6750336', '6750475', '6750483', '7072738', '7072726', '6702628', '6702626', '6702656', '6702754', '7072734', '6702699', '6702735', '6702719', '6703482', '6703480', '6703466', '6703446', '6703435', '6771948', '6704443', '6704417', '6704413', '6704335', '6771824', '6771815', '6771733', '6707301', '6707299', '6707257', '6707242', '6707498', '6707350', '6707223', '6707311', '6707282', '6707276', '6717535', '6717599', '6717575', '6717633', '6720616', '6720615', '6720568', '6720603', '6720588', '6794598', '6794585', '6794573', '6794558', '6794610', '6794609', '6794649', '6794636', '7072709', '7072707', '7089974', '7090049', '7090030', '7090124', '7089855', '7089823', '7089805', '7089943', '7089873', '7089771', '7102122', '7102120', '7102145', '7102161', '7092259', '7092184', '7092262']


In [680]:
# shapes that were not successfully routed by OSMNX

trip_df[trip_df.trip_id.isin(bus_osmnx_broken_trip_list)].shape_id.unique()

array(['133186', '133188', '133189', '133271', '133272', '133274',
       '133277', '134212', '133314', '133310', '133309', '133380',
       '133397', '133407', '133408', '133439', '138954', '138946',
       '138944', '138943', '138940', '138938', '138958', '138973'],
      dtype=object)

In [683]:
trip_df.shape_id.nunique()

208

In [684]:
bus_osmnx_link_shape_df.shape_id.nunique()

176

In [697]:
def route_bus_link_shst(drive_link, gtfs_shst_id):
    
    """
    route bus with shst match result
    
    parameter
    ---------
    drive link
    gtfs shst match return
    
    return
    ---------
    dataframe of drive links bus traverses
    list of imcomplete bus shapes
    
    """
    
    drive_link_df = drive_link.copy()
    shape_shst_df = gtfs_shst_id.copy()

    shape_shst_df = pd.merge(shape_shst_df, 
                             drive_link_df[
                                 ['shstReferenceId','wayId','u','v', "fromIntersectionId", "toIntersectionId", "A", "B"]
                             ],
                             how = 'left',
                             left_on = 'shstReferenceId',
                             right_on = 'shstReferenceId')
    
    shape_shst_df["u"] = shape_shst_df["u"].fillna(0).astype(np.int64)
    shape_shst_df["v"] = shape_shst_df["v"].fillna(0).astype(np.int64)
    shape_shst_df["A"] = shape_shst_df["A"].fillna(0).astype(np.int64)
    shape_shst_df["B"] = shape_shst_df["B"].fillna(0).astype(np.int64)
    
    """shape_shst_df.dropna(subset = ['u','v'], 
                         axis = 0, 
                         inplace = True)"""
    
    shape_shst_df = shape_shst_df.reset_index(drop=True)
    
    shape_shst_df['next_pp_shape_id'] = shape_shst_df['pp_shape_id'].\
                                            iloc[1:].\
                                            append(pd.Series(shape_shst_df['pp_shape_id'].iloc[-1])).\
                                            reset_index(drop=True)
    
    shape_shst_df['next_u'] = shape_shst_df['u'].\
                                iloc[1:].\
                                append(pd.Series(shape_shst_df['v'].iloc[-1])).\
                                reset_index(drop=True)
    
    incomplete_shape_list = shape_shst_df[\
                                   (shape_shst_df.pp_shape_id==shape_shst_df.next_pp_shape_id)\
                                   &(shape_shst_df.v!=shape_shst_df.next_u)\
                                  ].pp_shape_id.unique().\
                                    tolist()
    
    shape_shst_df = shape_shst_df[~shape_shst_df.pp_shape_id.isin(incomplete_shape_list)].copy()
    
    return shape_shst_df, incomplete_shape_list

In [698]:
bus_shst_link_shape_df, incomplete_shape_list = route_bus_link_shst(drive_link_df, muni_shst_df)

print(bus_shst_link_shape_df.shape)
print(bus_shst_link_shape_df.pp_shape_id.nunique())

(31715, 26)
313


In [699]:
print(incomplete_shape_list)

[133153, 133158, 133188, 133192, 133194, 133199, 133202, 133206, 133209, 133213, 133214, 133225, 133226, 133232, 133249, 133251, 133276, 133308, 133312, 133327, 133336, 133346, 133349, 133351, 133389, 133392, 133418, 133423, 133435, 133436, 133473, 133494, 133640, 133701, 135377, 137828, 137832, 137836, 137842, 137847, 138105, 138277, 138338, 138341, 138342, 138608, 138938, 138943, 138946, 138951, 138953, 138957, 138958, 138975, 138976, 138993, 138997, 139044, 139072, 139132, 139135, 139137, 139150, 139154, 139216, 139281, 139294]


In [29]:
# some of these buses has parts that are out side of SF county boundary, that's why they are labeled as incomplete shape
# some are due to the discrepency btw shst extraction and osmnx extraction

array([], dtype=int64)

In [700]:
bus_shst_link_shape_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 31715 entries, 0 to 41872
Data columns (total 26 columns):
shstReferenceId           31715 non-null object
shstGeometryId            31715 non-null object
shstFromIntersectionId    31715 non-null object
shstToIntersectionId      31715 non-null object
gisReferenceId            31715 non-null object
gisGeometryId             31715 non-null object
gisTotalSegments          31715 non-null int64
gisSegmentIndex           31715 non-null int64
gisFromIntersectionId     31715 non-null object
gisToIntersectionId       31715 non-null object
startSideOfStreet         31715 non-null object
endSideOfStreet           31715 non-null object
sideOfStreet              31715 non-null object
score                     31715 non-null float64
matchType                 31715 non-null object
pp_shape_id               31715 non-null int64
geometry                  31715 non-null object
wayId                     31715 non-null object
u                    

In [685]:
def bus_link(bus_link_osmnx, bus_link_shst, trip, incomplete_list):
    
    """
    combine bus links from OSMNX and SHST
    """
    
    bus_link_osmnx_df = bus_link_osmnx.copy()
    bus_link_shst_df = bus_link_shst.copy()
    
    trip_df = trip.copy()
    trip_df = pd.merge(trip_df, feed.routes[['route_id', 'route_type']], how = 'left', on = 'route_id')
    bus_trip_df = trip_df[trip_df.route_type == 3].copy()
    
    bus_link_shst_df.pp_shape_id = bus_link_shst_df.pp_shape_id.astype(str)
    
    shape_id_list = bus_trip_df.shape_id.unique().tolist()
    
    incomplete_list = [str(x) for x in incomplete_list]
    
    print("Targeting number of bus shape IDs: " + str(bus_trip_df.shape_id.nunique()))
    
    #trip_id, shape_id, u, v, link_id, omsid, shstrefere
    
    shst_shape_list = list(set([str(x) for x in bus_link_shst_df.pp_shape_id]))
    
    shapes_replace_with_shst_list = [x for x in shst_shape_list if x in shape_id_list]
    
    print("\n There are " + str(len(shapes_replace_with_shst_list)) + 
          " shapes that are from shst gtfs matching: \n \t" + 
          str(shapes_replace_with_shst_list))

    bus_link_osmnx_df = bus_link_osmnx_df[~bus_link_osmnx_df.shape_id.isin(shapes_replace_with_shst_list)].copy()
    
    osmnx_shape_list = bus_link_osmnx_df.shape_id.unique().tolist()
    
    print("\n There are " + str(len(osmnx_shape_list)) + 
          " shapes that are from OSMNX routing: \n \t" + 
          str(osmnx_shape_list))
    
    not_routed_list = [x for x in shape_id_list if x not in (shst_shape_list + osmnx_shape_list)]
    
    print("\n There are " + str(len(not_routed_list)) + 
         " shapes that are not routed by either of the two methods: \n \t" + 
         str(not_routed_list))
    
    bus_link_shst_df = pd.merge(bus_link_shst_df,
                                bus_trip_df[['trip_id', 'shape_id']],
                                how = 'inner',
                                left_on = 'pp_shape_id',
                                right_on = 'shape_id')
    #bus_link_shst_df.drop(['pp_shape_id'], axis = 1, inplace = True)
    
    bus_link_df = pd.concat([bus_link_osmnx_df, bus_link_shst_df],
                            sort = False,
                           ignore_index = True)
    
    column_list = bus_link_osmnx.columns.values.tolist()
    
    return bus_link_df[column_list]

In [686]:
bus_link_df = bus_link(bus_osmnx_link_shape_df, bus_shst_link_shape_df, trip_df, incomplete_shape_list)

Targeting number of bus shape IDs: 178

 There are 58 shapes that are from shst gtfs matching: 
 	['139047', '133395', '133432', '133332', '133261', '139001', '138979', '133401', '133267', '133323', '133433', '133426', '139050', '139139', '133269', '138961', '133142', '138912', '133586', '139030', '133303', '139021', '133297', '138908', '133299', '139022', '133235', '133155', '133154', '139000', '133585', '139024', '133139', '133330', '133396', '133310', '138913', '133368', '133275', '138936', '139023', '133309', '133150', '138980', '133331', '133422', '133151', '133260', '133274', '138909', '133238', '138926', '133314', '133434', '139045', '138928', '133305', '133328']

 There are 118 shapes that are from OSMNX routing: 
 	['133138', '133152', '133153', '133149', '133144', '133145', '133146', '133147', '133148', '133186', '133188', '133189', '133190', '133192', '133193', '133236', '133237', '133247', '133251', '133254', '133253', '133257', '133255', '133256', '133272', '133276', '1332

In [701]:
bus_link_df.info()
bus_link_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73362 entries, 0 to 73361
Data columns (total 9 columns):
u                  73362 non-null int64
v                  73362 non-null int64
trip_id            73362 non-null object
shape_id           73362 non-null object
wayId              73362 non-null object
shstReferenceId    73362 non-null object
shstGeometryId     73362 non-null object
A                  73362 non-null int64
B                  73362 non-null int64
dtypes: int64(4), object(5)
memory usage: 5.0+ MB


Unnamed: 0,u,v,trip_id,shape_id,wayId,shstReferenceId,shstGeometryId,A,B
0,65295383,4017324918,6681049,133138,224200962,3eed7ea339699d7fde84fddc9bbb981f,fcd40c63e5f8e198d1176e67b76112bf,1009243,1018384
1,4017324918,65295385,6681049,133138,224200962,937ab168d15ad27afb9aa8aaa4c526be,e7614382af759f79352c1ebf7edccf09,1018384,1010935
2,65295385,65295388,6681049,133138,224316515,4c6fc46b0c1970e89b23161a97dae29b,f120b796e34c04f3d8a757abe5e555a5,1010935,1023127


In [801]:
# create rail links
def non_bus_link(feed, trip, stop):
    
    """
    create rail links and nodes
    
    nodes are based on rail stops, links are true shape between nodes
    
    return
    ---------
    complete rail link path for each rail service
    complete rail node path for each rail service
    
    """
    
    print('generating rail links...')
    
    #get rail trips
    trip_df = trip.copy()
    trip_df = pd.merge(trip_df, feed.routes[['route_id', 'route_type']], how = 'left', on = 'route_id')
    rail_trip_df = trip_df[trip_df.route_type != 3].copy()
    
    stop_df = stop.copy()
    stop_time_df = feed.stop_times.copy()
    
    #get rail trips with stops
    chained_stop_to_node_df = pd.merge(stop_time_df, 
                                       stop_df,
                                       how = 'left',
                                       on = 'stop_id')
    
    rail_stop_time_df = chained_stop_to_node_df[
                                                chained_stop_to_node_df['trip_id']\
                                                .isin(rail_trip_df.trip_id.tolist())
                                               ]\
                                                .copy()
    
    #get gtfs rail shapes
    rail_shape_df = feed.shapes[
                                feed.shapes['shape_id'].isin(rail_trip_df.shape_id.tolist())
                                ].copy()
    
    #gtfs shape-trip correspondence
    shape_trip_dict = dict(zip(rail_trip_df.shape_id, rail_trip_df.trip_id))
    
    print(rail_shape_df.shape_id.unique())
    #for each rail shape
    for i in rail_shape_df.shape_id.unique():
    
        trip_id = shape_trip_dict[i]
        
        #get chained stop
        trip_stop_df = rail_stop_time_df[rail_stop_time_df.trip_id == trip_id].copy()
        
        # get gtfs shape nodes for the shape
        trip_shape_df = rail_shape_df[rail_shape_df.shape_id == i].copy()
        # initialize columns
        trip_shape_df['is_stop'] = np.int(0)
        trip_shape_df['stop_id'] = np.nan
        
        # for each rail stop, find the closest node in the shape, and those are the stops and breakpoints of new rail links
        # return is a gtfs node shape dataframe with two columns indicating if the node is a stop and the stop id
        shape_inventory = trip_shape_df[['shape_pt_lon', 'shape_pt_lat']].values
        tree = cKDTree(shape_inventory)
        for s in range(len(trip_stop_df)):
            point = trip_stop_df.iloc[s][['stop_lon', 'stop_lat']].values
            dd, ii = tree.query(point, k = 1)
            trip_shape_df.shape_pt_lon.iloc[ii] = trip_stop_df.iloc[s]['stop_lon']
            trip_shape_df.shape_pt_lat.iloc[ii] = trip_stop_df.iloc[s]['stop_lat']
            trip_shape_df.is_stop.iloc[ii] = 1
            trip_shape_df.stop_id.iloc[ii] = trip_stop_df.iloc[s]['stop_id']
        
        # appending the gtfs shape for each route shape id
        if i == rail_shape_df.shape_id.unique()[0]:
            shape_flag_df = trip_shape_df.copy()
        else:
            shape_flag_df = shape_flag_df.append(trip_shape_df, 
                                                 ignore_index = True, 
                                                 sort = False)
    
    # starting to build new rail links true shape
    linestring_df = pd.DataFrame(columns = ['shape_id', 'u', 'v', 'geometry', 'u_stop_id', 'v_stop_id'])

    # rail links are based on the gtfs shape, with nodes being the shapes that are identified as rail stops.
    for i in shape_flag_df.shape_id.unique():
        # get gtfs shape for shape id
        shape_route_df = shape_flag_df[shape_flag_df.shape_id == i]
        
        # get rail nodes based on the stop flags
        break_list = shape_route_df.index[shape_route_df.is_stop == 1].tolist()
        stop_id_list = shape_route_df[shape_route_df.is_stop == 1]['stop_id'].tolist()
        
        # use the gtfs shape between "stop" shapes to build the rail true shape
        for j in range(len(break_list)-1):
            lon_list = shape_flag_df.shape_pt_lon.iloc[break_list[j]:break_list[j+1]+1].tolist()
            lat_list = shape_flag_df.shape_pt_lat.iloc[break_list[j]:break_list[j+1]+1].tolist()
            linestring = LineString([Point(xy) for xy in zip(lon_list,lat_list)])
            linestring_df = linestring_df.append({'shape_id':i, 
                                                  'u':break_list[j], 
                                                  'v':break_list[j+1],
                                                  'u_stop_id':stop_id_list[j], 
                                                  'v_stop_id':stop_id_list[j+1],
                                                  'geometry' : linestring}, 
                                                 ignore_index = True, 
                                                 sort = False)
    
    # add rail travel time between stops
    stop_time_df = pd.merge(
                            stop_time_df, 
                            rail_trip_df[['trip_id', 'shape_id']], 
                            how = 'left', 
                            on = 'trip_id')
    
    unique_stop_time_df = stop_time_df[
                                        stop_time_df.shape_id.notnull()
                                    ].groupby(['trip_id', 'shape_id'])\
                                    .count().reset_index()\
                                    .drop_duplicates(subset = ['shape_id']).copy()
    
    stop_time_df = stop_time_df[stop_time_df.trip_id.isin(unique_stop_time_df.trip_id.tolist())].copy()

    
    linestring_df = pd.merge(linestring_df, 
                             stop_time_df[['shape_id', 'stop_id' , 'departure_time']].rename(
                                 columns = {"stop_id" : "u_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'u_stop_id'])
    
    linestring_df = pd.merge(linestring_df, 
                             stop_time_df[['shape_id', 'stop_id', 'arrival_time']].rename(
                                 columns = {"stop_id" : "v_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'v_stop_id'])
    
    # travel time in minutes
    linestring_df['rail_traveltime'] = (linestring_df['arrival_time'] - linestring_df['departure_time'])/60
    
    rail_node_df = shape_flag_df[shape_flag_df.is_stop == 1].rename_axis('node_id').reset_index()

    
    return linestring_df, rail_node_df

In [802]:
%%time
rail_path_link_gdf, rail_path_node_df = non_bus_link(feed, trip_df, stop_df)

generating rail links...
['133409' '133410' '133411' '133412' '133418' '133419' '133421' '133445'
 '133449' '133450' '133455' '133459' '133479' '133484' '133490' '133492'
 '133493' '133495' '133498' '133501' '133508' '133510' '133512' '134625'
 '134626' '134627' '139233' '139236' '139239' '139240']
Wall time: 1h 31min 14s


In [803]:
print(rail_path_node_df.columns)
print(rail_path_link_gdf.columns)

print(rail_path_node_df.shape)
print(rail_path_link_gdf.shape)

Index(['node_id', 'shape_id', 'shape_pt_lon', 'shape_pt_lat',
       'shape_pt_sequence', 'shape_dist_traveled', 'is_stop', 'stop_id'],
      dtype='object')
Index(['shape_id', 'u', 'v', 'geometry', 'u_stop_id', 'v_stop_id',
       'departure_time', 'arrival_time', 'rail_traveltime'],
      dtype='object')
(629, 8)
(599, 9)


In [804]:
rail_path_link_gdf

Unnamed: 0,shape_id,u,v,geometry,u_stop_id,v_stop_id,departure_time,arrival_time,rail_traveltime
0,133409,1,2,"LINESTRING (-122.411516 37.795709, -122.411619...",5364,5366,21240.0,21266.0,0.433333
1,133409,2,4,"LINESTRING (-122.411619 37.7962, -122.41186 37...",5366,5355,21266.0,21300.0,0.566667
2,133409,4,5,"LINESTRING (-122.411825 37.797208, -122.41202 ...",5355,5370,21300.0,21332.0,0.533333
3,133409,5,6,"LINESTRING (-122.41202 37.798199, -122.412203 ...",5370,5362,21332.0,21363.0,0.516667
4,133409,6,9,"LINESTRING (-122.412203 37.7991, -122.412409 3...",5362,5368,21363.0,21394.0,0.516667
5,133409,9,11,"LINESTRING (-122.412397 37.800037, -122.412535...",5368,5358,21394.0,21425.0,0.516667
6,133409,11,13,"LINESTRING (-122.41258 37.80096500000001, -122...",5358,5360,21425.0,21457.0,0.533333
7,133409,13,15,"LINESTRING (-122.412775 37.801929, -122.412912...",5360,4113,21457.0,21489.0,0.533333
8,133409,15,16,"LINESTRING (-122.413531 37.802687, -122.414586...",4113,4104,21489.0,21525.0,0.600000
9,133409,16,18,"LINESTRING (-122.414586 37.80341, -122.415009 ...",4104,6646,21525.0,21562.0,0.616667


In [805]:
rail_path_link_gdf[rail_path_link_gdf.shape_id == "139233"]

Unnamed: 0,shape_id,u,v,geometry,u_stop_id,v_stop_id,departure_time,arrival_time,rail_traveltime
567,139233,2301,2304,"LINESTRING (-122.417366 37.807248, -122.417267...",5184,3092,48540.0,48690.0,2.5
568,139233,2304,2306,"LINESTRING (-122.414125 37.807407, -122.412336...",3092,3095,48690.0,48840.0,2.5
569,139233,2306,2310,"LINESTRING (-122.41081 37.807841, -122.40924 3...",3095,4502,48840.0,49037.0,3.283333
570,139233,2310,2314,"LINESTRING (-122.40603 37.806629, -122.405721 ...",4502,4529,49037.0,49164.0,2.116667
571,139233,2314,2316,"LINESTRING (-122.403314 37.80502199999999, -12...",4529,4516,49164.0,49293.0,2.15
572,139233,2316,2318,"LINESTRING (-122.401029 37.80296, -122.400586 ...",4516,4518,49293.0,49428.0,2.25
573,139233,2318,2319,"LINESTRING (-122.39892 37.800605, -122.397434 ...",4518,4504,49428.0,49525.0,1.616667
574,139233,2319,2321,"LINESTRING (-122.397434 37.798901, -122.396519...",4504,4534,49525.0,49671.0,2.433333
575,139233,2321,2322,"LINESTRING (-122.395181 37.796355, -122.393449...",4534,4726,49671.0,49740.0,1.15


In [808]:
rail_path_node_df[rail_path_node_df.stop_id == "5184"]

Unnamed: 0,node_id,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled,is_stop,stop_id
593,2301,139233,-122.417366,37.807248,1,0,1,5184
617,2392,139239,-122.417366,37.807248,23,3043,1,5184
628,2416,139240,-122.417366,37.807248,24,2936,1,5184


In [809]:
def combine_bus_and_rail_shape(rail_path_link, rail_path_node, link, node, shape):
    
    """
    add only unique rail links and nodes to roadway standard
    
    parameter
    -----------
    complete rail link path
    complete rail node path
    all roadway links
    all roadway nodes
    all roadway shapes
    
    return
    -----------
    all roadway and rail links
    all roadway and rail nodes
    all roadway and rail shapes
    unique rail links
    unique rail nodes
    complete rail link path with updated link ID
    
    """
    
    print('indexing rail links and nodes...')
    
    node_gdf = node.copy()
    link_df = link.copy()
    shape_gdf = shape.copy()
    
    # add unique rail nodes to roadway node dataframe
    rail_path_node_gdf = rail_path_node.copy()
    
    unique_rail_node_df = rail_path_node_gdf.drop_duplicates(['shape_pt_lat', 'shape_pt_lon']).copy()
    
    # http://bayareametro.github.io/travel-model-two/input/#roadway-network
    TAP_start_number = 90001 
    
    unique_rail_node_df['model_node_id'] = range(TAP_start_number, TAP_start_number + len(unique_rail_node_df))
    
    rail_path_node_gdf = pd.merge(rail_path_node_gdf, 
                            unique_rail_node_df[['shape_pt_lat', 'shape_pt_lon', 'model_node_id']], 
                            how = 'left', 
                            on = ['shape_pt_lat', 'shape_pt_lon'])
    
    # get unique rail nodes
    unique_rail_node_df['geometry'] = [Point(xy) for xy in zip(unique_rail_node_df.shape_pt_lon, 
                                                               unique_rail_node_df.shape_pt_lat)]
    
    unique_rail_node_df = gpd.GeoDataFrame(unique_rail_node_df)
    unique_rail_node_df.crs = {'init' : 'epsg:4326'}
    unique_rail_node_df = unique_rail_node_df.to_crs(node_gdf.crs)
    
    unique_rail_node_df['transit_access'] = int(1)
    unique_rail_node_df["walk_access"] = int(1)
    
    # combine rail nodes and roadway nodes
    node_gdf["transit_access"] = int(0)
    
    rail_node_columns = ["model_node_id", "geometry", "transit_access", "walk_access"]
    
    roadway_and_rail_node_gdf = node_gdf.append(unique_rail_node_df[rail_node_columns],
                                                ignore_index = True, 
                                                sort = False)
    
    
    rail_node_osmid_dict = dict(zip(rail_path_node_gdf.node_id, rail_path_node_gdf.model_node_id))
    
    rail_path_link_df = rail_path_link.copy()
    
    rail_path_link_df['A'] = rail_path_link_df.u.map(rail_node_osmid_dict)
    rail_path_link_df['B'] = rail_path_link_df.v.map(rail_node_osmid_dict)
    
    rail_path_link_df.drop(["u", "v"], axis = 1, inplace = True)
    
    rail_path_link_df = gpd.GeoDataFrame(rail_path_link_df)
    rail_path_link_df.crs = {'init' : 'epsg:4326'}
    
    # get unique rail links
    unique_rail_link_gdf = rail_path_link_df.drop_duplicates(['A', 'B']).copy()
    
    # fake rail link shst geom id
    unique_rail_link_gdf['shstGeometryId'] = range(1, 1 + len(unique_rail_link_gdf))
    unique_rail_link_gdf['shstGeometryId'] = unique_rail_link_gdf.shstGeometryId.apply(lambda x:'rail'+str(x))
    unique_rail_link_gdf['id'] = unique_rail_link_gdf['shstGeometryId']

    unique_rail_link_gdf['transit_access'] = int(1)
    
    rail_path_link_df = pd.merge(rail_path_link_df,
                                unique_rail_link_gdf[["A", "B", "shstGeometryId"]],
                                how = "left",
                                on = ["A", "B"])
    
    rail_link_columns = ['A', 'B', "shstGeometryId", "rail_traveltime","transit_access", "id"]
    rail_shape_columns = ["id", "geometry"]
    
    # combine rail and roadway links
    roadway_and_rail_link_df = link_df.append(unique_rail_link_gdf[rail_link_columns], 
                                              ignore_index = True, 
                                              sort = False)
    
    # combine rail and roadway shapes
    roadway_and_rail_shape_gdf = shape_gdf.append(unique_rail_link_gdf[rail_shape_columns],
                                                 ignore_index = True,
                                                 sort = False)
    
    """rail_path_link_df = pd.merge(rail_path_link_df[['shape_id', 'geometry', 'u_stop_id', 'v_stop_id']],
                            unique_rail_shape_gdf.drop(['geometry', 'shape_id'], axis = 1),
                            how = 'left',
                            on = ['u_stop_id', 'v_stop_id'])"""
    
    rail_path_link_df = rail_path_link_df.to_crs({'init' : 'epsg:4326'})
        
    return roadway_and_rail_link_df, roadway_and_rail_node_gdf, roadway_and_rail_shape_gdf, \
                unique_rail_link_gdf, unique_rail_node_df, \
                rail_path_link_df

In [810]:
roadway_and_rail_link_df, roadway_and_rail_node_gdf, roadway_and_rail_shape_gdf, unique_rail_link_gdf, unique_rail_node_gdf, \
                                            rail_link_gdf = combine_bus_and_rail_shape(
                                                                                      rail_path_link_gdf, 
                                                                                      rail_path_node_df,
                                                                                      link_df, 
                                                                                      node_gdf,
                                                                                      shape_gdf)

indexing rail links and nodes...


In [811]:
roadway_and_rail_shape_gdf.id.nunique()

41768

In [812]:
roadway_and_rail_link_df.id.nunique()

41768

In [813]:
unique_rail_node_gdf.shape

(381, 12)

In [814]:
unique_rail_link_gdf.shape

(383, 12)

In [815]:
print(link_df.shape)
print(node_gdf.shape)
print(shape_gdf.shape)

(74352, 41)
(27700, 7)
(41385, 6)


In [816]:
print(roadway_and_rail_node_gdf.shape)
print(roadway_and_rail_link_df.shape)
print(roadway_and_rail_shape_gdf.shape)

(28081, 8)
(74735, 43)
(41768, 6)


In [817]:
rail_link_gdf

Unnamed: 0,shape_id,geometry,u_stop_id,v_stop_id,departure_time,arrival_time,rail_traveltime,A,B,shstGeometryId
0,133409,"LINESTRING (-122.411516 37.795709, -122.411619...",5364,5366,21240.0,21266.0,0.433333,90001,90002,rail1
1,133409,"LINESTRING (-122.411619 37.7962, -122.41186 37...",5366,5355,21266.0,21300.0,0.566667,90002,90003,rail2
2,133409,"LINESTRING (-122.411825 37.797208, -122.41202 ...",5355,5370,21300.0,21332.0,0.533333,90003,90004,rail3
3,133409,"LINESTRING (-122.41202 37.798199, -122.412203 ...",5370,5362,21332.0,21363.0,0.516667,90004,90005,rail4
4,133409,"LINESTRING (-122.412203 37.7991, -122.412409 3...",5362,5368,21363.0,21394.0,0.516667,90005,90006,rail5
5,133409,"LINESTRING (-122.412397 37.800037, -122.412535...",5368,5358,21394.0,21425.0,0.516667,90006,90007,rail6
6,133409,"LINESTRING (-122.41258 37.80096500000001, -122...",5358,5360,21425.0,21457.0,0.533333,90007,90008,rail7
7,133409,"LINESTRING (-122.412775 37.801929, -122.412912...",5360,4113,21457.0,21489.0,0.533333,90008,90009,rail8
8,133409,"LINESTRING (-122.413531 37.802687, -122.414586...",4113,4104,21489.0,21525.0,0.600000,90009,90010,rail9
9,133409,"LINESTRING (-122.414586 37.80341, -122.415009 ...",4104,6646,21525.0,21562.0,0.616667,90010,90011,rail10


In [818]:
def create_freq_table(trip_df):
    
    """
    create frequency table for network standard
    """
    
    print('creating frequency reference...')
    
    tod_numhours_dict = {"AM" : 4, "MD" : 5, "PM" :4, "NT" : 8, "EA" : 3}
    
    freq_df = trip_df[['trip_id', 'tod', 'direction_id', 'trip_num']].copy()
    freq_df['headway_secs'] = freq_df.tod.map(tod_numhours_dict)
    freq_df['headway_secs'] = freq_df.apply(lambda x: int(x.headway_secs * 60 * 60 / x.trip_num),
                                           axis = 1)
    
    freq_enum_list = {'start_time' : {'AM' : '06:00:00', 
                                      'MD' : '10:00:00',
                                      "PM" : "15:00:00",
                                      "NT" : "19:00:00",
                                      "EA" : "03:00:00"},
                      'end_time' : {'AM' : '10:00:00', 
                                    'MD' : '15:00:00',
                                    "PM" : "19:00:00",
                                    "NT" : "03:00:00",
                                    "EA" : "06:00:00"}}
    
    freq_df['start_time'] = freq_df.tod.map(freq_enum_list.get("start_time"))
    freq_df['end_time'] = freq_df.tod.map(freq_enum_list.get("end_time"))
    
    return freq_df

In [819]:
freq_df = create_freq_table(trip_df)

creating frequency reference...


In [820]:
freq_df

Unnamed: 0,trip_id,tod,direction_id,trip_num,headway_secs,start_time,end_time
0,6681228,MD,1,60,300,10:00:00,15:00:00
60,6681219,AM,1,57,252,06:00:00,10:00:00
96,6681318,NT,1,37,778,19:00:00,03:00:00
117,6681305,PM,1,62,232,15:00:00,19:00:00
153,6681390,EA,1,5,2160,03:00:00,06:00:00
158,6681070,AM,0,60,240,06:00:00,10:00:00
192,6681042,PM,0,64,225,15:00:00,19:00:00
229,6681038,MD,0,60,300,10:00:00,15:00:00
287,6681034,NT,0,28,1028,19:00:00,03:00:00
315,6681049,EA,0,8,1350,03:00:00,06:00:00


In [821]:
# create new shape with complete node list the route passes
def create_new_node_shape(node, bus_link, rail_link = pd.DataFrame(columns = ["u", "v", "shape_id", "A", "B"])):
    
    """
    create complete node lists each transit traverses to replace the gtfs shape.txt
    """
    bus_link_df = bus_link.copy()
    bus_trip_list_with_unique_shape_id = bus_link_df.drop_duplicates(subset = ["shape_id"]).trip_id.tolist()
    
    bus_link_df = bus_link_df[bus_link_df.trip_id.isin(bus_trip_list_with_unique_shape_id)].copy()
    
    shape_link_df = pd.concat([bus_link_df[["u", "v", 'shape_id', "A", "B"]]
                                , rail_link[['shape_id', "A", "B"]]],
                               sort = False,
                               ignore_index = True)
    
    shape_link_df.u = shape_link_df.u.fillna(0).astype(np.int64)
    shape_link_df.v = shape_link_df.v.fillna(0).astype(np.int64)

    shape_point_df = gpd.GeoDataFrame()
    
    for shape_id in shape_link_df.shape_id.unique():
        shape_df = shape_link_df[shape_link_df.shape_id == shape_id]
        point_df = pd.DataFrame(data = {"shape_id" : shape_id,
                                         "shape_osm_node_id" : shape_df.u.tolist() + [shape_df.v.iloc[-1]],
                                        "shape_model_node_id" : shape_df.A.tolist() + [shape_df.B.iloc[-1]],
                                       "shape_pt_sequence" : range(1, 1+len(shape_df)+1)})
   
        shape_point_df = pd.concat([shape_point_df,
                                   point_df],
                                  sort = False,
                                  ignore_index = True)

    shape_point_df = pd.merge(shape_point_df,
                             node[["osm_node_id", "shst_node_id", "model_node_id", "geometry"]],
                             how = "left",
                             left_on = "shape_model_node_id",
                             right_on = "model_node_id")
    
    shape_point_df.crs = {'init' : 'epsg:4326'}
    #shape_point_df = shape_point_df.to_crs(epsg = 4326)
    
    shape_point_df["shape_pt_lat"] = shape_point_df.geometry.map(lambda g:g.y)
    shape_point_df["shape_pt_lon"] = shape_point_df.geometry.map(lambda g:g.x)
    
    shape_point_df.rename(columns = {"shst_node_id":"shape_shst_node_id"}, inplace = True)
        
    return shape_point_df[["shape_id", "shape_pt_sequence", "shape_osm_node_id", "shape_shst_node_id", "shape_model_node_id"]]

In [822]:
shape_point_df = create_new_node_shape(roadway_and_rail_node_gdf, bus_link_df, rail_link_gdf)



In [823]:
shape_point_df

Unnamed: 0,shape_id,shape_pt_sequence,shape_osm_node_id,shape_shst_node_id,shape_model_node_id
0,133138,1,65295383,c4465447a28ea4b7436f3440c89a129b,1009243
1,133138,2,4017324918,38e3f9543a5709dfae1fbb280e7b9d2b,1018384
2,133138,3,65295385,9ad4cd94046659a2f73ee1a740c1d03f,1010935
3,133138,4,65295388,0e5a4de9899560e10905713a1c5c92f0,1023127
4,133138,5,5433307475,b8e38e2b7a4c70a2d605b753337bd950,1003221
5,133138,6,65295392,89f5537dcf4b60065eabb88471f64665,1017051
6,133138,7,65295399,69c3d928f35809544b269fe8485fa9ba,1014506
7,133138,8,5433307472,a8ca78cfad62b266c1fde8a1e6379ae1,1010390
8,133138,9,5433307469,e8a3e7cd98d2a1b6391971c863eb0547,1004226
9,133138,10,5433307468,5044cab2352139dcc1ceaec2efaf0611,1027039


In [824]:
shape_point_df.shape_id.nunique()

206

In [830]:
def write_out_transit_standard(trip, stop, shape_point, freq, feed, url, rail_node = False):
    
    shape_point_df = shape_point.copy()
    trip_df = trip.copy()
    
    trip_df = trip_df[trip_df.shape_id.isin(shape_point_df.shape_id.unique().tolist())]
    
    final_trip_list = trip_df.trip_id.unique().tolist()
    
    freq_df = freq.copy()
    freq_df = freq_df[freq_df.trip_id.isin(final_trip_list)]
    
    stop_df = stop.copy()
    
    if type(rail_node) != bool:
        rail_node_df = rail_node.copy()
        rail_node_dict = dict(zip(rail_node_df.stop_id, rail_node_df.model_node_id))
        
        stop_df['model_node_id'] = stop_df.apply(lambda x: rail_node_dict[x.stop_id] 
                                               if x.stop_id in rail_node_df.stop_id.tolist() 
                                               else x.model_node_id,
                                                axis = 1)
        stop_df['osm_node_id'] = stop_df.apply(lambda x: 0 
                                                if x.stop_id in rail_node_df.stop_id.tolist() 
                                                else x.shst_node_id,
                                                axis = 1)
        stop_df['shst_node_id'] = stop_df.apply(lambda x: '' 
                                                if x.stop_id in rail_node_df.stop_id.tolist() 
                                                else x.shst_node_id,
                                                axis = 1)
    

    stop_times_df = feed.stop_times.copy()
    stop_times_df = stop_times_df[stop_times_df.trip_id.isin(final_trip_list)]
    
    # update time to relative time for frequency based transit system
    stop_times_df['first_arrival'] = stop_times_df.groupby(['trip_id'])['arrival_time'].transform(min)
    stop_times_df['arrival_time'] = stop_times_df['arrival_time'] - stop_times_df['first_arrival']
    stop_times_df['departure_time'] = stop_times_df['departure_time'] - stop_times_df['first_arrival']
    stop_times_df['arrival_time'] = stop_times_df['arrival_time'].apply(lambda x : time.strftime('%H:%M:%S', 
                                                                                                 time.gmtime(x)))
    stop_times_df['departure_time'] = stop_times_df['departure_time'].apply(lambda x : time.strftime('%H:%M:%S', 
                                                                                                     time.gmtime(x)))
    stop_times_df.drop(['first_arrival'], axis = 1, inplace = True)
    
    route_df = feed.routes.copy()
    route_df = route_df[route_df.route_id.isin(trip_df.route_id.tolist())]
    
    route_df.to_csv(output_folder + "routes.txt", 
                    index = False, 
                    sep = ',')
    shape_point_df.to_csv(output_folder + "shapes.txt", 
                          index = False, 
                          sep = ',')
    trip_df[feed.trips.columns.values].to_csv(output_folder + "trips.txt", 
                                              index = False, 
                                              sep = ',')
    freq_df[['trip_id', 'headway_secs', 'start_time', 'end_time']].to_csv(output_folder + "frequencies.txt", 
                                                index = False, 
                                                sep = ',')
    stop_df.to_csv(output_folder + "stops.txt", 
                   index = False, 
                   sep = ',')
    stop_times_df.to_csv(output_folder + "stop_times.txt", 
                         index = False, 
                         sep = ',')


In [831]:
write_out_transit_standard(trip_df, 
                           stop_df, 
                           shape_point_df, 
                           freq_df, 
                           feed,
                           muni_url, 
                           unique_rail_node_gdf)

In [832]:
def create_transit_access_link(all_link, all_node, all_shape):
    
    """
    create rail walk access/egress links
    """
    
    tran_node_df = all_node[all_node.transit_access == 1].copy()
    walk_node_df = all_node[(all_node.walk_access == 1) & (all_node.transit_access == 0)].copy().reset_index(drop = True)
    
    walk_node_df = walk_node_df.to_crs({'init' : 'epsg:26915'})
    walk_node_df['X'] = walk_node_df.geometry.map(lambda g:g.x)
    walk_node_df['Y'] = walk_node_df.geometry.map(lambda g:g.y)
    inventory_node_ref = walk_node_df[['X', 'Y']].values
    tree = cKDTree(inventory_node_ref)
    
    tran_node_df = tran_node_df.to_crs({'init' : 'epsg:26915'})
    tran_node_df['X'] = tran_node_df.geometry.map(lambda g:g.x)
    tran_node_df['Y'] = tran_node_df.geometry.map(lambda g:g.y)
    
    for i in range(len(tran_node_df)):
        point = tran_node_df.iloc[i][['X', 'Y']].values
        dd, ii = tree.query(point, k = 1)
        add_node_gdf = gpd.GeoDataFrame(walk_node_df.iloc[ii]).transpose().reset_index(drop = True)
        add_node_gdf['tran_node'] = tran_node_df.iloc[i].model_node_id
        add_node_gdf['geometry_tran'] = tran_node_df.iloc[i].geometry
        
        if i == 0:
            rail_access_gdf = add_node_gdf.copy()
        else:
            rail_access_gdf = rail_access_gdf.append(add_node_gdf, ignore_index=True, sort=False)
    
    rail_access_gdf.rename(columns = {'geometry' : "geometry_walk"}, inplace = True)

    
    rail_access_gdf['geometry'] = [LineString(xy) for xy in zip(rail_access_gdf['geometry_walk'], 
                                                                rail_access_gdf['geometry_tran'])]
    
    # fake rail link shst geom id
    rail_access_gdf['shstGeometryId'] = range(1, 
                                     1 + len(rail_access_gdf))
    
    rail_access_gdf['shstGeometryId'] = rail_access_gdf.shstGeometryId.apply(lambda x:'walktorail'+str(x))
    rail_access_gdf['id'] = rail_access_gdf['shstGeometryId']
    
    rail_access_gdf["fromIntersectionId"] = rail_access_gdf.shst_node_id

    rail_access_gdf_copy = rail_access_gdf.copy()
    rail_access_gdf.rename(columns = {'model_node_id' : 'A', 'tran_node' : 'B'}, inplace = True)
    
    rail_access_gdf_copy.rename(columns = {'tran_node' : 'A', 'model_node_id' : 'B'}, inplace = True)
    
    rail_access_gdf = pd.concat(
                            [rail_access_gdf[['A', 'B', 'geometry', 'shstGeometryId', "id", "fromIntersectionId"]],
                            rail_access_gdf_copy[['A', 'B', 'geometry','shstGeometryId', "id", "fromIntersectionId"]]],
                               ignore_index = True,
                               sort = False)
    
    rail_access_gdf = gpd.GeoDataFrame(rail_access_gdf)
    rail_access_gdf.crs = {'init' : 'epsg:26915'}
    rail_access_gdf = rail_access_gdf.to_crs(all_node.crs)
    
    rail_access_gdf['walk_access'] = 1
    
    rail_access_link_columns = ["A", "B", "shstGeometryId", "walk_access", "id"]
    rail_access_shape_columns = ["id", "fromIntersectionId", "geometry"]
    
    all_link_df = all_link.copy()
    all_shape_gdf = all_shape.copy()
    
        
    all_shape_gdf = pd.concat([
                                all_shape_gdf,
                                rail_access_gdf[rail_access_shape_columns].drop_duplicates(
                                                                        subset = ["id"])
                              ],
                             sort = False,
                             ignore_index= True)

    
    all_link_df = pd.concat([all_link_df, 
                             rail_access_gdf[rail_access_link_columns]], 
                            ignore_index = True, 
                            sort = False)
    
    all_link_gdf = pd.merge(all_link_df,
                           all_shape_gdf,
                           how = "left",
                           left_on = "shstGeometryId",
                           right_on = "id")
    
    geom_length = gpd.GeoDataFrame(all_link_gdf[['geometry']])
    geom_length.crs = all_node.crs
    geom_length = geom_length.to_crs(epsg = 26915)
    geom_length["length"] = geom_length.length

    all_link_df["length"] = geom_length["length"]

    return all_link_df, all_shape_gdf

all_link_df, all_shape_gdf = create_transit_access_link(roadway_and_rail_link_df, 
                                                        roadway_and_rail_node_gdf,
                                                        roadway_and_rail_shape_gdf)

In [833]:
all_shape_gdf.id.nunique()

42149

In [834]:
all_link_df.id.nunique()

42149

In [835]:
# number geometry increse should be the number of transit nodes: 381
print(roadway_and_rail_link_df.shstGeometryId.nunique())
print(roadway_and_rail_shape_gdf.id.nunique())
print(roadway_and_rail_shape_gdf.shape)
print(all_shape_gdf.id.nunique())
print(all_shape_gdf.shape)
print(all_link_df.shstGeometryId.nunique())

41768
41768
(41768, 6)
42149
(42149, 6)
42149


In [836]:
# number of link increase should be 2 times of transit nodes : 762

print(roadway_and_rail_link_df.shape)
print(all_link_df.shape)

(74735, 43)
(75497, 43)


In [837]:
# true shapes for line record

from shapely import ops, geometry

def get_true_line_shape(trip_df, bus_link, roadway_and_rail_shape,
                        rail_link = pd.DataFrame(columns = ['LINK_ID','shape_id', 'u', 'v'])):
    
    """
    write out true shape for each trip
    """
    
    rail_link_df = rail_link.copy()
    rail_link_df = pd.merge(trip_df[['trip_id', 'shape_id']],
                            rail_link_df,
                           how = 'right',
                           on = 'shape_id')
    
    transit_link_gdf = pd.concat([bus_link[['shape_id', 'trip_id', "shstGeometryId"]], 
                                  rail_link_df[['shape_id', 'trip_id', "shstGeometryId"]]], 
                                 sort = False, ignore_index = True)
    
    transit_link_gdf = pd.merge(transit_link_gdf,
                                roadway_and_rail_shape[['id', 'geometry']],
                                how = 'left',
                                left_on = 'shstGeometryId',
                               right_on = "id")
    
    true_line_shape_df = transit_link_gdf.groupby(['trip_id', 'shape_id'])['geometry'].agg(
                                                                lambda x: 
                                                                ops.linemerge(geometry.MultiLineString(x.tolist())))\
                                        .reset_index()
    
    """true_line_shape_df = pd.merge(true_line_shape_df, 
                                  cube,
                                 how = 'left',
                                 on = ['shape_id', 'trip_id'])"""
    
    true_line_shape_gdf = gpd.GeoDataFrame(true_line_shape_df, 
                                           crs = roadway_and_rail_shape.crs, 
                                           geometry = 'geometry')
    
    return true_line_shape_gdf

In [838]:
true_line_shape_gdf = get_true_line_shape(trip_df, 
                                                bus_link_df, 
                                                roadway_and_rail_shape_gdf,
                                                rail_link_gdf)

In [839]:
true_line_shape_gdf.to_file(data_interim_folder + "transit_route.geojson",
                           driver = "GeoJSON")

In [840]:
true_line_shape_gdf.columns

Index(['trip_id', 'shape_id', 'geometry'], dtype='object')

In [841]:
def link_df_to_geojson(df, properties):
    """
    Author: Geoff Boeing:
    https://geoffboeing.com/2015/10/exporting-python-data-geojson/
    """
    geojson = {"type":"FeatureCollection", "features":[]}
    for _, row in df.iterrows():
        feature = {"type":"Feature",
                   "properties":{},
                   "geometry":{"type":"LineString",
                               "coordinates":[]}}
        feature["geometry"]["coordinates"] = [[x, y] for (x,y) in list(row["geometry"].coords)]
        for prop in properties:
            feature["properties"][prop] = row[prop]
        geojson["features"].append(feature)
    return geojson


def point_df_to_geojson(df: pd.DataFrame, properties: list):
    """
    Author: Geoff Boeing:
    https://geoffboeing.com/2015/10/exporting-python-data-geojson/
    """
    
    geojson = {"type": "FeatureCollection", "features": []}
    for _, row in df.iterrows():
        feature = {
            "type": "Feature",
            "properties": {},
            "geometry": {"type": "Point", "coordinates": []},
        }
        feature["geometry"]["coordinates"] = [row["geometry"].x, row["geometry"].y]
        for prop in properties:
            feature["properties"][prop] = row[prop]
        geojson["features"].append(feature)
    return geojson

def fill_na(df_na):
    """
    fill str NaN with ""
    fill numeric NaN with 0
    """
    df = df_na.copy()
    num_col = list(df.select_dtypes([np.number]).columns)
    print("numeric columns: ", num_col)
    object_col = list(df.select_dtypes(['object']).columns)
    print("str columns: ", object_col)
    
    for x in list(df.columns):
        if x in num_col:
            df[x].fillna(0, inplace = True)
        elif x in object_col:
            df[x].fillna("", inplace = True)
    
    return df

In [842]:
all_shape_gdf = fill_na(all_shape_gdf)

numeric columns:  []
str columns:  ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId', 'geometry']


In [843]:
int_col = ["bike_access", "walk_access", "drive_access", "transit_access", "LANES", "u", "v", "A", "B"]
for c in int_col:
    all_link_df[c] = all_link_df[c].fillna(0).astype(np.int64)
    
all_link_df = fill_na(all_link_df)

numeric columns:  ['A', 'B', 'LANES', 'bike_access', 'drive_access', 'forward', 'length', 'tomtom_f_jnctid', 'tomtom_id', 'tomtom_t_jnctid', 'u', 'v', 'walk_access', 'rail_traveltime', 'transit_access']
str columns:  ['access', 'area', 'bridge', 'est_width', 'fromIntersectionId', 'highway', 'id', 'junction', 'key', 'landuse', 'lanes', 'link', 'maxspeed', 'name', 'nodeIds', 'oneWay', 'oneway', 'ref', 'roadClass', 'roadway', 'roundabout', 'service', 'shstGeometryId', 'shstReferenceId', 'toIntersectionId', 'tunnel', 'wayId', 'width']


In [844]:
int_col = ["bike_access", "walk_access", "drive_access", "transit_access", "osm_node_id"]
for c in int_col:
    roadway_and_rail_node_gdf[c] = roadway_and_rail_node_gdf[c].fillna(0).astype(np.int64)
    
roadway_and_rail_node_gdf = fill_na(roadway_and_rail_node_gdf)

numeric columns:  ['osm_node_id', 'drive_access', 'walk_access', 'bike_access', 'model_node_id', 'transit_access']
str columns:  ['shst_node_id', 'geometry']


In [845]:
pd.crosstab(all_link_df.transit_access, all_link_df.walk_access)
pd.crosstab(roadway_and_rail_node_gdf.transit_access, roadway_and_rail_node_gdf.walk_access)

walk_access,0,1
transit_access,Unnamed: 1_level_1,Unnamed: 2_level_1
0,150,27550
1,0,381


In [846]:
all_link_df.info()
all_shape_gdf.info()
roadway_and_rail_node_gdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75497 entries, 0 to 75496
Data columns (total 43 columns):
A                     75497 non-null int64
B                     75497 non-null int64
LANES                 75497 non-null int64
access                75497 non-null object
area                  75497 non-null object
bike_access           75497 non-null int64
bridge                75497 non-null object
drive_access          75497 non-null int64
est_width             75497 non-null object
forward               75497 non-null float64
fromIntersectionId    75497 non-null object
highway               75497 non-null object
id                    75497 non-null object
junction              75497 non-null object
key                   75497 non-null object
landuse               75497 non-null object
lanes                 75497 non-null object
length                75497 non-null float64
link                  75497 non-null object
maxspeed              75497 non-null object
name          

In [847]:
%%time

print("-------write out link shape geojson---------")

shape_prop = ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId']
shape_geojson = link_df_to_geojson(all_shape_gdf, shape_prop)

with open(output_folder + "sf_shape.geojson", "w") as f:
    json.dump(shape_geojson, f)

-------write out link shape geojson---------
Wall time: 10.1 s


In [848]:
%%time

# write out link variable json
# link unique handle "shstReferenceId" + "shstGeometryId"

print("-------write out link json---------")

link_prop = all_link_df.drop(["forward", "roadClass", "oneway"], axis = 1).columns.tolist()

out = all_link_df[link_prop].to_json(orient = "records")

with open(output_folder + "sf_link.json", 'w') as f:
    f.write(out)

-------write out link json---------
Wall time: 2.34 s


In [849]:
%%time

print("-------write out node geojson---------")

node_prop = roadway_and_rail_node_gdf.drop("geometry", axis = 1).columns.tolist()
node_geojson = point_df_to_geojson(roadway_and_rail_node_gdf, node_prop)

with open(output_folder + "sf_node.geojson", "w") as f:
    json.dump(node_geojson, f)

-------write out node geojson---------
Wall time: 6.29 s


# write out to CUBE .lin

In [850]:
feed.routes.route_type.value_counts()

3    70
0     7
5     3
Name: route_type, dtype: int64

In [851]:
trip_df.columns

Index(['route_id', 'service_id', 'trip_id', 'trip_headsign', 'direction_id',
       'block_id', 'shape_id', 'arrival_time', 'departure_time', 'stop_id',
       'stop_sequence', 'stop_headsign', 'pickup_type', 'drop_off_type',
       'shape_dist_traveled', 'arrival_h', 'arrival_m', 'departure_h',
       'departure_m', 'tod', 'trip_num'],
      dtype='object')

In [852]:
feed.routes

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,10867,SFMTA,1,CALIFORNIA,,3,,,
1,1033,SFMTA,1AX,CALIFORNIA A EXPRESS,,3,,,
2,1034,SFMTA,1BX,CALIFORNIA B EXPRESS,,3,,,
3,1031,SFMTA,31AX,BALBOA A EXPRESS,,3,,,
4,1032,SFMTA,31BX,BALBOA B EXPRESS,,3,,,
5,1038,SFMTA,38AX,GEARY A EXPRESS,,3,,,
6,1039,SFMTA,38BX,GEARY B EXPRESS,,3,,,
7,1002,SFMTA,2,Clement,,3,,,
8,1003,SFMTA,3,Jackson,,3,,,
9,11438,SFMTA,5,FULTON,,3,,,


In [853]:
def prepare_df_for_cube(feed, trip, bus_link, freq,
                        rail_link = pd.DataFrame(columns = ['shape_id'])):
    
    
    transit_link_gdf = pd.concat([bus_link[['shape_id']], rail_link[['shape_id']]], 
                                 sort = False, ignore_index = True)
    trip_df = trip.copy()
    
    trip_df = trip_df[trip_df.shape_id.isin(transit_link_gdf.shape_id.unique().tolist())]
    
    trip_df = pd.merge(trip_df, feed.routes, how = 'left', on = 'route_id')
    trip_df = pd.merge(trip_df, freq[['trip_id','headway_secs']], how = 'left', on = 'trip_id')
    
    #trip_df['tod'] = np.where(trip_df.tod == 'peak', 'pk', 'op')
    
    trip_df['NAME'] = trip_df.apply(lambda x: x.route_id + '_' + x.route_short_name + \
                                    '_' + x.tod + str(x.direction_id), 
                                    axis = 1)
    trip_df['LONGNAME'] = trip_df['route_long_name']
    trip_df['HEADWAY'] = (trip_df['headway_secs']/60).astype(int)
    
    def mode_gtfs_to_muni(x):
        if x.route_type == 0:
            return 15
        elif x.route_type == 5:
            return 14
        elif x.route_short_name[-1] in ["X", "R"]:
            return 12
        else:
            return 11
    
    trip_df['MODE'] = trip_df.apply(lambda x : mode_gtfs_to_muni(x),
                                   axis = 1)
    
    trip_df['MODE'].fillna(11, inplace = True)
    trip_df['MODE'] = trip_df['MODE'].astype(int)
    
    trip_df['ONEWAY'] = 'T'
    
    return trip_df

In [854]:
cube_trip_df = prepare_df_for_cube(feed, 
                                trip_df, 
                                bus_link_df, 
                                freq_df, 
                                rail_link_gdf)

In [855]:
cube_trip_df

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id,arrival_time,departure_time,stop_id,...,route_type,route_url,route_color,route_text_color,headway_secs,NAME,LONGNAME,HEADWAY,MODE,ONEWAY
0,10867,1,6681228,Downtown,1,113,133142,38400.0,38400.0,4277,...,3,,,,300,10867_1_MD1,CALIFORNIA,5,11,T
1,10867,1,6681219,Downtown,1,119,133142,35700.0,35700.0,4277,...,3,,,,252,10867_1_AM1,CALIFORNIA,4,11,T
2,10867,1,6681318,Downtown,1,116,133142,69780.0,69780.0,4277,...,3,,,,778,10867_1_NT1,CALIFORNIA,12,11,T
3,10867,1,6681305,Downtown,1,101,133142,64320.0,64320.0,4277,...,3,,,,232,10867_1_PM1,CALIFORNIA,3,11,T
4,10867,1,6681390,Downtown,1,105,133142,21180.0,21180.0,4277,...,3,,,,2160,10867_1_EA1,CALIFORNIA,36,11,T
5,10867,1,6681070,the Richmond District,0,117,133139,31740.0,31740.0,4015,...,3,,,,240,10867_1_AM0,CALIFORNIA,4,11,T
6,10867,1,6681042,the Richmond District,0,103,133139,57720.0,57720.0,4015,...,3,,,,225,10867_1_PM0,CALIFORNIA,3,11,T
7,10867,1,6681038,the Richmond District,0,107,133139,50400.0,50400.0,4015,...,3,,,,300,10867_1_MD0,CALIFORNIA,5,11,T
8,10867,1,6681034,the Richmond District,0,117,133139,90000.0,90000.0,4015,...,3,,,,1028,10867_1_NT0,CALIFORNIA,17,11,T
9,10867,1,6681049,the Richmond District,0,106,133138,20580.0,20580.0,3892,...,3,,,,1350,10867_1_EA0,CALIFORNIA,22,11,T


In [857]:
unique_rail_node_gdf

Unnamed: 0,node_id,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled,is_stop,stop_id,model_node_id,geometry,transit_access,walk_access
0,1,133409,-122.411516,37.795709,2,49,1,5364,90001,POINT (-122.411516 37.795709),1,1
1,2,133409,-122.411619,37.796200,3,99,1,5366,90002,POINT (-122.411619 37.7962),1,1
2,4,133409,-122.411825,37.797208,5,209,1,5355,90003,POINT (-122.411825 37.797208),1,1
3,5,133409,-122.412020,37.798199,6,310,1,5370,90004,POINT (-122.41202 37.798199),1,1
4,6,133409,-122.412203,37.799100,7,415,1,5362,90005,POINT (-122.412203 37.7991),1,1
5,9,133409,-122.412397,37.800037,10,520,1,5368,90006,POINT (-122.412397 37.800037),1,1
6,11,133409,-122.412580,37.800965,12,624,1,5358,90007,POINT (-122.41258 37.80096500000001),1,1
7,13,133409,-122.412775,37.801929,14,728,1,5360,90008,POINT (-122.412775 37.801929),1,1
8,15,133409,-122.413531,37.802687,16,856,1,4113,90009,POINT (-122.413531 37.802687),1,1
9,16,133409,-122.414586,37.803410,17,991,1,4104,90010,POINT (-122.414586 37.80341),1,1


In [858]:
def node_list(x, trip_df, stop_df, bus_link, feed, node_gdf, link_gdf, 
              rail_link = pd.DataFrame(columns = ['shape_id', 'u', 'v']), rail_node_df = False):
    print(x.trip_id)
    
    if type(rail_node_df) != bool:
        rail_node_dict = dict(zip(rail_node_df.stop_id, rail_node_df.model_node_id))
        stop_df['model_node_id'] = stop_df.apply(lambda x: rail_node_dict[x.stop_id] 
                                               if x.stop_id in rail_node_df.stop_id.tolist()
                                               else x.model_node_id,
                                    axis = 1)
        
    rail_link_df = rail_link.copy()
    rail_link_df = pd.merge(trip_df[['trip_id', 'shape_id']],
                            rail_link_df,
                           how = 'right',
                           on = 'shape_id')
    
    transit_link_gdf = pd.concat([bus_link[['u', 'v', 'shape_id', 'trip_id' , "A", "B"]], 
                                  rail_link_df[['shape_id', 'trip_id', "A", "B"]]], 
                                 sort = False, 
                                 ignore_index = True)
    
    """transit_link_gdf = pd.merge(transit_link_gdf,
                               link_gdf[['LINK_ID',"u","v"]],
                               how = "left",
                               on = ["u","v"])"""
   
    stop_times_df = feed.stop_times.copy()
    stop_id_list = stop_times_df[stop_times_df.trip_id == x.trip_id]['stop_id'].tolist()
    stop_node_list = stop_df[stop_df['stop_id'].isin(stop_id_list)]['model_node_id'].tolist()
    
    node_list = transit_link_gdf[transit_link_gdf['trip_id'] == x.trip_id]['A'].tolist() + \
                    [transit_link_gdf[transit_link_gdf['trip_id'] == x.trip_id]['B'].iloc[-1]]
    
    #osmid_N_dict = dict(zip(node_gdf.OSMID, node_gdf.N))
    
    #node_list = list(map(osmid_N_dict.get, node_list))
    
    s = '\nLINE NAME=\"%s\",' % (x.NAME,)
    
    #line attribtes
    s += '\n LONGNAME=\"%s",' % (x.LONGNAME,)
    if x.tod == 'AM':
        s += '\n FREQ[1]=%s,' % (x.HEADWAY,)
    elif x.tod == "MD":
        s += '\n FREQ[2]=%s,' % (x.HEADWAY,)
    elif x.tod == "PM":
        s += '\n FREQ[3]=%s,' % (x.HEADWAY,)
    elif x.tod == "NT":
        s += '\n FREQ[4]=%s,' % (x.HEADWAY,)
    elif x.tod == "EA":
        s += '\n FREQ[5]=%s,' % (x.HEADWAY,)
    s += '\n MODE=%s,' % (x.MODE,)
    s += '\n ONEWAY=%s,' % (x.ONEWAY,)
    s += '\nNODES='
    
    circular = 0
    #node list
    trip_stop_list = []
    for nodeIdx in range(len(node_list)):
        # added condition to make sure stops only get stopped once
        if (node_list[nodeIdx] in stop_node_list) & (node_list[nodeIdx] not in trip_stop_list):
            s += '\n %s' % (node_list[nodeIdx])
            trip_stop_list += [node_list[nodeIdx]]
            if nodeIdx < (len(node_list)-1):
                s += ','
        else:
            s += '\n -%s' % (node_list[nodeIdx])
            if nodeIdx < (len(node_list)-1):
                s += ','
            if (node_list[nodeIdx] in trip_stop_list):
                circular = 1
    
    #if circular == 1:
    #    s += ','
    #    s += '\n CIRCULAR=T'                
            
    lines.append(s)

In [859]:
lines = [';;<<Trnbuild>>;;']

cube_trip_df.apply(lambda x: node_list(x, 
                                    trip_df,
                                    stop_df, 
                                    bus_link_df,
                                    feed,
                                    roadway_and_rail_node_gdf,
                                    roadway_and_rail_link_df,
                                    rail_link_gdf,
                                    unique_rail_node_gdf), 
                axis=1)

with open(output_folder + "muni.LIN", 'w') as f:
    f.write("\n".join(map(str, lines)))

6681228
6681219
6681318
6681305
6681390
6681070
6681042
6681038
6681034
6681049
6682885
6682961
6768113
6767752
6682823
6768226
6767277
6682952
6682951
6682850
6782020
6777414
6682866
6682864
6750403
6750367
6750360
6750337
6750336
6750475
6750483
6750661
6750633
6750613
6750520
6750519
6750587
6750582
6750546
6697427
6697425
6697413
6697398
6697387
6697454
6697443
6697495
6697481
6697469
6697866
6697858
6697844
6697841
6697892
6697786
6697771
6697839
6697823
6697807
7097415
6791152
6791146
6791142
6791088
7097423
6807612
6791349
7097433
6791336
6700945
6700972
6700971
6700958
6700998
6700933
6701017
6701026
6700984
6701011
7097163
6781891
6781876
6781967
6781933
7097146
6781748
6781746
6781830
6781754
6702754
7072734
6702699
6702735
6702719
6703482
6703480
6703466
6703446
6703435
6703487
6703555
6703553
6703539
6703518
6771948
6704443
6704417
6704413
6704335
6704559
6704537
6704521
6704513
6704501
6771824
6771832
6771815
6771733
6771828
6771751
6705662
6705658
6705657
6705636
6705626


In [860]:
stop_df[stop_df.stop_id == "5184"]

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,osm_node_id,shst_node_id,model_node_id
1711,5184,Jones St & Beach St,,37.807248,-122.417366,,,65297249,86e7a3a45b9e6e7a3fa01d63948d4260,90361


In [862]:
unique_rail_node_gdf[unique_rail_node_gdf.stop_id == "5184"]

Unnamed: 0,node_id,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled,is_stop,stop_id,model_node_id,geometry,transit_access,walk_access
593,2301,139233,-122.417366,37.807248,1,0,1,5184,90361,POINT (-122.417366 37.807248),1,1
