# This notebook goes through building transit network from gtfs to network standard

1. extract represetative trips
2. snap stops to roadway nodes
3. route bus on roadway via osmnx routing
4. route bus on roadway via shst routing
5. build non-bus/rail links and nodes
6. complete network node list that each transit path traverses
7. frequence based stop time
8. write out to transit network standard
9. write out quick QA/QC transit route true shape
10. write out network standard with rail nodes and links
11. write out travel model transit network

In [1]:
import partridge as ptg
import peartree as pt
#%matplotlib inline
import requests
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, LineString
import networkx as nx
from shapely import wkt
from scipy.spatial import cKDTree
import osmnx as ox
from dbfread import DBF
from osgeo import ogr
import glob
import time
import json
import os

In [2]:
from methods import link_df_to_geojson
from methods import point_df_to_geojson
from methods import fill_na

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
data_interim_dir = "../../data/interim/"

In [5]:
output_folder = "../../data/processed/version_12/"

#  Read Network

In [6]:
%%time
link_file = data_interim_dir + "step5_tidy_roadway/link.feather"
link_df = pd.read_feather(link_file)

node_file = data_interim_dir + "step5_tidy_roadway/node.geojson"
node_gdf = gpd.read_file(node_file)

shape_file = data_interim_dir + "step5_tidy_roadway/shape.geojson"
shape_gdf = gpd.read_file(shape_file)

Wall time: 2min 38s


In [6]:
print('roadway network has {} links'.format(link_df.shape[0]))

(1632702, 36)

In [7]:
# network type correction at Transbay temporary terminal:

display(link_df[link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"])][
        ["roadway", "drive_access", "walk_access", "bike_access"]])

Unnamed: 0,roadway,drive_access,walk_access,bike_access
524671,cycleway,0,1,1
863713,service,0,1,1


In [8]:
# set roadway type as 'service' and drive_access = 1
link_df.loc[link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"]), 
            "roadway"] = "service"
link_df.loc[link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"]), 
            "drive_access"] = 1

# check
link_df[link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"])][
    ["roadway", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,drive_access,walk_access,bike_access
524671,service,1,1,1
863713,service,1,1,1


In [9]:
# related nodes
display(node_gdf[node_gdf.osm_node_id.isin([890045140, 5372055804, 890045129])])

# set drive_access = 1
node_gdf.loc[node_gdf.osm_node_id.isin([890045140, 5372055804, 890045129]), "drive_access"] = 1

# check
node_gdf[node_gdf.osm_node_id.isin([890045140, 5372055804, 890045129])]

Unnamed: 0,osm_node_id,shst_node_id,county,drive_access,walk_access,bike_access,model_node_id,county_numbering_start,geometry
130712,5372055804,3291f7c2f15101c22abf554ce230343e,San Francisco,0,1,1,1006138,1000000,POINT (-122.39210 37.78934)
215683,890045140,4c0619714744bed10b7de965adc7048d,San Francisco,1,1,1,1010031,1000000,POINT (-122.39263 37.78966)
244340,890045129,490be8656a6428c6fc871a1f0e6432eb,San Francisco,1,1,1,1011380,1000000,POINT (-122.39203 37.78925)


Unnamed: 0,osm_node_id,shst_node_id,county,drive_access,walk_access,bike_access,model_node_id,county_numbering_start,geometry
130712,5372055804,3291f7c2f15101c22abf554ce230343e,San Francisco,1,1,1,1006138,1000000,POINT (-122.39210 37.78934)
215683,890045140,4c0619714744bed10b7de965adc7048d,San Francisco,1,1,1,1010031,1000000,POINT (-122.39263 37.78966)
244340,890045129,490be8656a6428c6fc871a1f0e6432eb,San Francisco,1,1,1,1011380,1000000,POINT (-122.39203 37.78925)


In [15]:
# build network routing file for osmnx routing

def ox_graph(nodes_df, links_df):
    """
        create an osmnx-flavored network graph
        osmnx doesn't like values that are arrays, so remove the variables
        that have arrays.  osmnx also requires that certain variables
        be filled in, so do that too.
        Parameters
        ----------
        nodes_df : GeoDataFrame
        link_df : GeoDataFrame
        Returns
        -------
        networkx multidigraph
    """
    try:
        graph_nodes = nodes_df.drop(
                ["inboundReferenceId", "outboundReferenceId"], axis=1
            )
    except:
        graph_nodes = nodes_df.copy()

    graph_nodes.gdf_name = "network_nodes"
    graph_nodes['id'] = graph_nodes['shst_node_id']

    graph_links = links_df.copy()
    graph_links['id'] = graph_links['shstReferenceId']
    graph_links['key'] = graph_links['shstReferenceId']

    # G = ox.gdfs_to_graph(graph_nodes, graph_links)
    # "ox.gdfs_to_graph()" was replaced by "ox.graph_from_gdfs()" in osmnx module v0.13.0
    G = ox.graph_from_gdfs(graph_nodes, graph_links)

    return G


drive_node_gdf = node_gdf[node_gdf.drive_access == 1].copy()
drive_link_df = link_df[link_df.drive_access == 1].copy()

G_drive = ox_graph(drive_node_gdf,
                   drive_link_df)

# Consolidate all gtfs into one

In [17]:
gtfs_agencies_list = os.listdir("../../data/external/gtfs/2015")
# print(gtfs_agencies_list)
gtfs_agencies_list.remove("Petaluma_2016_5_22")
gtfs_agencies_list.remove("WestCAT_2016_5_26")
gtfs_agencies_list.remove("GGFerries_2017_3_18")
gtfs_agencies_list.remove("commuteDOTorg_GTFSImportExport_20160127_final_mj")

gtfs_agencies_list = gtfs_agencies_list + ["commuteDOTorg_GTFSImportExport_20160127_final_mj"]

In [18]:
gtfs_agencies_list

['ACE_2017_3_20',
 'ACTransit_2015_8_14',
 'BART_2015_8_3',
 'Blue&Gold_gtfs_10_4_2017',
 'Caltrain_2015_5_13',
 'Capitol_2017_3_20',
 'CCTA_2015_8_11',
 'Emeryville_2016_10_26',
 'Fairfield_2015_10_14',
 'GGTransit_2015_9_3',
 'Marguerite_2016_10_10',
 'MarinTransit_2015_8_31',
 'MVGo_2016_10_26',
 'petalumatransit-petaluma-ca-us__11_12_15',
 'RioVista_2015_8_20',
 'SamTrans_2015_8_20',
 'SantaRosa_google_transit_08_28_15',
 'SFMTA_2015_8_11',
 'SF_Bay_Ferry2016_07_01',
 'Soltrans_2016_5_20',
 'SonomaCounty_2015_8_18',
 'TriDelta-GTFS-2018-05-24_21-43-17',
 'Union_City_Transit_Aug-01-2015 to Jun-30-2017',
 'vacavillecitycoach-2020-ca-us',
 'Vine_GTFS_PLUS_2015',
 'VTA_2015_8_27',
 'westcat-ca-us_9_17_2015',
 'Wheels_2016_7_13',
 'commuteDOTorg_GTFSImportExport_20160127_final_mj']

In [19]:
all_routes_df = pd.DataFrame()
all_trips_df = pd.DataFrame()
all_stops_df = pd.DataFrame()
all_shapes_df = pd.DataFrame()
all_stop_times_df = pd.DataFrame()
all_agency_df = pd.DataFrame()
all_fare_attributes_df = pd.DataFrame()
all_fare_rules_df = pd.DataFrame()

def get_representative_feed_from_gtfs(work_dir, in_url = "", fetch = False):
    
    print('getting representative feed...')
    
    if fetch == True:
        #read and save zip from url
        resp = urlopen(in_url)
        zipfile = ZipFile(BytesIO(resp.read()))
    
    if fetch == True:
        zipfile.extractall(work_dir + "muni")
    
    file_loc = work_dir
    
    # get feed for the busiest day
    feed = pt.get_representative_feed(file_loc)
    
    return feed

for name in gtfs_agencies_list:
    
    # exclude weekend only services
    if "calendar_orig.txt" in os.listdir("../../data/external/gtfs/2015/" + name):
        calendar_df = pd.read_csv("../../data/external/gtfs/2015/" + name + "/calendar.txt")
        
    elif "calendar.txt" in os.listdir("../../data/external/gtfs/2015/" + name):
        calendar_df = pd.read_csv("../../data/external/gtfs/2015/" + name + "/calendar.txt")
        calendar_df.to_csv("../../data/external/gtfs/2015/" + name + "/calendar_orig.txt",
                                                 index = False,
                                                 sep = ",")
    
        calendar_df["weekdays"] = calendar_df.apply(lambda x: x.monday + x.tuesday + x.wednesday + x.thursday + x.friday,
                                               axis = 1)
        calendar_df = calendar_df[calendar_df.weekdays > 0]
    
        calendar_df.drop("weekdays", axis = 1).to_csv("../../data/external/gtfs/2015/" + name + "/calendar.txt",
                                                 index = False,
                                                 sep = ",")
    
    feed = get_representative_feed_from_gtfs("../../data/external/gtfs/2015/" + name)
    
    routes_df = feed.routes.copy()
    routes_df["agency_raw_name"] = name
    
    stops_df = feed.stops.copy()
    stops_df["agency_raw_name"] = name
    
    trips_df = feed.trips.copy()
    trips_df["agency_raw_name"] = name
    
    if "direction_id" not in trips_df.columns: # Marguerita
        trips_df["direction_id"] = 0
    
    trips_df["direction_id"].fillna(0, inplace = True)
   
    shapes_df = feed.shapes.copy()
    shapes_df["agency_raw_name"] = name
    
    stop_times_df = feed.stop_times.copy()
    stop_times_df["agency_raw_name"] = name
    
    agency_df = feed.agency.copy()
    agency_df["agency_raw_name"] = name
    
    # gtfs cannot read fare tables for all agencies
        
    if "fare_attributes.txt" in os.listdir("../../data/external/gtfs/2015/" + name):
        
        fare_attributes_df = pd.read_csv("../../data/external/gtfs/2015/" + name + "/fare_attributes.txt",
                                         dtype = {"fare_id" : str})
        fare_attributes_df["agency_raw_name"] = name
    
    else:
        
        fare_attributes_df = pd.DataFrame()
    
    if "fare_rules.txt" in os.listdir("../../data/external/gtfs/2015/" + name):
        
        fare_rules_df = pd.read_csv("../../data/external/gtfs/2015/" + name + "/fare_rules.txt",
                                    dtype = {"fare_id" : str, "route_id" : str, "origin_id" : str, "destination_id" : str,
                                             " route_id" : str, " origin_id" : str, " destination_id" : str,})
        fare_rules_df["agency_raw_name"] = name
        
    else:
        
        fare_rules_df = pd.DataFrame()
        
    # add agency_id in routes.txt if missing
    if "agency_id" not in routes_df.columns:
        if "agency_id" in agency_df.columns:
            routes_df["agency_id"] = agency_df.agency_id.iloc[0]
    
    if len(shapes_df) == 0: # ACE, CCTA, VINE
        print("missing shapes.txt for {}".format(name))
        group_df = trips_df.groupby(["route_id", "direction_id"])["trip_id"].first().reset_index().drop("trip_id", axis = 1)
        group_df["shape_id"] = range(1, len(group_df) + 1)
        if "shape_id" in trips_df.columns:
            trips_df.drop("shape_id", axis = 1, inplace = True)
        trips_df = pd.merge(trips_df, group_df, how = "left", on = ["route_id", "direction_id"])
        
    if len(trips_df[trips_df.shape_id.isnull()]) > 0:
        print("partial complete shape_id for {}".format(name))
        trips_missing_shape_df = trips_df[trips_df.shape_id.isnull()].copy()
        group_df = trips_missing_shape_df.groupby(["route_id", "direction_id"])["trip_id"].first().reset_index().drop("trip_id", axis = 1)
        group_df["shape_id"] = range(1, len(group_df) + 1)
        group_df["shape_id"] = group_df["shape_id"].apply(lambda x: "psudo" + str(x))
        trips_missing_shape_df = pd.merge(trips_missing_shape_df.drop("shape_id", axis = 1), 
                                          group_df, how = "left", on = ["route_id", "direction_id"])
        trips_df = pd.concat([trips_df[trips_df.shape_id.notnull()], trips_missing_shape_df],
                             ignore_index = True,
                             sort = False)
        
    all_routes_df = all_routes_df.append(routes_df, sort = False, ignore_index = True)
    all_trips_df = all_trips_df.append(trips_df, sort = False, ignore_index = True)
    all_stops_df = all_stops_df.append(stops_df, sort = False, ignore_index = True)
    all_shapes_df = all_shapes_df.append(shapes_df, sort = False, ignore_index = True)
    all_stop_times_df = all_stop_times_df.append(stop_times_df, sort = False, ignore_index = True)
    all_agency_df = all_agency_df.append(agency_df, sort = False, ignore_index = True)
    all_fare_attributes_df = all_fare_attributes_df.append(fare_attributes_df, sort = False, ignore_index = True)
    all_fare_rules_df = all_fare_rules_df.append(fare_rules_df, sort = False, ignore_index = True)

getting representative feed...
missing shapes.txt for ACE_2017_3_20
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
partial complete shape_id for Capitol_2017_3_20
getting representative feed...
missing shapes.txt for CCTA_2015_8_11
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
partial complete shape_id for SamTrans_2015_8_20
getting representative feed...
getting representative feed...
getting representative feed...
partial complete shape_id for SF_Bay_Ferry2016_07_01
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...


In [20]:
# clean up field names
all_fare_rules_df["origin_id"] = np.where(all_fare_rules_df["origin_id"].isnull(),
                                          all_fare_rules_df[" origin_id"],
                                          all_fare_rules_df["origin_id"])

all_fare_rules_df["destination_id"] = np.where(all_fare_rules_df["destination_id"].isnull(),
                                          all_fare_rules_df[" destination_id"],
                                          all_fare_rules_df["destination_id"])

all_fare_rules_df["contains_id"] = np.where(all_fare_rules_df["contains_id"].isnull(),
                                          all_fare_rules_df[" contains_id"],
                                          all_fare_rules_df["contains_id"])

all_fare_rules_df.drop(columns = [" origin_id", " destination_id", " contains_id"], inplace = True)

In [21]:
# examine transit agencies included
all_agency_df.astype(str).groupby(["agency_raw_name", "agency_name", "agency_id"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email
agency_raw_name,agency_name,agency_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ACE_2017_3_20,ACE Altamont Corridor Express,CE,1,1,1,1,1,1
ACTransit_2015_8_14,AC Transit,AC Transit,1,1,1,1,1,1
BART_2015_8_3,Bay Area Rapid Transit,BART,1,1,1,1,1,1
Blue&Gold_gtfs_10_4_2017,Blue & Gold Fleet,BG,1,1,1,1,1,1
CCTA_2015_8_11,County Connection,,1,1,1,1,1,1
Caltrain_2015_5_13,Caltrain,caltrain-ca-us,1,1,1,1,1,1
Capitol_2017_3_20,Capitol Corridor,AM,1,1,1,1,1,1
Emeryville_2016_10_26,Emery Go-Round,573,1,1,1,1,1,1
Fairfield_2015_10_14,Fairfield and Suisun Transit,36,1,1,1,1,1,1
GGTransit_2015_9_3,Golden Gate Transit,,1,1,1,1,1,1


# Re-ID the consolidated gtfs

In [22]:
# route_id, shape_id, trip_id, stop_id, 

unique_route_id_df = all_routes_df[
    all_routes_df.agency_raw_name != "commuteDOTorg_GTFSImportExport_20160127_final_mj"].copy()
unique_route_id_df = unique_route_id_df.groupby(
    ["agency_raw_name", "route_id"])["agency_id"].count().reset_index().drop(["agency_id"], axis = 1)
unique_route_id_df["route_id_original"] = unique_route_id_df["route_id"]
unique_route_id_df["route_id"] = range(1,  len(unique_route_id_df) + 1)

unique_trip_id_df = all_trips_df[
    all_trips_df.agency_raw_name != "commuteDOTorg_GTFSImportExport_20160127_final_mj"].copy()
unique_trip_id_df = unique_trip_id_df.groupby(
    ["agency_raw_name", "trip_id"])["route_id"].count().reset_index().drop(["route_id"], axis = 1)
unique_trip_id_df["trip_id_original"] = unique_trip_id_df["trip_id"]
unique_trip_id_df["trip_id"] = range(1,  len(unique_trip_id_df) + 1)

unique_shape_id_df = all_trips_df[
    all_trips_df.agency_raw_name != "commuteDOTorg_GTFSImportExport_20160127_final_mj"].copy()
unique_shape_id_df = unique_shape_id_df.groupby(
    ["agency_raw_name", "shape_id"])["trip_id"].count().reset_index().drop(["trip_id"], axis = 1)
unique_shape_id_df["shape_id_original"] = unique_shape_id_df["shape_id"]
unique_shape_id_df["shape_id"] = range(1,  len(unique_shape_id_df) + 1)

unique_stop_id_df = all_stops_df[
    all_stops_df.agency_raw_name != "commuteDOTorg_GTFSImportExport_20160127_final_mj"].copy()
unique_stop_id_df = unique_stop_id_df.groupby(
    ["agency_raw_name", "stop_id"])["stop_lat"].count().reset_index().drop(["stop_lat"], axis = 1)
unique_stop_id_df["stop_id_original"] = unique_stop_id_df["stop_id"]
unique_stop_id_df["stop_id"] = range(1,  len(unique_stop_id_df) + 1)

## caltrain shuttle June 11th, 2021
# appending caltrain data without renumbering the existing transit

cs_unique_route_id_df = all_routes_df[
    all_routes_df.agency_raw_name == "commuteDOTorg_GTFSImportExport_20160127_final_mj"].copy()
cs_unique_route_id_df = cs_unique_route_id_df.groupby(
    ["agency_raw_name", "route_id"])["agency_id"].count().reset_index().drop(["agency_id"], axis = 1)
cs_unique_route_id_df["route_id_original"] = cs_unique_route_id_df["route_id"]
cs_unique_route_id_df["route_id"] = range(1 + max(unique_route_id_df.route_id),  
                                          len(cs_unique_route_id_df) + 1 + max(unique_route_id_df.route_id))

unique_route_id_df = pd.concat([unique_route_id_df, cs_unique_route_id_df], sort = False, ignore_index = True)

cs_unique_trip_id_df = all_trips_df[
    all_trips_df.agency_raw_name == "commuteDOTorg_GTFSImportExport_20160127_final_mj"].copy()
cs_unique_trip_id_df = cs_unique_trip_id_df.groupby(
    ["agency_raw_name", "trip_id"])["route_id"].count().reset_index().drop(["route_id"], axis = 1)
cs_unique_trip_id_df["trip_id_original"] = cs_unique_trip_id_df["trip_id"]
cs_unique_trip_id_df["trip_id"] = range(1 + max(unique_trip_id_df.trip_id),  
                                        len(cs_unique_trip_id_df) + 1 + max(unique_trip_id_df.trip_id))

unique_trip_id_df = pd.concat([unique_trip_id_df, cs_unique_trip_id_df], sort = False, ignore_index = True)

cs_unique_shape_id_df = all_trips_df[
    all_trips_df.agency_raw_name == "commuteDOTorg_GTFSImportExport_20160127_final_mj"].copy()
cs_unique_shape_id_df = cs_unique_shape_id_df.groupby(
    ["agency_raw_name", "shape_id"])["trip_id"].count().reset_index().drop(["trip_id"], axis = 1)
cs_unique_shape_id_df["shape_id_original"] = cs_unique_shape_id_df["shape_id"]
cs_unique_shape_id_df["shape_id"] = range(1 + max(unique_shape_id_df.shape_id),  
                                          len(cs_unique_shape_id_df) + 1 + max(unique_shape_id_df.shape_id))

unique_shape_id_df = pd.concat([unique_shape_id_df, cs_unique_shape_id_df], sort = False, ignore_index = True)

cs_unique_stop_id_df = all_stops_df[
    all_stops_df.agency_raw_name == "commuteDOTorg_GTFSImportExport_20160127_final_mj"].copy()
cs_unique_stop_id_df = cs_unique_stop_id_df.groupby(
    ["agency_raw_name", "stop_id"])["stop_lat"].count().reset_index().drop(["stop_lat"], axis = 1)
cs_unique_stop_id_df["stop_id_original"] = cs_unique_stop_id_df["stop_id"]
cs_unique_stop_id_df["stop_id"] = range(1 + max(unique_stop_id_df.stop_id),  
                                        len(cs_unique_stop_id_df) + 1 + max(unique_stop_id_df.stop_id))

unique_stop_id_df = pd.concat([unique_stop_id_df, cs_unique_stop_id_df], sort = False, ignore_index = True)

In [26]:
# merge unique shape id and stio id back to aggregated gtfs data

all_routes_df = pd.merge(
    all_routes_df.rename(columns = {"route_id" : "route_id_original"}),
    unique_route_id_df,
    how = "left",
    on = ["agency_raw_name", "route_id_original"]
)

all_trips_df = pd.merge(
    all_trips_df.rename(columns = {"route_id" : "route_id_original", "trip_id" : "trip_id_original",
                                 "shape_id" : "shape_id_original"}),
    unique_route_id_df,
    how = "left",
    on = ["agency_raw_name", "route_id_original"]
)
all_trips_df = pd.merge(
    all_trips_df,
    unique_trip_id_df,
    how = "left",
    on = ["agency_raw_name", "trip_id_original"]
)
all_trips_df = pd.merge(
    all_trips_df,
    unique_shape_id_df,
    how = "left",
    on = ["agency_raw_name", "shape_id_original"]
)

all_stops_df = pd.merge(
    all_stops_df.rename(columns = {"stop_id" : "stop_id_original"}),
    unique_stop_id_df,
    how = "left",
    on = ["agency_raw_name", "stop_id_original"]
)

all_shapes_df = pd.merge(
    all_shapes_df.rename(columns = {"shape_id" : "shape_id_original"}),
    unique_shape_id_df,
    how = "left",
    on = ["agency_raw_name", "shape_id_original"]
)


all_stop_times_df = pd.merge(
    all_stop_times_df.rename(columns = {"trip_id" : "trip_id_original", "stop_id" : "stop_id_original"}),
    unique_trip_id_df,
    how = "left",
    on = ["agency_raw_name", "trip_id_original"]
)
all_stop_times_df = pd.merge(
    all_stop_times_df,
    unique_stop_id_df,
    how = "left",
    on = ["agency_raw_name", "stop_id_original"]
)

all_fare_rules_df["route_id"] = all_fare_rules_df["route_id"].astype(str)
all_fare_rules_df = pd.merge(
    all_fare_rules_df.rename(columns = {"route_id" : "route_id_original"}),
    unique_route_id_df,
    how = "left",
    on = ["agency_raw_name", "route_id_original"]
)


In [27]:
# examine and export

print('all_routes_df field: \n{}'.format(all_routes_df.columns))
all_routes_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/routes.txt', index = False, sep = ',')

print('all_trips_df field: \n{}'.format(all_trips_df.columns))
all_trips_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/trips.txt', index = False, sep = ',')

print('all_stops_df fields: \n{}'.format(all_stops_df.columns))
all_stops_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/stops.txt', index = False, sep = ',')

print('all_shapes_df fields: \n{}'.format(all_shapes_df.columns))
all_shapes_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/shapes.txt', index = False, sep = ',')

print('all_stop_times_df fields: \n{}'.format(all_stop_times_df.columns))
all_stop_times_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/stop_times.txt', index = False, sep = ',')

print('all_agency_df fields: \n{}'.format(all_agency_df.columns))
all_agency_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/agency.txt', index = False, sep = ',')

print('all_fare_attributes_df fields: \n{}'.format(all_fare_attributes_df.columns))
all_fare_attributes_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/fare_attributes.txt', index = False, sep = ',')

print('all_fare_rules_df fields: \n{}'.format(all_fare_rules_df.columns))
all_fare_rules_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/fare_rules.txt', index = False, sep = ',')

all_routes_df field: 
Index(['route_id_original', 'agency_id', 'route_short_name', 'route_long_name',
       'route_type', 'route_color', 'route_text_color', 'agency_raw_name',
       'route_desc', 'route_url', 'route_sort_order', 'min_headway_minutes',
       'eligibility_restricted', 'continuous_pickup', 'continuous_drop_off',
       'route_id'],
      dtype='object')
all_trips_df field: 
Index(['route_id_original', 'service_id', 'trip_id_original', 'trip_headsign',
       'direction_id', 'block_id', 'trip_short_name', 'agency_raw_name',
       'shape_id_original', 'wheelchair_accessible', 'bikes_allowed',
       'original_trip_id', 'trip_bikes_allowed', 'trip_type',
       'drt_max_travel_time', 'drt_avg_travel_time', 'drt_advance_book_min',
       'drt_pickup_message', 'drt_drop_off_message',
       'continuous_pickup_message', 'continuous_drop_off_message', 'route_id',
       'trip_id', 'shape_id'],
      dtype='object')
all_stops_df fields: 
Index(['stop_id_original', 'stop_name'

In [28]:
# drop fields not needed
all_trips_df.drop(["route_id_original", "trip_id_original", "shape_id_original"], axis = 1, inplace = True)
all_stops_df.drop(["stop_id_original"], axis = 1, inplace = True)
all_shapes_df.drop(["agency_raw_name", "shape_id_original"], axis = 1, inplace = True)
all_stop_times_df.drop(["agency_raw_name", "trip_id_original", "stop_id_original"], axis = 1, inplace = True)

# Processing

### get representative trip for each route by direction

In [29]:
# pick representatives for each route by direction, with most number of trip 
def get_representative_trip_for_route(trips, stop_times):
    
    """
    get the representative trips for each route, by direction, tod
    
    """
    
    print('getting representative trip...')
    
    # get the first stop of each trip to determine the time period for each trip
    # process time
    stop_times_df = stop_times.copy()
    stop_times_df['arrival_h'] = pd.to_datetime(stop_times_df['arrival_time'], unit = 's').dt.hour
    stop_times_df['arrival_m'] = pd.to_datetime(stop_times_df['arrival_time'], unit = 's').dt.minute
    stop_times_df['departure_h'] = pd.to_datetime(stop_times_df['departure_time'], unit = 's').dt.hour
    stop_times_df['departure_m'] = pd.to_datetime(stop_times_df['departure_time'], unit = 's').dt.minute
    
    # according to the gtfs reference, the stop sequence does not have to be consecutive, but has to always increase
    # so we can get the fisrt stop by the smallest stop sequence on the trip
    stop_times_df.sort_values(by = ["trip_id", "stop_sequence"], 
                              ascending = True, 
                              inplace = True)
    first_stop_df = stop_times_df.drop_duplicates(subset = ["trip_id"])
    
    ## identify peak, offpeak trips, based on the arrival time of first stop
    trip_df = trips.copy()
    trip_df = pd.merge(trip_df, 
                       first_stop_df,
                       how = 'left',
                       on = 'trip_id')
    
    ## AM: 6-10am, MD: 10am-3pm, PM: 3-7pm, NT 7pm-3am, EA 3-6am
    trip_df['tod'] = np.where((trip_df['arrival_h'] >= 6) & (trip_df['arrival_h'] < 10),
                                 'AM',
                                 np.where((trip_df['arrival_h'] >= 10) & (trip_df['arrival_h'] < 15),
                                     'MD',
                                    np.where((trip_df['arrival_h'] >= 15) & (trip_df['arrival_h'] < 19),
                                         'PM',
                                        np.where((trip_df['arrival_h'] >= 3) & (trip_df['arrival_h'] < 6),
                                             'EA',
                                                 'NT'))))
  
    # calculate frequency for EA and NT period using 5-6am, and 7-10pm
    trip_EA_NT_df = trip_df.copy()
    trip_EA_NT_df["tod"] = np.where((trip_df['arrival_h'] >= 5) & (trip_df['arrival_h'] < 6),
                                  "EA",
                                  np.where((trip_df['arrival_h'] >= 19) & (trip_df['arrival_h'] < 22),
                                          "NT",
                                          "NA")
                                  )
    
    # get the most frequent trip for each route, by direction, by time of day
    ## trips share the same shape_id is considered being the same
    ## first get the trip count for each shape_id
#     trip_freq_df = trip_df.groupby(['route_id', 'tod', 'direction_id', 'shape_id'])['trip_id'].count().\
#                             to_frame().\
#                             drop(index = 'other', level = 1).\
#                             reset_index()
    trip_freq_df = trip_df.groupby(['route_id', 'tod', 'direction_id', 'shape_id'])['trip_id'].count()

    ## then choose the most frequent shape_id for each route, frequency use the total number of trips
    def agg(x):
        m = x.shape_id.iloc[np.argmax(x.trip_id.values)]
        return pd.Series({'trip_num' : x.trip_id.sum(), 'shape_id' : m})
   
    trip_freq_df = trip_freq_df.reset_index().groupby(['route_id', 'tod', 'direction_id']).apply(agg)
    
    # retain the complete trip info of represent trip only
    trip_df = pd.merge(trip_df, trip_freq_df.reset_index(),
                      how = 'inner',
                      on = ['route_id', 'tod', 'direction_id', 'shape_id']).\
                drop_duplicates(['route_id', 'direction_id', 'tod'])
        
    trip_EA_NT_df = pd.merge(trip_EA_NT_df, trip_freq_df.reset_index(),
                      how = 'inner',
                      on = ['route_id', 'tod', 'direction_id', 'shape_id'])
    
    trip_EA_NT_df = trip_EA_NT_df[trip_EA_NT_df.tod.isin(["EA", "NT"])].groupby(
        ["route_id", "tod", "direction_id", "shape_id"])["trip_id"].count().reset_index()
    
    trip_EA_NT_df.rename(columns = {"trip_id" : "trip_num"}, inplace = True)
    
    trip_df = pd.merge(
        trip_df,
        trip_EA_NT_df,
        how = "left",
        on = ["route_id", "tod", "direction_id", "shape_id"]
    )
    
    trip_df["trip_num"] = np.where(trip_df.trip_num_y.isnull(),
                                  trip_df.trip_num_x,
                                  trip_df.trip_num_y)
    
    return trip_df

In [30]:
# code departure/arrival hour and minute
stop_times_df = all_stop_times_df.copy()
stop_times_df['arrival_h'] = pd.to_datetime(stop_times_df['arrival_time'], unit = 's').dt.hour
stop_times_df['arrival_m'] = pd.to_datetime(stop_times_df['arrival_time'], unit = 's').dt.minute
stop_times_df['departure_h'] = pd.to_datetime(stop_times_df['departure_time'], unit = 's').dt.hour
stop_times_df['departure_m'] = pd.to_datetime(stop_times_df['departure_time'], unit = 's').dt.minute

# sort in order to get first stop of each trip
stop_times_df.sort_values(by = ["trip_id", "stop_sequence"], 
                          ascending = True, 
                          inplace = True)
first_stop_df = stop_times_df.drop_duplicates(subset = ["trip_id"])

In [31]:
# code type of day
trip_df = all_trips_df.copy()
trip_df = pd.merge(trip_df, 
                   first_stop_df,
                   how = 'left',
                   on = 'trip_id')

trip_df['tod'] = np.where((trip_df['arrival_h'] >= 6) & (trip_df['arrival_h'] < 10),
                                 'AM',
                                 np.where((trip_df['arrival_h'] >= 10) & (trip_df['arrival_h'] < 15),
                                     'MD',
                                    np.where((trip_df['arrival_h'] >= 15) & (trip_df['arrival_h'] < 19),
                                         'PM',
                                        np.where((trip_df['arrival_h'] >= 3) & (trip_df['arrival_h'] < 6),
                                             'EA',
                                                 'NT'))))

In [32]:
%%time
# obtain the representative trip of each route

trip_df = get_representative_trip_for_route(all_trips_df, all_stop_times_df)

getting representative trip...
Wall time: 3.49 s


### map gtfs stops to roadway nodes

In [34]:
def snap_stop_to_node(stops, node_gdf):
    
    """
    map gtfs stops to roadway nodes
    
    Parameters:
    ------------
    feed
    drive nodes
    
    return
    ------------
    stops with drive nodes id
    """
    
    print('snapping gtfs stops to roadway node osmid...')
    
    node_non_c_gdf = node_gdf.copy()
    node_non_c_gdf = node_non_c_gdf.to_crs({'init' : 'epsg:26915'})
    node_non_c_gdf['X'] = node_non_c_gdf.geometry.map(lambda g:g.x)
    node_non_c_gdf['Y'] = node_non_c_gdf.geometry.map(lambda g:g.y)
    inventory_node_ref = node_non_c_gdf[['X', 'Y']].values
    tree = cKDTree(inventory_node_ref)
    
    stop_df = stops.copy()
    stop_df['geometry'] = [Point(xy) for xy in zip(stop_df['stop_lon'], stop_df['stop_lat'])]
    stop_df = gpd.GeoDataFrame(stop_df)
    stop_df.crs = {'init' : 'epsg:4326'}
    stop_df = stop_df.to_crs({'init' : 'epsg:26915'})
    stop_df['X'] = stop_df['geometry'].apply(lambda p: p.x)
    stop_df['Y'] = stop_df['geometry'].apply(lambda p: p.y)
   
    for i in range(len(stop_df)):
        point = stop_df.iloc[i][['X', 'Y']].values
        dd, ii = tree.query(point, k = 1)
        add_snap_gdf = gpd.GeoDataFrame(node_non_c_gdf.iloc[ii]).transpose().reset_index(drop = True)
        add_snap_gdf['stop_id'] = stop_df.iloc[i]['stop_id']
        if i == 0:
            stop_to_node_gdf = add_snap_gdf.copy()
        else:
            stop_to_node_gdf = stop_to_node_gdf.append(add_snap_gdf, ignore_index=True, sort=False)
    
    stop_df.drop(['X','Y'], axis = 1, inplace = True)
    stop_to_node_gdf = pd.merge(stop_df, stop_to_node_gdf, how = 'left', on = 'stop_id')
    
    column_list = stops.columns.values.tolist() + ['osm_node_id', 'shst_node_id', "model_node_id"]
    
    return stop_to_node_gdf[column_list]

In [35]:
# get rid of motorway nodes. 
# Motorway and motorway are restricted-access freeway and freeway ramps, shouldn't have transit stops
non_motorway_links_df = drive_link_df[~drive_link_df.roadway.isin(["motorway", "motorway_link"])].copy()

nodes_for_stops_df = drive_node_gdf[drive_node_gdf.model_node_id.isin(non_motorway_links_df.A.tolist() + 
                                                                     non_motorway_links_df.B.tolist())].copy()

In [36]:
%%time
stop_df = snap_stop_to_node(all_stops_df, nodes_for_stops_df)

snapping gtfs stops to roadway node osmid...


  return _prepare_from_string(" ".join(pjargs))
  return _prepare_from_string(" ".join(pjargs))


Wall time: 6min 1s


In [37]:
# examine the data
stop_df.info()
stop_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22153 entries, 0 to 22152
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   stop_name            22153 non-null  object 
 1   stop_lat             22153 non-null  float64
 2   stop_lon             22153 non-null  float64
 3   zone_id              12267 non-null  object 
 4   agency_raw_name      22153 non-null  object 
 5   stop_code            15488 non-null  object 
 6   location_type        5558 non-null   float64
 7   parent_station       620 non-null    object 
 8   stop_desc            4319 non-null   object 
 9   stop_url             4168 non-null   object 
 10  stop_timezone        137 non-null    object 
 11  wheelchair_boarding  388 non-null    float64
 12  platform_code        58 non-null     object 
 13  position             0 non-null      object 
 14  direction            0 non-null      object 
 15  * used by routes     125 non-null   

Unnamed: 0,stop_name,stop_lat,stop_lon,zone_id,agency_raw_name,stop_code,location_type,parent_station,stop_desc,stop_url,stop_timezone,wheelchair_boarding,platform_code,position,direction,* used by routes,stop_id,osm_node_id,shst_node_id,model_node_id
0,FMT FREMONT STATION,37.558559,-122.007598,55644,ACE_2017_3_20,,,,,,,,,,,,7,53026824,0d6e8c51d77d4cbf23cf1150a4c73d22,2582218
1,GAC GREAT AMERICA STATION,37.406398,-121.966671,55645,ACE_2017_3_20,,,,,,,,,,,,8,312418202,4cfcb17d2cf046c6f55ce459194ebc0e,2019349
2,LAT LATHROP/MANTECA STATION,37.797908,-121.263664,55449,ACE_2017_3_20,,,,,,,,,,,,2,1832575219,584a0ca93779f09a42595e3d094b8151,2551601


### route bus on roadway via osmnx routing

In [38]:
def route_bus_link_osmnx(roadway_gdf, node_gdf, G, stop_times, routes, trip, stop):
    
    """
    route bus with OSMNX routing
    
    Parameters
    ----------
    drive link
    drive node
    drive graph
    feed
    trip 
    stop
    
    return
    ----------
    dataframe of drive links bus trips traverses
    list of trips that could not be routed by OSMNX
    """
    
    trip_df = trip.copy()
    stop_df = stop.copy()
    stop_time_df = stop_times.copy()
    
    chained_stop_df = stop_time_df[stop_time_df['trip_id'].isin(trip_df.trip_id.tolist())]
    chained_stop_to_node_df = pd.merge(chained_stop_df, 
                                       stop_df,
                                        how = 'left',
                                        on = 'stop_id')
    
    print('routing bus on roadway network with osmnx...')
    
    #osm_node_dict = dict(zip(node_gdf.osmid, node_gdf.N))
    
    trip_df = pd.merge(trip_df, routes, how = 'left', on = 'route_id')
    bus_trip_df = trip_df[trip_df['route_type'] == 3]
    
    # to track trips that osmnx failed to route
    broken_shape_trip_list = []
    
    # output dataframe for osmnx success
    trip_link_shape_df = pd.DataFrame()
    
    # loop through for bus trips
    for trip_id in bus_trip_df.trip_id.unique():
        
        # get the stops on the trip
        trip_stop_df = chained_stop_to_node_df[chained_stop_to_node_df['trip_id'] == trip_id].copy()
        
        trip_stop_df.sort_values(by = ["stop_sequence"], inplace = True)

        try:
            print("routing" + str(trip_id))
            for s in range(len(trip_stop_df)-1):
                # from stop node OSM id
                closest_node_to_stop1 = int(trip_stop_df.osm_node_id.iloc[s])
                
                # to stop node OSM id
                closest_node_to_stop2 = int(trip_stop_df.osm_node_id.iloc[s+1])
                
                # osmnx routing btw from and to stops, return the list of nodes
                node_osmid_list = nx.shortest_path(G, closest_node_to_stop1, closest_node_to_stop2, weight = "length")
                
                # get the links
                if len(node_osmid_list) > 1:
                    osm_link_gdf = pd.DataFrame({'u' : node_osmid_list[:len(node_osmid_list)-1], 
                                            'v' : node_osmid_list[1:len(node_osmid_list)],
                                            'trip_id' : trip_id},
                                               )
                else:
                    continue
                
                trip_link_shape_df = trip_link_shape_df.append(osm_link_gdf, ignore_index = True, sort = False)                

        except:
            broken_shape_trip_list = broken_shape_trip_list + [trip_id]
            print('  warning: cannot route bus: ' + str(trip_id))
            continue      
        
    trip_link_shape_df = pd.merge(trip_link_shape_df, trip_df[['trip_id', 'shape_id']], how = 'left', on = 'trip_id')

    trip_link_shape_df = pd.merge(trip_link_shape_df,
                                  drive_link_df[["u", "v", "wayId", "shstReferenceId", "shstGeometryId", "A", "B"]].\
                                      drop_duplicates(subset = ["u", "v"]),
                                  how = "left",
                                  on = ["u", "v"])
    
    return trip_link_shape_df, broken_shape_trip_list

In [39]:
%%time
bus_osmnx_link_shape_df, bus_osmnx_broken_trip_list = route_bus_link_osmnx(drive_link_df, 
                                                                           drive_node_gdf, 
                                                                           G_drive, 
                                                                           all_stop_times_df,
                                                                           all_routes_df,
                                                                           trip_df, 
                                                                           stop_df)

routing bus on roadway network with osmnx...
routing9
routing10
routing11
routing20
routing21
routing22
routing23
routing31
routing42
routing44
routing45
routing46
routing48
routing49
routing50
routing51
routing52
routing53
routing54
routing55
routing57
routing58
routing59
routing60
routing61
routing63
routing64
routing65
routing66
routing67
routing69
routing74
routing75
routing78
routing80
routing81
routing83
routing91
routing92
routing94
routing95
routing96
routing97
routing104
routing106
routing108
routing114
routing116
routing120
routing126
routing127
routing128
routing139
routing144
routing149
routing153
routing183
routing184
routing188
routing193
routing197
routing199
routing207
routing217
routing225
routing228
routing235
routing245
routing253
routing257
routing277
routing302
routing322
routing324
routing344
routing369
routing389
routing392
routing393
routing395
routing396
routing399
routing400
routing408
routing409
routing443
routing444
routing447
routing450
routing459
routing46

routing4645
routing4651
routing4652
routing4654
routing4655
routing4662
routing4672
routing4680
routing4685
routing4693
routing4703
routing4711
routing4716
routing4724
routing4734
routing4742
routing4748
routing4754
routing4764
routing4772
routing4779
routing4783
routing4788
routing4792
routing4793
routing4797
routing4802
routing4806
routing4807
routing4811
routing4816
routing4820
routing4822
routing4825
routing4830
routing4834
routing4836
routing4837
routing4841
routing4845
routing4849
routing4855
routing4859
routing4864
routing4868
routing4870
routing4874
routing4879
routing4883
routing4887
routing4888
routing4889
routing4890
routing4892
routing4893
routing4895
routing4896
routing4898
routing4899
routing4901
routing5003
routing5004
routing5005
routing5019
routing5020
routing5026
routing5032
routing5048
routing5052
routing5057
routing5061
routing5062
routing5065
routing5070
routing5074
routing5075
routing5077
routing5097
routing5122
routing5142
routing5160
routing5185
routing5205
rout

routing9725
routing9726
routing9729
routing9740
routing9745
routing9748
routing9751
routing9756
routing9822
routing9828
routing9829
routing9836
routing9840
routing9916
routing9925
routing9940
routing9951
routing9954
routing9958
routing9963
routing9967
routing9977
routing9991
routing9997
routing10007
routing10015
routing10019
routing10023
routing10028
routing10034
routing10035
routing10036
routing10054
routing10082
routing10112
routing10141
routing10179
routing10183
routing10201
routing10203
routing10222
routing10238
routing10259
routing10265
routing10276
routing10294
routing10297
routing10307
routing10317
routing10333
routing10356
routing10378
routing10389
routing10434
routing10494
routing10542
routing10549
routing10560
routing10572
routing10583
routing10584
routing10588
routing10595
routing10600
routing10604
routing10618
routing10624
routing10634
routing10642
routing10646
routing10649
routing10662
routing10664
routing10676
routing10685
routing10699
routing10706
routing10716
routing107

routing11362
routing11295
routing11258
routing11626
routing11625
routing11614
routing11541
routing11651
routing19420
routing21116
routing21113
routing19478
routing19378
routing19400
routing21110
routing19447
routing20324
routing19407
routing21173
routing19408
routing19404
routing11777
routing11766
routing11710
routing11761
routing11742
routing11700
routing11693
routing11782
routing11861
routing11853
routing11674
routing11832
routing11817
routing11800
routing11871
routing11869
routing11794
routing11904
routing12159
routing12143
routing12100
routing20549
routing20542
routing20512
routing12096
routing12113
routing11990
routing11984
routing12046
routing12045
routing20698
routing20676
routing12055
routing11920
routing11957
routing12298
routing12234
routing12288
routing12287
routing12328
routing12374
routing12367
routing12320
routing12363
routing12455
routing12541
routing12513
routing12509
routing12465
routing12467
routing12521
routing12536
routing12517
routing12511
routing12400
routing20749

routing26338
routing26541
routing26445
routing26526
routing26337
routing26453
routing26366
routing26390
routing26345
routing26612
routing26549
routing26623
routing26589
routing26463
routing26515
routing26426
routing26605
routing26499
routing26480
routing26561
routing26456
routing26347
routing26513
routing26335
routing26584
routing26496
routing26544
routing26517
routing26423
routing26491
routing26373
routing26510
routing26358
routing26462
routing26505
routing26432
routing26374
routing26350
routing26430
routing26372
routing26616
routing26452
routing26344
routing26477
routing26523
routing26402
routing26594
routing26351
routing26472
routing26355
routing26415
routing26635
routing26443
routing26592
routing26508
routing26610
routing26506
routing26537
routing26512
routing26447
routing26620
routing26636
routing26428
routing26574
routing26560
routing26343
routing26534
routing26387
routing26525
routing26547
routing26353
routing26450
routing26495
routing26536
routing26368
routing26470
routing26411

routing29546
routing29519
routing29568
routing29566
routing29584
routing29600
routing29615
routing29606
routing29613
routing29661
routing29660
routing29648
routing29634
routing29623
routing29681
routing29712
routing29700
routing29676
routing29687
routing29761
routing29759
routing29728
routing29741
routing29788
routing29809
routing29776
routing29793
routing29807
routing29840
routing29835
routing29824
routing29837
routing29902
routing29903
routing29886
routing29871
routing29893
routing29930
routing29913
routing29908
routing29947
routing29960
routing29977
routing29962
routing29957
routing29982
routing30003
routing30000
routing30004
routing30010
routing30011
routing30032
routing30019
routing30034
routing30053
routing30059
routing30045
routing30052
routing30087
routing30083
routing30070
routing30091
routing30098
routing30093
routing30107
routing30114
routing30116
routing30118
routing30121
routing30119
routing30128
routing30136
routing30135
routing30140
routing30139
routing30141
routing30144

In [41]:
# trips successfully routed by osmnx
bus_osmnx_link_shape_df.info()

print('# unique shapd_id of all trips: {}'.format(trip_df.shape_id.nunique()))
print('# unique shapd_id of routed trips: {}'.format(bus_osmnx_link_shape_df.shape_id.nunique()))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 934579 entries, 0 to 934578
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   u                934579 non-null  int64 
 1   v                934579 non-null  int64 
 2   trip_id          934579 non-null  int32 
 3   shape_id         934579 non-null  int32 
 4   wayId            934579 non-null  object
 5   shstReferenceId  934579 non-null  object
 6   shstGeometryId   934579 non-null  object
 7   A                934579 non-null  int64 
 8   B                934579 non-null  int64 
dtypes: int32(2), int64(4), object(3)
memory usage: 64.2+ MB
# unique shapd_id of all trips: 1598
# unique shapd_id of routed trips: 1499


In [42]:
# osmnx failed to route these trips: can be rail modes
print(bus_osmnx_broken_trip_list)

[8363, 8361, 8357, 8358, 8364, 11209, 11213, 11216, 11208, 11211, 11217, 11214, 23467]


In [43]:
# shapes that were not successfully routed by OSMNX
trip_df[trip_df.trip_id.isin(bus_osmnx_broken_trip_list)].shape_id.unique()

array([ 541,  502,  510,  513,  514,  786,  787,  788,  789, 1224])

### route bus on roadway via shst routing

In [44]:
# read shst match result

all_shst_df = pd.DataFrame()

for name in gtfs_agencies_list:
    try: 
        shst_df = gpd.read_file(data_interim_dir + "step6_gtfs/shst_match/" + name + ".transit.out.matched.geojson")
        shst_df["agency_raw_name"] = name
    
        all_shst_df = all_shst_df.append(shst_df, sort = False, ignore_index = True)
    except:
        print(name + ".out.matched.geojson not found.")

ACE_2017_3_20.out.matched.geojson not found.
BART_2015_8_3.out.matched.geojson not found.
Blue&Gold_gtfs_10_4_2017.out.matched.geojson not found.
Caltrain_2015_5_13.out.matched.geojson not found.
Capitol_2017_3_20.out.matched.geojson not found.
CCTA_2015_8_11.out.matched.geojson not found.
MVGo_2016_10_26.out.matched.geojson not found.
SF_Bay_Ferry2016_07_01.out.matched.geojson not found.
Union_City_Transit_Aug-01-2015 to Jun-30-2017.out.matched.geojson not found.
Vine_GTFS_PLUS_2015.out.matched.geojson not found.
commuteDOTorg_GTFSImportExport_20160127_final_mj.out.matched.geojson not found.


In [45]:
all_shst_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 118172 entries, 0 to 118171
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   shstReferenceId         118172 non-null  object  
 1   shstGeometryId          118172 non-null  object  
 2   shstFromIntersectionId  118172 non-null  object  
 3   shstToIntersectionId    118172 non-null  object  
 4   gisReferenceId          118172 non-null  object  
 5   gisGeometryId           118172 non-null  object  
 6   gisTotalSegments        118172 non-null  int64   
 7   gisSegmentIndex         118172 non-null  int64   
 8   gisFromIntersectionId   118172 non-null  object  
 9   gisToIntersectionId     118172 non-null  object  
 10  startSideOfStreet       118172 non-null  object  
 11  endSideOfStreet         118172 non-null  object  
 12  sideOfStreet            118172 non-null  object  
 13  score                   118172 non-null  float64 
 

In [46]:
# add consolidated IDs

all_shst_df.rename(columns = {"pp_shape_id" : "shape_id_original"}, inplace = True)

all_shst_df = pd.merge(
    all_shst_df,
    unique_shape_id_df,
    how = "left",
    on = ["agency_raw_name", "shape_id_original"]
)

all_shst_df = all_shst_df[all_shst_df.shape_id.notnull()].copy()

In [48]:
def route_bus_link_shst(drive_link, gtfs_shst_id):
    
    """
    route bus with shst match result
    
    parameter
    ---------
    drive link
    gtfs shst match return
    
    return
    ---------
    dataframe of drive links bus traverses
    list of imcomplete bus shapes
    
    """
    
    drive_link_df = drive_link.copy()
    shape_shst_df = gtfs_shst_id.copy()

    shape_shst_df = pd.merge(shape_shst_df, 
                             drive_link_df[
                                 ['shstReferenceId','wayId','u','v', "fromIntersectionId", "toIntersectionId", "A", "B"]
                             ],
                             how = 'left',
                             left_on = 'shstReferenceId',
                             right_on = 'shstReferenceId')
    
    shape_shst_df["u"] = shape_shst_df["u"].fillna(0).astype(np.int64)
    shape_shst_df["v"] = shape_shst_df["v"].fillna(0).astype(np.int64)
    shape_shst_df["A"] = shape_shst_df["A"].fillna(0).astype(np.int64)
    shape_shst_df["B"] = shape_shst_df["B"].fillna(0).astype(np.int64)
    
    """shape_shst_df.dropna(subset = ['u','v'], 
                         axis = 0, 
                         inplace = True)"""
    
    shape_shst_df = shape_shst_df.reset_index(drop=True)
    
    shape_shst_df['next_shape_id'] = shape_shst_df['shape_id'].\
                                            iloc[1:].\
                                            append(pd.Series(shape_shst_df['shape_id'].iloc[-1])).\
                                            reset_index(drop=True)
    
    shape_shst_df['next_u'] = shape_shst_df['u'].\
                                iloc[1:].\
                                append(pd.Series(shape_shst_df['v'].iloc[-1])).\
                                reset_index(drop=True)
    
    incomplete_shape_list = shape_shst_df[\
                                   (shape_shst_df.shape_id==shape_shst_df.next_shape_id)\
                                   &(shape_shst_df.v!=shape_shst_df.next_u)\
                                  ].shape_id.unique().\
                                    tolist()
    
    shape_shst_df = shape_shst_df[~shape_shst_df.shape_id.isin(incomplete_shape_list)].copy()
    
    return shape_shst_df, incomplete_shape_list

In [49]:
%%time
bus_shst_link_shape_df, incomplete_shape_list = route_bus_link_shst(drive_link_df, all_shst_df)

print('total {} bus_shst links with {} shapes; {} links with {} shapes were successfully routed'.format(
    all_shst_df.shape[0],
    all_shst_df.shape_id.nunique(),
    bus_shst_link_shape_df.shape[0],
    bus_shst_link_shape_df.shape_id.nunique()))

total 69266 bus_shst links with 466 shapes; 49818 links with 351 shapes were successfully routed
Wall time: 2.09 s


In [50]:
bus_shst_link_shape_df.info()
bus_shst_link_shape_df.head(3)

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 49818 entries, 0 to 69265
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   shstReferenceId         49818 non-null  object  
 1   shstGeometryId          49818 non-null  object  
 2   shstFromIntersectionId  49818 non-null  object  
 3   shstToIntersectionId    49818 non-null  object  
 4   gisReferenceId          49818 non-null  object  
 5   gisGeometryId           49818 non-null  object  
 6   gisTotalSegments        49818 non-null  int64   
 7   gisSegmentIndex         49818 non-null  int64   
 8   gisFromIntersectionId   49818 non-null  object  
 9   gisToIntersectionId     49818 non-null  object  
 10  startSideOfStreet       49818 non-null  object  
 11  endSideOfStreet         49818 non-null  object  
 12  sideOfStreet            49818 non-null  object  
 13  score                   49818 non-null  float64 
 14  matchType     

Unnamed: 0,shstReferenceId,shstGeometryId,shstFromIntersectionId,shstToIntersectionId,gisReferenceId,gisGeometryId,gisTotalSegments,gisSegmentIndex,gisFromIntersectionId,gisToIntersectionId,...,shape_id,wayId,u,v,fromIntersectionId,toIntersectionId,A,B,next_shape_id,next_u
0,bf94c0db40279cc5ef46d54d8be6139d,90d83676f262ccee0386f52a5167e6fd,47bfb418bcc178ad00fef4071aaf5a57,c31e4495db3607e2dc4dbed346f2e363,74b0e1ba27de4555ef113010eb16ed5e,743dd80bd02aca964d6783f584141692,247,1,4264bd2eb5f2270e1a9c2d34fa505f7d,7415e23dc4b24042982eb73250380539,...,7.0,6345240,53073691,53073689,47bfb418bcc178ad00fef4071aaf5a57,c31e4495db3607e2dc4dbed346f2e363,2549243,2551858,7.0,53073689
1,1315a47eae44cb25d4ba5b9bdaa2c04a,b9429a4f0520adc5db786e220ec76c43,c31e4495db3607e2dc4dbed346f2e363,1a1364a96b0a5bd7bd7a81079f3812ea,74b0e1ba27de4555ef113010eb16ed5e,743dd80bd02aca964d6783f584141692,247,2,4264bd2eb5f2270e1a9c2d34fa505f7d,7415e23dc4b24042982eb73250380539,...,7.0,562054606,53073689,5484284278,c31e4495db3607e2dc4dbed346f2e363,1a1364a96b0a5bd7bd7a81079f3812ea,2551858,2520680,7.0,5484284278
2,d8bcbf7a1eacd73c6dc94cdf86588477,eb5d72233b87c45388e9be68d7a577d5,1a1364a96b0a5bd7bd7a81079f3812ea,f813a2083833e119b39a3a4d443a8978,74b0e1ba27de4555ef113010eb16ed5e,743dd80bd02aca964d6783f584141692,247,3,4264bd2eb5f2270e1a9c2d34fa505f7d,7415e23dc4b24042982eb73250380539,...,7.0,"[417401349, 562054606]",5484284278,53085389,1a1364a96b0a5bd7bd7a81079f3812ea,f813a2083833e119b39a3a4d443a8978,2520680,2515655,7.0,53085389


In [51]:
# shst failed to route these trips

# some of these buses has parts that are out side of SF county boundary, that's why they are labeled as incomplete shape
# some are due to the discrepency btw shst extraction and osmnx extraction

print(incomplete_shape_list)

[116.0, 147.0, 148.0, 149.0, 158.0, 160.0, 161.0, 165.0, 167.0, 168.0, 172.0, 174.0, 175.0, 176.0, 188.0, 189.0, 192.0, 193.0, 194.0, 195.0, 205.0, 206.0, 207.0, 208.0, 213.0, 214.0, 217.0, 219.0, 220.0, 222.0, 230.0, 233.0, 237.0, 239.0, 240.0, 244.0, 269.0, 270.0, 274.0, 324.0, 326.0, 695.0, 714.0, 732.0, 733.0, 751.0, 760.0, 761.0, 765.0, 1185.0, 1186.0, 1196.0, 1197.0, 1198.0, 1217.0, 1233.0, 1237.0, 1238.0, 1255.0, 1256.0, 1257.0, 1258.0, 1275.0, 1281.0, 1295.0, 1306.0, 1307.0, 1314.0, 1338.0, 1340.0, 1344.0, 1345.0, 1346.0, 1604.0, 1618.0, 1619.0, 1625.0, 1633.0, 1635.0, 1641.0, 1663.0, 1666.0, 1667.0, 1677.0, 1678.0, 1679.0, 1680.0, 1702.0, 1703.0, 1704.0, 1712.0, 1730.0, 1737.0, 1738.0, 1744.0, 1751.0, 1752.0, 1753.0, 1759.0, 1827.0, 1832.0, 1833.0, 1846.0, 1850.0, 1858.0, 1866.0, 1873.0, 1953.0, 1967.0, 1987.0, 1988.0, 2007.0, 2008.0, 2013.0, 2014.0]


### combine routing results of the two approaches

In [52]:
def bus_link(bus_link_osmnx, bus_link_shst, routes, trip, incomplete_list):
    
    """
    combine bus links from OSMNX and SHST
    
    Prioritize SHST matching, for those failed to match through SHST, use OSMNX routing
    """
    
    bus_link_osmnx_df = bus_link_osmnx.copy()
    bus_link_shst_df = bus_link_shst.copy()
    
    trip_df = trip.copy()
    trip_df = pd.merge(trip_df, routes[['route_id', 'route_type']], how = 'left', on = 'route_id')
    bus_trip_df = trip_df[trip_df.route_type == 3].copy()
    
    shape_id_list = bus_trip_df.shape_id.unique().tolist()

    incomplete_list = [x for x in incomplete_list]
    
    print("Targeting number of bus shape IDs: " + str(bus_trip_df.shape_id.nunique()))
    
    shst_shape_list = list(set([x for x in bus_link_shst_df.shape_id]))
    
    shapes_replace_with_shst_list = [x for x in shst_shape_list if x in shape_id_list]
    
    print("\n There are " + str(len(shapes_replace_with_shst_list)) + 
          " shapes that are from shst gtfs matching: \n \t" + 
          str(shapes_replace_with_shst_list))

    bus_link_osmnx_df = bus_link_osmnx_df[~bus_link_osmnx_df.shape_id.isin(shapes_replace_with_shst_list)].copy()
    
    osmnx_shape_list = bus_link_osmnx_df.shape_id.unique().tolist()
    
    print("\n There are " + str(len(osmnx_shape_list)) + 
          " shapes that are from OSMNX routing: \n \t" + 
          str(osmnx_shape_list))
    
    not_routed_list = [x for x in shape_id_list if x not in (shst_shape_list + osmnx_shape_list)]
    
    print("\n There are " + str(len(not_routed_list)) + 
         " shapes that are not routed by either of the two methods: \n \t" + 
         str(not_routed_list))
    
    bus_link_shst_df = pd.merge(bus_link_shst_df,
                                bus_trip_df[['trip_id', 'shape_id']],
                                how = 'inner',
                                left_on = 'shape_id',
                                right_on = 'shape_id')
    
    bus_link_df = pd.concat([bus_link_osmnx_df, bus_link_shst_df],
                            sort = False,
                           ignore_index = True)
    
    column_list = bus_link_osmnx.columns.values.tolist()
    
    return bus_link_df[column_list]

In [53]:
bus_link_df = bus_link(bus_osmnx_link_shape_df, bus_shst_link_shape_df, all_routes_df, trip_df, incomplete_shape_list)

Targeting number of bus shape IDs: 1503

 There are 253 shapes that are from shst gtfs matching: 
 	[7.0, 10.0, 11.0, 13.0, 15.0, 16.0, 2069.0, 2070.0, 48.0, 52.0, 53.0, 61.0, 62.0, 65.0, 66.0, 73.0, 74.0, 77.0, 78.0, 84.0, 85.0, 86.0, 87.0, 94.0, 95.0, 97.0, 101.0, 102.0, 104.0, 105.0, 106.0, 107.0, 109.0, 110.0, 111.0, 114.0, 121.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 138.0, 139.0, 140.0, 141.0, 144.0, 145.0, 150.0, 151.0, 171.0, 181.0, 185.0, 186.0, 190.0, 191.0, 196.0, 197.0, 199.0, 201.0, 202.0, 209.0, 210.0, 215.0, 216.0, 226.0, 229.0, 238.0, 241.0, 242.0, 245.0, 246.0, 249.0, 250.0, 251.0, 252.0, 253.0, 254.0, 255.0, 257.0, 258.0, 275.0, 298.0, 301.0, 304.0, 305.0, 306.0, 307.0, 312.0, 313.0, 318.0, 319.0, 323.0, 325.0, 327.0, 335.0, 336.0, 337.0, 350.0, 351.0, 374.0, 574.0, 589.0, 590.0, 591.0, 596.0, 597.0, 598.0, 599.0, 664.0, 737.0, 741.0, 743.0, 747.0, 750.0, 755.0, 759.0, 1156.0, 1158.0, 1166.0, 1167.0, 1222.0, 

In [54]:
bus_link_df.info()
bus_link_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920493 entries, 0 to 920492
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   u                920493 non-null  int64  
 1   v                920493 non-null  int64  
 2   trip_id          920493 non-null  int32  
 3   shape_id         920493 non-null  float64
 4   wayId            920493 non-null  object 
 5   shstReferenceId  920493 non-null  object 
 6   shstGeometryId   920493 non-null  object 
 7   A                920493 non-null  int64  
 8   B                920493 non-null  int64  
dtypes: float64(1), int32(1), int64(4), object(3)
memory usage: 59.7+ MB


Unnamed: 0,u,v,trip_id,shape_id,wayId,shstReferenceId,shstGeometryId,A,B
0,57808643,57808641,9,359.0,303291893,020aa1becdbef42ef4245a3c278e9d29,c0f97948d99c64e52ae3c463383dabbf,3047632,3052718
1,57808641,57808639,9,359.0,303291893,1f81b33f6a786a97a0ed79bd2074ba02,94987b6adba60bad1ce73f6ec491fdf0,3052718,3006601
2,57808639,57808637,9,359.0,303291893,1e505f22f056942e593ee134ec97e624,f1dc7acbc076866422e7f373d4b2b643,3006601,3070234


### build non-bus/rail links and nodes

In [55]:
# manual correction for Capitol Corridor
# the shape_id from GTFS are wrong
# use the trips that go to San Jose

trip_df.loc[(trip_df.shape_id==487)&(trip_df.tod=="AM"), 
                 "trip_id"] = 8042
trip_df.loc[(trip_df.shape_id==487)&(trip_df.tod=="MD"), 
                 "trip_id"] = 8049
trip_df.loc[(trip_df.shape_id==487)&(trip_df.tod=="PM"), 
                 "trip_id"] = 8054
trip_df.loc[(trip_df.shape_id==487)&(trip_df.tod=="NT"), 
                 "trip_id"] = 8063

In [56]:
# create rail links
def non_bus_link(stop_times, shapes, routes, trip, stop):
    
    """
    create rail links and nodes
    
    nodes are based on rail stops, links are true shape between nodes
    
    return
    ---------
    complete rail link path for each rail service
    complete rail node path for each rail service
    
    """
    
    print('generating rail links...')
    
    #get rail trips
    trip_df = trip.copy()
    trip_df = pd.merge(trip_df, routes[['route_id', 'route_type']], how = 'left', on = 'route_id')
    rail_trip_df = trip_df[trip_df.route_type != 3].copy()
    
    stop_df = stop.copy()
    stop_time_df = stop_times.copy()
    
    #get rail trips with stops
    chained_stop_to_node_df = pd.merge(stop_time_df, 
                                       stop_df, 
                                       how = 'left', 
                                       on = 'stop_id')
    
    rail_stop_time_df = chained_stop_to_node_df[
                                                chained_stop_to_node_df['trip_id']\
                                                .isin(rail_trip_df.trip_id.tolist())
                                               ]\
                                                .copy()
    
    #get gtfs rail shapes
    rail_shape_df = shapes[shapes['shape_id'].isin(rail_trip_df.shape_id.tolist())
                                ].copy()
    
    #gtfs shape-trip correspondence
    shape_trip_dict = dict(zip(rail_trip_df.shape_id, rail_trip_df.trip_id))
    
    # manual correction
    shape_trip_dict[486] = 8039
    shape_trip_dict[487] = 8042
    
    print(rail_shape_df.shape_id.unique())
    #for each rail shape
    for i in rail_shape_df.shape_id.unique():
    
        trip_id = shape_trip_dict[i]
        
        #get chained stop
        trip_stop_df = rail_stop_time_df[rail_stop_time_df.trip_id == trip_id].copy()
        
        trip_stop_df.sort_values(by = ["stop_sequence"], inplace = True)
        
        # get gtfs shape nodes for the shape
        trip_shape_df = rail_shape_df[rail_shape_df.shape_id == i].copy()
        # initialize columns
        trip_shape_df['is_stop'] = np.int(0)
        trip_shape_df['stop_id'] = np.nan
        
        # for each rail stop, find the closest node in the shape, and those are the stops and breakpoints of new rail links
        # return is a gtfs node shape dataframe with two columns indicating if the node is a stop and the stop id
        shape_inventory = trip_shape_df[['shape_pt_lon', 'shape_pt_lat']].values
        tree = cKDTree(shape_inventory)
        for s in range(len(trip_stop_df)):
            point = trip_stop_df.iloc[s][['stop_lon', 'stop_lat']].values
            dd, ii = tree.query(point, k = 1)
            trip_shape_df.shape_pt_lon.iloc[ii] = trip_stop_df.iloc[s]['stop_lon']
            trip_shape_df.shape_pt_lat.iloc[ii] = trip_stop_df.iloc[s]['stop_lat']
            trip_shape_df.is_stop.iloc[ii] = 1
            trip_shape_df.stop_id.iloc[ii] = trip_stop_df.iloc[s]['stop_id']
        
        # appending the gtfs shape for each route shape id
        if i == rail_shape_df.shape_id.unique()[0]:
            shape_flag_df = trip_shape_df.copy()
        else:
            shape_flag_df = shape_flag_df.append(trip_shape_df, 
                                                 ignore_index = True, 
                                                 sort = False)
    
    # starting to build new rail links true shape
    linestring_df = pd.DataFrame(columns = ['shape_id', 'u', 'v', 'geometry', 'u_stop_id', 'v_stop_id'])

    # rail links are based on the gtfs shape, with nodes being the shapes that are identified as rail stops.
    for i in shape_flag_df.shape_id.unique():
        # get gtfs shape for shape id
        shape_route_df = shape_flag_df[shape_flag_df.shape_id == i].copy()
        
        # get rail nodes based on the stop flags
        break_list = shape_route_df.index[shape_route_df.is_stop == 1].tolist()
        stop_id_list = shape_route_df[shape_route_df.is_stop == 1]['stop_id'].tolist()
        
        # use the gtfs shape between "stop" shapes to build the rail true shape
        for j in range(len(break_list)-1):
            lon_list = shape_flag_df.shape_pt_lon.iloc[break_list[j]:break_list[j+1]+1].tolist()
            lat_list = shape_flag_df.shape_pt_lat.iloc[break_list[j]:break_list[j+1]+1].tolist()
            linestring = LineString([Point(xy) for xy in zip(lon_list,lat_list)])
            linestring_df = linestring_df.append({'shape_id':i, 
                                                  'u':break_list[j], 
                                                  'v':break_list[j+1],
                                                  'u_stop_id':stop_id_list[j], 
                                                  'v_stop_id':stop_id_list[j+1],
                                                  'geometry' : linestring}, 
                                                 ignore_index = True, 
                                                 sort = False)
    
    # add rail travel time between stops
    stop_time_df = pd.merge(
                            stop_time_df, 
                            rail_trip_df[['trip_id', 'shape_id']], 
                            how = 'left', 
                            on = 'trip_id')
    
    unique_stop_time_df = stop_time_df[
                                        stop_time_df.shape_id.notnull()
                                    ].groupby(['trip_id', 'shape_id'])\
                                    .count().reset_index()\
                                    .drop_duplicates(subset = ['shape_id']).copy()
    
    stop_time_df = stop_time_df[stop_time_df.trip_id.isin(unique_stop_time_df.trip_id.tolist())].copy()

    
    linestring_df = pd.merge(linestring_df, 
                             stop_time_df[['shape_id', 'stop_id' , 'departure_time']].rename(
                                 columns = {"stop_id" : "u_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'u_stop_id'])
    
    linestring_df = pd.merge(linestring_df, 
                             stop_time_df[['shape_id', 'stop_id', 'arrival_time']].rename(
                                 columns = {"stop_id" : "v_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'v_stop_id'])
    
    # travel time in minutes
    linestring_df['rail_traveltime'] = (linestring_df['arrival_time'] - linestring_df['departure_time'])/60
    
    rail_node_df = shape_flag_df[shape_flag_df.is_stop == 1].rename_axis('node_id').reset_index()

    
    return linestring_df, rail_node_df

In [57]:
%%time
rail_path_link_gdf, rail_path_node_df = non_bus_link(all_stop_times_df, all_shapes_df, all_routes_df, trip_df, stop_df)

generating rail links...
[ 404  405  406  407  408  409  410  411  412  413  414  415  416  417
  418  419  420  421  481  483  485  482  484  486  487  668  669  670
  671 1018 1019 1020 1021 1022 1024 1026 1027 1028 1030 1056 1059 1061
 1092 1097 1098 1101 1102 1103 1104 1105 1108 1111 1112 1117 1118 1119
 1120 1122 1123 1124 1126 1127 1129 1131 1132 1134 1137 1138 1139 1140
 1141 1143 1144 1145 1146 1905 1907 1908 1917 1919 1920 1924 1926 1929
 1930]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A v

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A v

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A v

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A v

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  trip_shape_df['is_stop'] = np.int(0)
A v

Wall time: 14 s


In [58]:
print('{} rail nodes'.format(rail_path_node_df.shape[0]))
print(rail_path_node_df.columns)

print('\n{} rail links with {} unique shapes'.format(rail_path_link_gdf.shape[0],
                                                     rail_path_link_gdf.shape_id.nunique()))
print(rail_path_link_gdf.columns)

1424 rail nodes
Index(['node_id', 'shape_pt_lat', 'shape_pt_lon', 'shape_pt_sequence',
       'shape_dist_traveled', 'shape_id', 'is_stop', 'stop_id'],
      dtype='object')

1339 rail links
Index(['shape_id', 'u', 'v', 'geometry', 'u_stop_id', 'v_stop_id',
       'departure_time', 'arrival_time', 'rail_traveltime'],
      dtype='object')


### add ACE whose gtfs data doesn't have 'shape' info

In [59]:
ACE_trips_df = trip_df[trip_df.agency_raw_name == "ACE_2017_3_20"]
shape_trip_dict = dict(zip(ACE_trips_df.shape_id, ACE_trips_df.trip_id))

ACE_linestring_df = pd.DataFrame(columns = ['shape_id', 'u', 'v', 'geometry', 'u_stop_id', 'v_stop_id'])

#get chained stop
chained_trip_stop_df = pd.merge(all_stop_times_df, all_stops_df, how = "left", on = "stop_id")
    
for i in ACE_trips_df.shape_id.unique():
    trip_id = shape_trip_dict[i]
    #get chained stop
    trip_stop_df = chained_trip_stop_df[chained_trip_stop_df.trip_id == trip_id].copy()
    
    trip_shape_df = trip_stop_df.copy()
    trip_shape_df["is_stop"] = 1
    trip_shape_df["shape_id"] = i
    
    break_list = trip_shape_df.index[trip_shape_df.is_stop == 1].tolist()
    stop_id_list = trip_shape_df[trip_shape_df.is_stop == 1]['stop_id'].tolist()
    
    for j in range(len(trip_stop_df)-1):
        lon_list = trip_shape_df.stop_lon.iloc[j:j+2].tolist()
        lat_list = trip_shape_df.stop_lat.iloc[j:j+2].tolist()
        linestring = LineString([Point(xy) for xy in zip(lon_list,lat_list)])
        ACE_linestring_df = ACE_linestring_df.append({'shape_id':i, 
                                                  'u':break_list[j], 
                                                  'v':break_list[j+1],
                                                  'u_stop_id':stop_id_list[j], 
                                                  'v_stop_id':stop_id_list[j+1],
                                                  'geometry' : linestring}, 
                                                 ignore_index = True, 
                                                 sort = False)
    if i == ACE_trips_df.shape_id.unique()[0]:
        ACE_rail_node_df = trip_shape_df
    else:
        ACE_rail_node_df = ACE_rail_node_df.append(trip_shape_df, ignore_index = False, sort = False)

stop_time_df = pd.merge(all_stop_times_df, 
                            ACE_trips_df[['trip_id', 'shape_id']], 
                            how = 'left', 
                            on = 'trip_id')

unique_stop_time_df = stop_time_df[stop_time_df.shape_id.notnull()
                                    ].groupby(['trip_id', 'shape_id'])\
                                    .count().reset_index()\
                                    .drop_duplicates(subset = ['shape_id']).copy()

stop_time_df = stop_time_df[stop_time_df.trip_id.isin(unique_stop_time_df.trip_id.tolist())].copy()
        
ACE_linestring_df = pd.merge(ACE_linestring_df, 
                        stop_time_df[['shape_id', 'stop_id' , 'departure_time']].rename(
                                 columns = {"stop_id" : "u_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'u_stop_id'])
    
ACE_linestring_df = pd.merge(ACE_linestring_df, 
                        stop_time_df[['shape_id', 'stop_id', 'arrival_time']].rename(
                                 columns = {"stop_id" : "v_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'v_stop_id'])
    
# travel time in minutes
ACE_linestring_df['rail_traveltime'] = (ACE_linestring_df['arrival_time'] - ACE_linestring_df['departure_time'])/60

ACE_rail_node_df = ACE_rail_node_df.rename_axis('node_id').reset_index()
ACE_rail_node_df.rename(columns = {"stop_lat" : "shape_pt_lat", 
                                   "stop_lon" : "shape_pt_lon", 
                                   "stop_sequence": "shape_pt_sequence"},
                       inplace = True)

In [60]:
# combine ACE with rest of rail

rail_path_link_with_ACE_gdf = pd.concat([rail_path_link_gdf, ACE_linestring_df], sort = False, ignore_index = True)
rail_path_node_with_ACE_df = pd.concat([rail_path_node_df, ACE_rail_node_df[rail_path_node_df.columns]], 
                                       sort = False, ignore_index = True)

In [62]:
print('including ACE, {} rail links with {} shapes, and {} rail nodes'.format(
    rail_path_link_with_ACE_gdf.shape[0],
    rail_path_link_with_ACE_gdf.shape_id.nunique(),
    rail_path_node_with_ACE_df.shape[0]))

including ACE, 1357 rail links with 87 shapes, and 1444 rail nodes


### combine bus and rail

In [63]:
def combine_bus_and_rail_shape(rail_path_link, rail_path_node, link, node, shape):
    
    """
    add only unique rail links and nodes to roadway standard
    
    parameter
    -----------
    complete rail link path
    complete rail node path
    all roadway links
    all roadway nodes
    all roadway shapes
    
    return
    -----------
    all roadway and rail links
    all roadway and rail nodes
    all roadway and rail shapes
    unique rail links
    unique rail nodes
    complete rail link path with updated link ID
    
    """
    
    print('indexing rail links and nodes...')
    
    node_gdf = node.copy()
    link_df = link.copy()
    shape_gdf = shape.copy()
    
    # add unique rail nodes to roadway node dataframe
    rail_path_node_gdf = rail_path_node.copy()
    
    unique_rail_node_df = rail_path_node_gdf.drop_duplicates(['shape_pt_lat', 'shape_pt_lon']).copy()
    
    # http://bayareametro.github.io/travel-model-two/input/#roadway-network
    TAP_start_number = 90001 
    
    unique_rail_node_df['model_node_id'] = range(TAP_start_number, TAP_start_number + len(unique_rail_node_df))
    
    rail_path_node_gdf = pd.merge(rail_path_node_gdf, 
                            unique_rail_node_df[['shape_pt_lat', 'shape_pt_lon', 'model_node_id']], 
                            how = 'left', 
                            on = ['shape_pt_lat', 'shape_pt_lon'])
    
    # get unique rail nodes
    unique_rail_node_df['geometry'] = [Point(xy) for xy in zip(unique_rail_node_df.shape_pt_lon, 
                                                               unique_rail_node_df.shape_pt_lat)]
    
    unique_rail_node_df = gpd.GeoDataFrame(unique_rail_node_df)
    unique_rail_node_df.crs = {'init' : 'epsg:4326'}
    unique_rail_node_df = unique_rail_node_df.to_crs(node_gdf.crs)
    
    unique_rail_node_df['rail_only'] = int(1)
    unique_rail_node_df["walk_access"] = int(1)
    
    # combine rail nodes and roadway nodes
    node_gdf["rail_only"] = int(0)
    
    rail_node_columns = ["model_node_id", "geometry", "rail_only", "walk_access"]
    
    roadway_and_rail_node_gdf = node_gdf.append(unique_rail_node_df[rail_node_columns],
                                                ignore_index = True, 
                                                sort = False)
    
    
    rail_node_osmid_dict = dict(zip(rail_path_node_gdf.node_id, rail_path_node_gdf.model_node_id))
    
    rail_path_link_df = rail_path_link.copy()
    
    rail_path_link_df['A'] = rail_path_link_df.u.map(rail_node_osmid_dict)
    rail_path_link_df['B'] = rail_path_link_df.v.map(rail_node_osmid_dict)
    
    rail_path_link_df.drop(["u", "v"], axis = 1, inplace = True)
    
    rail_path_link_df = gpd.GeoDataFrame(rail_path_link_df)
    rail_path_link_df.crs = {'init' : 'epsg:4326'}
    
    # get unique rail links
    unique_rail_link_gdf = rail_path_link_df.drop_duplicates(['A', 'B']).copy()
    
    # fake rail link shst geom id
    unique_rail_link_gdf['shstGeometryId'] = range(1, 1 + len(unique_rail_link_gdf))
    unique_rail_link_gdf['shstGeometryId'] = unique_rail_link_gdf.shstGeometryId.apply(lambda x:'rail'+str(x))
    unique_rail_link_gdf['id'] = unique_rail_link_gdf['shstGeometryId']

    unique_rail_link_gdf['rail_only'] = int(1)
    
    rail_path_link_df = pd.merge(rail_path_link_df,
                                unique_rail_link_gdf[["A", "B", "shstGeometryId"]],
                                how = "left",
                                on = ["A", "B"])
    
    rail_link_columns = ['A', 'B', "shstGeometryId", "rail_traveltime", "rail_only", "id"]
    rail_shape_columns = ["id", "geometry"]
    
    # combine rail and roadway links
    roadway_and_rail_link_df = link_df.append(unique_rail_link_gdf[rail_link_columns], 
                                              ignore_index = True, 
                                              sort = False)
    
    # combine rail and roadway shapes
    roadway_and_rail_shape_gdf = shape_gdf.append(unique_rail_link_gdf[rail_shape_columns],
                                                 ignore_index = True,
                                                 sort = False)
    
    """rail_path_link_df = pd.merge(rail_path_link_df[['shape_id', 'geometry', 'u_stop_id', 'v_stop_id']],
                            unique_rail_shape_gdf.drop(['geometry', 'shape_id'], axis = 1),
                            how = 'left',
                            on = ['u_stop_id', 'v_stop_id'])"""
    
    rail_path_link_df = rail_path_link_df.to_crs({'init' : 'epsg:4326'})
        
    return roadway_and_rail_link_df, roadway_and_rail_node_gdf, roadway_and_rail_shape_gdf, \
                unique_rail_link_gdf, unique_rail_node_df, \
                rail_path_link_df

In [64]:
roadway_and_rail_link_df, \
roadway_and_rail_node_gdf, \
roadway_and_rail_shape_gdf, \
unique_rail_link_gdf, \
unique_rail_node_gdf, \
rail_link_gdf = combine_bus_and_rail_shape(
                                           rail_path_link_with_ACE_gdf, 
                                           rail_path_node_with_ACE_df,
                                           link_df, 
                                           node_gdf,
                                           shape_gdf)

indexing rail links and nodes...


  return _prepare_from_string(" ".join(pjargs))


In [65]:
print('{} roadway links, {} links after adding transit gtfs'.format(link_df.shape[0],
                                                                    roadway_and_rail_link_df.shape[0]))
print('{} roadway nodes, {} nodes after adding transit gtfs'.format(node_gdf.shape[0],
                                                                    roadway_and_rail_node_gdf.shape[0]))
print('{} roadway shapes, {} shapes after adding transit gtfs'.format(shape_gdf.shape[0],
                                                                      roadway_and_rail_shape_gdf.shape[0]))

1632702 roadway links, 1633452 links after adding transit gtfs
643811 roadway nodes, 644480 nodes after adding transit gtfs
868567 roadway shapes, 869317 shapes after adding transit gtfs


### re-number rail nodes and links to be consistent with the county numbering ranges

In [66]:
# county_file = "../../data/external/county_boundaries/cb_2018_us_county_500k/cb_2018_us_county_500k.shp"

# county_gdf = gpd.read_file(county_file)

# # convert to EPSG 4326
# county_gdf = county_gdf.to_crs(shape_gdf.crs)

# # double check
# print(unique_rail_link_gdf.crs)

+init=epsg:4326 +type=crs


In [68]:
# use nearest match for rail nodes

node_county_matched_gdf = node_gdf.copy()
node_county_unmatched_gdf = unique_rail_node_gdf.copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

node_county_unmatched_gdf = node_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
node_county_unmatched_gdf['X'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
node_county_unmatched_gdf['Y'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

node_county_rematch_gdf = pd.DataFrame()

for i in range(len(node_county_unmatched_gdf)):
    point = node_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["county"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['model_node_id'] = node_county_unmatched_gdf.iloc[i]['model_node_id']
    
    if i == 0:
        node_county_rematch_gdf = add_snap_gdf.copy()
    else:
        node_county_rematch_gdf = node_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)
        
        
# use nearest match for rail links
node_county_matched_gdf = node_gdf.copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

shape_county_unmatched_gdf = unique_rail_link_gdf.copy()
shape_county_unmatched_gdf["geometry"] = unique_rail_link_gdf.centroid

shape_county_unmatched_gdf = shape_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
shape_county_unmatched_gdf["geometry"] = shape_county_unmatched_gdf["geometry"].centroid
shape_county_unmatched_gdf['X'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
shape_county_unmatched_gdf['Y'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

shape_county_rematch_gdf = pd.DataFrame()

for i in range(len(shape_county_unmatched_gdf)):
    point = shape_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["county"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['id'] = shape_county_unmatched_gdf.iloc[i]['id']
    
    if i == 0:
        shape_county_rematch_gdf = add_snap_gdf.copy()
    else:
        shape_county_rematch_gdf = shape_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

  return _prepare_from_string(" ".join(pjargs))

  shape_county_unmatched_gdf["geometry"] = unique_rail_link_gdf.centroid
  return _prepare_from_string(" ".join(pjargs))


In [70]:
display(node_county_rematch_gdf)
display(shape_county_rematch_gdf)

Unnamed: 0,county,model_node_id
0,Contra Costa,90001
1,Contra Costa,90002
2,Contra Costa,90003
3,Contra Costa,90004
4,Contra Costa,90005
...,...,...
664,Alameda,90665
665,Alameda,90666
666,Santa Clara,90667
667,Santa Clara,90668


Unnamed: 0,county,id
0,Contra Costa,rail1
1,Contra Costa,rail2
2,Contra Costa,rail3
3,Contra Costa,rail4
4,Contra Costa,rail5
...,...,...
745,Alameda,rail746
746,Alameda,rail747
747,Alameda,rail748
748,Alameda,rail749


In [73]:
# get the last node and link number of counties

county_last_node_id_df = node_gdf.groupby("county")["model_node_id"].max().reset_index().rename(
    columns = {"model_node_id" : "county_last_id"})

county_last_link_id_df = link_df.groupby("county")["model_link_id"].max().reset_index().rename(
    columns = {"model_link_id" : "county_last_id"})

node_county_rematch_gdf = pd.merge(
    node_county_rematch_gdf.rename(columns = {"model_node_id" : "rail_node_id"}),
    county_last_node_id_df,
    how = "left",
    on = "county"
)

node_county_rematch_gdf["model_node_id"] = node_county_rematch_gdf.groupby(["county"]).cumcount() + 1

node_county_rematch_gdf["model_node_id"] = node_county_rematch_gdf["model_node_id"] + node_county_rematch_gdf["county_last_id"]

shape_county_rematch_gdf = pd.merge(
    shape_county_rematch_gdf,
    county_last_link_id_df,
    how = "left",
    on = "county"
)

shape_county_rematch_gdf["model_link_id"] = shape_county_rematch_gdf.groupby(["county"]).cumcount() + 1

shape_county_rematch_gdf["model_link_id"] = shape_county_rematch_gdf["model_link_id"] + shape_county_rematch_gdf["county_last_id"]

In [74]:
# update to model_node_id and model_link_id for rail

# create dictionary
rail_node_id_dict = dict(zip(node_county_rematch_gdf.rail_node_id, node_county_rematch_gdf.model_node_id))
rail_node_county_dict = dict(zip(node_county_rematch_gdf.model_node_id, node_county_rematch_gdf.county))
rail_link_id_dict = dict(zip(shape_county_rematch_gdf.id, shape_county_rematch_gdf.model_link_id))
rail_link_county_dict = dict(zip(shape_county_rematch_gdf.id, shape_county_rematch_gdf.county))

# map to model_node_id and model_link_id
unique_rail_node_gdf["model_node_id"] = unique_rail_node_gdf["model_node_id"].map(rail_node_id_dict)
unique_rail_node_gdf["county"] = unique_rail_node_gdf["model_node_id"].map(rail_node_county_dict)
unique_rail_link_gdf["model_link_id"] = unique_rail_link_gdf["id"].map(rail_link_id_dict)
unique_rail_link_gdf["county"] = unique_rail_link_gdf["id"].map(rail_link_county_dict)

# update A/B
unique_rail_link_gdf["A"] = unique_rail_link_gdf["A"].map(rail_node_id_dict)
unique_rail_link_gdf["B"] = unique_rail_link_gdf["B"].map(rail_node_id_dict)

rail_link_gdf["A"] = rail_link_gdf["A"].map(rail_node_id_dict)
rail_link_gdf["B"] = rail_link_gdf["B"].map(rail_node_id_dict)

In [75]:
# clean up and combine with roadway links and shapes

# combine rail and roadway nodes
rail_node_columns = ["model_node_id", "geometry", "rail_only", "walk_access", "county"]
    
roadway_and_rail_node_gdf = node_gdf.append(unique_rail_node_gdf[rail_node_columns],
                                            ignore_index = True, 
                                            sort = False)

roadway_and_rail_node_gdf["rail_only"].fillna(0, inplace = True)
roadway_and_rail_node_gdf["rail_only"] = roadway_and_rail_node_gdf["rail_only"].astype(int)
    
# combine rail and roadway links
rail_link_columns = ['A', 'B', "shstGeometryId", "rail_traveltime","rail_only", "id", "model_link_id", "county"]

roadway_and_rail_link_df = link_df.append(unique_rail_link_gdf[rail_link_columns], 
                                              ignore_index = True, 
                                              sort = False)

roadway_and_rail_link_df["rail_only"].fillna(0, inplace = True)
roadway_and_rail_link_df["rail_only"] = roadway_and_rail_link_df["rail_only"].astype(int)
    
# combine rail and roadway shapes
rail_shape_columns = ["id", "geometry"]

roadway_and_rail_shape_gdf = shape_gdf.append(unique_rail_link_gdf[rail_shape_columns],
                                                 ignore_index = True,
                                                 sort = False)

In [79]:
print('after reassigning model_node_id and model_link_id,\n roadway and rail networks have {} links, {} link shapes, {} nodes'.format(
        roadway_and_rail_link_df.shape[0],
        roadway_and_rail_shape_gdf.shape[0],
        roadway_and_rail_node_gdf.shape[0]))

after reassigning model_node_id and model_link_id,
 roadway and rail networks have 1633452 links, 869317 link shapes, 644480 nodes


# modify and write out Transit standard files

In [90]:
# transit frequence based stop time

def create_freq_table(trip_df):
    
    """
    create frequency table for network standard
    
    trip_df: representative trips from gtfs
    
    """
    
    print('creating frequency reference...')
    
    # calculate EA and NT frequency using 5-6am, and 7-10pm
    tod_numhours_dict = {"AM" : 4, "MD" : 5, "PM" :4, "NT" : 3, "EA" : 1}
    
    freq_df = trip_df[['trip_id', 'tod', 'direction_id', 'trip_num']].copy()
    freq_df['headway_secs'] = freq_df.tod.map(tod_numhours_dict)
    freq_df['headway_secs'] = freq_df.apply(lambda x: int(x.headway_secs * 60 * 60 / x.trip_num),
                                           axis = 1)
    
    freq_enum_list = {'start_time' : {'AM' : '06:00:00', 
                                      'MD' : '10:00:00',
                                      "PM" : "15:00:00",
                                      "NT" : "19:00:00",
                                      "EA" : "03:00:00"},
                      'end_time' : {'AM' : '10:00:00', 
                                    'MD' : '15:00:00',
                                    "PM" : "19:00:00",
                                    "NT" : "03:00:00",
                                    "EA" : "06:00:00"}}
    
    freq_df['start_time'] = freq_df.tod.map(freq_enum_list.get("start_time"))
    freq_df['end_time'] = freq_df.tod.map(freq_enum_list.get("end_time"))
    
    return freq_df

In [82]:
freq_df = create_freq_table(trip_df)

display(freq_df)

creating frequency reference...


Unnamed: 0,trip_id,tod,direction_id,trip_num,headway_secs,start_time,end_time
0,1,EA,0.0,1.0,3600,03:00:00,06:00:00
1,5,AM,0.0,2.0,7200,06:00:00,10:00:00
2,4,PM,1.0,4.0,3600,15:00:00,19:00:00
3,9,EA,1.0,1.0,3600,03:00:00,06:00:00
4,10,AM,1.0,5.0,2880,06:00:00,10:00:00
...,...,...,...,...,...,...,...
3975,33661,PM,1.0,3.0,4800,15:00:00,19:00:00
3976,33510,AM,0.0,5.0,2880,06:00:00,10:00:00
3977,33515,PM,0.0,6.0,2400,15:00:00,19:00:00
3978,33521,NT,0.0,1.0,10800,19:00:00,03:00:00


In [85]:
# create new shape with complete node list the route passes
def create_new_node_shape(node, bus_link, rail_link = pd.DataFrame(columns = ["u", "v", "shape_id", "A", "B"])):
    
    """
    create complete node lists each transit traverses to replace the gtfs shape.txt
    """
    bus_link_df = bus_link.copy()
    bus_trip_list_with_unique_shape_id = bus_link_df.drop_duplicates(subset = ["shape_id"]).trip_id.tolist()
    
    bus_link_df = bus_link_df[bus_link_df.trip_id.isin(bus_trip_list_with_unique_shape_id)].copy()
    
    shape_link_df = pd.concat([bus_link_df[["u", "v", 'shape_id', "A", "B"]]
                                , rail_link[['shape_id', "A", "B"]]],
                               sort = False,
                               ignore_index = True)
    
    shape_link_df.u = shape_link_df.u.fillna(0).astype(np.int64)
    shape_link_df.v = shape_link_df.v.fillna(0).astype(np.int64)

    shape_point_df = gpd.GeoDataFrame()
    
    for shape_id in shape_link_df.shape_id.unique():
        shape_df = shape_link_df[shape_link_df.shape_id == shape_id]
        point_df = pd.DataFrame(data = {"shape_id" : shape_id,
                                         "shape_osm_node_id" : shape_df.u.tolist() + [shape_df.v.iloc[-1]],
                                        "shape_model_node_id" : shape_df.A.tolist() + [shape_df.B.iloc[-1]],
                                       "shape_pt_sequence" : range(1, 1+len(shape_df)+1)})
   
        shape_point_df = pd.concat([shape_point_df,
                                   point_df],
                                  sort = False,
                                  ignore_index = True)

    shape_point_df = pd.merge(shape_point_df,
                             node[["osm_node_id", "shst_node_id", "model_node_id", "geometry"]],
                             how = "left",
                             left_on = "shape_model_node_id",
                             right_on = "model_node_id")
    
    shape_point_df.crs = {'init' : 'epsg:4326'}
    
    print('double check: points on route shapes missing geometry: \n{}'.format(shape_point_df[shape_point_df.geometry.isnull()]))
    
    shape_point_df["shape_pt_lat"] = shape_point_df.geometry.map(lambda g:g.y)
    shape_point_df["shape_pt_lon"] = shape_point_df.geometry.map(lambda g:g.x)
    
    shape_point_df["shape_id"] = shape_point_df["shape_id"].astype(int)
    
    shape_point_df.rename(columns = {"shst_node_id":"shape_shst_node_id"}, inplace = True)
        
    return shape_point_df[["shape_id", "shape_pt_sequence", "shape_osm_node_id", "shape_shst_node_id", "shape_model_node_id"]]

In [86]:
shape_point_df = create_new_node_shape(roadway_and_rail_node_gdf, bus_link_df, rail_link_gdf)

  shape_point_df.crs = {'init' : 'epsg:4326'}


double check: points on route shapes missing geometry: 
Empty DataFrame
Columns: [shape_id, shape_osm_node_id, shape_model_node_id, shape_pt_sequence, osm_node_id, shst_node_id, model_node_id, geometry]
Index: []


In [87]:
display(shape_point_df.head(10))

shape_point_df.info()

Unnamed: 0,shape_id,shape_pt_sequence,shape_osm_node_id,shape_shst_node_id,shape_model_node_id
0,359,1,57808643,e7d44e84d34760e4d8ea8359e0bfe7b8,3047632
1,359,2,57808641,acccdab7b5c416d0cb08cd3a454c133c,3052718
2,359,3,57808639,4dea96f8099eb6899778d359f0181aba,3006601
3,359,4,57808637,9ef8fb82ea20dc287ed1ebebb20de60e,3070234
4,359,5,57808634,7728d88555ea4a62cd09d1269e727e4c,3026926
5,359,6,4924860621,b19d9c7759ec8df17aa2b9a261efc09e,3027418
6,359,7,4924860622,2a96e6d6c5b8e32ac127ae8aeb94a596,3038051
7,359,8,57808632,e54f087c7f4ed83cd6ae2da9ce86dd01,3013244
8,359,9,57808630,4e948b323a1eb20998aafc5ac91850cb,3007700
9,359,10,57807822,df689924893e875ae12af8b7471f4a3c,3023434


<class 'pandas.core.frame.DataFrame'>
Int64Index: 350940 entries, 0 to 350939
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   shape_id             350940 non-null  int32 
 1   shape_pt_sequence    350940 non-null  int64 
 2   shape_osm_node_id    350940 non-null  int64 
 3   shape_shst_node_id   349496 non-null  object
 4   shape_model_node_id  350940 non-null  int64 
dtypes: int32(1), int64(3), object(1)
memory usage: 14.7+ MB


In [88]:
def write_out_transit_standard(trip, stop, shape_point, freq, stop_times, routes, trips, rail_node = None):
    
    shape_point_df = shape_point.copy()
    trip_df = trip.copy()
        
    trip_df = trip_df[~ trip_df.agency_raw_name.isin(["Petaluma_2016_5_22", "WestCAT_2016_5_26", "GGFerries_2017_3_18"])].copy()
    
    trip_df["shape_id"] = trip_df["shape_id"].astype(int)
    
    trip_df = trip_df[trip_df.shape_id.isin(shape_point_df.shape_id.unique().tolist())]
    
    final_trip_list = trip_df.trip_id.unique().tolist()
    
    freq_df = freq.copy()
    freq_df = freq_df[freq_df.trip_id.isin(final_trip_list)]
    
    stop_df = stop.copy()
    
    if len(rail_node) > 0:
        rail_node_df = rail_node.copy()
        rail_node_dict = dict(zip(rail_node_df.stop_id, rail_node_df.model_node_id))
        
        stop_df['model_node_id'] = stop_df.apply(lambda x: rail_node_dict[x.stop_id] 
                                               if x.stop_id in rail_node_df.stop_id.tolist() 
                                               else x.model_node_id,
                                                axis = 1)
        stop_df['osm_node_id'] = stop_df.apply(lambda x: ""
                                                if x.stop_id in rail_node_df.stop_id.tolist() 
                                                else x.osm_node_id,
                                                axis = 1)
        stop_df['shst_node_id'] = stop_df.apply(lambda x: '' 
                                                if x.stop_id in rail_node_df.stop_id.tolist() 
                                                else x.shst_node_id,
                                                axis = 1)
    

    stop_times_df = stop_times.copy()
    stop_times_df = stop_times_df[stop_times_df.trip_id.isin(final_trip_list)]
    
    # update time to relative time for frequency based transit system
    stop_times_df['first_arrival'] = stop_times_df.groupby(['trip_id'])['arrival_time'].transform(min)
    stop_times_df['arrival_time'] = stop_times_df['arrival_time'] - stop_times_df['first_arrival']
    stop_times_df['departure_time'] = stop_times_df['departure_time'] - stop_times_df['first_arrival']
    
    stop_times_df['arrival_time'] = stop_times_df['arrival_time'].apply(
        lambda x : time.strftime('%H:%M:%S', time.gmtime(x)) if ~np.isnan(x) else x)
    stop_times_df['departure_time'] = stop_times_df['departure_time'].apply(
        lambda x : time.strftime('%H:%M:%S', time.gmtime(x)) if ~np.isnan(x) else x)

    
    stop_times_df.drop(['first_arrival'], axis = 1, inplace = True)
    
    route_df = routes.copy()
    route_df = route_df[route_df.route_id.isin(trip_df.route_id.tolist())]
    
    route_df.to_csv(output_folder + "routes.txt", 
                    index = False, 
                    sep = ',')
   
    shape_point_df.to_csv(output_folder + "shapes.txt", 
                          index = False, 
                          sep = ',')
  
    trip_df[trips.columns.values].to_csv(output_folder + "trips.txt", 
                                              index = False, 
                                              sep = ',')
  
    freq_df[['trip_id', 'headway_secs', 'start_time', 'end_time']].to_csv(output_folder + "frequencies.txt", 
                                                index = False, 
                                                sep = ',')
    
    stop_df.to_csv(output_folder + "stops.txt", 
                   index = False, 
                   sep = ',')
   
    stop_times_df.to_csv(output_folder + "stop_times.txt", 
                         index = False, 
                         sep = ',')


In [89]:
write_out_transit_standard(trip_df, 
                           stop_df, 
                           shape_point_df, 
                           freq_df, 
                           all_stop_times_df,
                           all_routes_df,
                           all_trips_df,
                           unique_rail_node_gdf)

# create rail-walk access links and add to the network

In [91]:
def create_transit_access_link(all_link, all_node, all_shape):
    
    """
    create rail walk access/egress links
    """
    
    tran_node_df = all_node[all_node.rail_only == 1].copy()
    walk_node_df = all_node[(all_node.walk_access == 1) & (all_node.rail_only == 0)].copy().reset_index(drop = True)
    
    walk_node_df = walk_node_df.to_crs({'init' : 'epsg:26915'})
    walk_node_df['X'] = walk_node_df.geometry.map(lambda g:g.x)
    walk_node_df['Y'] = walk_node_df.geometry.map(lambda g:g.y)
    inventory_node_ref = walk_node_df[['X', 'Y']].values
    tree = cKDTree(inventory_node_ref)
    
    tran_node_df = tran_node_df.to_crs({'init' : 'epsg:26915'})
    tran_node_df['X'] = tran_node_df.geometry.map(lambda g:g.x)
    tran_node_df['Y'] = tran_node_df.geometry.map(lambda g:g.y)
    
    for i in range(len(tran_node_df)):
        point = tran_node_df.iloc[i][['X', 'Y']].values
        dd, ii = tree.query(point, k = 1)
        add_node_gdf = gpd.GeoDataFrame(walk_node_df.iloc[ii]).transpose().reset_index(drop = True)
        add_node_gdf['tran_node'] = tran_node_df.iloc[i].model_node_id
        add_node_gdf['geometry_tran'] = tran_node_df.iloc[i].geometry
        
        if i == 0:
            rail_access_gdf = add_node_gdf.copy()
        else:
            rail_access_gdf = rail_access_gdf.append(add_node_gdf, ignore_index=True, sort=False)
    
    rail_access_gdf.rename(columns = {'geometry' : "geometry_walk"}, inplace = True)

    
    rail_access_gdf['geometry'] = [LineString(xy) for xy in zip(rail_access_gdf['geometry_walk'], 
                                                                rail_access_gdf['geometry_tran'])]
    
    # fake rail link shst geom id
    rail_access_gdf['shstGeometryId'] = range(1, 
                                     1 + len(rail_access_gdf))
    
    rail_access_gdf['shstGeometryId'] = rail_access_gdf.shstGeometryId.apply(lambda x:'walktorail'+str(x))
    rail_access_gdf['id'] = rail_access_gdf['shstGeometryId']
    
    rail_access_gdf["fromIntersectionId"] = rail_access_gdf.shst_node_id

    rail_access_gdf_copy = rail_access_gdf.copy()
    rail_access_gdf.rename(columns = {'model_node_id' : 'A', 'tran_node' : 'B'}, inplace = True)
    
    rail_access_gdf_copy.rename(columns = {'tran_node' : 'A', 'model_node_id' : 'B'}, inplace = True)
    
    rail_access_gdf = pd.concat(
                            [rail_access_gdf[['A', 'B', 'geometry', 'shstGeometryId', "id", "fromIntersectionId"]],
                            rail_access_gdf_copy[['A', 'B', 'geometry','shstGeometryId', "id", "fromIntersectionId"]]],
                               ignore_index = True,
                               sort = False)
    
    rail_access_gdf = gpd.GeoDataFrame(rail_access_gdf)
    rail_access_gdf.crs = {'init' : 'epsg:26915'}
    rail_access_gdf = rail_access_gdf.to_crs(all_node.crs)
    
    rail_access_gdf['walk_access'] = 1
    
    rail_access_link_columns = ["A", "B", "shstGeometryId", "walk_access", "id"]
    rail_access_shape_columns = ["id", "fromIntersectionId", "geometry"]
    
    all_link_df = all_link.copy()
    all_shape_gdf = all_shape.copy()
    
        
    all_shape_gdf = pd.concat([
                                all_shape_gdf,
                                rail_access_gdf[rail_access_shape_columns].drop_duplicates(
                                                                        subset = ["id"])
                              ],
                             sort = False,
                             ignore_index= True)

    
    all_link_df = pd.concat([all_link_df, 
                             rail_access_gdf[rail_access_link_columns]], 
                            ignore_index = True, 
                            sort = False)
    
    all_link_gdf = pd.merge(all_link_df,
                           all_shape_gdf,
                           how = "left",
                           left_on = "shstGeometryId",
                           right_on = "id")
    
    geom_length = gpd.GeoDataFrame(all_link_gdf[['geometry']])
    geom_length.crs = all_node.crs
    geom_length = geom_length.to_crs(epsg = 26915)
    geom_length["length"] = geom_length.length

    all_link_df["length"] = geom_length["length"]

    return all_link_df, all_shape_gdf


In [92]:
%%time
all_link_df, all_shape_gdf = create_transit_access_link(roadway_and_rail_link_df, 
                                                        roadway_and_rail_node_gdf,
                                                        roadway_and_rail_shape_gdf)

  return _prepare_from_string(" ".join(pjargs))
  return _prepare_from_string(" ".join(pjargs))
  return _prepare_from_string(" ".join(pjargs))


Wall time: 1min 45s


In [96]:
print('before adding rail walk access links, the network has {} links, with {} shapes'.format(
        roadway_and_rail_link_df.shape[0],
        roadway_and_rail_shape_gdf.shape[0]))
print('after adding rail walk access links, the network has {} links, with {} shapes'.format(
        all_link_df.shape[0],
        all_shape_gdf.shape[0]))

before adding rail walk access links, the network has 1633452 links, with 869317 shapes
after adding rail walk access links, the network has 1634790 links, with 869986 shapes


In [97]:
# use nearest to get county match for rail walk access links
node_county_matched_gdf = node_gdf.copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

link_county_unmatched_gdf = all_link_df[all_link_df.model_link_id.isnull()].copy()
link_county_unmatched_gdf = pd.merge(link_county_unmatched_gdf, all_shape_gdf[["id", "geometry"]], how = "left", on = "id")
link_county_unmatched_gdf = gpd.GeoDataFrame(link_county_unmatched_gdf, 
                                             geometry = link_county_unmatched_gdf.geometry, 
                                             crs = all_shape_gdf.crs)

link_county_unmatched_gdf = link_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
link_county_unmatched_gdf['X'] = link_county_unmatched_gdf['geometry'].apply(lambda p: p.centroid.x)
link_county_unmatched_gdf['Y'] = link_county_unmatched_gdf['geometry'].apply(lambda p: p.centroid.y)

link_county_rematch_gdf = pd.DataFrame()

for i in range(len(link_county_unmatched_gdf)):
    point = link_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["county"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf = pd.concat([add_snap_gdf,
                              link_county_unmatched_gdf.drop("county", axis = 1).iloc[[i]].reset_index(drop = True)], 
                               axis = 1) 
    
    if i == 0:
        link_county_rematch_gdf = add_snap_gdf.copy()
    else:
        link_county_rematch_gdf = link_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

link_county_rematch_gdf.crs = link_county_unmatched_gdf.crs
link_county_rematch_gdf = link_county_rematch_gdf.to_crs(node_gdf.crs)

  return _prepare_from_string(" ".join(pjargs))


In [98]:
link_county_rematch_gdf.info()
link_county_rematch_gdf.county.value_counts()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   county                  1338 non-null   object  
 1   access                  0 non-null      object  
 2   area                    0 non-null      object  
 3   bike_access             0 non-null      float64 
 4   bridge                  0 non-null      object  
 5   drive_access            0 non-null      float64 
 6   est_width               0 non-null      object  
 7   fromIntersectionId      0 non-null      object  
 8   highway                 0 non-null      object  
 9   id                      1338 non-null   object  
 10  junction                0 non-null      object  
 11  key                     0 non-null      object  
 12  landuse                 0 non-null      object  
 13  lanes                   0 non-null      object  
 14  link            

San Francisco    866
Santa Clara      300
Alameda           70
San Mateo         58
Contra Costa      28
Marin             10
Solano             6
Name: county, dtype: int64

In [99]:
# assign model_link_id to train-walk access links

# get the last link number of counties
county_last_link_id_df = roadway_and_rail_link_df.groupby("county")["model_link_id"].max().reset_index().rename(
    columns = {"model_link_id" : "county_last_id"})

link_county_rematch_gdf = pd.merge(
    link_county_rematch_gdf,
    county_last_link_id_df,
    how = "left",
    on = "county"
)

# assign model_link_id
link_county_rematch_gdf["model_link_id"] = link_county_rematch_gdf.groupby(["county"]).cumcount() + 1
link_county_rematch_gdf["model_link_id"] = link_county_rematch_gdf["model_link_id"] + link_county_rematch_gdf["county_last_id"]

In [100]:
# combine rail+roadway links and rail-walk access links

all_link_df = roadway_and_rail_link_df.append(link_county_rematch_gdf, 
                                              ignore_index = True, 
                                              sort = False)

In [101]:
# double check total number of links
all_link_df.model_link_id.nunique()

1634790

In [102]:
# number geometry increse should be the number of transit nodes: 664+5
print(roadway_and_rail_link_df.shstGeometryId.nunique())
print(roadway_and_rail_shape_gdf.id.nunique())
print(roadway_and_rail_shape_gdf.shape)
print(all_shape_gdf.id.nunique())
print(all_shape_gdf.shape)
print(all_link_df.shstGeometryId.nunique())

869317
869317
(869317, 6)
869986
(869986, 6)
869986


In [103]:
# number of link increase should be 2 times of transit nodes : 1328+10

print(roadway_and_rail_link_df.shape)
print(all_link_df.shape)

(1633452, 38)
(1634790, 42)


In [104]:
# check number of links and model_link_id by county
print(all_link_df.county.value_counts(dropna=False))
print()
print(all_link_df.groupby("county")["model_link_id"].max())

Santa Clara      517014
Alameda          315462
Contra Costa     237498
San Mateo        141024
Sonoma           135042
Solano           117114
San Francisco     74758
Marin             63198
Napa              33680
Name: county, dtype: int64

county
Alameda          3315461
Contra Costa     4237497
Marin            8063197
Napa             6033679
San Francisco      74758
San Mateo        1141023
Santa Clara      2517013
Solano           5117113
Sonoma           7135041
Name: model_link_id, dtype: int64


# write out network standard with rail nodes and links

In [107]:
# clean up
all_shape_gdf = fill_na(all_shape_gdf)

int_col = ["bike_access", "walk_access", "drive_access", "rail_only"]
for c in int_col:
    all_link_df[c] = all_link_df[c].fillna(0).astype(np.int64)
    
int_col = ["bike_access", "walk_access", "drive_access", "rail_only"]
for c in int_col:
    roadway_and_rail_node_gdf[c] = roadway_and_rail_node_gdf[c].fillna(0).astype(np.int64)

numeric columns:  []
str columns:  ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId']


In [108]:
# files to write out
all_link_df.info()
all_shape_gdf.info()
roadway_and_rail_node_gdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1634790 entries, 0 to 1634789
Data columns (total 42 columns):
 #   Column                  Non-Null Count    Dtype   
---  ------                  --------------    -----   
 0   access                  1632702 non-null  object  
 1   area                    1632702 non-null  object  
 2   bike_access             1634790 non-null  int64   
 3   bridge                  1632702 non-null  object  
 4   drive_access            1634790 non-null  int64   
 5   est_width               1632702 non-null  object  
 6   fromIntersectionId      1632702 non-null  object  
 7   highway                 1632702 non-null  object  
 8   id                      1634790 non-null  object  
 9   junction                1632702 non-null  object  
 10  key                     1632702 non-null  object  
 11  landuse                 1632702 non-null  object  
 12  lanes                   1632702 non-null  object  
 13  link                    1632702 non-null  

In [111]:
%%time

print("-------write out link shape geojson---------")

shape_prop = ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId']
shape_geojson = link_df_to_geojson(all_shape_gdf, shape_prop)

with open(data_interim_dir + "step6_gtfs/version_12/shape.geojson", "w") as f:
    json.dump(shape_geojson, f)

-------write out link shape geojson---------
Wall time: 0 ns


In [None]:
%%time

# write out link variable json
# link unique handle "shstReferenceId" + "shstGeometryId"

print("-------write out link json---------")

link_prop = all_link_df.drop(["county_numbering_start", "X", "Y", "county_last_id", "geometry"], axis = 1).columns.tolist()

out = all_link_df[link_prop].to_json(orient = "records")

with open(data_interim_dir + "step6_gtfs/version_12/link.json", 'w') as f:
    f.write(out)

In [None]:
%%time

print("-------write out node geojson---------")

node_prop = roadway_and_rail_node_gdf.drop(["geometry", "county_numbering_start"], axis = 1).columns.tolist()
node_geojson = point_df_to_geojson(roadway_and_rail_node_gdf, node_prop)

with open(data_interim_dir + "step6_gtfs/version_12/node.geojson", "w") as f:
    json.dump(node_geojson, f)

In [None]:
print("-------write out link feather---------")

link_feather = all_link_df.drop(["county_numbering_start", "X", "Y", "county_last_id", "geometry"], axis = 1).copy()

link_feather.to_feather(data_interim_dir + 'step6_gtfs/version_12/link.feather')

# write out to CUBE .lin

In [112]:
def prepare_df_for_cube(routes, mode_crosswalk, trip, bus_link, freq,
                        rail_link = pd.DataFrame(columns = ['shape_id'])):
    
    
    transit_link_gdf = pd.concat([bus_link[['shape_id']], rail_link[['shape_id']]], 
                                 sort = False, ignore_index = True)
    trip_df = trip.copy()
    
    trip_df = trip_df[trip_df.shape_id.isin(transit_link_gdf.shape_id.unique().tolist())]
    
    trip_df = pd.merge(trip_df.drop("agency_raw_name", axis = 1), routes, how = 'left', on = 'route_id')
    
    trip_df = trip_df[~ trip_df.agency_raw_name.isin(["Petaluma_2016_5_22", "WestCAT_2016_5_26", "GGFerries_2017_3_18"])].copy()
    
    trip_df = pd.merge(trip_df, freq[['trip_id','headway_secs']], how = 'left', on = 'trip_id')
    
    #trip_df['tod'] = np.where(trip_df.tod == 'peak', 'pk', 'op')
    
    trip_df['NAME'] = trip_df.apply(lambda x: str(x.agency_id) + '_' + str(x.route_id) + '_' 
                                    + str(x.route_short_name) 
                                    #+ '_'
                                    #+ x.tod 
                                    #+ str(x.direction_id)
                                    , 
                                    axis = 1)
    trip_df['LONGNAME'] = trip_df['route_long_name']
    trip_df['HEADWAY'] = (trip_df['headway_secs']/60).astype(int)
    
    """
    def mode_gtfs_to_muni(x):
        if x.route_type == 0:
            return 15
        elif x.route_type == 5:
   cube_trip_df = prepare_df_for_cube(all_routes_df, 
                                   gtfs_to_tm2_mode_crosswalk_df,
                                trip_df, 
                                bus_link_df, 
                                freq_df, 
                                rail_link_gdf)         return 14
        elif x.route_short_name[-1] in ["X", "R"]:
            return 12
        else:
            return 11
    
    trip_df['MODE'] = trip_df.apply(lambda x : mode_gtfs_to_muni(x),
                                   axis = 1)
    """
    
    trip_df = pd.merge(
        trip_df,
        mode_crosswalk.drop("agency_id", axis = 1),
        how = "left",
        on = ["agency_raw_name", "route_type"]
    )
    
    trip_df['TM2_mode'].fillna(11, inplace = True)
    trip_df['TM2_mode'] = trip_df['TM2_mode'].astype(int)
    
    trip_df['ONEWAY'] = 'T'
    
    return trip_df

In [113]:
# read gfts to TM2 mode crosswalk
gtfs_to_tm2_mode_crosswalk_df = pd.read_csv(data_interim_dir + "gtfs_to_tm2_mode_crosswalk.csv")

gtfs_to_tm2_mode_crosswalk_df.drop_duplicates(subset = ["agency_raw_name", "route_type"], inplace = True)

In [114]:
cube_trip_df = prepare_df_for_cube(all_routes_df, 
                                   gtfs_to_tm2_mode_crosswalk_df,
                                   trip_df, 
                                   bus_link_df, 
                                   freq_df, 
                                   rail_link_gdf)

In [115]:
cube_trip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3947 entries, 0 to 3946
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   service_id                    3947 non-null   object 
 1   trip_headsign                 3572 non-null   object 
 2   direction_id                  3947 non-null   float64
 3   block_id                      2780 non-null   object 
 4   trip_short_name               363 non-null    object 
 5   wheelchair_accessible         105 non-null    float64
 6   bikes_allowed                 610 non-null    float64
 7   original_trip_id              147 non-null    object 
 8   trip_bikes_allowed            8 non-null      object 
 9   trip_type                     0 non-null      object 
 10  drt_max_travel_time           18 non-null     object 
 11  drt_avg_travel_time           18 non-null     object 
 12  drt_advance_book_min          18 non-null     object 
 13  drt

In [116]:
def node_list(x, trip_df, stop_df, bus_link, stop_times, node_gdf, link_gdf, 
              rail_link = pd.DataFrame(columns = ['shape_id', 'u', 'v']), rail_node_df = None):
    print(x.trip_id)
    
    if len(rail_node_df) > 0:
        rail_node_dict = dict(zip(rail_node_df.stop_id, rail_node_df.model_node_id))
        stop_df['model_node_id'] = stop_df.apply(lambda x: rail_node_dict[x.stop_id] 
                                               if x.stop_id in rail_node_df.stop_id.tolist()
                                               else x.model_node_id,
                                    axis = 1)
        
    rail_link_df = rail_link.copy()
    rail_link_df = pd.merge(trip_df[['trip_id', 'shape_id']],
                            rail_link_df,
                           how = 'right',
                           on = 'shape_id')
    
    transit_link_gdf = pd.concat([bus_link[['u', 'v', 'shape_id', 'trip_id' , "A", "B"]], 
                                  rail_link_df[['shape_id', 'trip_id', "A", "B"]]], 
                                 sort = False, 
                                 ignore_index = True)
    
    """transit_link_gdf = pd.merge(transit_link_gdf,
                               link_gdf[['LINK_ID',"u","v"]],
                               how = "left",
                               on = ["u","v"])"""
   
    stop_times_df = stop_times.copy()
    stop_id_list = stop_times_df[stop_times_df.trip_id == x.trip_id]['stop_id'].tolist()
    stop_node_list = stop_df[stop_df['stop_id'].isin(stop_id_list)]['model_node_id'].tolist()
    
    #print(stop_node_list)
    
    node_list = transit_link_gdf[transit_link_gdf['trip_id'] == x.trip_id]['A'].tolist() + \
                    [transit_link_gdf[transit_link_gdf['trip_id'] == x.trip_id]['B'].iloc[-1]]
    
    #osmid_N_dict = dict(zip(node_gdf.OSMID, node_gdf.N))
    
    #node_list = list(map(osmid_N_dict.get, node_list))
    
    s = '\nLINE NAME=\"%s\",' % (x.NAME,)
    
    #line attribtes
    s += '\n LONGNAME=\"%s",' % (x.LONGNAME,)
    s += '\n USERA1=\"%s",' % (x.agency_id,)
    s += '\n USERA2=\"%s",' % (x.TM2_line_haul_name,)
    if x.tod == 'AM':
        s += '\n HEADWAY[2]=%s,' % (x.HEADWAY,)
    elif x.tod == "MD":
        s += '\n HEADWAY[3]=%s,' % (x.HEADWAY,)
    elif x.tod == "PM":
        s += '\n HEADWAY[4]=%s,' % (x.HEADWAY,)
    elif x.tod == "NT":
        s += '\n HEADWAY[5]=%s,' % (x.HEADWAY,)
    elif x.tod == "EA":
        s += '\n HEADWAY[1]=%s,' % (x.HEADWAY,)
    s += '\n MODE=%s,' % (x.TM2_mode,)
    s += '\n ONEWAY=%s,' % (x.ONEWAY,)
    s += '\n OPERATOR=%s,' % (x.agency_id,)
    s += '\n SHORTNAME=%s,' % (x.route_short_name,)
    s += '\nN='
    
    circular = 0
    #node list
    trip_stop_list = []
    for nodeIdx in range(len(node_list)):
        # added condition to make sure stops only get stopped once
        if (node_list[nodeIdx] in stop_node_list) & (node_list[nodeIdx] not in trip_stop_list):
            s += '\n %s' % (node_list[nodeIdx])
            trip_stop_list += [node_list[nodeIdx]]
            if nodeIdx < (len(node_list)-1):
                s += ','
        else:
            s += '\n -%s' % (node_list[nodeIdx])
            if nodeIdx < (len(node_list)-1):
                s += ','
            if (node_list[nodeIdx] in trip_stop_list):
                circular = 1
    
    #if circular == 1:
    #    s += ','
    #    s += '\n CIRCULAR=T'                
            
    lines.append(s)

In [118]:
%%time

lines = [';;<<PT>><<LINE>>;;']

cube_trip_df.apply(lambda x: node_list(x, 
                                    trip_df,
                                    stop_df, 
                                    bus_link_df,
                                    all_stop_times_df,
                                    roadway_and_rail_node_gdf,
                                    roadway_and_rail_link_df,
                                    rail_link_gdf,
                                    unique_rail_node_gdf), 
                axis=1)

with open(output_folder + "transit.LIN", 'w') as f:
    f.write("\n".join(map(str, lines)))

# write out quick QA/QC transit route true shape

In [119]:
# true shapes for line record

from shapely import ops, geometry

def get_true_line_shape(trip_df, bus_link, roadway_and_rail_shape,
                        rail_link = pd.DataFrame(columns = ['LINK_ID','shape_id', 'u', 'v'])):
    
    """
    write out true shape for each trip
    """
    
    rail_link_df = rail_link.copy()
    rail_link_df = pd.merge(trip_df[['trip_id', 'shape_id']],
                            rail_link_df,
                           how = 'right',
                           on = 'shape_id')
    
    transit_link_gdf = pd.concat([bus_link[['shape_id', 'trip_id', "shstGeometryId"]], 
                                  rail_link_df[['shape_id', 'trip_id', "shstGeometryId"]]], 
                                 sort = False, ignore_index = True)
    
    transit_link_gdf = pd.merge(transit_link_gdf,
                                roadway_and_rail_shape[['id', 'geometry']],
                                how = 'left',
                                left_on = 'shstGeometryId',
                               right_on = "id")
    
    true_line_shape_df = transit_link_gdf.groupby(['trip_id', 'shape_id'])['geometry'].agg(
                                                                lambda x: 
                                                                ops.linemerge(geometry.MultiLineString(x.tolist())))\
                                        .reset_index()
    
    """true_line_shape_df = pd.merge(true_line_shape_df, 
                                  cube,
                                 how = 'left',
                                 on = ['shape_id', 'trip_id'])"""
    
    true_line_shape_gdf = gpd.GeoDataFrame(true_line_shape_df, 
                                           crs = roadway_and_rail_shape.crs, 
                                           geometry = 'geometry')
    
    return true_line_shape_gdf

In [120]:
true_line_shape_gdf = get_true_line_shape(trip_df,
                                          bus_link_df, 
                                          roadway_and_rail_shape_gdf,
                                          rail_link_gdf)

In [121]:
true_line_shape_gdf.to_file(data_interim_dir + "step6_gtfs/transit_route.geojson",
                            driver = "GeoJSON")