# This notebook goes through building transit network from gtfs to network standard

1. extract represetative trips
2. snap stops to roadway nodes
3. route bus on roadway via osmnx routing
4. route bus on roadway via shst routing
5. build non-bus/rail links and nodes
6. complete network node list that each transit path traverses
7. frequence based stop time
8. write out to transit network standard
9. write out quick QA/QC transit route true shape
10. write out network standard with rail nodes and links
11. write out travel model transit network

In [1]:
import partridge as ptg
import peartree as pt
#%matplotlib inline
import requests
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, LineString
import networkx as nx
from shapely import wkt
from scipy.spatial import cKDTree
import osmnx as ox
from dbfread import DBF
from osgeo import ogr
import glob
import time
import json
import os

In [2]:
from methods import link_df_to_geojson
from methods import point_df_to_geojson

In [3]:
data_interim_dir = "../../data/interim/"

In [4]:
output_folder = "../../data/interim/step6_gtfs/"

#  Read Network

In [6]:
link_file = data_interim_dir + "step5_tidy_roadway/link.feather"
link_df = pd.read_feather(link_file)

node_file = data_interim_dir + "step5_tidy_roadway/node.geojson"
node_gdf = gpd.read_file(node_file)

shape_file = data_interim_dir + "step5_tidy_roadway/shape.geojson"
shape_gdf = gpd.read_file(shape_file)

In [16]:
# network type correction at Transbay temporary terminal:

link_df[link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"])][
    ["roadway", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,drive_access,walk_access,bike_access
524671,cycleway,0,1,1
863713,service,0,1,1


In [17]:
link_df.loc[link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"]), 
            "roadway"] = "service"
link_df.loc[link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"]), 
            "drive_access"] = 1

link_df[link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"])][
    ["roadway", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,drive_access,walk_access,bike_access
524671,service,1,1,1
863713,service,1,1,1


In [18]:
node_gdf[node_gdf.osm_node_id.isin([890045140, 5372055804, 890045129])]

Unnamed: 0,osm_node_id,shst_node_id,county,drive_access,walk_access,bike_access,model_node_id,county_numbering_start,geometry
130712,5372055804,3291f7c2f15101c22abf554ce230343e,San Francisco,0,1,1,1006138,1000000,POINT (-122.3920956 37.7893448)
215684,890045140,4c0619714744bed10b7de965adc7048d,San Francisco,1,1,1,1010031,1000000,POINT (-122.3926305 37.7896628)
244341,890045129,490be8656a6428c6fc871a1f0e6432eb,San Francisco,1,1,1,1011380,1000000,POINT (-122.3920287 37.7892519)


In [19]:
# network type correction at Transbay temporary terminal:

node_gdf.loc[node_gdf.osm_node_id.isin([890045140, 5372055804, 890045129]), "drive_access"] = 1

node_gdf[node_gdf.osm_node_id.isin([890045140, 5372055804, 890045129])]

Unnamed: 0,osm_node_id,shst_node_id,county,drive_access,walk_access,bike_access,model_node_id,county_numbering_start,geometry
130712,5372055804,3291f7c2f15101c22abf554ce230343e,San Francisco,1,1,1,1006138,1000000,POINT (-122.3920956 37.7893448)
215684,890045140,4c0619714744bed10b7de965adc7048d,San Francisco,1,1,1,1010031,1000000,POINT (-122.3926305 37.7896628)
244341,890045129,490be8656a6428c6fc871a1f0e6432eb,San Francisco,1,1,1,1011380,1000000,POINT (-122.3920287 37.7892519)


In [21]:
link_df.info()
node_gdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632702 entries, 0 to 1632701
Data columns (total 36 columns):
access                    1632702 non-null object
area                      1632702 non-null object
bike_access               1632702 non-null int64
bridge                    1632702 non-null object
drive_access              1632702 non-null int64
est_width                 1632702 non-null object
fromIntersectionId        1632702 non-null object
highway                   1632702 non-null object
id                        1632702 non-null object
junction                  1632702 non-null object
key                       1632702 non-null object
landuse                   1632702 non-null object
lanes                     1632702 non-null object
link                      1632702 non-null object
maxspeed                  1632702 non-null object
name                      1632702 non-null object
oneWay                    1632702 non-null object
ref                       1632702 non-n

In [22]:
print(node_gdf.crs)
print(node_gdf.columns)

{'init': 'epsg:4326'}
Index(['osm_node_id', 'shst_node_id', 'county', 'drive_access', 'walk_access',
       'bike_access', 'model_node_id', 'county_numbering_start', 'geometry'],
      dtype='object')


In [23]:
bus_only_link_list = [
     "0c246e7620f35085a744138ded98bcc0",
     "0d18dfb09f3fade6e2ff628c5a687b11",
     "2aa3575c0c2e3ca0641cd4cb1e72948b",
     "2c2ea4bd3246004fa11069cb1efb840b",
     "3cab7f966f939208fc75a47f4c493751",
     "5a3f39cc85c2c3c44c0ff4982cbd2290",
     "5a65ce949f2477a9fd5e860efb824666",
     "5b4f08db83fda8d46f14df935bfda4d4",
     "5cdc5688d4330f2135a2628a902183ce",
     "6d7b91866c6fd11abd5144ba2b97cb5b",
     "7aa77386d90920bfedfa2fea214ae75e",
     "7da12913ab01dcf54be12621dd306b31",
     "8d28736a25806aa1434e2276bedf0841",
     "8ff2a5e76e9aa61944c5e751c673bcbe",
     "15b6821fe27d6803b44050f5140aebc4",
     "55f500211d2a56c6f80ea40aa3d615f2",
     "027cfff6c4d141d2abb2c6e0fc44739e",
     "541cf1dc777f65d3cb6c23c9088d4449",
     "694bc9238c7b37caa4ff57ecabc15b70",
     "0046fbaec4f136e61ae2d4d02169a150",
     "1326d18cb4c10421af5b25fefc7e6a9c",
     "3262ab95d00bda62eb5f65c9f1c10021",
     "3881cb98e78d3b2dd857e09ab6a679e4",
     "5552ee2ab36013a295fc0e4afe9bfe2a",
     "15048893fae1581c76a7261f89da9e88",
     "a8b4bbf3f56497377bb8ddb6371a32ec",
     "a396e808862ac29c36f27189d21e3cb3",
     "c3f3a85194115ab61746c7cc3cbf9e93",
     "c66b526e2968ccf62f157c92df5d5662",
     "d1998ed72acdf4a204777c33585db28a",
     "ddbeb602266c86602a5264cfa0423f1a",
     "e7af2d7bbc999a0488111460d188a1af",
     "e78179638b38d98ee48139cd0ec71edb",
     "f8c85e9ba005383f6a54ca4d528e40bb",
     "f306fdfe70e8d4c1f329993ac57fe821",
     "0ae155b7b483507e1f38376849e9dbdb",
     "1e23dc3dce4c3961dfc54fcf05f9f038",
     "2b625a39aa77a0870a98090d0bdde604",
     "4a1b379d0c179be3968356758cc6b469",
     "4da487b370aefb446e59602ec26a7c3c",
     "6b033d32fce65e1d1fb96ab7490cd777",
     "9fc6ae1f0b07c2488742933ce6c3be25",
     "65a2bb27ae9514171321c6e2b55a95eb",
     "518da54d198a20142fddf421781db0af",
     "913aade8c55e81dec27a4ff42f3f1e86",
     "2630a99ce8078e48eaa633eba4c1632f",
     "4404a553a2b51fd8efb247d8a69937b1",
     "7655f4b670a43a48ce09491389014226",
     "783645e4615580e8c79e1476a80fc8f2",
     "a43f37b125547fe05b199221a3dc9452",
     "aa654aa460971b1957c2d6df3d5fb8de",
     "bf5e9237c8fbb0e92d82f5accd53c182",
     "cb711b2cd73ebb6ac0548350ae9f226e",
     "ccc57d772b85d192f10f43fc99fc808b",
     "d26deeebe93b64192bc57bf87a914445",
     "febb16def7601a2c58fa9bdf1fd7643e",
]

In [26]:
np.setdiff1d(bus_only_link_list,link_df.shstReferenceId.tolist())

array(['cb711b2cd73ebb6ac0548350ae9f226e'], dtype='<U32')

In [24]:
link_df[link_df.shstReferenceId.isin(bus_only_link_list)].drive_access.value_counts()

1    55
Name: drive_access, dtype: int64

In [27]:
drive_node_gdf = node_gdf[node_gdf.drive_access == 1].copy()
drive_link_df = link_df[link_df.drive_access == 1].copy()

In [28]:
def ox_graph(nodes_df, links_df):
    """
        create an osmnx-flavored network graph
        osmnx doesn't like values that are arrays, so remove the variables
        that have arrays.  osmnx also requires that certain variables
        be filled in, so do that too.
        Parameters
        ----------
        nodes_df : GeoDataFrame
        link_df : GeoDataFrame
        Returns
        -------
        networkx multidigraph
    """
    try:
        graph_nodes = nodes_df.drop(
                ["inboundReferenceId", "outboundReferenceId"], axis=1
            )
    except:
        graph_nodes = nodes_df.copy()

    graph_nodes.gdf_name = "network_nodes"
    graph_nodes['id'] = graph_nodes['shst_node_id']

    graph_links = links_df.copy()
    graph_links['id'] = graph_links['shstReferenceId']
    graph_links['key'] = graph_links['shstReferenceId']

    G = ox.gdfs_to_graph(graph_nodes, graph_links)

    return G

In [29]:
# build network routing file for osmnx routing

G_drive = ox_graph(drive_node_gdf,
                  drive_link_df)

In [785]:
drive_node_gdf[["model_node_id", "osm_node_id", "geometry"]].to_file("../../data/interim/step6_gtfs/check_routeing_node.geojson", driver = "GeoJSON")
join_gdf = pd.merge(drive_link_df[["A", "B", "u", "v", "id", "length"]], shape_gdf[["id", "geometry"]], how= "left", on = "id")
join_gdf = gpd.GeoDataFrame(join_gdf, geometry = join_gdf.geometry, crs = shape_gdf.crs)
join_gdf.to_file("../../data/interim/step6_gtfs/check_routeing_link.geojson", driver = "GeoJSON")

In [30]:
nx.shortest_path(G_drive, 3955000020, 5000827322, weight = "length")

[3955000020, 890045139, 890045140, 5372055804, 890045129, 5000827322]

In [31]:
nx.shortest_path(G_drive, 26029616, 4550843070, weight = "length")

[26029616,
 240296607,
 201825904,
 2780987624,
 2780987626,
 276837953,
 276838006,
 3701773474,
 3695430845,
 276838025,
 3939260686,
 3701349495,
 3701349496,
 4915156912,
 279895969,
 4915158128,
 4915158123,
 4915158134,
 4915158133,
 26029613,
 26029634,
 26029635,
 2386411714,
 5367061316,
 5367061317,
 5367061318,
 5367061312,
 5367061319,
 2631423354,
 2631423356,
 2631423358,
 2631423361,
 2870159701,
 187643145,
 4915141312,
 5367061431,
 5367061433,
 187643164,
 671799114,
 448809367,
 292445500,
 1722505394,
 2338377988,
 5785770332,
 65662636,
 65658312,
 4550843070]

In [32]:
nx.shortest_path(G_drive, 293741891, 65284950, weight = "length")

[293741891,
 65290257,
 911547143,
 3593679267,
 423778249,
 65290252,
 5435466368,
 65290251,
 65290249,
 65290238,
 65290236,
 5435466213,
 5435466205,
 65290232,
 5435466333,
 5435466219,
 65290229,
 65290227,
 5435466158,
 65290225,
 5435466163,
 65281097,
 4911322443,
 5437055071,
 65284950]

# Consolidate all gtfs into one

In [35]:
gtfs_agencies_list = os.listdir("../../data/external/gtfs/2015")

gtfs_agencies_list.remove("Petaluma_2016_5_22")
gtfs_agencies_list.remove("WestCAT_2016_5_26")
gtfs_agencies_list.remove("GGFerries_2017_3_18")

In [36]:
gtfs_agencies_list

['BART_2015_8_3',
 'ACE_2017_3_20',
 'CCTA_2015_8_11',
 'Caltrain_2015_5_13',
 'Emeryville_2016_10_26',
 'Fairfield_2015_10_14',
 'GGTransit_2015_9_3',
 'MVGo_2016_10_26',
 'Marguerite_2016_10_10',
 'MarinTransit_2015_8_31',
 'RioVista_2015_8_20',
 'SFMTA_2015_8_11',
 'SamTrans_2015_8_20',
 'Soltrans_2016_5_20',
 'SonomaCounty_2015_8_18',
 'VTA_2015_8_27',
 'TriDelta-GTFS-2018-05-24_21-43-17',
 'Union_City_Transit_Aug-01-2015 to Jun-30-2017',
 'Vine_GTFS_PLUS_2015',
 'petalumatransit-petaluma-ca-us__11_12_15',
 'vacavillecitycoach-2020-ca-us',
 'westcat-ca-us_9_17_2015',
 'SF_Bay_Ferry2016_07_01',
 'ACTransit_2015_8_14',
 'Blue&Gold_gtfs_10_4_2017',
 'Capitol_2017_3_20',
 'SantaRosa_google_transit_08_28_15']

In [38]:
all_routes_df = pd.DataFrame()
all_trips_df = pd.DataFrame()
all_stops_df = pd.DataFrame()
all_shapes_df = pd.DataFrame()
all_stop_times_df = pd.DataFrame()
all_agency_df = pd.DataFrame()

def get_representative_feed_from_gtfs(work_dir, in_url = "", fetch = False):
    
    print('getting representative feed...')
    
    if fetch == True:
        #read and save zip from url
        resp = urlopen(in_url)
        zipfile = ZipFile(BytesIO(resp.read()))
    
    if fetch == True:
        zipfile.extractall(work_dir + "muni")
    
    file_loc = work_dir
    
    # get feed for the busiest day
    feed = pt.get_representative_feed(file_loc)
    
    return feed

for name in gtfs_agencies_list:
    
    # exclude weekend only services
    if "calendar_orig.txt" in os.listdir("../../data/external/gtfs/2015/" + name):
        calendar_df = pd.read_csv("../../data/external/gtfs/2015/" + name + "/calendar.txt")
        
    elif "calendar.txt" in os.listdir("../../data/external/gtfs/2015/" + name):
        calendar_df = pd.read_csv("../../data/external/gtfs/2015/" + name + "/calendar.txt")
        calendar_df.to_csv("../../data/external/gtfs/2015/" + name + "/calendar_orig.txt",
                                                 index = False,
                                                 sep = ",")
    
        calendar_df["weekdays"] = calendar_df.apply(lambda x: x.monday + x.tuesday + x.wednesday + x.thursday + x.friday,
                                               axis = 1)
        calendar_df = calendar_df[calendar_df.weekdays > 0]
    
        calendar_df.drop("weekdays", axis = 1).to_csv("../../data/external/gtfs/2015/" + name + "/calendar.txt",
                                                 index = False,
                                                 sep = ",")
    
    feed = get_representative_feed_from_gtfs("../../data/external/gtfs/2015/" + name)
    
    routes_df = feed.routes.copy()
    routes_df["agency_raw_name"] = name
    
    stops_df = feed.stops.copy()
    stops_df["agency_raw_name"] = name
    
    trips_df = feed.trips.copy()
    trips_df["agency_raw_name"] = name
    
    if "direction_id" not in trips_df.columns: # Marguerita
        trips_df["direction_id"] = 0
    
    trips_df["direction_id"].fillna(0, inplace = True)
   
    shapes_df = feed.shapes.copy()
    shapes_df["agency_raw_name"] = name
    
    stop_times_df = feed.stop_times.copy()
    stop_times_df["agency_raw_name"] = name
    
    agency_df = feed.agency.copy()
    agency_df["agency_raw_name"] = name
    
    if len(shapes_df) == 0: # ACE, CCTA, VINE
        print("missing shapes.txt for {}".format(name))
        group_df = trips_df.groupby(["route_id", "direction_id"])["trip_id"].first().reset_index().drop("trip_id", axis = 1)
        group_df["shape_id"] = range(1, len(group_df) + 1)
        if "shape_id" in trips_df.columns:
            trips_df.drop("shape_id", axis = 1, inplace = True)
        trips_df = pd.merge(trips_df, group_df, how = "left", on = ["route_id", "direction_id"])
        
    if len(trips_df[trips_df.shape_id.isnull()]) > 0:
        print("partial complete shape_id for {}".format(name))
        trips_missing_shape_df = trips_df[trips_df.shape_id.isnull()].copy()
        group_df = trips_missing_shape_df.groupby(["route_id", "direction_id"])["trip_id"].first().reset_index().drop("trip_id", axis = 1)
        group_df["shape_id"] = range(1, len(group_df) + 1)
        group_df["shape_id"] = group_df["shape_id"].apply(lambda x: "psudo" + str(x))
        trips_missing_shape_df = pd.merge(trips_missing_shape_df.drop("shape_id", axis = 1), 
                                          group_df, how = "left", on = ["route_id", "direction_id"])
        trips_df = pd.concat([trips_df[trips_df.shape_id.notnull()], trips_missing_shape_df],
                             ignore_index = True,
                             sort = False)
        
    all_routes_df = all_routes_df.append(routes_df, sort = False, ignore_index = True)
    all_trips_df = all_trips_df.append(trips_df, sort = False, ignore_index = True)
    all_stops_df = all_stops_df.append(stops_df, sort = False, ignore_index = True)
    all_shapes_df = all_shapes_df.append(shapes_df, sort = False, ignore_index = True)
    all_stop_times_df = all_stop_times_df.append(stop_times_df, sort = False, ignore_index = True)
    all_agency_df = all_agency_df.append(agency_df, sort = False, ignore_index = True)

getting representative feed...
getting representative feed...
missing shapes.txt for ACE_2017_3_20
getting representative feed...
missing shapes.txt for CCTA_2015_8_11
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
partial complete shape_id for SamTrans_2015_8_20
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
missing shapes.txt for Vine_GTFS_PLUS_2015
getting representative feed...
getting representative feed...
getting representative feed...
getting representative feed...
partial complete shape_id for SF_Bay_Ferry2016_07_01
getting representative feed...
getting representative feed...
getti

In [39]:
all_trips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32789 entries, 0 to 32788
Data columns (total 21 columns):
route_id                       32789 non-null object
service_id                     32789 non-null object
trip_id                        32789 non-null object
trip_headsign                  30645 non-null object
direction_id                   32789 non-null float64
block_id                       25396 non-null object
shape_id                       32789 non-null object
wheelchair_accessible          1318 non-null float64
bikes_allowed                  4332 non-null float64
agency_raw_name                32789 non-null object
trip_short_name                2037 non-null object
original_trip_id               668 non-null object
trip_bikes_allowed             10 non-null object
trip_type                      0 non-null object
drt_max_travel_time            48 non-null object
drt_avg_travel_time            48 non-null object
drt_advance_book_min           48 non-null object
drt_pick

In [40]:
all_routes_df[(all_routes_df.agency_raw_name == "ACTransit_2015_8_14") & (all_routes_df.route_short_name == "H")]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,agency_raw_name,route_sort_order,min_headway_minutes,eligibility_restricted,continuous_pickup,continuous_drop_off
624,H-118,AC Transit,H,San Francisco - Richmond,,3,,,,ACTransit_2015_8_14,,,,,


In [41]:
all_stops_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21260 entries, 0 to 21259
Data columns (total 16 columns):
stop_id                21260 non-null object
stop_name              21260 non-null object
stop_desc              4319 non-null object
stop_lat               21260 non-null float64
stop_lon               21260 non-null float64
zone_id                12267 non-null object
stop_url               4168 non-null object
location_type          5558 non-null float64
parent_station         620 non-null object
stop_timezone          137 non-null object
wheelchair_boarding    388 non-null float64
agency_raw_name        21260 non-null object
stop_code              15488 non-null object
platform_code          58 non-null object
position               0 non-null object
direction              0 non-null object
dtypes: float64(4), object(12)
memory usage: 2.6+ MB


In [42]:
all_agency_df[all_agency_df.agency_raw_name == "Marguerite_2016_10_10"]

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_raw_name,agency_phone,agency_fare_url,agency_email
8,,Stanford Marguerite Shuttle,http://marguerite.stanford.edu,America/Los_Angeles,en,Marguerite_2016_10_10,650-723-9362,,


In [43]:
all_agency_df.astype(str).groupby(["agency_raw_name", "agency_name", "agency_id"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url,agency_email
agency_raw_name,agency_name,agency_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ACE_2017_3_20,ACE Altamont Corridor Express,CE,1,1,1,1,1,1
ACTransit_2015_8_14,AC Transit,AC Transit,1,1,1,1,1,1
BART_2015_8_3,Bay Area Rapid Transit,BART,1,1,1,1,1,1
Blue&Gold_gtfs_10_4_2017,Blue & Gold Fleet,BG,1,1,1,1,1,1
CCTA_2015_8_11,County Connection,,1,1,1,1,1,1
Caltrain_2015_5_13,Caltrain,caltrain-ca-us,1,1,1,1,1,1
Capitol_2017_3_20,Capitol Corridor,AM,1,1,1,1,1,1
Emeryville_2016_10_26,Emery Go-Round,573,1,1,1,1,1,1
Fairfield_2015_10_14,Fairfield and Suisun Transit,36,1,1,1,1,1,1
GGTransit_2015_9_3,Golden Gate Transit,,1,1,1,1,1,1


# Re-ID the consolidated gtfs

In [44]:
# route_id, shape_id, trip_id, stop_id, 

unique_route_id_df = all_routes_df.copy()
unique_route_id_df = unique_route_id_df.groupby(
    ["agency_raw_name", "route_id"])["agency_id"].count().reset_index().drop(["agency_id"], axis = 1)
unique_route_id_df["route_id_original"] = unique_route_id_df["route_id"]
unique_route_id_df["route_id"] = range(1,  len(unique_route_id_df) + 1)

unique_trip_id_df = all_trips_df.copy()
unique_trip_id_df = unique_trip_id_df.groupby(
    ["agency_raw_name", "trip_id"])["route_id"].count().reset_index().drop(["route_id"], axis = 1)
unique_trip_id_df["trip_id_original"] = unique_trip_id_df["trip_id"]
unique_trip_id_df["trip_id"] = range(1,  len(unique_trip_id_df) + 1)

unique_shape_id_df = all_trips_df.copy()
unique_shape_id_df = unique_shape_id_df.groupby(
    ["agency_raw_name", "shape_id"])["trip_id"].count().reset_index().drop(["trip_id"], axis = 1)
unique_shape_id_df["shape_id_original"] = unique_shape_id_df["shape_id"]
unique_shape_id_df["shape_id"] = range(1,  len(unique_shape_id_df) + 1)

unique_stop_id_df = all_stops_df.copy()
unique_stop_id_df = unique_stop_id_df.groupby(
    ["agency_raw_name", "stop_id"])["stop_lat"].count().reset_index().drop(["stop_lat"], axis = 1)
unique_stop_id_df["stop_id_original"] = unique_stop_id_df["stop_id"]
unique_stop_id_df["stop_id"] = range(1,  len(unique_stop_id_df) + 1)

In [45]:
unique_route_id_df[unique_route_id_df.route_id_original == "1031"]

Unnamed: 0,agency_raw_name,route_id,route_id_original
310,SFMTA_2015_8_11,311,1031
499,SonomaCounty_2015_8_18,500,1031


In [46]:
unique_stop_id_df[unique_stop_id_df.stop_id_original.isin(["3042", "3071"])]

Unnamed: 0,agency_raw_name,stop_id,stop_id_original
8420,SFMTA_2015_8_11,8421,3042
8447,SFMTA_2015_8_11,8448,3071
18153,VTA_2015_8_27,18154,3042
18174,VTA_2015_8_27,18175,3071


In [47]:
unique_shape_id_df[unique_shape_id_df.agency_raw_name == "Vine_GTFS_PLUS_2015"]

Unnamed: 0,agency_raw_name,shape_id,shape_id_original
1933,Vine_GTFS_PLUS_2015,1934,1
1934,Vine_GTFS_PLUS_2015,1935,2
1935,Vine_GTFS_PLUS_2015,1936,3
1936,Vine_GTFS_PLUS_2015,1937,4
1937,Vine_GTFS_PLUS_2015,1938,5
1938,Vine_GTFS_PLUS_2015,1939,6
1939,Vine_GTFS_PLUS_2015,1940,7
1940,Vine_GTFS_PLUS_2015,1941,8
1941,Vine_GTFS_PLUS_2015,1942,9
1942,Vine_GTFS_PLUS_2015,1943,10


In [48]:
all_routes_df = pd.merge(
    all_routes_df.rename(columns = {"route_id" : "route_id_original"}),
    unique_route_id_df,
    how = "left",
    on = ["agency_raw_name", "route_id_original"]
)

all_trips_df = pd.merge(
    all_trips_df.rename(columns = {"route_id" : "route_id_original", "trip_id" : "trip_id_original",
                                 "shape_id" : "shape_id_original"}),
    unique_route_id_df,
    how = "left",
    on = ["agency_raw_name", "route_id_original"]
)
all_trips_df = pd.merge(
    all_trips_df,
    unique_trip_id_df,
    how = "left",
    on = ["agency_raw_name", "trip_id_original"]
)
all_trips_df = pd.merge(
    all_trips_df,
    unique_shape_id_df,
    how = "left",
    on = ["agency_raw_name", "shape_id_original"]
)

all_stops_df = pd.merge(
    all_stops_df.rename(columns = {"stop_id" : "stop_id_original"}),
    unique_stop_id_df,
    how = "left",
    on = ["agency_raw_name", "stop_id_original"]
)

all_shapes_df = pd.merge(
    all_shapes_df.rename(columns = {"shape_id" : "shape_id_original"}),
    unique_shape_id_df,
    how = "left",
    on = ["agency_raw_name", "shape_id_original"]
)


all_stop_times_df = pd.merge(
    all_stop_times_df.rename(columns = {"trip_id" : "trip_id_original", "stop_id" : "stop_id_original"}),
    unique_trip_id_df,
    how = "left",
    on = ["agency_raw_name", "trip_id_original"]
)
all_stop_times_df = pd.merge(
    all_stop_times_df,
    unique_stop_id_df,
    how = "left",
    on = ["agency_raw_name", "stop_id_original"]
)

all_routes_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/routes.txt', index = False, sep = ',')
all_trips_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/trips.txt', index = False, sep = ',')
all_stops_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/stops.txt', index = False, sep = ',')
all_shapes_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/shapes.txt', index = False, sep = ',')
all_stop_times_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/stop_times.txt', index = False, sep = ',')
all_agency_df.to_csv('../../data/interim/step6_gtfs/consolidated_gtfs_input/agency.txt', index = False, sep = ',')

all_trips_df.drop(["route_id_original", "trip_id_original", "shape_id_original"], axis = 1, inplace = True)
all_stops_df.drop(["agency_raw_name", "stop_id_original"], axis = 1, inplace = True)
all_shapes_df.drop(["agency_raw_name", "shape_id_original"], axis = 1, inplace = True)
all_stop_times_df.drop(["agency_raw_name", "trip_id_original", "stop_id_original"], axis = 1, inplace = True)

In [49]:
all_trips_df[all_trips_df.direction_id.isnull()].agency_raw_name.unique()

array([], dtype=object)

# Processing

In [50]:
# pick representatives for each route by direction, with most number of trip 
def get_representative_trip_for_route(trips, stop_times):
    
    """
    get the representative trips for each route, by direction, tod
    
    """
    
    print('getting representative trip...')
    
    # get the first stop of each trip to determine the time period for each trip
    # process time
    stop_times_df = stop_times.copy()
    stop_times_df['arrival_h'] = pd.to_datetime(stop_times_df['arrival_time'], unit = 's').dt.hour
    stop_times_df['arrival_m'] = pd.to_datetime(stop_times_df['arrival_time'], unit = 's').dt.minute
    stop_times_df['departure_h'] = pd.to_datetime(stop_times_df['departure_time'], unit = 's').dt.hour
    stop_times_df['departure_m'] = pd.to_datetime(stop_times_df['departure_time'], unit = 's').dt.minute
    
    # according to the gtfs reference, the stop sequence does not have to be consecutive, but has to always increase
    # so we can get the fisrt stop by the smallest stop sequence on the trip
    stop_times_df.sort_values(by = ["trip_id", "stop_sequence"], 
                              ascending = True, 
                              inplace = True)
    first_stop_df = stop_times_df.drop_duplicates(subset = ["trip_id"])
    
    ## identify peak, offpeak trips, based on the arrival time of first stop
    trip_df = trips.copy()
    trip_df = pd.merge(trip_df, 
                       first_stop_df,
                       how = 'left',
                       on = 'trip_id')
    
    ## AM: 6-10am, MD: 10am-3pm, PM: 3-7pm, NT 7pm-3am, EA 3-6am
    trip_df['tod'] = np.where((trip_df['arrival_h'] >= 6) & (trip_df['arrival_h'] < 10),
                                 'AM',
                                 np.where((trip_df['arrival_h'] >= 10) & (trip_df['arrival_h'] < 15),
                                     'MD',
                                    np.where((trip_df['arrival_h'] >= 15) & (trip_df['arrival_h'] < 19),
                                         'PM',
                                        np.where((trip_df['arrival_h'] >= 3) & (trip_df['arrival_h'] < 6),
                                             'EA',
                                                 'NT'))))
  
    # calculate frequency for EA and NT period using 5-6am, and 7-10pm
    trip_EA_NT_df = trip_df.copy()
    trip_EA_NT_df["tod"] = np.where((trip_df['arrival_h'] >= 5) & (trip_df['arrival_h'] < 6),
                                  "EA",
                                  np.where((trip_df['arrival_h'] >= 19) & (trip_df['arrival_h'] < 22),
                                          "NT",
                                          "NA")
                                  )
    
    # get the most frequent trip for each route, by direction, by time of day
    ## trips share the same shape_id is considered being the same
    ## first get the trip count for each shape_id
    trip_freq_df = trip_df.groupby(['route_id', 'tod', 'direction_id', 'shape_id'])['trip_id'].count().\
                            to_frame().\
                            drop(index = 'other', level = 1).\
                            reset_index()
    
    ## then choose the most frequent shape_id for each route, frequency use the total number of trips
    def agg(x):
        m = x.shape_id.iloc[np.argmax(x.trip_id.values)]
        return pd.Series({'trip_num' : x.trip_id.sum(), 'shape_id' : m})
   
    trip_freq_df = trip_freq_df.reset_index().groupby(['route_id', 'tod', 'direction_id']).apply(agg)
    
    # retain the complete trip info of represent trip only
    trip_df = pd.merge(trip_df, trip_freq_df.reset_index(),
                      how = 'inner',
                      on = ['route_id', 'tod', 'direction_id', 'shape_id']).\
                drop_duplicates(['route_id', 'direction_id', 'tod'])
        
    trip_EA_NT_df = pd.merge(trip_EA_NT_df, trip_freq_df.reset_index(),
                      how = 'inner',
                      on = ['route_id', 'tod', 'direction_id', 'shape_id'])
    
    trip_EA_NT_df = trip_EA_NT_df[trip_EA_NT_df.tod.isin(["EA", "NT"])].groupby(
        ["route_id", "tod", "direction_id", "shape_id"])["trip_id"].count().reset_index()
    
    trip_EA_NT_df.rename(columns = {"trip_id" : "trip_num"}, inplace = True)
    
    trip_df = pd.merge(
        trip_df,
        trip_EA_NT_df,
        how = "left",
        on = ["route_id", "tod", "direction_id", "shape_id"]
    )
    
    trip_df["trip_num"] = np.where(trip_df.trip_num_y.isnull(),
                                  trip_df.trip_num_x,
                                  trip_df.trip_num_y)
    
    return trip_df

In [51]:
trip_df = get_representative_trip_for_route(all_trips_df, all_stop_times_df)

getting representative trip...


In [52]:
all_routes_df.agency_raw_name.value_counts()

ACTransit_2015_8_14                              152
SFMTA_2015_8_11                                   81
VTA_2015_8_27                                     80
SamTrans_2015_8_20                                75
GGTransit_2015_9_3                                38
CCTA_2015_8_11                                    30
SonomaCounty_2015_8_18                            29
Marguerite_2016_10_10                             29
SantaRosa_google_transit_08_28_15                 17
Fairfield_2015_10_14                              16
Soltrans_2016_5_20                                14
TriDelta-GTFS-2018-05-24_21-43-17                 14
westcat-ca-us_9_17_2015                           14
Vine_GTFS_PLUS_2015                               13
petalumatransit-petaluma-ca-us__11_12_15          11
Union_City_Transit_Aug-01-2015 to Jun-30-2017      9
MarinTransit_2015_8_31                             8
Emeryville_2016_10_26                              7
SF_Bay_Ferry2016_07_01                        

In [53]:
all_routes_df[all_routes_df.agency_raw_name.isin(["CCTA_2015_8_11"])]

Unnamed: 0,route_id_original,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,agency_raw_name,route_sort_order,min_headway_minutes,eligibility_restricted,continuous_pickup,continuous_drop_off,route_id
7,34_merged_381003112,,93X,Kirker Pass Express,,3,,,,CCTA_2015_8_11,,,,,,182
8,28_merged_381003096,,5,Creekside/BART Walnut Creek,,3,,,,CCTA_2015_8_11,,,,,,175
9,26_merged_381003090,,36,San Ramon/BART Dublin,,3,,,,CCTA_2015_8_11,,,,,,173
10,13_merged_381003100,,21,BART Walnut Creek/San Ramon,,3,,,,CCTA_2015_8_11,,,,,,167
11,15_merged_381003102,,28,BART North Concord/Martinez,,3,,,,CCTA_2015_8_11,,,,,,169
12,11_merged_381003098,,2,Rudgear/BART Walnut Creek,,3,,,,CCTA_2015_8_11,,,,,,165
13,1,,1,Rossmoor/Shadelands,,3,,,,CCTA_2015_8_11,,,,,,163
14,3,,11,BART Concord/BART Pleasant Hi,,3,,,,CCTA_2015_8_11,,,,,,177
15,2,,10,BART Concord/Clayton,,3,,,,CCTA_2015_8_11,,,,,,171
16,5,,15,Treat Blvd,,3,,,,CCTA_2015_8_11,,,,,,188


In [54]:
all_trips_df[all_trips_df.trip_id == 4384]

Unnamed: 0,service_id,trip_headsign,direction_id,block_id,wheelchair_accessible,bikes_allowed,agency_raw_name,trip_short_name,original_trip_id,trip_bikes_allowed,...,drt_max_travel_time,drt_avg_travel_time,drt_advance_book_min,drt_pickup_message,drt_drop_off_message,continuous_pickup_message,continuous_drop_off_message,route_id,trip_id,shape_id
30921,1508FA-D6-Weekday-02,232 FREMONT BART,1.0,611281,,,ACTransit_2015_8_14,,,,...,,,,,,,,17,4384,47


In [55]:
print(trip_df.info())
trip_df[trip_df.agency_raw_name == "Caltrain_2015_5_13"]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 0 to 3817
Data columns (total 49 columns):
service_id                      3818 non-null object
trip_headsign                   3444 non-null object
direction_id                    3818 non-null float64
block_id                        2751 non-null object
wheelchair_accessible           125 non-null float64
bikes_allowed                   634 non-null float64
agency_raw_name                 3818 non-null object
trip_short_name                 363 non-null object
original_trip_id                147 non-null object
trip_bikes_allowed              8 non-null object
trip_type                       0 non-null object
drt_max_travel_time             18 non-null object
drt_avg_travel_time             18 non-null object
drt_advance_book_min            18 non-null object
drt_pickup_message              0 non-null object
drt_drop_off_message            0 non-null object
continuous_pickup_message       0 non-null object
continuous_dro

Unnamed: 0,service_id,trip_headsign,direction_id,block_id,wheelchair_accessible,bikes_allowed,agency_raw_name,trip_short_name,original_trip_id,trip_bikes_allowed,...,last_stop_on_trip,stop_id,arrival_h,arrival_m,departure_h,departure_m,tod,trip_num_x,trip_num_y,trip_num
271,CT-14OCT-Combo-Weekday-01,San Jose Caltrain Station,1.0,,,,Caltrain_2015_5_13,312,,,...,,6607,6.0,57.0,6.0,57.0,AM,5,,5.0
272,CT-14OCT-Combo-Weekday-01,San Francisco Caltrain Station,0.0,,,,Caltrain_2015_5_13,365,,,...,,6650,16.0,23.0,16.0,23.0,PM,5,,5.0
273,CT-14OCT-Combo-Weekday-01,San Francisco Caltrain Station,0.0,,,,Caltrain_2015_5_13,305,,,...,,6650,5.0,45.0,5.0,45.0,EA,2,1.0,1.0
274,CT-14OCT-Combo-Weekday-01,San Francisco Caltrain Station,0.0,,,,Caltrain_2015_5_13,313,,,...,,6650,6.0,45.0,6.0,45.0,AM,4,,4.0
275,CT-14OCT-Combo-Weekday-01,San Jose Caltrain Station,1.0,,,,Caltrain_2015_5_13,360,,,...,,6607,16.0,9.0,16.0,9.0,PM,6,,6.0
276,CT-14OCT-Combo-Weekday-01,Tamien Caltrain Station,1.0,,,,Caltrain_2015_5_13,208,,,...,,6607,6.0,24.0,6.0,24.0,AM,10,,10.0
277,CT-14OCT-Combo-Weekday-01,Tamien Caltrain Station,1.0,,,,Caltrain_2015_5_13,254,,,...,,6607,14.0,37.0,14.0,37.0,MD,1,,1.0
278,CT-14OCT-Combo-Weekday-01,San Jose Caltrain Station,1.0,,,,Caltrain_2015_5_13,262,,,...,,6607,16.0,19.0,16.0,19.0,PM,10,,10.0
279,CT-14OCT-Combo-Weekday-01,San Francisco Caltrain Station,0.0,,,,Caltrain_2015_5_13,211,,,...,,6650,6.0,20.0,6.0,20.0,AM,9,,9.0
280,CT-14OCT-Combo-Weekday-01,San Francisco Caltrain Station,0.0,,,,Caltrain_2015_5_13,257,,,...,,6652,14.0,33.0,14.0,33.0,MD,1,,1.0


In [56]:
trip_df[trip_df.trip_headsign == "22 EASTRIDGE"].trip_id

1979    27543
1980    27582
1981    27537
1982    27551
1983    27592
Name: trip_id, dtype: int32

In [57]:
all_shapes_df[all_shapes_df.shape_id == 1691]
all_stop_times_df[all_stop_times_df.trip_id == 27543]

Unnamed: 0,arrival_time,departure_time,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,start_service_area_id,end_service_area_id,...,end_service_area_radius,continuous_pickup,continuous_drop_off,pickup_area_id,drop_off_area_id,pickup_service_area_radius,drop_off_service_area_radius,last_stop_on_trip,trip_id,stop_id
655459,13800.0,13800.0,20,,,,,,,,...,,,,,,,,,27543,18542
655460,15720.0,15720.0,61,,,,,,,,...,,,,,,,,,27543,16621
663714,13020.0,13020.0,1,,,,,,,,...,,,,,,,,,27543,18370
663715,13380.0,13380.0,8,,,,,,,,...,,,,,,,,,27543,18412
663716,14520.0,14520.0,37,,,,,,,,...,,,,,,,,,27543,18716
663717,14880.0,14880.0,43,,,,,,,,...,,,,,,,,,27543,18776
663718,16020.0,16020.0,69,,,,,,,,...,,,,,,,,,27543,19000
663719,17040.0,17040.0,92,,,,,,,,...,,,,,,,,,27543,18569
663720,17280.0,17280.0,97,,,,,,,,...,,,,,,,,,27543,18999
663721,17820.0,17820.0,109,,,,,,,,...,,,,,,,,,27543,20048


In [58]:
def snap_stop_to_node(stops, node_gdf):
    
    """
    map gtfs stops to roadway nodes
    
    Parameters:
    ------------
    feed
    drive nodes
    
    return
    ------------
    stops with drive nodes id
    """
    
    print('snapping gtfs stops to roadway node osmid...')
    
    node_non_c_gdf = node_gdf.copy()
    node_non_c_gdf = node_non_c_gdf.to_crs({'init' : 'epsg:26915'})
    node_non_c_gdf['X'] = node_non_c_gdf.geometry.map(lambda g:g.x)
    node_non_c_gdf['Y'] = node_non_c_gdf.geometry.map(lambda g:g.y)
    inventory_node_ref = node_non_c_gdf[['X', 'Y']].values
    tree = cKDTree(inventory_node_ref)
    
    stop_df = stops.copy()
    stop_df['geometry'] = [Point(xy) for xy in zip(stop_df['stop_lon'], stop_df['stop_lat'])]
    stop_df = gpd.GeoDataFrame(stop_df)
    stop_df.crs = {'init' : 'epsg:4326'}
    stop_df = stop_df.to_crs({'init' : 'epsg:26915'})
    stop_df['X'] = stop_df['geometry'].apply(lambda p: p.x)
    stop_df['Y'] = stop_df['geometry'].apply(lambda p: p.y)
   
    for i in range(len(stop_df)):
        point = stop_df.iloc[i][['X', 'Y']].values
        dd, ii = tree.query(point, k = 1)
        add_snap_gdf = gpd.GeoDataFrame(node_non_c_gdf.iloc[ii]).transpose().reset_index(drop = True)
        add_snap_gdf['stop_id'] = stop_df.iloc[i]['stop_id']
        if i == 0:
            stop_to_node_gdf = add_snap_gdf.copy()
        else:
            stop_to_node_gdf = stop_to_node_gdf.append(add_snap_gdf, ignore_index=True, sort=False)
    
    stop_df.drop(['X','Y'], axis = 1, inplace = True)
    stop_to_node_gdf = pd.merge(stop_df, stop_to_node_gdf, how = 'left', on = 'stop_id')
    
    column_list = stops.columns.values.tolist() + ['osm_node_id', 'shst_node_id', "model_node_id"]
    
    return stop_to_node_gdf[column_list]

In [59]:
stop_df = snap_stop_to_node(all_stops_df, drive_node_gdf)

snapping gtfs stops to roadway node osmid...


In [60]:
stop_df.info()
stop_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21260 entries, 0 to 21259
Data columns (total 18 columns):
stop_name              21260 non-null object
stop_desc              4319 non-null object
stop_lat               21260 non-null float64
stop_lon               21260 non-null float64
zone_id                12267 non-null object
stop_url               4168 non-null object
location_type          5558 non-null float64
parent_station         620 non-null object
stop_timezone          137 non-null object
wheelchair_boarding    388 non-null float64
stop_code              15488 non-null object
platform_code          58 non-null object
position               0 non-null object
direction              0 non-null object
stop_id                21260 non-null int32
osm_node_id            21260 non-null object
shst_node_id           21260 non-null object
model_node_id          21260 non-null object
dtypes: float64(4), int32(1), object(13)
memory usage: 3.0+ MB


Unnamed: 0,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,stop_code,platform_code,position,direction,stop_id,osm_node_id,shst_node_id,model_node_id
0,12th St. Oakland City Center,,37.803664,-122.271604,12TH,http://www.bart.gov/stations/12TH/,0.0,,,1.0,,,,,5411,2389063554,903a4c75fc5e88338fc384411b8d596a,2501627
1,16th St. Mission,,37.765062,-122.419694,16TH,http://www.bart.gov/stations/16TH/,0.0,,,1.0,,,,,5412,65345359,b353cc71cd220cb5c98504c2ce04eced,1011071
2,19th St. Oakland,,37.80787,-122.269029,19TH,http://www.bart.gov/stations/19TH/,0.0,,,1.0,,,,,5413,53077120,b40a224e561bcb18b9c1c6ed21b6b6bb,2504171


In [61]:
all_routes_df[all_routes_df.route_type==5]

Unnamed: 0,route_id_original,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,agency_raw_name,route_sort_order,min_headway_minutes,eligibility_restricted,continuous_pickup,continuous_drop_off,route_id
191,1059,SFMTA,Powell-Mason,,,5,,,,SFMTA_2015_8_11,,,,,,319
192,1060,SFMTA,Powell-Hyde,,,5,,,,SFMTA_2015_8_11,,,,,,320
193,10807,SFMTA,CALIFORNIA,,,5,,,,SFMTA_2015_8_11,,,,,,352


In [62]:
all_routes_df.route_type.value_counts()

3    632
0     10
4      9
2      8
1      6
5      3
Name: route_type, dtype: int64

In [63]:
def route_bus_link_osmnx(roadway_gdf, node_gdf, G, stop_times, routes, trip, stop):
    
    """
    route bus with OSMNX routing
    
    Parameters
    ----------
    drive link
    drive node
    drive graph
    feed
    trip 
    stop
    
    return
    ----------
    dataframe of drive links bus trips traverses
    list of trips that could not be routed by OSMNX
    """
    
    trip_df = trip.copy()
    stop_df = stop.copy()
    stop_time_df = stop_times.copy()
    
    chained_stop_df = stop_time_df[stop_time_df['trip_id'].isin(trip_df.trip_id.tolist())]
    chained_stop_to_node_df = pd.merge(chained_stop_df, 
                                       stop_df,
                                        how = 'left',
                                        on = 'stop_id')
    
    print('routing bus on roadway network with osmnx...')
    
    #osm_node_dict = dict(zip(node_gdf.osmid, node_gdf.N))
    
    trip_df = pd.merge(trip_df, routes, how = 'left', on = 'route_id')
    bus_trip_df = trip_df[trip_df['route_type'] == 3]
    
    # to track trips that osmnx failed to route
    broken_shape_trip_list = []
    
    # output dataframe for osmnx success
    trip_link_shape_df = pd.DataFrame()
    
    # loop through for bus trips
    for trip_id in bus_trip_df.trip_id.unique():
        
        # get the stops on the trip
        trip_stop_df = chained_stop_to_node_df[chained_stop_to_node_df['trip_id'] == trip_id].copy()
        
        trip_stop_df.sort_values(by = ["stop_sequence"], inplace = True)
        """
        print("routing" + str(trip_id))
        for s in range(len(trip_stop_df)-1):
                # from stop node OSM id
            closest_node_to_stop1 = int(trip_stop_df.osm_node_id.iloc[s])
            print("from {}".format(closest_node_to_stop1))
                
                # to stop node OSM id
            closest_node_to_stop2 = int(trip_stop_df.osm_node_id.iloc[s+1])
            print("to {}".format(closest_node_to_stop2))
                
                # osmnx routing btw from and to stops, return the list of nodes
            node_osmid_list = nx.shortest_path(G, closest_node_to_stop1, closest_node_to_stop2)
            print("path {}".format(node_osmid_list))
                
                # get the links
            if len(node_osmid_list) > 1:
                osm_link_gdf = pd.DataFrame({'u' : node_osmid_list[:len(node_osmid_list)-1], 
                                            'v' : node_osmid_list[1:len(node_osmid_list)],
                                            'trip_id' : trip_id},
                                               )
            else:
                continue
                
            trip_link_shape_df = trip_link_shape_df.append(osm_link_gdf, ignore_index = True, sort = False)
                     
        """
        try:
            print("routing" + str(trip_id))
            for s in range(len(trip_stop_df)-1):
                # from stop node OSM id
                closest_node_to_stop1 = int(trip_stop_df.osm_node_id.iloc[s])
                
                # to stop node OSM id
                closest_node_to_stop2 = int(trip_stop_df.osm_node_id.iloc[s+1])
                
                # osmnx routing btw from and to stops, return the list of nodes
                node_osmid_list = nx.shortest_path(G, closest_node_to_stop1, closest_node_to_stop2, weight = "length")
                
                # get the links
                if len(node_osmid_list) > 1:
                    osm_link_gdf = pd.DataFrame({'u' : node_osmid_list[:len(node_osmid_list)-1], 
                                            'v' : node_osmid_list[1:len(node_osmid_list)],
                                            'trip_id' : trip_id},
                                               )
                else:
                    continue
                
                trip_link_shape_df = trip_link_shape_df.append(osm_link_gdf, ignore_index = True, sort = False)
                
        

        except:
            broken_shape_trip_list = broken_shape_trip_list + [trip_id]
            print('  warning: cannot route bus: ' + str(trip_id))
            continue      
       #"""        
    trip_link_shape_df = pd.merge(trip_link_shape_df, trip_df[['trip_id', 'shape_id']], how = 'left', on = 'trip_id')

    trip_link_shape_df = pd.merge(trip_link_shape_df,
                                  drive_link_df[["u", "v", "wayId", "shstReferenceId", "shstGeometryId", "A", "B"]].\
                                      drop_duplicates(subset = ["u", "v"]),
                                  how = "left",
                                  on = ["u", "v"])
    
    
    return trip_link_shape_df, broken_shape_trip_list

In [64]:
stop_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21260 entries, 0 to 21259
Data columns (total 18 columns):
stop_name              21260 non-null object
stop_desc              4319 non-null object
stop_lat               21260 non-null float64
stop_lon               21260 non-null float64
zone_id                12267 non-null object
stop_url               4168 non-null object
location_type          5558 non-null float64
parent_station         620 non-null object
stop_timezone          137 non-null object
wheelchair_boarding    388 non-null float64
stop_code              15488 non-null object
platform_code          58 non-null object
position               0 non-null object
direction              0 non-null object
stop_id                21260 non-null int32
osm_node_id            21260 non-null object
shst_node_id           21260 non-null object
model_node_id          21260 non-null object
dtypes: float64(4), int32(1), object(13)
memory usage: 3.0+ MB


In [66]:
bus_route_list = all_routes_df[all_routes_df.route_type ==3].route_id.tolist()

In [67]:
trip_df.shape

(3818, 49)

In [68]:
trip_df[trip_df.route_id.isin(bus_route_list)]

Unnamed: 0,service_id,trip_headsign,direction_id,block_id,wheelchair_accessible,bikes_allowed,agency_raw_name,trip_short_name,original_trip_id,trip_bikes_allowed,...,last_stop_on_trip,stop_id,arrival_h,arrival_m,departure_h,departure_m,tod,trip_num_x,trip_num_y,trip_num
59,2_merged_381003124,BART Concord Via BART Pleasant Hill,0.0,,,,CCTA_2015_8_11,,50002035,,...,,6061,17.0,40.0,17.0,40.0,PM,4,,4.0
60,2_merged_381003124,BART Pleasant Hill,0.0,,,,CCTA_2015_8_11,,40002065,,...,,6532,20.0,5.0,20.0,5.0,NT,3,3.0,3.0
61,2_merged_381003124,BART Walnut Creek,1.0,,,,CCTA_2015_8_11,,361002053,,...,,5910,15.0,6.0,15.0,6.0,PM,12,,12.0
62,2_merged_381003124,Clockwise,0.0,,,,CCTA_2015_8_11,,,,...,,6186,16.0,52.0,16.0,52.0,PM,16,,16.0
63,2_merged_381003124,San Ramon Transit Center,0.0,,,,CCTA_2015_8_11,,360002041,,...,,6057,8.0,57.0,8.0,57.0,AM,13,,13.0
64,2_merged_381003124,BART Walnut Creek Via BART Pleasant Hill,0.0,,,,CCTA_2015_8_11,,310002002,,...,,6201,5.0,50.0,5.0,50.0,EA,1,1.0,1.0
65,2_merged_381003124,BART Walnut Creek,1.0,,,,CCTA_2015_8_11,,341002017,,...,,6213,7.0,7.0,7.0,7.0,AM,2,,2.0
66,2_merged_381003124,BART Dublin Via Windemere Pkwy,0.0,,,,CCTA_2015_8_11,,,,...,,6133,15.0,0.0,15.0,0.0,PM,8,,8.0
67,2_merged_381003124,Hillcrest Park and Ride,0.0,,,,CCTA_2015_8_11,,340002005,,...,,6056,16.0,6.0,16.0,6.0,PM,7,,7.0
68,2_merged_381003124,Diablo Valley College via BART Pleasant Hill,1.0,,,,CCTA_2015_8_11,,311002017,,...,,6054,11.0,40.0,11.0,40.0,MD,5,,5.0


In [69]:
link_df[link_df.u == 890045140][["roadway", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,drive_access,walk_access,bike_access
235924,service,1,1,1
863713,service,1,1,1
1530026,service,1,1,1


In [70]:
bus_osmnx_link_shape_df, bus_osmnx_broken_trip_list = route_bus_link_osmnx(drive_link_df, 
                                                                            drive_node_gdf, 
                                                                            G_drive, 
                                                                            all_stop_times_df,
                                                                            all_routes_df,
                                                                            trip_df, 
                                                                            stop_df)

routing bus on roadway network with osmnx...
routing7809
routing7772
routing7682
routing7481
routing7648
routing7533
routing7618
routing7212
routing7607
routing7570
routing7861
routing6894
routing7900
routing7223
routing7087
routing7437
routing7412
routing7854
routing6887
routing7798
routing7180
routing7179
routing7465
routing7499
routing7494
routing7323
routing7622
routing7327
routing7276
routing7667
routing7121
routing7563
routing7137
routing7403
routing6917
routing7211
routing7228
routing7753
routing7205
routing7194
routing7868
routing7915
routing7732
routing7581
routing6932
routing7921
routing6940
routing7794
routing7420
routing7545
routing7167
routing7290
routing7777
routing7497
routing7156
routing6908
routing7832
routing7597
routing7364
routing6930
routing7428
routing6882
routing7909
routing7145
routing7784
routing7800
routing7837
routing7602
routing7604
routing7906
routing7074
routing7264
routing7926
routing7016
routing7914
routing6961
routing7118
routing6979
routing7357
routing

routing11053
routing11057
routing11058
routing11063
routing11064
routing11066
routing11115
routing11116
routing11121
routing11164
routing11179
routing11206
routing11209
routing11213
routing11216
routing11208
routing11211
routing11210
routing11217
routing11214
routing11273
routing11363
routing11362
routing11295
routing11258
routing11626
routing11625
routing11614
routing11541
routing11651
routing19420
routing21116
routing21113
routing19478
routing19378
routing19400
routing21110
routing19447
routing20324
routing19407
routing21173
routing19408
routing19404
routing11777
routing11766
routing11710
routing11761
routing11742
routing11700
routing11693
routing11782
routing11861
routing11853
routing11674
routing11832
routing11817
routing11800
routing11871
routing11869
routing11794
routing11904
routing12159
routing12143
routing12100
routing20549
routing20542
routing20512
routing12096
routing12113
routing11990
routing11984
routing12046
routing12045
routing20698
routing20676
routing12055
routing11920

routing23362
routing23363
routing23366
routing23367
routing23374
routing23375
routing23384
routing23385
routing23392
routing23393
routing23400
routing23401
routing23410
routing23411
routing23418
routing23419
routing23420
routing23421
routing23422
routing23432
routing23437
routing23453
routing23454
routing23458
routing23460
routing23461
routing23464
routing23465
routing23466
routing23467
routing23468
routing23469
routing23477
routing23479
routing23480
routing23481
routing23482
routing23485
routing23486
routing23487
routing23491
routing23492
routing23495
routing23496
routing23497
routing23498
routing23499
routing23501
routing23502
routing23504
routing23505
routing23507
routing23509
routing23510
routing23511
routing23512
routing23513
routing23514
routing23519
routing23520
routing23526
routing23527
routing23528
routing23530
routing23531
routing23533
routing23534
routing23535
routing23536
routing23537
routing23538
routing23539
routing23541
routing23542
routing23546
routing23547
routing23556

routing28302
routing28324
routing28320
routing28315
routing28346
routing28328
routing28341
routing28373
routing28372
routing28349
routing28368
routing28376
routing28402
routing28395
routing28385
routing28433
routing28427
routing28417
routing28409
routing31285
routing31289
routing31296
routing31297
routing31306
routing31312
routing31313
routing31316
routing31320
routing31325
routing31329
routing31336
routing31341
routing31331
routing28448
routing28461
routing28434
routing28488
routing28479
routing28465
routing28516
routing28510
routing28498
routing28492
routing28543
routing28536
routing28527
routing28519
routing28560
routing28551
routing28545
routing28565
routing28579
routing28576
routing28582
routing28591
routing28603
routing28585
routing28611
routing28620
routing28634
routing28610
routing28689
routing28688
routing28661
routing28642
routing28653
routing28709
routing28729
routing28734
routing28743
routing28717
routing28767
routing28762
routing28750
routing28752
routing28747
routing28785

routing32424
routing32409
routing32338
routing32737
routing32471
routing32494
routing32543
routing32545
routing32242
routing32249
routing32532
routing32564
routing32449
routing32306
routing32515
routing32453
routing32661
routing32275
routing32452
routing32687
routing32252
routing32769
routing32563
routing32421
routing32670
routing32763
routing32392
routing32418
routing32397
routing32370
routing32309
routing32391
routing32216
routing32432
routing32645
routing32199
routing32772
routing32696
routing32441
routing32310
routing32583
routing32642
routing32710
routing32633
routing32265
routing32288
routing32364
routing32607
routing32291
routing32193
routing32311
routing32613
routing32438
routing32605
routing32611
routing32386
routing32225
routing32557
routing32378
routing32404
routing32365
routing32406
routing32437
routing32498
routing32387
routing32496
routing32566
routing32782
routing32748
routing32403
routing32220
routing32208
routing9
routing10
routing11
routing20
routing21
routing22
routi

routing4378
routing4383
routing4386
routing4387
routing4391
routing4396
routing4400
routing4402
routing4405
routing4411
routing4414
routing4415
routing4419
routing4424
routing4428
routing4430
routing4433
routing4434
routing4438
routing4439
routing4443
routing4447
routing4448
routing4449
routing4453
routing4457
routing4461
routing4465
routing4470
routing4474
routing4476
routing4477
routing4479
routing4490
routing4492
routing4493
routing4494
routing4498
routing4503
routing4507
routing4509
routing4510
routing4514
routing4519
routing4523
routing4524
routing4525
routing4526
routing4530
routing4535
routing4539
routing4541
routing4545
routing4550
routing4554
routing4555
routing4559
routing4564
routing4568
routing4573
routing4575
routing4578
routing4579
routing4580
routing4584
routing4589
routing4596
routing4599
routing4602
routing4603
routing4608
routing4636
routing4637
routing4638
routing4639
routing4640
routing4644
routing4645
routing4651
routing4652
routing4654
routing4655
routing4662
rout

In [72]:
bus_osmnx_link_shape_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 906259 entries, 0 to 906258
Data columns (total 9 columns):
u                  906259 non-null int64
v                  906259 non-null int64
trip_id            906259 non-null int64
shape_id           906259 non-null int32
wayId              906259 non-null object
shstReferenceId    906259 non-null object
shstGeometryId     906259 non-null object
A                  906259 non-null int64
B                  906259 non-null int64
dtypes: int32(1), int64(5), object(3)
memory usage: 65.7+ MB


In [73]:
# osmnx failed to route these trips: can be rail modes
print(bus_osmnx_broken_trip_list)

[8363, 8361, 8357, 8358, 8364, 11209, 11213, 11216, 11208, 11211, 11217, 11214, 23467]


In [75]:
bus_osmnx_link_shape_df[bus_osmnx_link_shape_df.shape_id == 362]
all_routes_df[(all_routes_df.agency_raw_name == "ACTransit_2015_8_14") & (all_routes_df.route_short_name == "P")]
trip_df[trip_df.route_id == 147][["trip_id", "direction_id"]]

Unnamed: 0,trip_id,direction_id
3118,1962,0.0
3119,1968,0.0
3175,2080,1.0


In [76]:
# shapes that were not successfully routed by OSMNX

trip_df[trip_df.trip_id.isin(bus_osmnx_broken_trip_list)].shape_id.unique()

array([ 541,  502,  510,  513,  514,  786,  787,  788,  789, 1224],
      dtype=int64)

In [77]:
trip_df.shape_id.nunique()

1496

In [78]:
bus_osmnx_link_shape_df.shape_id.nunique()

1397

In [79]:
# read shst match result

all_shst_df = pd.DataFrame()

for name in gtfs_agencies_list:
    try: 
        shst_df = gpd.read_file(data_interim_dir + "step6_gtfs/shst_match/" + name + ".out.matched.geojson")
        shst_df["agency_raw_name"] = name
    
        all_shst_df = all_shst_df.append(shst_df, sort = False, ignore_index = True)
    except:
        print(name + ".out.matched_geojson not found.")

BART_2015_8_3.out.matched_geojson not found.
ACE_2017_3_20.out.matched_geojson not found.
CCTA_2015_8_11.out.matched_geojson not found.
Caltrain_2015_5_13.out.matched_geojson not found.
MVGo_2016_10_26.out.matched_geojson not found.
Vine_GTFS_PLUS_2015.out.matched_geojson not found.
SF_Bay_Ferry2016_07_01.out.matched_geojson not found.
Blue&Gold_gtfs_10_4_2017.out.matched_geojson not found.
Capitol_2017_3_20.out.matched_geojson not found.


In [80]:
all_shst_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 116231 entries, 0 to 116230
Data columns (total 18 columns):
shstReferenceId           116231 non-null object
shstGeometryId            116231 non-null object
shstFromIntersectionId    116231 non-null object
shstToIntersectionId      116231 non-null object
gisReferenceId            116231 non-null object
gisGeometryId             116231 non-null object
gisTotalSegments          116231 non-null int64
gisSegmentIndex           116231 non-null int64
gisFromIntersectionId     116231 non-null object
gisToIntersectionId       116231 non-null object
startSideOfStreet         116231 non-null object
endSideOfStreet           116231 non-null object
sideOfStreet              116231 non-null object
score                     116231 non-null float64
matchType                 116231 non-null object
pp_shape_id               116231 non-null object
geometry                  116231 non-null object
agency_raw_name           116231 non-null object


In [81]:
all_shst_df.rename(columns = {"pp_shape_id" : "shape_id_original"}, inplace = True)

all_shst_df = pd.merge(
    all_shst_df,
    unique_shape_id_df,
    how = "left",
    on = ["agency_raw_name", "shape_id_original"]
)

all_shst_df = all_shst_df[all_shst_df.shape_id.notnull()].copy()

In [82]:
all_shst_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 67395 entries, 746 to 116230
Data columns (total 19 columns):
shstReferenceId           67395 non-null object
shstGeometryId            67395 non-null object
shstFromIntersectionId    67395 non-null object
shstToIntersectionId      67395 non-null object
gisReferenceId            67395 non-null object
gisGeometryId             67395 non-null object
gisTotalSegments          67395 non-null int64
gisSegmentIndex           67395 non-null int64
gisFromIntersectionId     67395 non-null object
gisToIntersectionId       67395 non-null object
startSideOfStreet         67395 non-null object
endSideOfStreet           67395 non-null object
sideOfStreet              67395 non-null object
score                     67395 non-null float64
matchType                 67395 non-null object
shape_id_original         67395 non-null object
geometry                  67395 non-null object
agency_raw_name           67395 non-null object
shape_id         

In [83]:
all_shst_df.shape_id.nunique()

447

In [84]:
all_shst_df

Unnamed: 0,shstReferenceId,shstGeometryId,shstFromIntersectionId,shstToIntersectionId,gisReferenceId,gisGeometryId,gisTotalSegments,gisSegmentIndex,gisFromIntersectionId,gisToIntersectionId,startSideOfStreet,endSideOfStreet,sideOfStreet,score,matchType,shape_id_original,geometry,agency_raw_name,shape_id
746,3980867d25d325434be9a7e6b73d2603,046b767976f0ae8fdc9c5b2ce2012413,0853d2ce687380d910f165fe63509372,fbea383d5b37b5185c6b39f68a2f343f,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,1,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,LINESTRING (-122.6042002231446 38.000250039545...,GGTransit_2015_9_3,574.0
747,ef4671b87d7a63a9f7433e444d85077b,13e1f50f74f9c1d7e8c0e4e0d3453335,fbea383d5b37b5185c6b39f68a2f343f,50193abff640d477ce678539e92b4856,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,2,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.6041266 38.0003347, -122.6031...",GGTransit_2015_9_3,574.0
748,4171e0d8d27102c60bd3e5ffc73b5992,fc3156215a8a2035671aa5459c3dfa79,50193abff640d477ce678539e92b4856,863997b967c7b89883980ab60ff0d911,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,3,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.6031428 38.0002128, -122.6021...",GGTransit_2015_9_3,574.0
749,4f5ea777178c14b727a651b063ce0754,7bebbc1d69bf2579ff89ff142710ae03,863997b967c7b89883980ab60ff0d911,34eb61b0eae06b7309292a9a571fa485,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,4,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.6021601 38.000079, -122.60164...",GGTransit_2015_9_3,574.0
750,a56fa533c535750f4e20a3ab5616df17,f5b9ea5e21b1d8b359c3a4f463378f90,34eb61b0eae06b7309292a9a571fa485,5e2548aa513f729b44c5c0e4d9f9fd91,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,5,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.6002788 37.9990399, -122.6001...",GGTransit_2015_9_3,574.0
751,9c1347d8f55e360b2d67f2d149734f86,9f927dd4b4bc0e813f5374844b9dfa81,5e2548aa513f729b44c5c0e4d9f9fd91,c92ed11a4ad4ff38c15fa52d41d70fe8,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,6,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.5998647 37.9983542, -122.5998...",GGTransit_2015_9_3,574.0
752,444ce95dd643678f4461b42670109f82,8608e068cbca78c2d5e98ece3015b423,c92ed11a4ad4ff38c15fa52d41d70fe8,d9856e58cfb55ebce3ff492df540cafa,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,7,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.5998439 37.9983179, -122.5996...",GGTransit_2015_9_3,574.0
753,583597b4a5ae4dbb4b2b580dd88be0bb,2a7367edd6bd2cabfcfdb24aadc6ff0a,d9856e58cfb55ebce3ff492df540cafa,861b4646927232450fd27333ba76271d,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,8,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.5995858 37.9977663, -122.5994...",GGTransit_2015_9_3,574.0
754,393ca46afccac4442f7bdbaf4f73d317,58dc36cb3441ccc2a45f61f2bfdd9ae4,861b4646927232450fd27333ba76271d,f3ec75bbfe4baed0c1fe19f8d2537086,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,9,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.5994848 37.9976033, -122.5994...",GGTransit_2015_9_3,574.0
755,699df09dd11e71d1dea1b9df75c6b98a,d71a4a904abcc4de8234b690a35d0af1,f3ec75bbfe4baed0c1fe19f8d2537086,68945e68ccae799fe9dcfdae6efd1586,ee8d95854de9a07714f092b2e8d7d0d0,46f5826504e972c5c3ade370008140ed,191,10,7d1129480f925472a67fa7b46d085d6d,db02976314ee2037d2e7e49b71b8a54d,right,right,right,11.60,hmm,250038,"LINESTRING (-122.599228 37.9972532, -122.59920...",GGTransit_2015_9_3,574.0


In [85]:
def route_bus_link_shst(drive_link, gtfs_shst_id):
    
    """
    route bus with shst match result
    
    parameter
    ---------
    drive link
    gtfs shst match return
    
    return
    ---------
    dataframe of drive links bus traverses
    list of imcomplete bus shapes
    
    """
    
    drive_link_df = drive_link.copy()
    shape_shst_df = gtfs_shst_id.copy()

    shape_shst_df = pd.merge(shape_shst_df, 
                             drive_link_df[
                                 ['shstReferenceId','wayId','u','v', "fromIntersectionId", "toIntersectionId", "A", "B"]
                             ],
                             how = 'left',
                             left_on = 'shstReferenceId',
                             right_on = 'shstReferenceId')
    
    shape_shst_df["u"] = shape_shst_df["u"].fillna(0).astype(np.int64)
    shape_shst_df["v"] = shape_shst_df["v"].fillna(0).astype(np.int64)
    shape_shst_df["A"] = shape_shst_df["A"].fillna(0).astype(np.int64)
    shape_shst_df["B"] = shape_shst_df["B"].fillna(0).astype(np.int64)
    
    """shape_shst_df.dropna(subset = ['u','v'], 
                         axis = 0, 
                         inplace = True)"""
    
    shape_shst_df = shape_shst_df.reset_index(drop=True)
    
    shape_shst_df['next_shape_id'] = shape_shst_df['shape_id'].\
                                            iloc[1:].\
                                            append(pd.Series(shape_shst_df['shape_id'].iloc[-1])).\
                                            reset_index(drop=True)
    
    shape_shst_df['next_u'] = shape_shst_df['u'].\
                                iloc[1:].\
                                append(pd.Series(shape_shst_df['v'].iloc[-1])).\
                                reset_index(drop=True)
    
    incomplete_shape_list = shape_shst_df[\
                                   (shape_shst_df.shape_id==shape_shst_df.next_shape_id)\
                                   &(shape_shst_df.v!=shape_shst_df.next_u)\
                                  ].shape_id.unique().\
                                    tolist()
    
    shape_shst_df = shape_shst_df[~shape_shst_df.shape_id.isin(incomplete_shape_list)].copy()
    
    return shape_shst_df, incomplete_shape_list

In [86]:
bus_shst_link_shape_df, incomplete_shape_list = route_bus_link_shst(drive_link_df, all_shst_df)

print(bus_shst_link_shape_df.shape)
print(bus_shst_link_shape_df.shape_id.nunique())

(48720, 28)
339


In [87]:
print(incomplete_shape_list)

[695.0, 714.0, 732.0, 733.0, 751.0, 760.0, 761.0, 765.0, 1185.0, 1186.0, 1196.0, 1197.0, 1198.0, 1217.0, 1233.0, 1237.0, 1238.0, 1255.0, 1256.0, 1257.0, 1258.0, 1275.0, 1281.0, 1295.0, 1306.0, 1307.0, 1314.0, 1338.0, 1340.0, 1344.0, 1345.0, 1346.0, 1618.0, 1619.0, 1625.0, 1633.0, 1635.0, 1641.0, 1663.0, 1666.0, 1667.0, 1677.0, 1678.0, 1679.0, 1680.0, 1702.0, 1703.0, 1704.0, 1712.0, 1730.0, 1737.0, 1738.0, 1744.0, 1751.0, 1752.0, 1753.0, 1759.0, 1827.0, 1832.0, 1833.0, 1846.0, 1850.0, 1858.0, 1866.0, 1873.0, 1604.0, 1616.0, 116.0, 147.0, 148.0, 149.0, 158.0, 160.0, 161.0, 165.0, 167.0, 168.0, 172.0, 174.0, 175.0, 176.0, 188.0, 189.0, 192.0, 193.0, 194.0, 195.0, 205.0, 206.0, 207.0, 208.0, 213.0, 214.0, 217.0, 219.0, 220.0, 222.0, 230.0, 233.0, 237.0, 239.0, 240.0, 244.0, 269.0, 270.0, 274.0, 324.0, 326.0]


In [88]:
# some of these buses has parts that are out side of SF county boundary, that's why they are labeled as incomplete shape
# some are due to the discrepency btw shst extraction and osmnx extraction

In [89]:
bus_shst_link_shape_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 48720 entries, 0 to 67394
Data columns (total 28 columns):
shstReferenceId           48720 non-null object
shstGeometryId            48720 non-null object
shstFromIntersectionId    48720 non-null object
shstToIntersectionId      48720 non-null object
gisReferenceId            48720 non-null object
gisGeometryId             48720 non-null object
gisTotalSegments          48720 non-null int64
gisSegmentIndex           48720 non-null int64
gisFromIntersectionId     48720 non-null object
gisToIntersectionId       48720 non-null object
startSideOfStreet         48720 non-null object
endSideOfStreet           48720 non-null object
sideOfStreet              48720 non-null object
score                     48720 non-null float64
matchType                 48720 non-null object
shape_id_original         48720 non-null object
geometry                  48720 non-null object
agency_raw_name           48720 non-null object
shape_id            

In [90]:
def bus_link(bus_link_osmnx, bus_link_shst, routes, trip, incomplete_list):
    
    """
    combine bus links from OSMNX and SHST
    """
    
    bus_link_osmnx_df = bus_link_osmnx.copy()
    bus_link_shst_df = bus_link_shst.copy()
    
    trip_df = trip.copy()
    trip_df = pd.merge(trip_df, routes[['route_id', 'route_type']], how = 'left', on = 'route_id')
    bus_trip_df = trip_df[trip_df.route_type == 3].copy()
    
    #bus_link_shst_df.shape_id = bus_link_shst_df.shape_id.astype(str)
    
    shape_id_list = bus_trip_df.shape_id.unique().tolist()
    
    #incomplete_list = [str(x) for x in incomplete_list]
    incomplete_list = [x for x in incomplete_list]
    
    print("Targeting number of bus shape IDs: " + str(bus_trip_df.shape_id.nunique()))
    
    #trip_id, shape_id, u, v, link_id, omsid, shstrefere
    
    shst_shape_list = list(set([x for x in bus_link_shst_df.shape_id]))
    
    shapes_replace_with_shst_list = [x for x in shst_shape_list if x in shape_id_list]
    
    print("\n There are " + str(len(shapes_replace_with_shst_list)) + 
          " shapes that are from shst gtfs matching: \n \t" + 
          str(shapes_replace_with_shst_list))

    bus_link_osmnx_df = bus_link_osmnx_df[~bus_link_osmnx_df.shape_id.isin(shapes_replace_with_shst_list)].copy()
    
    osmnx_shape_list = bus_link_osmnx_df.shape_id.unique().tolist()
    
    print("\n There are " + str(len(osmnx_shape_list)) + 
          " shapes that are from OSMNX routing: \n \t" + 
          str(osmnx_shape_list))
    
    not_routed_list = [x for x in shape_id_list if x not in (shst_shape_list + osmnx_shape_list)]
    
    print("\n There are " + str(len(not_routed_list)) + 
         " shapes that are not routed by either of the two methods: \n \t" + 
         str(not_routed_list))
    
    bus_link_shst_df = pd.merge(bus_link_shst_df,
                                bus_trip_df[['trip_id', 'shape_id']],
                                how = 'inner',
                                left_on = 'shape_id',
                                right_on = 'shape_id')
    #bus_link_shst_df.drop(['pp_shape_id'], axis = 1, inplace = True)
    
    bus_link_df = pd.concat([bus_link_osmnx_df, bus_link_shst_df],
                            sort = False,
                           ignore_index = True)
    
    column_list = bus_link_osmnx.columns.values.tolist()
    
    return bus_link_df[column_list]

In [91]:
bus_link_df = bus_link(bus_osmnx_link_shape_df, bus_shst_link_shape_df, all_routes_df, trip_df, incomplete_shape_list)

Targeting number of bus shape IDs: 1401

 There are 245 shapes that are from shst gtfs matching: 
 	[7.0, 10.0, 11.0, 13.0, 15.0, 16.0, 48.0, 52.0, 53.0, 61.0, 62.0, 65.0, 66.0, 73.0, 74.0, 77.0, 78.0, 84.0, 85.0, 86.0, 87.0, 94.0, 95.0, 97.0, 101.0, 102.0, 104.0, 105.0, 106.0, 107.0, 109.0, 110.0, 111.0, 114.0, 121.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 138.0, 139.0, 140.0, 141.0, 144.0, 145.0, 150.0, 151.0, 171.0, 181.0, 185.0, 186.0, 190.0, 191.0, 196.0, 197.0, 199.0, 201.0, 202.0, 209.0, 210.0, 215.0, 216.0, 226.0, 229.0, 238.0, 241.0, 242.0, 245.0, 246.0, 249.0, 250.0, 251.0, 252.0, 253.0, 254.0, 255.0, 257.0, 258.0, 275.0, 298.0, 301.0, 304.0, 305.0, 306.0, 307.0, 312.0, 313.0, 318.0, 319.0, 323.0, 325.0, 327.0, 335.0, 336.0, 337.0, 350.0, 351.0, 374.0, 574.0, 589.0, 590.0, 591.0, 596.0, 597.0, 598.0, 599.0, 664.0, 737.0, 741.0, 743.0, 747.0, 750.0, 755.0, 759.0, 1156.0, 1158.0, 1166.0, 1167.0, 1222.0, 1224.0, 1225.0, 

In [92]:
bus_link_df.info()
bus_link_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891985 entries, 0 to 891984
Data columns (total 9 columns):
u                  891985 non-null int64
v                  891985 non-null int64
trip_id            891985 non-null int64
shape_id           891985 non-null float64
wayId              891985 non-null object
shstReferenceId    891985 non-null object
shstGeometryId     891985 non-null object
A                  891985 non-null int64
B                  891985 non-null int64
dtypes: float64(1), int64(5), object(3)
memory usage: 61.2+ MB


Unnamed: 0,u,v,trip_id,shape_id,wayId,shstReferenceId,shstGeometryId,A,B
0,1457417136,1457417138,7809,470.0,132500480,bdc3dba6875e8b5e55b462469a47f62b,15f0772c8788e76b2861aa965dc90bc3,3059583,3076350
1,1457417138,4924963339,7809,470.0,31571611,e14edc7080e17188f0e1be9420c4a7c9,a710eb8ba11925df6fd325cd6e2457ca,3076350,3054980
2,4924963339,353276628,7809,470.0,31571611,8737d21c2bb5a9fe5a62601d2886dc8d,f566c92c7cace766bb76918e032ed75d,3054980,3056254


In [131]:
# create rail links
def non_bus_link(stop_times, shapes, routes, trip, stop):
    
    """
    create rail links and nodes
    
    nodes are based on rail stops, links are true shape between nodes
    
    return
    ---------
    complete rail link path for each rail service
    complete rail node path for each rail service
    
    """
    
    print('generating rail links...')
    
    #get rail trips
    trip_df = trip.copy()
    trip_df = pd.merge(trip_df, routes[['route_id', 'route_type']], how = 'left', on = 'route_id')
    rail_trip_df = trip_df[trip_df.route_type != 3].copy()
    
    stop_df = stop.copy()
    stop_time_df = stop_times.copy()
    
    #get rail trips with stops
    chained_stop_to_node_df = pd.merge(stop_time_df, 
                                       stop_df,
                                       how = 'left',
                                       on = 'stop_id')
    
    rail_stop_time_df = chained_stop_to_node_df[
                                                chained_stop_to_node_df['trip_id']\
                                                .isin(rail_trip_df.trip_id.tolist())
                                               ]\
                                                .copy()
    
    #get gtfs rail shapes
    rail_shape_df = shapes[shapes['shape_id'].isin(rail_trip_df.shape_id.tolist())
                                ].copy()
    
    #gtfs shape-trip correspondence
    shape_trip_dict = dict(zip(rail_trip_df.shape_id, rail_trip_df.trip_id))
    
    print(rail_shape_df.shape_id.unique())
    #for each rail shape
    for i in rail_shape_df.shape_id.unique():
    
        trip_id = shape_trip_dict[i]
        
        #get chained stop
        trip_stop_df = rail_stop_time_df[rail_stop_time_df.trip_id == trip_id].copy()
        
        trip_stop_df.sort_values(by = ["stop_sequence"], inplace = True)
        
        # get gtfs shape nodes for the shape
        trip_shape_df = rail_shape_df[rail_shape_df.shape_id == i].copy()
        # initialize columns
        trip_shape_df['is_stop'] = np.int(0)
        trip_shape_df['stop_id'] = np.nan
        
        # for each rail stop, find the closest node in the shape, and those are the stops and breakpoints of new rail links
        # return is a gtfs node shape dataframe with two columns indicating if the node is a stop and the stop id
        shape_inventory = trip_shape_df[['shape_pt_lon', 'shape_pt_lat']].values
        tree = cKDTree(shape_inventory)
        for s in range(len(trip_stop_df)):
            point = trip_stop_df.iloc[s][['stop_lon', 'stop_lat']].values
            dd, ii = tree.query(point, k = 1)
            trip_shape_df.shape_pt_lon.iloc[ii] = trip_stop_df.iloc[s]['stop_lon']
            trip_shape_df.shape_pt_lat.iloc[ii] = trip_stop_df.iloc[s]['stop_lat']
            trip_shape_df.is_stop.iloc[ii] = 1
            trip_shape_df.stop_id.iloc[ii] = trip_stop_df.iloc[s]['stop_id']
        
        # appending the gtfs shape for each route shape id
        if i == rail_shape_df.shape_id.unique()[0]:
            shape_flag_df = trip_shape_df.copy()
        else:
            shape_flag_df = shape_flag_df.append(trip_shape_df, 
                                                 ignore_index = True, 
                                                 sort = False)
    
    # starting to build new rail links true shape
    linestring_df = pd.DataFrame(columns = ['shape_id', 'u', 'v', 'geometry', 'u_stop_id', 'v_stop_id'])

    # rail links are based on the gtfs shape, with nodes being the shapes that are identified as rail stops.
    for i in shape_flag_df.shape_id.unique():
        # get gtfs shape for shape id
        shape_route_df = shape_flag_df[shape_flag_df.shape_id == i].copy()
        
        # get rail nodes based on the stop flags
        break_list = shape_route_df.index[shape_route_df.is_stop == 1].tolist()
        stop_id_list = shape_route_df[shape_route_df.is_stop == 1]['stop_id'].tolist()
        
        # use the gtfs shape between "stop" shapes to build the rail true shape
        for j in range(len(break_list)-1):
            lon_list = shape_flag_df.shape_pt_lon.iloc[break_list[j]:break_list[j+1]+1].tolist()
            lat_list = shape_flag_df.shape_pt_lat.iloc[break_list[j]:break_list[j+1]+1].tolist()
            linestring = LineString([Point(xy) for xy in zip(lon_list,lat_list)])
            linestring_df = linestring_df.append({'shape_id':i, 
                                                  'u':break_list[j], 
                                                  'v':break_list[j+1],
                                                  'u_stop_id':stop_id_list[j], 
                                                  'v_stop_id':stop_id_list[j+1],
                                                  'geometry' : linestring}, 
                                                 ignore_index = True, 
                                                 sort = False)
    
    # add rail travel time between stops
    stop_time_df = pd.merge(
                            stop_time_df, 
                            rail_trip_df[['trip_id', 'shape_id']], 
                            how = 'left', 
                            on = 'trip_id')
    
    unique_stop_time_df = stop_time_df[
                                        stop_time_df.shape_id.notnull()
                                    ].groupby(['trip_id', 'shape_id'])\
                                    .count().reset_index()\
                                    .drop_duplicates(subset = ['shape_id']).copy()
    
    stop_time_df = stop_time_df[stop_time_df.trip_id.isin(unique_stop_time_df.trip_id.tolist())].copy()

    
    linestring_df = pd.merge(linestring_df, 
                             stop_time_df[['shape_id', 'stop_id' , 'departure_time']].rename(
                                 columns = {"stop_id" : "u_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'u_stop_id'])
    
    linestring_df = pd.merge(linestring_df, 
                             stop_time_df[['shape_id', 'stop_id', 'arrival_time']].rename(
                                 columns = {"stop_id" : "v_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'v_stop_id'])
    
    # travel time in minutes
    linestring_df['rail_traveltime'] = (linestring_df['arrival_time'] - linestring_df['departure_time'])/60
    
    rail_node_df = shape_flag_df[shape_flag_df.is_stop == 1].rename_axis('node_id').reset_index()

    
    return linestring_df, rail_node_df

In [132]:
%%time
rail_path_link_gdf, rail_path_node_df = non_bus_link(all_stop_times_df, all_shapes_df, all_routes_df, trip_df, stop_df)

generating rail links...
[ 404  405  406  407  408  409  410  411  412  413  414  415  481  483
  485  482  484  668  669  670  671 1018 1019 1020 1021 1022 1024 1026
 1027 1028 1030 1056 1059 1061 1092 1097 1098 1101 1102 1103 1104 1105
 1108 1111 1112 1117 1118 1119 1120 1122 1123 1124 1126 1127 1129 1131
 1132 1134 1905 1907 1908 1917 1919 1920 1924 1926 1929 1930 1137 1138
 1139 1140 1141 1143 1144 1145 1146  416  417  418  419  420  421  486
  487]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFram

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFram

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFram

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFram

Wall time: 4h 57min 57s


In [133]:
print(rail_path_node_df.columns)
print(rail_path_link_gdf.columns)

print(rail_path_node_df.shape)
print(rail_path_link_gdf.shape)

Index(['node_id', 'shape_pt_lat', 'shape_pt_lon', 'shape_pt_sequence',
       'shape_dist_traveled', 'shape_id', 'is_stop', 'stop_id'],
      dtype='object')
Index(['shape_id', 'u', 'v', 'geometry', 'u_stop_id', 'v_stop_id',
       'departure_time', 'arrival_time', 'rail_traveltime'],
      dtype='object')
(1413, 8)
(1328, 9)


In [134]:
trip_df[trip_df.agency_raw_name == "Caltrain_2015_5_13"].shape_id

271    482
272    484
273    484
274    484
275    482
276    483
277    483
278    482
279    484
280    485
281    485
282    485
283    484
284    482
285    484
286    484
287    484
288    484
289    482
290    482
291    482
292    481
Name: shape_id, dtype: int32

In [135]:
rail_path_link_gdf[rail_path_link_gdf.shape_id == 484]

Unnamed: 0,shape_id,u,v,geometry,u_stop_id,v_stop_id,departure_time,arrival_time,rail_traveltime
280,484,26452,26466,"LINESTRING (-121.903011 37.329239, -121.903406...",6650.0,6646.0,58980.0,,
281,484,26466,26485,"LINESTRING (-121.93608 37.353238, -121.9382321...",6646.0,6644.0,,,
282,484,26485,26495,"LINESTRING (-121.997114 37.370598, -122.000094...",6644.0,6642.0,,,
283,484,26495,26507,"LINESTRING (-122.031372 37.378916, -122.033514...",6642.0,6640.0,,59700.0,
284,484,26507,26517,"LINESTRING (-122.075956 37.394459, -122.078039...",6640.0,6638.0,59700.0,,
285,484,26517,26533,"LINESTRING (-122.107069 37.407323, -122.108348...",6638.0,6636.0,,,
286,484,26533,26546,"LINESTRING (-122.141927 37.429365, -122.143056...",6636.0,6634.0,,60180.0,
287,484,26546,26555,"LINESTRING (-122.164614 37.443475, -122.165468...",6634.0,6632.0,60180.0,60360.0,3.0
288,484,26555,26579,"LINESTRING (-122.182297 37.454856, -122.183450...",6632.0,6630.0,60360.0,60720.0,6.0
289,484,26579,26593,"LINESTRING (-122.231936 37.486159, -122.234916...",6630.0,6628.0,60720.0,,


In [136]:
all_stop_times_df

Unnamed: 0,arrival_time,departure_time,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,start_service_area_id,end_service_area_id,...,end_service_area_radius,continuous_pickup,continuous_drop_off,pickup_area_id,drop_off_area_id,pickup_service_area_radius,drop_off_service_area_radius,last_stop_on_trip,trip_id,stop_id
0,14520.0,14520.0,1,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5444
1,14880.0,14880.0,2,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5440
2,15120.0,15120.0,3,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5423
3,15420.0,15420.0,4,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5443
4,15600.0,15600.0,5,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5455
5,15900.0,15900.0,6,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5433
6,16200.0,16200.0,7,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5442
7,16500.0,16500.0,8,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5448
8,16680.0,16680.0,9,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5436
9,16920.0,16920.0,10,San Francisco Int'l Airport,,,,1.0,,,...,,,,,,,,,5781,5413


In [137]:
ACE_trips_df = trip_df[trip_df.agency_raw_name == "ACE_2017_3_20"]
shape_trip_dict = dict(zip(ACE_trips_df.shape_id, ACE_trips_df.trip_id))

ACE_linestring_df = pd.DataFrame(columns = ['shape_id', 'u', 'v', 'geometry', 'u_stop_id', 'v_stop_id'])

#get chained stop
chained_trip_stop_df = pd.merge(all_stop_times_df, all_stops_df, how = "left", on = "stop_id")
    
for i in ACE_trips_df.shape_id.unique():
    trip_id = shape_trip_dict[i]
    #get chained stop
    trip_stop_df = chained_trip_stop_df[chained_trip_stop_df.trip_id == trip_id].copy()
    
    trip_shape_df = trip_stop_df.copy()
    trip_shape_df["is_stop"] = 1
    trip_shape_df["shape_id"] = i
    
    break_list = trip_shape_df.index[trip_shape_df.is_stop == 1].tolist()
    stop_id_list = trip_shape_df[trip_shape_df.is_stop == 1]['stop_id'].tolist()
    
    for j in range(len(trip_stop_df)-1):
        lon_list = trip_shape_df.stop_lon.iloc[j:j+2].tolist()
        lat_list = trip_shape_df.stop_lat.iloc[j:j+2].tolist()
        linestring = LineString([Point(xy) for xy in zip(lon_list,lat_list)])
        ACE_linestring_df = ACE_linestring_df.append({'shape_id':i, 
                                                  'u':break_list[j], 
                                                  'v':break_list[j+1],
                                                  'u_stop_id':stop_id_list[j], 
                                                  'v_stop_id':stop_id_list[j+1],
                                                  'geometry' : linestring}, 
                                                 ignore_index = True, 
                                                 sort = False)
    if i == ACE_trips_df.shape_id.unique()[0]:
        ACE_rail_node_df = trip_shape_df
    else:
        ACE_rail_node_df = ACE_rail_node_df.append(trip_shape_df, ignore_index = False, sort = False)

stop_time_df = pd.merge(all_stop_times_df, 
                            ACE_trips_df[['trip_id', 'shape_id']], 
                            how = 'left', 
                            on = 'trip_id')

unique_stop_time_df = stop_time_df[stop_time_df.shape_id.notnull()
                                    ].groupby(['trip_id', 'shape_id'])\
                                    .count().reset_index()\
                                    .drop_duplicates(subset = ['shape_id']).copy()

stop_time_df = stop_time_df[stop_time_df.trip_id.isin(unique_stop_time_df.trip_id.tolist())].copy()
        
ACE_linestring_df = pd.merge(ACE_linestring_df, 
                        stop_time_df[['shape_id', 'stop_id' , 'departure_time']].rename(
                                 columns = {"stop_id" : "u_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'u_stop_id'])
    
ACE_linestring_df = pd.merge(ACE_linestring_df, 
                        stop_time_df[['shape_id', 'stop_id', 'arrival_time']].rename(
                                 columns = {"stop_id" : "v_stop_id"}),
                            how = 'left',
                            on = ['shape_id', 'v_stop_id'])
    
# travel time in minutes
ACE_linestring_df['rail_traveltime'] = (ACE_linestring_df['arrival_time'] - ACE_linestring_df['departure_time'])/60

ACE_rail_node_df = ACE_rail_node_df.rename_axis('node_id').reset_index()
ACE_rail_node_df.rename(columns = {"stop_lat" : "shape_pt_lat", 
                                   "stop_lon" : "shape_pt_lon", 
                                   "stop_sequence": "shape_pt_sequence"},
                       inplace = True)

In [138]:
# combine ACE with rest of rail

rail_path_link_with_ACE_gdf = pd.concat([rail_path_link_gdf, ACE_linestring_df], sort = False, ignore_index = True)
rail_path_node_with_ACE_df = pd.concat([rail_path_node_df, ACE_rail_node_df[rail_path_node_df.columns]], 
                                       sort = False, ignore_index = True)

In [139]:
rail_path_link_with_ACE_gdf

Unnamed: 0,shape_id,u,v,geometry,u_stop_id,v_stop_id,departure_time,arrival_time,rail_traveltime
0,404,10,278,"LINESTRING (-121.945154 38.018914, -121.9456 3...",5444,5440,14520.0,14880.0,6.0
1,404,278,455,"LINESTRING (-122.024597 38.003275, -122.024759...",5440,5423,14880.0,15120.0,4.0
2,404,455,741,"LINESTRING (-122.029095 37.973737, -122.029177...",5423,5443,15120.0,15420.0,5.0
3,404,741,840,"LINESTRING (-122.056013 37.928403, -122.05647 ...",5443,5455,15420.0,15600.0,3.0
4,404,840,990,"LINESTRING (-122.067423 37.905628, -122.06768 ...",5455,5433,15600.0,15900.0,5.0
5,404,990,1169,"LINESTRING (-122.123801 37.893394, -122.12495 ...",5433,5442,15900.0,16200.0,5.0
6,404,1169,1242,"LINESTRING (-122.1837911 37.87836087, -122.184...",5442,5448,16200.0,16500.0,5.0
7,404,1242,1318,"LINESTRING (-122.251793 37.844601, -122.252371...",5448,5436,16500.0,16680.0,3.0
8,404,1318,1373,"LINESTRING (-122.267227 37.828415, -122.267504...",5436,5413,16680.0,16920.0,4.0
9,404,1373,1379,"LINESTRING (-122.269029 37.80787, -122.269489 ...",5413,5411,16920.0,16980.0,1.0


In [140]:
rail_path_node_with_ACE_df

Unnamed: 0,node_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,shape_id,is_stop,stop_id
0,10,38.018914,-121.945154,10,,404,1,5444.0
1,278,38.003275,-122.024597,278,,404,1,5440.0
2,455,37.973737,-122.029095,455,,404,1,5423.0
3,741,37.928403,-122.056013,741,,404,1,5443.0
4,840,37.905628,-122.067423,840,,404,1,5455.0
5,990,37.893394,-122.123801,990,,404,1,5433.0
6,1169,37.878361,-122.183791,1169,,404,1,5442.0
7,1242,37.844601,-122.251793,1242,,404,1,5448.0
8,1318,37.828415,-122.267227,1319,,404,1,5436.0
9,1373,37.807870,-122.269029,1376,,404,1,5413.0


In [141]:
unique_shape_id_df[unique_shape_id_df.shape_id_original == "132540"]

Unnamed: 0,agency_raw_name,shape_id,shape_id_original
1023,SFMTA_2015_8_11,1024,132540


In [142]:
rail_path_link_with_ACE_gdf[rail_path_link_with_ACE_gdf.shape_id == 1024]

Unnamed: 0,shape_id,u,v,geometry,u_stop_id,v_stop_id,departure_time,arrival_time,rail_traveltime
394,1024,27016,27019,"LINESTRING (-122.415307 37.805257, -122.415055...",11358,11359,53760.0,53818.0,0.966667
395,1024,27019,27020,"LINESTRING (-122.414998 37.803758, -122.413978...",11359,11686,53818.0,53861.0,0.716667
396,1024,27020,27022,"LINESTRING (-122.413978 37.80283, -122.412912 ...",11686,10277,53861.0,53911.0,0.833333
397,1024,27022,27024,"LINESTRING (-122.412924 37.80183, -122.412763 ...",10277,10275,53911.0,53944.0,0.55
398,1024,27024,27026,"LINESTRING (-122.412729 37.800902, -122.412535...",10275,10285,53944.0,53977.0,0.55
399,1024,27026,27029,"LINESTRING (-122.412546 37.799974, -122.41242 ...",10285,10279,53977.0,54011.0,0.566667
400,1024,27029,27030,"LINESTRING (-122.412352 37.79902, -122.412192 ...",10279,10287,54011.0,54039.0,0.466667
401,1024,27030,27031,"LINESTRING (-122.412192 37.798243, -122.412009...",10287,10272,54039.0,54071.0,0.533333
402,1024,27031,27033,"LINESTRING (-122.412009 37.797351, -122.41186 ...",10272,10283,54071.0,54115.0,0.733333
403,1024,27033,27035,"LINESTRING (-122.411769 37.79612, -122.411631 ...",10283,10281,54115.0,54134.0,0.316667


In [143]:
unique_stop_id_df[unique_stop_id_df.stop_id_original == "6644"]

Unnamed: 0,agency_raw_name,stop_id,stop_id_original
11357,SFMTA_2015_8_11,11358,6644


In [144]:
rail_path_node_with_ACE_df[rail_path_node_with_ACE_df.stop_id == 11358]

Unnamed: 0,node_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,shape_id,is_stop,stop_id
420,27016,37.805257,-122.415307,1,0.0,1024,1,11358.0


In [145]:
def combine_bus_and_rail_shape(rail_path_link, rail_path_node, link, node, shape):
    
    """
    add only unique rail links and nodes to roadway standard
    
    parameter
    -----------
    complete rail link path
    complete rail node path
    all roadway links
    all roadway nodes
    all roadway shapes
    
    return
    -----------
    all roadway and rail links
    all roadway and rail nodes
    all roadway and rail shapes
    unique rail links
    unique rail nodes
    complete rail link path with updated link ID
    
    """
    
    print('indexing rail links and nodes...')
    
    node_gdf = node.copy()
    link_df = link.copy()
    shape_gdf = shape.copy()
    
    # add unique rail nodes to roadway node dataframe
    rail_path_node_gdf = rail_path_node.copy()
    
    unique_rail_node_df = rail_path_node_gdf.drop_duplicates(['shape_pt_lat', 'shape_pt_lon']).copy()
    
    # http://bayareametro.github.io/travel-model-two/input/#roadway-network
    TAP_start_number = 90001 
    
    unique_rail_node_df['model_node_id'] = range(TAP_start_number, TAP_start_number + len(unique_rail_node_df))
    
    rail_path_node_gdf = pd.merge(rail_path_node_gdf, 
                            unique_rail_node_df[['shape_pt_lat', 'shape_pt_lon', 'model_node_id']], 
                            how = 'left', 
                            on = ['shape_pt_lat', 'shape_pt_lon'])
    
    # get unique rail nodes
    unique_rail_node_df['geometry'] = [Point(xy) for xy in zip(unique_rail_node_df.shape_pt_lon, 
                                                               unique_rail_node_df.shape_pt_lat)]
    
    unique_rail_node_df = gpd.GeoDataFrame(unique_rail_node_df)
    unique_rail_node_df.crs = {'init' : 'epsg:4326'}
    unique_rail_node_df = unique_rail_node_df.to_crs(node_gdf.crs)
    
    unique_rail_node_df['rail_only'] = int(1)
    unique_rail_node_df["walk_access"] = int(1)
    
    # combine rail nodes and roadway nodes
    node_gdf["rail_only"] = int(0)
    
    rail_node_columns = ["model_node_id", "geometry", "rail_only", "walk_access"]
    
    roadway_and_rail_node_gdf = node_gdf.append(unique_rail_node_df[rail_node_columns],
                                                ignore_index = True, 
                                                sort = False)
    
    
    rail_node_osmid_dict = dict(zip(rail_path_node_gdf.node_id, rail_path_node_gdf.model_node_id))
    
    rail_path_link_df = rail_path_link.copy()
    
    rail_path_link_df['A'] = rail_path_link_df.u.map(rail_node_osmid_dict)
    rail_path_link_df['B'] = rail_path_link_df.v.map(rail_node_osmid_dict)
    
    rail_path_link_df.drop(["u", "v"], axis = 1, inplace = True)
    
    rail_path_link_df = gpd.GeoDataFrame(rail_path_link_df)
    rail_path_link_df.crs = {'init' : 'epsg:4326'}
    
    # get unique rail links
    unique_rail_link_gdf = rail_path_link_df.drop_duplicates(['A', 'B']).copy()
    
    # fake rail link shst geom id
    unique_rail_link_gdf['shstGeometryId'] = range(1, 1 + len(unique_rail_link_gdf))
    unique_rail_link_gdf['shstGeometryId'] = unique_rail_link_gdf.shstGeometryId.apply(lambda x:'rail'+str(x))
    unique_rail_link_gdf['id'] = unique_rail_link_gdf['shstGeometryId']

    unique_rail_link_gdf['rail_only'] = int(1)
    
    rail_path_link_df = pd.merge(rail_path_link_df,
                                unique_rail_link_gdf[["A", "B", "shstGeometryId"]],
                                how = "left",
                                on = ["A", "B"])
    
    rail_link_columns = ['A', 'B', "shstGeometryId", "rail_traveltime", "rail_only", "id"]
    rail_shape_columns = ["id", "geometry"]
    
    # combine rail and roadway links
    roadway_and_rail_link_df = link_df.append(unique_rail_link_gdf[rail_link_columns], 
                                              ignore_index = True, 
                                              sort = False)
    
    # combine rail and roadway shapes
    roadway_and_rail_shape_gdf = shape_gdf.append(unique_rail_link_gdf[rail_shape_columns],
                                                 ignore_index = True,
                                                 sort = False)
    
    """rail_path_link_df = pd.merge(rail_path_link_df[['shape_id', 'geometry', 'u_stop_id', 'v_stop_id']],
                            unique_rail_shape_gdf.drop(['geometry', 'shape_id'], axis = 1),
                            how = 'left',
                            on = ['u_stop_id', 'v_stop_id'])"""
    
    rail_path_link_df = rail_path_link_df.to_crs({'init' : 'epsg:4326'})
        
    return roadway_and_rail_link_df, roadway_and_rail_node_gdf, roadway_and_rail_shape_gdf, \
                unique_rail_link_gdf, unique_rail_node_df, \
                rail_path_link_df

In [146]:
roadway_and_rail_link_df, roadway_and_rail_node_gdf, roadway_and_rail_shape_gdf, unique_rail_link_gdf, unique_rail_node_gdf, \
                                            rail_link_gdf = combine_bus_and_rail_shape(
                                                                                      rail_path_link_with_ACE_gdf, 
                                                                                      rail_path_node_with_ACE_df,
                                                                                      link_df, 
                                                                                      node_gdf,
                                                                                      shape_gdf)

indexing rail links and nodes...


In [147]:
shape_gdf.shape

(868567, 6)

In [148]:
unique_rail_link_gdf.shape

(739, 12)

In [149]:
link_df.id.nunique()

868567

In [150]:
roadway_and_rail_shape_gdf.id.nunique()

869306

In [151]:
roadway_and_rail_link_df.id.nunique()

869306

In [152]:
roadway_and_rail_link_df.columns

Index(['access', 'area', 'bike_access', 'bridge', 'drive_access', 'est_width',
       'fromIntersectionId', 'highway', 'id', 'junction', 'key', 'landuse',
       'lanes', 'link', 'maxspeed', 'name', 'oneWay', 'ref', 'roadway',
       'roundabout', 'service', 'shstGeometryId', 'shstReferenceId',
       'toIntersectionId', 'tunnel', 'u', 'v', 'walk_access', 'wayId', 'width',
       'county', 'length', 'model_link_id', 'county_numbering_start', 'A', 'B',
       'rail_traveltime', 'rail_only'],
      dtype='object')

In [153]:
unique_rail_node_gdf.shape

(664, 12)

In [154]:
unique_rail_link_gdf.shape

(739, 12)

In [155]:
print(link_df.shape)
print(node_gdf.shape)
print(shape_gdf.shape)

(1632702, 36)
(643811, 9)
(868567, 6)


In [156]:
print(roadway_and_rail_node_gdf.shape)
print(roadway_and_rail_link_df.shape)
print(roadway_and_rail_shape_gdf.shape)

(644475, 10)
(1633441, 38)
(869306, 6)


In [157]:
unique_rail_node_gdf

Unnamed: 0,node_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,shape_id,is_stop,stop_id,model_node_id,geometry,rail_only,walk_access
0,10,38.018914,-121.945154,10,,404,1,5444.0,90001,POINT (-121.945154 38.018914),1,1
1,278,38.003275,-122.024597,278,,404,1,5440.0,90002,POINT (-122.024597 38.003275),1,1
2,455,37.973737,-122.029095,455,,404,1,5423.0,90003,POINT (-122.029095 37.973737),1,1
3,741,37.928403,-122.056013,741,,404,1,5443.0,90004,POINT (-122.056013 37.928403),1,1
4,840,37.905628,-122.067423,840,,404,1,5455.0,90005,POINT (-122.067423 37.905628),1,1
5,990,37.893394,-122.123801,990,,404,1,5433.0,90006,POINT (-122.123801 37.893394),1,1
6,1169,37.878361,-122.183791,1169,,404,1,5442.0,90007,POINT (-122.1837911 37.87836087),1,1
7,1242,37.844601,-122.251793,1242,,404,1,5448.0,90008,POINT (-122.251793 37.844601),1,1
8,1318,37.828415,-122.267227,1319,,404,1,5436.0,90009,POINT (-122.267227 37.828415),1,1
9,1373,37.807870,-122.269029,1376,,404,1,5413.0,90010,POINT (-122.269029 37.80787),1,1


In [158]:
unique_rail_link_gdf

Unnamed: 0,shape_id,geometry,u_stop_id,v_stop_id,departure_time,arrival_time,rail_traveltime,A,B,shstGeometryId,id,rail_only
0,404,"LINESTRING (-121.945154 38.018914, -121.9456 3...",5444,5440,14520.0,14880.0,6.0,90001,90002,rail1,rail1,1
1,404,"LINESTRING (-122.024597 38.003275, -122.024759...",5440,5423,14880.0,15120.0,4.0,90002,90003,rail2,rail2,1
2,404,"LINESTRING (-122.029095 37.973737, -122.029177...",5423,5443,15120.0,15420.0,5.0,90003,90004,rail3,rail3,1
3,404,"LINESTRING (-122.056013 37.928403, -122.05647 ...",5443,5455,15420.0,15600.0,3.0,90004,90005,rail4,rail4,1
4,404,"LINESTRING (-122.067423 37.905628, -122.06768 ...",5455,5433,15600.0,15900.0,5.0,90005,90006,rail5,rail5,1
5,404,"LINESTRING (-122.123801 37.893394, -122.12495 ...",5433,5442,15900.0,16200.0,5.0,90006,90007,rail6,rail6,1
6,404,"LINESTRING (-122.1837911 37.87836087, -122.184...",5442,5448,16200.0,16500.0,5.0,90007,90008,rail7,rail7,1
7,404,"LINESTRING (-122.251793 37.844601, -122.252371...",5448,5436,16500.0,16680.0,3.0,90008,90009,rail8,rail8,1
8,404,"LINESTRING (-122.267227 37.828415, -122.267504...",5436,5413,16680.0,16920.0,4.0,90009,90010,rail9,rail9,1
9,404,"LINESTRING (-122.269029 37.80787, -122.269489 ...",5413,5411,16920.0,16980.0,1.0,90010,90011,rail10,rail10,1


In [159]:
roadway_and_rail_node_gdf

Unnamed: 0,osm_node_id,shst_node_id,county,drive_access,walk_access,bike_access,model_node_id,county_numbering_start,geometry,rail_only
0,2.401245e+09,505d64eb98f1da8d812a3b3801034308,Contra Costa,1.0,1,1.0,3000000,3000000.0,POINT (-122.3315542 37.9812044),0
1,5.783907e+07,473979c78435732f01ca5a168afb62e0,Contra Costa,1.0,1,1.0,3000001,3000000.0,POINT (-121.94477 37.953322),0
2,1.024389e+09,fc7b575d5d8c961d4a70fca846ae7f80,Marin,1.0,1,1.0,5000000,5000000.0,POINT (-122.5398278 37.8979989),0
3,6.556143e+07,6c60cf34e9dc3e123eefb829fe80c76a,Santa Clara,1.0,1,1.0,2000000,2000000.0,POINT (-122.031897 37.2673855),0
4,4.545576e+09,013e1f994fd86c1f226098f8364f7286,Santa Clara,1.0,1,1.0,2000001,2000000.0,POINT (-122.0118628 37.3784474),0
5,3.377850e+09,c180e4b9ba43a15fb576f5decab10cd0,Santa Clara,1.0,1,1.0,2000002,2000000.0,POINT (-122.0324237 37.2882627),0
6,4.305402e+09,e295e0bcdd7cb308302cd2ab782033b1,Alameda,1.0,1,1.0,2500000,2500000.0,POINT (-122.0877569 37.6706708),0
7,5.021129e+09,a3d4daf5130420a2c27d87892b718574,Santa Clara,0.0,1,1.0,2000003,2000000.0,POINT (-121.3351831 37.01963780000001),0
8,4.925259e+09,2003d46b59a6ff6d782d7e13623962d9,Contra Costa,1.0,1,1.0,3000002,3000000.0,POINT (-122.0267767 37.930987),0
9,6.530854e+07,55d370526a55a5d348e23751aad86ac0,San Francisco,1.0,1,1.0,1000000,1000000.0,POINT (-122.4888978 37.77797810000001),0


In [160]:
county_file = "../../data/external/county_boundaries/cb_2018_us_county_500k/cb_2018_us_county_500k.shp"

county_gdf = gpd.read_file(county_file)

county_gdf = county_gdf.to_crs(shape_gdf.crs)

In [161]:
unique_rail_link_gdf.crs

{'init': 'epsg:4326'}

In [162]:
# re-number rail nodes and links

# use nearest match for nodes that did not get county match (e.g. in the Bay)
node_county_matched_gdf = node_gdf.copy()
node_county_unmatched_gdf = unique_rail_node_gdf.copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

node_county_unmatched_gdf = node_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
node_county_unmatched_gdf['X'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
node_county_unmatched_gdf['Y'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

node_county_rematch_gdf = pd.DataFrame()

for i in range(len(node_county_unmatched_gdf)):
    point = node_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["county"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['model_node_id'] = node_county_unmatched_gdf.iloc[i]['model_node_id']
    
    if i == 0:
        node_county_rematch_gdf = add_snap_gdf.copy()
    else:
        node_county_rematch_gdf = node_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)
        
        
# use nearest for links that did not get county match
node_county_matched_gdf = node_gdf.copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

shape_county_unmatched_gdf = unique_rail_link_gdf.copy()
shape_county_unmatched_gdf["geometry"] = unique_rail_link_gdf.centroid

shape_county_unmatched_gdf = shape_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
shape_county_unmatched_gdf["geometry"] = shape_county_unmatched_gdf["geometry"].centroid
shape_county_unmatched_gdf['X'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
shape_county_unmatched_gdf['Y'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

shape_county_rematch_gdf = pd.DataFrame()

for i in range(len(shape_county_unmatched_gdf)):
    point = shape_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["county"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['id'] = shape_county_unmatched_gdf.iloc[i]['id']
    
    if i == 0:
        shape_county_rematch_gdf = add_snap_gdf.copy()
    else:
        shape_county_rematch_gdf = shape_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

In [163]:
node_gdf.county.value_counts()

Santa Clara      192799
Alameda          125942
Contra Costa      97273
San Mateo         56363
Sonoma            56146
Solano            47318
San Francisco     27608
Marin             26529
Napa              13833
Name: county, dtype: int64

In [164]:
node_county_rematch_gdf
shape_county_rematch_gdf

Unnamed: 0,county,id
0,Contra Costa,rail1
1,Contra Costa,rail2
2,Contra Costa,rail3
3,Contra Costa,rail4
4,Contra Costa,rail5
5,Contra Costa,rail6
6,Alameda,rail7
7,Alameda,rail8
8,Alameda,rail9
9,Alameda,rail10


In [165]:
# get the last node and link number of counties

county_last_node_id_df = node_gdf.groupby("county")["model_node_id"].max().reset_index().rename(
    columns = {"model_node_id" : "county_last_id"})

county_last_link_id_df = link_df.groupby("county")["model_link_id"].max().reset_index().rename(
    columns = {"model_link_id" : "county_last_id"})

node_county_rematch_gdf = pd.merge(
    node_county_rematch_gdf.rename(columns = {"model_node_id" : "rail_node_id"}),
    county_last_node_id_df,
    how = "left",
    on = "county"
)

node_county_rematch_gdf["model_node_id"] = node_county_rematch_gdf.groupby(["county"]).cumcount() + 1

node_county_rematch_gdf["model_node_id"] = node_county_rematch_gdf["model_node_id"] + node_county_rematch_gdf["county_last_id"]

shape_county_rematch_gdf = pd.merge(
    shape_county_rematch_gdf,
    county_last_link_id_df,
    how = "left",
    on = "county"
)

shape_county_rematch_gdf["model_link_id"] = shape_county_rematch_gdf.groupby(["county"]).cumcount() + 1

shape_county_rematch_gdf["model_link_id"] = shape_county_rematch_gdf["model_link_id"] + shape_county_rematch_gdf["county_last_id"]

In [166]:
rail_node_id_dict = dict(zip(node_county_rematch_gdf.rail_node_id, node_county_rematch_gdf.model_node_id))
rail_node_county_dict = dict(zip(node_county_rematch_gdf.model_node_id, node_county_rematch_gdf.county))
rail_link_id_dict = dict(zip(shape_county_rematch_gdf.id, shape_county_rematch_gdf.model_link_id))
rail_link_county_dict = dict(zip(shape_county_rematch_gdf.id, shape_county_rematch_gdf.county))

In [167]:
unique_rail_node_gdf["model_node_id"] = unique_rail_node_gdf["model_node_id"].map(rail_node_id_dict)
unique_rail_node_gdf["county"] = unique_rail_node_gdf["model_node_id"].map(rail_node_county_dict)
unique_rail_link_gdf["model_link_id"] = unique_rail_link_gdf["id"].map(rail_link_id_dict)
unique_rail_link_gdf["county"] = unique_rail_link_gdf["id"].map(rail_link_county_dict)

In [168]:
unique_rail_link_gdf["A"] = unique_rail_link_gdf["A"].map(rail_node_id_dict)
unique_rail_link_gdf["B"] = unique_rail_link_gdf["B"].map(rail_node_id_dict)

In [169]:
rail_link_gdf["A"] = rail_link_gdf["A"].map(rail_node_id_dict)
rail_link_gdf["B"] = rail_link_gdf["B"].map(rail_node_id_dict)

In [170]:
rail_node_columns = ["model_node_id", "geometry", "rail_only", "walk_access", "county"]
    
roadway_and_rail_node_gdf = node_gdf.append(unique_rail_node_gdf[rail_node_columns],
                                            ignore_index = True, 
                                            sort = False)

roadway_and_rail_node_gdf["rail_only"].fillna(0, inplace = True)
roadway_and_rail_node_gdf["rail_only"] = roadway_and_rail_node_gdf["rail_only"].astype(int)

rail_link_columns = ['A', 'B', "shstGeometryId", "rail_traveltime","rail_only", "id", "model_link_id", "county"]
rail_shape_columns = ["id", "geometry"]
    
# combine rail and roadway links

roadway_and_rail_link_df = link_df.append(unique_rail_link_gdf[rail_link_columns], 
                                              ignore_index = True, 
                                              sort = False)

roadway_and_rail_link_df["rail_only"].fillna(0, inplace = True)
roadway_and_rail_link_df["rail_only"] = roadway_and_rail_link_df["rail_only"].astype(int)
    
# combine rail and roadway shapes
roadway_and_rail_shape_gdf = shape_gdf.append(unique_rail_link_gdf[rail_shape_columns],
                                                 ignore_index = True,
                                                 sort = False)

In [171]:
unique_rail_node_gdf

Unnamed: 0,node_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,shape_id,is_stop,stop_id,model_node_id,geometry,rail_only,walk_access,county
0,10,38.018914,-121.945154,10,,404,1,5444.0,3097273,POINT (-121.945154 38.018914),1,1,Contra Costa
1,278,38.003275,-122.024597,278,,404,1,5440.0,3097274,POINT (-122.024597 38.003275),1,1,Contra Costa
2,455,37.973737,-122.029095,455,,404,1,5423.0,3097275,POINT (-122.029095 37.973737),1,1,Contra Costa
3,741,37.928403,-122.056013,741,,404,1,5443.0,3097276,POINT (-122.056013 37.928403),1,1,Contra Costa
4,840,37.905628,-122.067423,840,,404,1,5455.0,3097277,POINT (-122.067423 37.905628),1,1,Contra Costa
5,990,37.893394,-122.123801,990,,404,1,5433.0,3097278,POINT (-122.123801 37.893394),1,1,Contra Costa
6,1169,37.878361,-122.183791,1169,,404,1,5442.0,3097279,POINT (-122.1837911 37.87836087),1,1,Contra Costa
7,1242,37.844601,-122.251793,1242,,404,1,5448.0,2625942,POINT (-122.251793 37.844601),1,1,Alameda
8,1318,37.828415,-122.267227,1319,,404,1,5436.0,2625943,POINT (-122.267227 37.828415),1,1,Alameda
9,1373,37.807870,-122.269029,1376,,404,1,5413.0,2625944,POINT (-122.269029 37.80787),1,1,Alameda


In [172]:
roadway_and_rail_node_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 644475 entries, 0 to 644474
Data columns (total 10 columns):
osm_node_id               643811 non-null float64
shst_node_id              643811 non-null object
county                    644475 non-null object
drive_access              643811 non-null float64
walk_access               644475 non-null int64
bike_access               643811 non-null float64
model_node_id             644475 non-null int64
county_numbering_start    643811 non-null float64
geometry                  644475 non-null object
rail_only                 644475 non-null int32
dtypes: float64(4), int32(1), int64(2), object(3)
memory usage: 46.7+ MB


In [173]:
roadway_and_rail_node_gdf.model_node_id.nunique()

644475

In [174]:
roadway_and_rail_link_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1633441 entries, 0 to 1633440
Data columns (total 38 columns):
access                    1632702 non-null object
area                      1632702 non-null object
bike_access               1632702 non-null float64
bridge                    1632702 non-null object
drive_access              1632702 non-null float64
est_width                 1632702 non-null object
fromIntersectionId        1632702 non-null object
highway                   1632702 non-null object
id                        1633441 non-null object
junction                  1632702 non-null object
key                       1632702 non-null object
landuse                   1632702 non-null object
lanes                     1632702 non-null object
link                      1632702 non-null object
maxspeed                  1632702 non-null object
name                      1632702 non-null object
oneWay                    1632702 non-null object
ref                       1632702 n

In [175]:
roadway_and_rail_link_df.rail_only.value_counts()

0    1632702
1        739
Name: rail_only, dtype: int64

In [176]:
roadway_and_rail_link_df.model_link_id.nunique()

1633441

In [177]:
roadway_and_rail_shape_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 869306 entries, 0 to 869305
Data columns (total 6 columns):
id                    869306 non-null object
fromIntersectionId    868567 non-null object
toIntersectionId      868567 non-null object
forwardReferenceId    868567 non-null object
backReferenceId       868567 non-null object
geometry              869306 non-null object
dtypes: object(6)
memory usage: 39.8+ MB


In [178]:
def create_freq_table(trip_df):
    
    """
    create frequency table for network standard
    """
    
    print('creating frequency reference...')
    
    # calculate EA and NT frequency using 5-6am, and 7-10pm
    tod_numhours_dict = {"AM" : 4, "MD" : 5, "PM" :4, "NT" : 3, "EA" : 1}
    
    freq_df = trip_df[['trip_id', 'tod', 'direction_id', 'trip_num']].copy()
    freq_df['headway_secs'] = freq_df.tod.map(tod_numhours_dict)
    freq_df['headway_secs'] = freq_df.apply(lambda x: int(x.headway_secs * 60 * 60 / x.trip_num),
                                           axis = 1)
    
    freq_enum_list = {'start_time' : {'AM' : '06:00:00', 
                                      'MD' : '10:00:00',
                                      "PM" : "15:00:00",
                                      "NT" : "19:00:00",
                                      "EA" : "03:00:00"},
                      'end_time' : {'AM' : '10:00:00', 
                                    'MD' : '15:00:00',
                                    "PM" : "19:00:00",
                                    "NT" : "03:00:00",
                                    "EA" : "06:00:00"}}
    
    freq_df['start_time'] = freq_df.tod.map(freq_enum_list.get("start_time"))
    freq_df['end_time'] = freq_df.tod.map(freq_enum_list.get("end_time"))
    
    return freq_df

In [179]:
freq_df = create_freq_table(trip_df)

creating frequency reference...


In [180]:
freq_df

Unnamed: 0,trip_id,tod,direction_id,trip_num,headway_secs,start_time,end_time
0,5781,EA,0.0,4.0,900,03:00:00,06:00:00
1,5877,AM,0.0,29.0,496,06:00:00,10:00:00
2,6391,MD,0.0,20.0,900,10:00:00,15:00:00
3,6619,PM,0.0,18.0,800,15:00:00,19:00:00
4,6765,NT,0.0,10.0,1080,19:00:00,03:00:00
5,5775,EA,0.0,4.0,900,03:00:00,06:00:00
6,5871,AM,0.0,16.0,900,06:00:00,10:00:00
7,6229,MD,0.0,20.0,900,10:00:00,15:00:00
8,6469,PM,0.0,16.0,900,15:00:00,19:00:00
9,6639,NT,0.0,8.0,1350,19:00:00,03:00:00


In [181]:
# create new shape with complete node list the route passes
def create_new_node_shape(node, bus_link, rail_link = pd.DataFrame(columns = ["u", "v", "shape_id", "A", "B"])):
    
    """
    create complete node lists each transit traverses to replace the gtfs shape.txt
    """
    bus_link_df = bus_link.copy()
    bus_trip_list_with_unique_shape_id = bus_link_df.drop_duplicates(subset = ["shape_id"]).trip_id.tolist()
    
    bus_link_df = bus_link_df[bus_link_df.trip_id.isin(bus_trip_list_with_unique_shape_id)].copy()
    
    shape_link_df = pd.concat([bus_link_df[["u", "v", 'shape_id', "A", "B"]]
                                , rail_link[['shape_id', "A", "B"]]],
                               sort = False,
                               ignore_index = True)
    
    shape_link_df.u = shape_link_df.u.fillna(0).astype(np.int64)
    shape_link_df.v = shape_link_df.v.fillna(0).astype(np.int64)

    shape_point_df = gpd.GeoDataFrame()
    
    for shape_id in shape_link_df.shape_id.unique():
        shape_df = shape_link_df[shape_link_df.shape_id == shape_id]
        point_df = pd.DataFrame(data = {"shape_id" : shape_id,
                                         "shape_osm_node_id" : shape_df.u.tolist() + [shape_df.v.iloc[-1]],
                                        "shape_model_node_id" : shape_df.A.tolist() + [shape_df.B.iloc[-1]],
                                       "shape_pt_sequence" : range(1, 1+len(shape_df)+1)})
   
        shape_point_df = pd.concat([shape_point_df,
                                   point_df],
                                  sort = False,
                                  ignore_index = True)

    shape_point_df = pd.merge(shape_point_df,
                             node[["osm_node_id", "shst_node_id", "model_node_id", "geometry"]],
                             how = "left",
                             left_on = "shape_model_node_id",
                             right_on = "model_node_id")
    
    shape_point_df.crs = {'init' : 'epsg:4326'}
    #shape_point_df = shape_point_df.to_crs(epsg = 4326)
    
    print(shape_point_df[shape_point_df.geometry.isnull()])
    
    shape_point_df["shape_pt_lat"] = shape_point_df.geometry.map(lambda g:g.y)
    shape_point_df["shape_pt_lon"] = shape_point_df.geometry.map(lambda g:g.x)
    
    shape_point_df["shape_id"] = shape_point_df["shape_id"].astype(int)
    
    shape_point_df.rename(columns = {"shst_node_id":"shape_shst_node_id"}, inplace = True)
        
    return shape_point_df[["shape_id", "shape_pt_sequence", "shape_osm_node_id", "shape_shst_node_id", "shape_model_node_id"]]

In [182]:
bus_link_df

Unnamed: 0,u,v,trip_id,shape_id,wayId,shstReferenceId,shstGeometryId,A,B
0,1457417136,1457417138,7809,470.0,132500480,bdc3dba6875e8b5e55b462469a47f62b,15f0772c8788e76b2861aa965dc90bc3,3059583,3076350
1,1457417138,4924963339,7809,470.0,31571611,e14edc7080e17188f0e1be9420c4a7c9,a710eb8ba11925df6fd325cd6e2457ca,3076350,3054980
2,4924963339,353276628,7809,470.0,31571611,8737d21c2bb5a9fe5a62601d2886dc8d,f566c92c7cace766bb76918e032ed75d,3054980,3056254
3,353276628,57879658,7809,470.0,7875248,d27338aecf12912296070f62fb642946,ccd62fe779122ac6c6c7a523c9369098,3056254,3057568
4,57879658,848729957,7809,470.0,448246247,903ab2c6f77b2e847d7c6f3ff6776c26,1ed198f66cbaf473179816c5005cd0c8,3057568,3060385
5,848729957,5480792155,7809,470.0,448246247,db96abb1593412bd6f828a353184db76,6c1907326ae4bacb689978859f9992fe,3060385,3043295
6,5480792155,1456102969,7809,470.0,448246247,ad8ade3ba10dadfd9f0e5a4512de2407,5bb9fc4ff93e88a8d11aa27ecf064b89,3043295,3042944
7,1456102969,1456102998,7809,470.0,448246247,628453e140b501d71fd91812bf1c589d,f5a6b7db764ac16d7633016be78f706e,3042944,3004569
8,1456102998,1457024295,7809,470.0,448246247,090cc82070e5ee4427f3743f251770a3,ec38601f00fab5c08bb8f3346807fa36,3004569,3048347
9,1457024295,57859934,7809,470.0,448246247,7257b3e831f072d916df0e0f2371ee62,ade1896ba78bddcc03c192e42ea2fda8,3048347,3026227


In [183]:
rail_link_gdf

Unnamed: 0,shape_id,geometry,u_stop_id,v_stop_id,departure_time,arrival_time,rail_traveltime,A,B,shstGeometryId
0,404,"LINESTRING (-121.945154 38.018914, -121.9456 3...",5444,5440,14520.0,14880.0,6.0,3097273,3097274,rail1
1,404,"LINESTRING (-122.024597 38.003275, -122.024759...",5440,5423,14880.0,15120.0,4.0,3097274,3097275,rail2
2,404,"LINESTRING (-122.029095 37.973737, -122.029177...",5423,5443,15120.0,15420.0,5.0,3097275,3097276,rail3
3,404,"LINESTRING (-122.056013 37.928403, -122.05647 ...",5443,5455,15420.0,15600.0,3.0,3097276,3097277,rail4
4,404,"LINESTRING (-122.067423 37.905628, -122.06768 ...",5455,5433,15600.0,15900.0,5.0,3097277,3097278,rail5
5,404,"LINESTRING (-122.123801 37.893394, -122.12495 ...",5433,5442,15900.0,16200.0,5.0,3097278,3097279,rail6
6,404,"LINESTRING (-122.1837911 37.87836087, -122.184...",5442,5448,16200.0,16500.0,5.0,3097279,2625942,rail7
7,404,"LINESTRING (-122.251793 37.844601, -122.252371...",5448,5436,16500.0,16680.0,3.0,2625942,2625943,rail8
8,404,"LINESTRING (-122.267227 37.828415, -122.267504...",5436,5413,16680.0,16920.0,4.0,2625943,2625944,rail9
9,404,"LINESTRING (-122.269029 37.80787, -122.269489 ...",5413,5411,16920.0,16980.0,1.0,2625944,2625945,rail10


In [184]:
shape_point_df = create_new_node_shape(roadway_and_rail_node_gdf, bus_link_df, rail_link_gdf)



Empty DataFrame
Columns: [shape_id, shape_osm_node_id, shape_model_node_id, shape_pt_sequence, osm_node_id, shst_node_id, model_node_id, geometry]
Index: []


In [185]:
shape_point_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333418 entries, 0 to 333417
Data columns (total 5 columns):
shape_id               333418 non-null int32
shape_pt_sequence      333418 non-null int64
shape_osm_node_id      333418 non-null int64
shape_shst_node_id     331985 non-null object
shape_model_node_id    333418 non-null int64
dtypes: int32(1), int64(3), object(1)
memory usage: 14.0+ MB


In [186]:
shape_point_df.shape_id.nunique()

1484

In [187]:
all_routes_df.agency_raw_name.value_counts()

ACTransit_2015_8_14                              152
SFMTA_2015_8_11                                   81
VTA_2015_8_27                                     80
SamTrans_2015_8_20                                75
GGTransit_2015_9_3                                38
CCTA_2015_8_11                                    30
SonomaCounty_2015_8_18                            29
Marguerite_2016_10_10                             29
SantaRosa_google_transit_08_28_15                 17
Fairfield_2015_10_14                              16
Soltrans_2016_5_20                                14
TriDelta-GTFS-2018-05-24_21-43-17                 14
westcat-ca-us_9_17_2015                           14
Vine_GTFS_PLUS_2015                               13
petalumatransit-petaluma-ca-us__11_12_15          11
Union_City_Transit_Aug-01-2015 to Jun-30-2017      9
MarinTransit_2015_8_31                             8
Emeryville_2016_10_26                              7
SF_Bay_Ferry2016_07_01                        

In [188]:
def write_out_transit_standard(trip, stop, shape_point, freq, stop_times, routes, trips, rail_node = None):
    
    shape_point_df = shape_point.copy()
    trip_df = trip.copy()
    
    #trip_df = pd.merge(trip_df, routes[["route_id", "agency_raw_name"]], how = "left", on = ["route_id"])
    
    trip_df = trip_df[~ trip_df.agency_raw_name.isin(["Petaluma_2016_5_22", "WestCAT_2016_5_26", "GGFerries_2017_3_18"])].copy()
    
    trip_df["shape_id"] = trip_df["shape_id"].astype(int)
    
    trip_df = trip_df[trip_df.shape_id.isin(shape_point_df.shape_id.unique().tolist())]
    
    final_trip_list = trip_df.trip_id.unique().tolist()
    
    freq_df = freq.copy()
    freq_df = freq_df[freq_df.trip_id.isin(final_trip_list)]
    
    stop_df = stop.copy()
    
    if len(rail_node) > 0:
        rail_node_df = rail_node.copy()
        rail_node_dict = dict(zip(rail_node_df.stop_id, rail_node_df.model_node_id))
        
        stop_df['model_node_id'] = stop_df.apply(lambda x: rail_node_dict[x.stop_id] 
                                               if x.stop_id in rail_node_df.stop_id.tolist() 
                                               else x.model_node_id,
                                                axis = 1)
        stop_df['osm_node_id'] = stop_df.apply(lambda x: ""
                                                if x.stop_id in rail_node_df.stop_id.tolist() 
                                                else x.osm_node_id,
                                                axis = 1)
        stop_df['shst_node_id'] = stop_df.apply(lambda x: '' 
                                                if x.stop_id in rail_node_df.stop_id.tolist() 
                                                else x.shst_node_id,
                                                axis = 1)
    

    stop_times_df = stop_times.copy()
    stop_times_df = stop_times_df[stop_times_df.trip_id.isin(final_trip_list)]
    
    # update time to relative time for frequency based transit system
    stop_times_df['first_arrival'] = stop_times_df.groupby(['trip_id'])['arrival_time'].transform(min)
    stop_times_df['arrival_time'] = stop_times_df['arrival_time'] - stop_times_df['first_arrival']
    stop_times_df['departure_time'] = stop_times_df['departure_time'] - stop_times_df['first_arrival']
    
    stop_times_df['arrival_time'] = stop_times_df['arrival_time'].apply(
        lambda x : time.strftime('%H:%M:%S', time.gmtime(x)) if ~np.isnan(x) else x)
    stop_times_df['departure_time'] = stop_times_df['departure_time'].apply(
        lambda x : time.strftime('%H:%M:%S', time.gmtime(x)) if ~np.isnan(x) else x)

    
    stop_times_df.drop(['first_arrival'], axis = 1, inplace = True)
    
    route_df = routes.copy()
    route_df = route_df[route_df.route_id.isin(trip_df.route_id.tolist())]
    
    route_df.to_csv(output_folder + "routes.txt", 
                    index = False, 
                    sep = ',')
   
    shape_point_df.to_csv(output_folder + "shapes.txt", 
                          index = False, 
                          sep = ',')
  
    trip_df[trips.columns.values].to_csv(output_folder + "trips.txt", 
                                              index = False, 
                                              sep = ',')
  
    freq_df[['trip_id', 'headway_secs', 'start_time', 'end_time']].to_csv(output_folder + "frequencies.txt", 
                                                index = False, 
                                                sep = ',')
    
    stop_df.to_csv(output_folder + "stops.txt", 
                   index = False, 
                   sep = ',')
   
    stop_times_df.to_csv(output_folder + "stop_times.txt", 
                         index = False, 
                         sep = ',')


In [189]:
stop_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21260 entries, 0 to 21259
Data columns (total 18 columns):
stop_name              21260 non-null object
stop_desc              4319 non-null object
stop_lat               21260 non-null float64
stop_lon               21260 non-null float64
zone_id                12267 non-null object
stop_url               4168 non-null object
location_type          5558 non-null float64
parent_station         620 non-null object
stop_timezone          137 non-null object
wheelchair_boarding    388 non-null float64
stop_code              15488 non-null object
platform_code          58 non-null object
position               0 non-null object
direction              0 non-null object
stop_id                21260 non-null int32
osm_node_id            21260 non-null object
shst_node_id           21260 non-null object
model_node_id          21260 non-null object
dtypes: float64(4), int32(1), object(13)
memory usage: 3.0+ MB


In [190]:
all_stop_times_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1140078 entries, 0 to 1140077
Data columns (total 21 columns):
arrival_time                    706584 non-null float64
departure_time                  706584 non-null float64
stop_sequence                   1140078 non-null int64
stop_headsign                   482983 non-null object
pickup_type                     168165 non-null float64
drop_off_type                   597183 non-null object
shape_dist_traveled             72888 non-null float64
timepoint                       36123 non-null float64
start_service_area_id           0 non-null object
end_service_area_id             0 non-null object
start_service_area_radius       0 non-null object
end_service_area_radius         0 non-null object
continuous_pickup               1092 non-null object
continuous_drop_off             1092 non-null object
pickup_area_id                  0 non-null object
drop_off_area_id                0 non-null object
pickup_service_area_radius      0 non-

In [191]:
stop_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21260 entries, 0 to 21259
Data columns (total 18 columns):
stop_name              21260 non-null object
stop_desc              4319 non-null object
stop_lat               21260 non-null float64
stop_lon               21260 non-null float64
zone_id                12267 non-null object
stop_url               4168 non-null object
location_type          5558 non-null float64
parent_station         620 non-null object
stop_timezone          137 non-null object
wheelchair_boarding    388 non-null float64
stop_code              15488 non-null object
platform_code          58 non-null object
position               0 non-null object
direction              0 non-null object
stop_id                21260 non-null int32
osm_node_id            21260 non-null object
shst_node_id           21260 non-null object
model_node_id          21260 non-null object
dtypes: float64(4), int32(1), object(13)
memory usage: 3.0+ MB


In [192]:
unique_stop_id_df[unique_stop_id_df.stop_id_original == "1814"]

Unnamed: 0,agency_raw_name,stop_id,stop_id_original
5976,CCTA_2015_8_11,5977,1814
17164,VTA_2015_8_27,17165,1814


In [193]:
unique_trip_id_df[unique_trip_id_df.trip_id == 2040]

Unnamed: 0,agency_raw_name,trip_id,trip_id_original
2039,ACTransit_2015_8_14,2040,4174746-1508FA-D4-Weekday-04


In [194]:
np.isnan(all_stop_times_df[all_stop_times_df.arrival_time.isnull()].arrival_time.iloc[0])

True

In [195]:
write_out_transit_standard(trip_df, 
                           stop_df, 
                           shape_point_df, 
                           freq_df, 
                           all_stop_times_df,
                           all_routes_df,
                           all_trips_df,
                           unique_rail_node_gdf)

In [196]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3818 entries, 0 to 3817
Data columns (total 49 columns):
service_id                      3818 non-null object
trip_headsign                   3444 non-null object
direction_id                    3818 non-null float64
block_id                        2751 non-null object
wheelchair_accessible           125 non-null float64
bikes_allowed                   634 non-null float64
agency_raw_name                 3818 non-null object
trip_short_name                 363 non-null object
original_trip_id                147 non-null object
trip_bikes_allowed              8 non-null object
trip_type                       0 non-null object
drt_max_travel_time             18 non-null object
drt_avg_travel_time             18 non-null object
drt_advance_book_min            18 non-null object
drt_pickup_message              0 non-null object
drt_drop_off_message            0 non-null object
continuous_pickup_message       0 non-null object
continuous_dro

In [197]:
def create_transit_access_link(all_link, all_node, all_shape):
    
    """
    create rail walk access/egress links
    """
    
    tran_node_df = all_node[all_node.rail_only == 1].copy()
    walk_node_df = all_node[(all_node.walk_access == 1) & (all_node.rail_only == 0)].copy().reset_index(drop = True)
    
    walk_node_df = walk_node_df.to_crs({'init' : 'epsg:26915'})
    walk_node_df['X'] = walk_node_df.geometry.map(lambda g:g.x)
    walk_node_df['Y'] = walk_node_df.geometry.map(lambda g:g.y)
    inventory_node_ref = walk_node_df[['X', 'Y']].values
    tree = cKDTree(inventory_node_ref)
    
    tran_node_df = tran_node_df.to_crs({'init' : 'epsg:26915'})
    tran_node_df['X'] = tran_node_df.geometry.map(lambda g:g.x)
    tran_node_df['Y'] = tran_node_df.geometry.map(lambda g:g.y)
    
    for i in range(len(tran_node_df)):
        point = tran_node_df.iloc[i][['X', 'Y']].values
        dd, ii = tree.query(point, k = 1)
        add_node_gdf = gpd.GeoDataFrame(walk_node_df.iloc[ii]).transpose().reset_index(drop = True)
        add_node_gdf['tran_node'] = tran_node_df.iloc[i].model_node_id
        add_node_gdf['geometry_tran'] = tran_node_df.iloc[i].geometry
        
        if i == 0:
            rail_access_gdf = add_node_gdf.copy()
        else:
            rail_access_gdf = rail_access_gdf.append(add_node_gdf, ignore_index=True, sort=False)
    
    rail_access_gdf.rename(columns = {'geometry' : "geometry_walk"}, inplace = True)

    
    rail_access_gdf['geometry'] = [LineString(xy) for xy in zip(rail_access_gdf['geometry_walk'], 
                                                                rail_access_gdf['geometry_tran'])]
    
    # fake rail link shst geom id
    rail_access_gdf['shstGeometryId'] = range(1, 
                                     1 + len(rail_access_gdf))
    
    rail_access_gdf['shstGeometryId'] = rail_access_gdf.shstGeometryId.apply(lambda x:'walktorail'+str(x))
    rail_access_gdf['id'] = rail_access_gdf['shstGeometryId']
    
    rail_access_gdf["fromIntersectionId"] = rail_access_gdf.shst_node_id

    rail_access_gdf_copy = rail_access_gdf.copy()
    rail_access_gdf.rename(columns = {'model_node_id' : 'A', 'tran_node' : 'B'}, inplace = True)
    
    rail_access_gdf_copy.rename(columns = {'tran_node' : 'A', 'model_node_id' : 'B'}, inplace = True)
    
    rail_access_gdf = pd.concat(
                            [rail_access_gdf[['A', 'B', 'geometry', 'shstGeometryId', "id", "fromIntersectionId"]],
                            rail_access_gdf_copy[['A', 'B', 'geometry','shstGeometryId', "id", "fromIntersectionId"]]],
                               ignore_index = True,
                               sort = False)
    
    rail_access_gdf = gpd.GeoDataFrame(rail_access_gdf)
    rail_access_gdf.crs = {'init' : 'epsg:26915'}
    rail_access_gdf = rail_access_gdf.to_crs(all_node.crs)
    
    rail_access_gdf['walk_access'] = 1
    
    rail_access_link_columns = ["A", "B", "shstGeometryId", "walk_access", "id"]
    rail_access_shape_columns = ["id", "fromIntersectionId", "geometry"]
    
    all_link_df = all_link.copy()
    all_shape_gdf = all_shape.copy()
    
        
    all_shape_gdf = pd.concat([
                                all_shape_gdf,
                                rail_access_gdf[rail_access_shape_columns].drop_duplicates(
                                                                        subset = ["id"])
                              ],
                             sort = False,
                             ignore_index= True)

    
    all_link_df = pd.concat([all_link_df, 
                             rail_access_gdf[rail_access_link_columns]], 
                            ignore_index = True, 
                            sort = False)
    
    all_link_gdf = pd.merge(all_link_df,
                           all_shape_gdf,
                           how = "left",
                           left_on = "shstGeometryId",
                           right_on = "id")
    
    geom_length = gpd.GeoDataFrame(all_link_gdf[['geometry']])
    geom_length.crs = all_node.crs
    geom_length = geom_length.to_crs(epsg = 26915)
    geom_length["length"] = geom_length.length

    all_link_df["length"] = geom_length["length"]

    return all_link_df, all_shape_gdf

all_link_df, all_shape_gdf = create_transit_access_link(roadway_and_rail_link_df, 
                                                        roadway_and_rail_node_gdf,
                                                        roadway_and_rail_shape_gdf)

In [198]:
# use nearest for links that did not get county match
node_county_matched_gdf = node_gdf.copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

link_county_unmatched_gdf = all_link_df[all_link_df.model_link_id.isnull()].copy()
link_county_unmatched_gdf = pd.merge(link_county_unmatched_gdf, all_shape_gdf[["id", "geometry"]], how = "left", on = "id")
link_county_unmatched_gdf = gpd.GeoDataFrame(link_county_unmatched_gdf, 
                                             geometry = link_county_unmatched_gdf.geometry, 
                                             crs = all_shape_gdf.crs)
#link_county_unmatched_gdf["geometry"] = link_county_unmatched_gdf.centroid

link_county_unmatched_gdf = link_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
#link_county_unmatched_gdf["geometry"] = link_county_unmatched_gdf["geometry"].centroid
link_county_unmatched_gdf['X'] = link_county_unmatched_gdf['geometry'].apply(lambda p: p.centroid.x)
link_county_unmatched_gdf['Y'] = link_county_unmatched_gdf['geometry'].apply(lambda p: p.centroid.y)

link_county_rematch_gdf = pd.DataFrame()

for i in range(len(link_county_unmatched_gdf)):
    point = link_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["county"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf = pd.concat([add_snap_gdf,
                              link_county_unmatched_gdf.drop("county", axis = 1).iloc[[i]].reset_index(drop = True)], 
                               axis = 1) 
    
    if i == 0:
        link_county_rematch_gdf = add_snap_gdf.copy()
    else:
        link_county_rematch_gdf = link_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

link_county_rematch_gdf.crs = link_county_unmatched_gdf.crs
link_county_rematch_gdf = link_county_rematch_gdf.to_crs(node_gdf.crs)

In [199]:
link_county_rematch_gdf

Unnamed: 0,county,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,...,length,model_link_id,county_numbering_start,A,B,rail_traveltime,rail_only,geometry,X,Y
0,Contra Costa,,,,,,,,,walktorail1,...,77.785409,,,3047769,3097273,,,"LINESTRING (-121.9453851 38.0182924, -121.9451...",-2.063638e+06,4.626937e+06
1,Contra Costa,,,,,,,,,walktorail2,...,64.081815,,,3079878,3097274,,,"LINESTRING (-122.0242967 38.0027973, -122.0245...",-2.071361e+06,4.627598e+06
2,Contra Costa,,,,,,,,,walktorail3,...,33.210322,,,3052965,3097275,,,"LINESTRING (-122.0294236 37.9738307, -122.0290...",-2.072930e+06,4.624421e+06
3,Contra Costa,,,,,,,,,walktorail4,...,9.486918,,,3060241,3097276,,,"LINESTRING (-122.0560862 37.92834939999999, -1...",-2.077106e+06,4.620083e+06
4,Contra Costa,,,,,,,,,walktorail5,...,1.423180,,,3021886,3097277,,,"LINESTRING (-122.0674372 37.90563169999999, -1...",-2.079015e+06,4.617847e+06
5,Contra Costa,,,,,,,,,walktorail6,...,68.856506,,,3070076,3097278,,,"LINESTRING (-122.1236759 37.8939582, -122.1238...",-2.084553e+06,4.618224e+06
6,Contra Costa,,,,,,,,,walktorail7,...,22.270810,,,3060432,3097279,,,"LINESTRING (-122.1840235 37.87834160000003, -1...",-2.090574e+06,4.618340e+06
7,Alameda,,,,,,,,,walktorail8,...,60.299615,,,2588445,2625942,,,"LINESTRING (-122.2518515 37.84510009999999, -1...",-2.098008e+06,4.616633e+06
8,Alameda,,,,,,,,,walktorail9,...,40.383864,,,2620923,2625943,,,"LINESTRING (-122.267178 37.8287484, -122.26722...",-2.100032e+06,4.615260e+06
9,Alameda,,,,,,,,,walktorail10,...,11.035156,,,2504171,2625944,,,"LINESTRING (-122.2690232 37.80777839999999, -1...",-2.101009e+06,4.612955e+06


In [200]:
link_county_rematch_gdf.info()
link_county_rematch_gdf.county.value_counts()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1328 entries, 0 to 1327
Data columns (total 41 columns):
county                    1328 non-null object
access                    0 non-null object
area                      0 non-null object
bike_access               0 non-null float64
bridge                    0 non-null object
drive_access              0 non-null float64
est_width                 0 non-null object
fromIntersectionId        0 non-null object
highway                   0 non-null object
id                        1328 non-null object
junction                  0 non-null object
key                       0 non-null object
landuse                   0 non-null object
lanes                     0 non-null object
link                      0 non-null object
maxspeed                  0 non-null object
name                      0 non-null object
oneWay                    0 non-null object
ref                       0 non-null object
roadway                   0 non-null obje

San Francisco    866
Santa Clara      294
Alameda           66
San Mateo         58
Contra Costa      28
Marin             10
Solano             6
Name: county, dtype: int64

In [201]:
# get the last node and link number of counties

county_last_link_id_df = roadway_and_rail_link_df.groupby("county")["model_link_id"].max().reset_index().rename(
    columns = {"model_link_id" : "county_last_id"})

link_county_rematch_gdf = pd.merge(
    link_county_rematch_gdf,
    county_last_link_id_df,
    how = "left",
    on = "county"
)

link_county_rematch_gdf["model_link_id"] = link_county_rematch_gdf.groupby(["county"]).cumcount() + 1

link_county_rematch_gdf["model_link_id"] = link_county_rematch_gdf["model_link_id"] + link_county_rematch_gdf["county_last_id"]

In [202]:
link_county_rematch_gdf[link_county_rematch_gdf.id == "walktorail1"]

Unnamed: 0,county,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,...,model_link_id,county_numbering_start,A,B,rail_traveltime,rail_only,geometry,X,Y,county_last_id
0,Contra Costa,,,,,,,,,walktorail1,...,4237470,,3047769,3097273,,,"LINESTRING (-121.9453851 38.0182924, -121.9451...",-2063638.0,4626937.0,4237469
664,Contra Costa,,,,,,,,,walktorail1,...,4237484,,3097273,3047769,,,"LINESTRING (-121.9453851 38.0182924, -121.9451...",-2063638.0,4626937.0,4237469


In [203]:
# combine rail and roadway links

all_link_df = roadway_and_rail_link_df.append(link_county_rematch_gdf, 
                                              ignore_index = True, 
                                              sort = False)

In [204]:
print(roadway_and_rail_link_df.model_link_id.nunique())
print(roadway_and_rail_link_df.shape)
print(roadway_and_rail_link_df.county.value_counts())

1633441
(1633441, 38)
Santa Clara      516710
Alameda          315385
Contra Costa     237470
San Mateo        140966
Sonoma           135042
Solano           117108
San Francisco     73892
Marin             63188
Napa              33680
Name: county, dtype: int64


In [205]:
all_link_df.model_link_id.nunique()

1634769

In [206]:
all_shape_gdf.id.nunique()

869970

In [207]:
all_link_df.id.nunique()

869970

In [208]:
# number geometry increse should be the number of transit nodes: 664
print(roadway_and_rail_link_df.shstGeometryId.nunique())
print(roadway_and_rail_shape_gdf.id.nunique())
print(roadway_and_rail_shape_gdf.shape)
print(all_shape_gdf.id.nunique())
print(all_shape_gdf.shape)
print(all_link_df.shstGeometryId.nunique())

869306
869306
(869306, 6)
869970
(869970, 6)
869970


In [209]:
# number of link increase should be 2 times of transit nodes : 1328

print(roadway_and_rail_link_df.shape)
print(all_link_df.shape)

(1633441, 38)
(1634769, 42)


In [210]:
print(all_link_df.county.value_counts(dropna=False))
print(all_link_df.groupby("county")["model_link_id"].max())

Santa Clara      517004
Alameda          315451
Contra Costa     237498
San Mateo        141024
Sonoma           135042
Solano           117114
San Francisco     74758
Marin             63198
Napa              33680
Name: county, dtype: int64
county
Alameda          3315450
Contra Costa     4237497
Marin            8063197
Napa             6033679
San Francisco      74758
San Mateo        1141023
Santa Clara      2517003
Solano           5117113
Sonoma           7135041
Name: model_link_id, dtype: int64


# true shapes for line record

from shapely import ops, geometry

def get_true_line_shape(trip_df, bus_link, roadway_and_rail_shape,
                        rail_link = pd.DataFrame(columns = ['LINK_ID','shape_id', 'u', 'v'])):
    
    """
    write out true shape for each trip
    """
    
    rail_link_df = rail_link.copy()
    rail_link_df = pd.merge(trip_df[['trip_id', 'shape_id']],
                            rail_link_df,
                           how = 'right',
                           on = 'shape_id')
    
    transit_link_gdf = pd.concat([bus_link[['shape_id', 'trip_id', "shstGeometryId"]], 
                                  rail_link_df[['shape_id', 'trip_id', "shstGeometryId"]]], 
                                 sort = False, ignore_index = True)
    
    transit_link_gdf = pd.merge(transit_link_gdf,
                                roadway_and_rail_shape[['id', 'geometry']],
                                how = 'left',
                                left_on = 'shstGeometryId',
                               right_on = "id")
    
    true_line_shape_df = transit_link_gdf.groupby(['trip_id', 'shape_id'])['geometry'].agg(
                                                                lambda x: 
                                                                ops.linemerge(geometry.MultiLineString(x.tolist())))\
                                        .reset_index()
    
    """true_line_shape_df = pd.merge(true_line_shape_df, 
                                  cube,
                                 how = 'left',
                                 on = ['shape_id', 'trip_id'])"""
    
    true_line_shape_gdf = gpd.GeoDataFrame(true_line_shape_df, 
                                           crs = roadway_and_rail_shape.crs, 
                                           geometry = 'geometry')
    
    return true_line_shape_gdf

true_line_shape_gdf = get_true_line_shape(trip_df, 
                                                bus_link_df, 
                                                roadway_and_rail_shape_gdf,
                                                rail_link_gdf)

true_line_shape_gdf.to_file(data_interim_folder + "transit_route.geojson",
                           driver = "GeoJSON")

true_line_shape_gdf.columns

def link_df_to_geojson(df, properties):
    """
    Author: Geoff Boeing:
    https://geoffboeing.com/2015/10/exporting-python-data-geojson/
    """
    geojson = {"type":"FeatureCollection", "features":[]}
    for _, row in df.iterrows():
        feature = {"type":"Feature",
                   "properties":{},
                   "geometry":{"type":"LineString",
                               "coordinates":[]}}
        feature["geometry"]["coordinates"] = [[x, y] for (x,y) in list(row["geometry"].coords)]
        for prop in properties:
            feature["properties"][prop] = row[prop]
        geojson["features"].append(feature)
    return geojson


def point_df_to_geojson(df: pd.DataFrame, properties: list):
    """
    Author: Geoff Boeing:
    https://geoffboeing.com/2015/10/exporting-python-data-geojson/
    """
    
    geojson = {"type": "FeatureCollection", "features": []}
    for _, row in df.iterrows():
        feature = {
            "type": "Feature",
            "properties": {},
            "geometry": {"type": "Point", "coordinates": []},
        }
        feature["geometry"]["coordinates"] = [row["geometry"].x, row["geometry"].y]
        for prop in properties:
            feature["properties"][prop] = row[prop]
        geojson["features"].append(feature)
    return geojson

def fill_na(df_na):
    """
    fill str NaN with ""
    fill numeric NaN with 0
    """
    df = df_na.copy()
    num_col = list(df.select_dtypes([np.number]).columns)
    print("numeric columns: ", num_col)
    object_col = list(df.select_dtypes(['object']).columns)
    print("str columns: ", object_col)
    
    for x in list(df.columns):
        if x in num_col:
            df[x].fillna(0, inplace = True)
        elif x in object_col:
            df[x].fillna("", inplace = True)
    
    return df

all_shape_gdf = fill_na(all_shape_gdf)

In [211]:
int_col = ["bike_access", "walk_access", "drive_access", "rail_only"]
for c in int_col:
    all_link_df[c] = all_link_df[c].fillna(0).astype(np.int64)
    
#all_link_df = fill_na(all_link_df)

In [212]:
int_col = ["bike_access", "walk_access", "drive_access", "rail_only"]
for c in int_col:
    roadway_and_rail_node_gdf[c] = roadway_and_rail_node_gdf[c].fillna(0).astype(np.int64)
    
#roadway_and_rail_node_gdf = fill_na(roadway_and_rail_node_gdf)

In [213]:
pd.crosstab(all_link_df.rail_only, all_link_df.walk_access, dropna = False)
pd.crosstab(roadway_and_rail_node_gdf.rail_only, roadway_and_rail_node_gdf.walk_access, dropna = False)

walk_access,0,1
rail_only,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4819,638992
1,0,664


In [214]:
print(link_df.drive_access.value_counts())
print(all_link_df.drive_access.value_counts())

print(link_df.bike_access.value_counts())
print(all_link_df.bike_access.value_counts())

print(link_df.walk_access.value_counts())
print(all_link_df.walk_access.value_counts())

1    1001893
0     630809
Name: drive_access, dtype: int64
1    1001893
0     632876
Name: drive_access, dtype: int64
1    1371914
0     260788
Name: bike_access, dtype: int64
1    1371914
0     262855
Name: bike_access, dtype: int64
1    1619403
0      13299
Name: walk_access, dtype: int64
1    1620731
0      14038
Name: walk_access, dtype: int64


In [215]:
all_link_df.info()
all_shape_gdf.info()
roadway_and_rail_node_gdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1634769 entries, 0 to 1634768
Data columns (total 42 columns):
access                    1632702 non-null object
area                      1632702 non-null object
bike_access               1634769 non-null int64
bridge                    1632702 non-null object
drive_access              1634769 non-null int64
est_width                 1632702 non-null object
fromIntersectionId        1632702 non-null object
highway                   1632702 non-null object
id                        1634769 non-null object
junction                  1632702 non-null object
key                       1632702 non-null object
landuse                   1632702 non-null object
lanes                     1632702 non-null object
link                      1632702 non-null object
maxspeed                  1632702 non-null object
name                      1632702 non-null object
oneWay                    1632702 non-null object
ref                       1632702 non-n

In [217]:
all_link_df[all_link_df.shstReferenceId.isin(["feab62cc90650bfc45dc453816782f9c", "9ab364b22d6b33ec158d8bc4008c1be7"])][
    ["roadway", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,drive_access,walk_access,bike_access
524671,service,1,1,1
863713,service,1,1,1


In [218]:
roadway_and_rail_node_gdf[roadway_and_rail_node_gdf.osm_node_id.isin([890045140, 5372055804, 890045129])]

Unnamed: 0,osm_node_id,shst_node_id,county,drive_access,walk_access,bike_access,model_node_id,county_numbering_start,geometry,rail_only
130712,5372056000.0,3291f7c2f15101c22abf554ce230343e,San Francisco,1,1,1,1006138,1000000.0,POINT (-122.3920956 37.7893448),0
215684,890045100.0,4c0619714744bed10b7de965adc7048d,San Francisco,1,1,1,1010031,1000000.0,POINT (-122.3926305 37.7896628),0
244341,890045100.0,490be8656a6428c6fc871a1f0e6432eb,San Francisco,1,1,1,1011380,1000000.0,POINT (-122.3920287 37.7892519),0


In [219]:
%%time

print("-------write out link shape geojson---------")

shape_prop = ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId']
shape_geojson = link_df_to_geojson(all_shape_gdf, shape_prop)

with open(data_interim_dir + "step6_gtfs/shape.geojson", "w") as f:
    json.dump(shape_geojson, f)

-------write out link shape geojson---------
Wall time: 3min 51s


In [220]:
%%time

# write out link variable json
# link unique handle "shstReferenceId" + "shstGeometryId"

print("-------write out link json---------")

link_prop = all_link_df.drop(["county_numbering_start", "X", "Y", "county_last_id", "geometry"], axis = 1).columns.tolist()

out = all_link_df[link_prop].to_json(orient = "records")

with open(data_interim_dir + "step6_gtfs/link.json", 'w') as f:
    f.write(out)

-------write out link json---------
Wall time: 40.1 s


In [221]:
%%time

print("-------write out node geojson---------")

node_prop = roadway_and_rail_node_gdf.drop(["geometry", "county_numbering_start"], axis = 1).columns.tolist()
node_geojson = point_df_to_geojson(roadway_and_rail_node_gdf, node_prop)

with open(data_interim_dir + "step6_gtfs/node.geojson", "w") as f:
    json.dump(node_geojson, f)

-------write out node geojson---------
Wall time: 2min 34s


In [222]:
print("-------write out link feather---------")

link_feather = all_link_df.drop(["county_numbering_start", "X", "Y", "county_last_id", "geometry"], axis = 1).copy()

link_feather.to_feather(data_interim_dir + 'step6_gtfs/link.feather')

-------write out link feather---------


In [223]:
all_link_df.model_link_id

0          4000000
1          4000001
2          8000000
3          2000000
4          2000001
5          2000002
6          2000003
7          4000002
8                1
9          4000003
10         4000004
11         2000004
12               2
13         7000000
14         4000005
15         7000001
16         3000000
17         3000001
18         2000005
19         5000000
20         1000000
21         1000001
22         4000006
23         3000002
24         3000003
25         4000007
26         7000002
27         7000003
28         2000006
29         4000008
            ...   
1634739    2517000
1634740      74756
1634741    3315439
1634742    1141023
1634743    3315440
1634744    3315441
1634745      74757
1634746    8063195
1634747      74758
1634748    8063196
1634749    8063197
1634750    5117111
1634751    5117112
1634752    5117113
1634753    4237494
1634754    4237495
1634755    3315442
1634756    3315443
1634757    3315444
1634758    3315445
1634759    4237496
1634760    4

In [224]:
roadway_and_rail_node_gdf.model_node_id

0         3000000
1         3000001
2         5000000
3         2000000
4         2000001
5         2000002
6         2500000
7         2000003
8         3000002
9         1000000
10        3000003
11        3000004
12        2000004
13        1000001
14        4500000
15        3000005
16        4500001
17        2500001
18        2500002
19        2000005
20        3500000
21        1500000
22        1500001
23        3000006
24        2500003
25        2500004
26        3000007
27        4500002
28        4500003
29        2000006
           ...   
644445    2192942
644446    1028038
644447    2625963
644448    1556391
644449    2625964
644450    2625965
644451    1028039
644452    5026531
644453    1028040
644454    5026532
644455    5026533
644456    3547318
644457    3547319
644458    3547320
644459    3097283
644460    3097284
644461    2625966
644462    2625967
644463    2625968
644464    2625969
644465    3097285
644466    3097286
644467    2625970
644468    2625971
644469    

In [225]:
link_df.columns

Index(['access', 'area', 'bike_access', 'bridge', 'drive_access', 'est_width',
       'fromIntersectionId', 'highway', 'id', 'junction', 'key', 'landuse',
       'lanes', 'link', 'maxspeed', 'name', 'oneWay', 'ref', 'roadway',
       'roundabout', 'service', 'shstGeometryId', 'shstReferenceId',
       'toIntersectionId', 'tunnel', 'u', 'v', 'walk_access', 'wayId', 'width',
       'county', 'length', 'model_link_id', 'county_numbering_start', 'A',
       'B'],
      dtype='object')

# write out to CUBE .lin

In [226]:
all_routes_df.route_type.value_counts()

3    632
0     10
4      9
2      8
1      6
5      3
Name: route_type, dtype: int64

In [227]:
trip_df.columns

Index(['service_id', 'trip_headsign', 'direction_id', 'block_id',
       'wheelchair_accessible', 'bikes_allowed', 'agency_raw_name',
       'trip_short_name', 'original_trip_id', 'trip_bikes_allowed',
       'trip_type', 'drt_max_travel_time', 'drt_avg_travel_time',
       'drt_advance_book_min', 'drt_pickup_message', 'drt_drop_off_message',
       'continuous_pickup_message', 'continuous_drop_off_message', 'route_id',
       'trip_id', 'shape_id', 'arrival_time', 'departure_time',
       'stop_sequence', 'stop_headsign', 'pickup_type', 'drop_off_type',
       'shape_dist_traveled', 'timepoint', 'start_service_area_id',
       'end_service_area_id', 'start_service_area_radius',
       'end_service_area_radius', 'continuous_pickup', 'continuous_drop_off',
       'pickup_area_id', 'drop_off_area_id', 'pickup_service_area_radius',
       'drop_off_service_area_radius', 'last_stop_on_trip', 'stop_id',
       'arrival_h', 'arrival_m', 'departure_h', 'departure_m', 'tod',
       'trip_num_x

In [228]:
all_routes_df

Unnamed: 0,route_id_original,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,agency_raw_name,route_sort_order,min_headway_minutes,eligibility_restricted,continuous_pickup,continuous_drop_off,route_id
0,01,BART,,Pittsburg/Bay Point - SFIA/Millbrae,,1,http://www.bart.gov/schedules/bylineresults?ro...,ffff33,,BART_2015_8_3,,,,,,154
1,03,BART,,Fremont - Richmond,,1,http://www.bart.gov/schedules/bylineresults?ro...,ff9933,,BART_2015_8_3,,,,,,155
2,05,BART,,Fremont - Daly City,,1,http://www.bart.gov/schedules/bylineresults?ro...,339933,,BART_2015_8_3,,,,,,156
3,07,BART,,Richmond - Daly City/Millbrae,,1,http://www.bart.gov/schedules/bylineresults?ro...,ff0000,,BART_2015_8_3,,,,,,157
4,11,BART,,Dublin/Pleasanton - Daly City,,1,http://www.bart.gov/schedules/bylineresults?ro...,0099cc,,BART_2015_8_3,,,,,,158
5,19,BART,,Coliseum - Oakland Int'l Airport,,1,http://www.bart.gov/schedules/bylineresults?ro...,d5cfa3,,BART_2015_8_3,,,,,,159
6,ACE,CE,ACE,,,2,,,,ACE_2017_3_20,,,,,,1
7,34_merged_381003112,,93X,Kirker Pass Express,,3,,,,CCTA_2015_8_11,,,,,,182
8,28_merged_381003096,,5,Creekside/BART Walnut Creek,,3,,,,CCTA_2015_8_11,,,,,,175
9,26_merged_381003090,,36,San Ramon/BART Dublin,,3,,,,CCTA_2015_8_11,,,,,,173


In [229]:
def prepare_df_for_cube(routes, mode_crosswalk, trip, bus_link, freq,
                        rail_link = pd.DataFrame(columns = ['shape_id'])):
    
    
    transit_link_gdf = pd.concat([bus_link[['shape_id']], rail_link[['shape_id']]], 
                                 sort = False, ignore_index = True)
    trip_df = trip.copy()
    
    trip_df = trip_df[trip_df.shape_id.isin(transit_link_gdf.shape_id.unique().tolist())]
    
    trip_df = pd.merge(trip_df.drop("agency_raw_name", axis = 1), routes, how = 'left', on = 'route_id')
    
    trip_df = trip_df[~ trip_df.agency_raw_name.isin(["Petaluma_2016_5_22", "WestCAT_2016_5_26", "GGFerries_2017_3_18"])].copy()
    
    trip_df = pd.merge(trip_df, freq[['trip_id','headway_secs']], how = 'left', on = 'trip_id')
    
    #trip_df['tod'] = np.where(trip_df.tod == 'peak', 'pk', 'op')
    
    trip_df['NAME'] = trip_df.apply(lambda x: str(x.agency_id) + '_' + str(x.route_id) + '_' 
                                    + str(x.route_short_name) 
                                    #+ '_'
                                    #+ x.tod 
                                    #+ str(x.direction_id)
                                    , 
                                    axis = 1)
    trip_df['LONGNAME'] = trip_df['route_long_name']
    trip_df['HEADWAY'] = (trip_df['headway_secs']/60).astype(int)
    
    """
    def mode_gtfs_to_muni(x):
        if x.route_type == 0:
            return 15
        elif x.route_type == 5:
            return 14
        elif x.route_short_name[-1] in ["X", "R"]:
            return 12
        else:
            return 11
    
    trip_df['MODE'] = trip_df.apply(lambda x : mode_gtfs_to_muni(x),
                                   axis = 1)
    """
    
    trip_df = pd.merge(
        trip_df,
        mode_crosswalk.drop("agency_id", axis = 1),
        how = "left",
        on = ["agency_raw_name", "route_type"]
    )
    
    trip_df['TM2_mode'].fillna(11, inplace = True)
    trip_df['TM2_mode'] = trip_df['TM2_mode'].astype(int)
    
    trip_df['ONEWAY'] = 'T'
    
    return trip_df

In [230]:
gtfs_to_tm2_mode_crosswalk_df = pd.read_csv(data_interim_dir + "gtfs_to_tm2_mode_crosswalk.csv")

gtfs_to_tm2_mode_crosswalk_df.drop_duplicates(subset = ["agency_raw_name", "route_type"], inplace = True)

In [231]:
gtfs_to_tm2_mode_crosswalk_df

Unnamed: 0,agency_raw_name,agency_name,agency_id,route_type,TM2_mode,TM2_line_haul_name
0,ACE_2017_3_20,ACE Altamont Corridor Express,CE,2,133,Commuter rail
1,ACTransit_2015_8_14,AC Transit,AC Transit,3,30,Local bus
2,BART_2015_8_3,Bay Area Rapid Transit,BART,1,120,Heavy rail
3,Blue&Gold_gtfs_10_4_2017,Blue & Gold Fleet,BG,4,103,Ferry service
4,CCTA_2015_8_11,County Connection,,3,42,Local bus
5,Caltrain_2015_5_13,Caltrain,,2,130,Commuter rail
6,Caltrain_2015_5_13,Caltrain,,3,14,Local bus
7,Capitol_2017_3_20,Capitol Corridor,AM,2,131,Commuter rail
8,Emeryville_2016_10_26,Emery Go-Round,573,3,12,Local bus
9,Fairfield_2015_10_14,Fairfield and Suisun Transit,36,3,52,Local bus


In [232]:
cube_trip_df = prepare_df_for_cube(all_routes_df, 
                                   gtfs_to_tm2_mode_crosswalk_df,
                                trip_df, 
                                bus_link_df, 
                                freq_df, 
                                rail_link_gdf)

In [233]:
trip_df.shape

(3818, 49)

In [234]:
cube_trip_df.shape

(3785, 71)

In [235]:
cube_trip_df[cube_trip_df.agency_raw_name == "GGFerries_2017_3_18"]

Unnamed: 0,service_id,trip_headsign,direction_id,block_id,wheelchair_accessible,bikes_allowed,trip_short_name,original_trip_id,trip_bikes_allowed,trip_type,...,continuous_pickup_y,continuous_drop_off_y,headway_secs,NAME,LONGNAME,HEADWAY,agency_name,TM2_mode,TM2_line_haul_name,ONEWAY


In [236]:
cube_trip_df.TM2_mode.value_counts().sum()

3785

In [237]:
cube_trip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3785 entries, 0 to 3784
Data columns (total 71 columns):
service_id                      3785 non-null object
trip_headsign                   3411 non-null object
direction_id                    3785 non-null float64
block_id                        2751 non-null object
wheelchair_accessible           105 non-null float64
bikes_allowed                   610 non-null float64
trip_short_name                 363 non-null object
original_trip_id                147 non-null object
trip_bikes_allowed              8 non-null object
trip_type                       0 non-null object
drt_max_travel_time             18 non-null object
drt_avg_travel_time             18 non-null object
drt_advance_book_min            18 non-null object
drt_pickup_message              0 non-null object
drt_drop_off_message            0 non-null object
continuous_pickup_message       0 non-null object
continuous_drop_off_message     0 non-null object
route_id         

In [239]:
cube_trip_df.iloc[1]

service_id                                                                  WKDY
trip_headsign                                                     Montgomery St.
direction_id                                                                   0
block_id                                                                     NaN
wheelchair_accessible                                                          1
bikes_allowed                                                                  1
trip_short_name                                                              NaN
original_trip_id                                                             NaN
trip_bikes_allowed                                                           NaN
trip_type                                                                    NaN
drt_max_travel_time                                                          NaN
drt_avg_travel_time                                                          NaN
drt_advance_book_min        

In [240]:
def node_list(x, trip_df, stop_df, bus_link, stop_times, node_gdf, link_gdf, 
              rail_link = pd.DataFrame(columns = ['shape_id', 'u', 'v']), rail_node_df = None):
    print(x.trip_id)
    
    if len(rail_node_df) > 0:
        rail_node_dict = dict(zip(rail_node_df.stop_id, rail_node_df.model_node_id))
        stop_df['model_node_id'] = stop_df.apply(lambda x: rail_node_dict[x.stop_id] 
                                               if x.stop_id in rail_node_df.stop_id.tolist()
                                               else x.model_node_id,
                                    axis = 1)
        
    rail_link_df = rail_link.copy()
    rail_link_df = pd.merge(trip_df[['trip_id', 'shape_id']],
                            rail_link_df,
                           how = 'right',
                           on = 'shape_id')
    
    transit_link_gdf = pd.concat([bus_link[['u', 'v', 'shape_id', 'trip_id' , "A", "B"]], 
                                  rail_link_df[['shape_id', 'trip_id', "A", "B"]]], 
                                 sort = False, 
                                 ignore_index = True)
    
    """transit_link_gdf = pd.merge(transit_link_gdf,
                               link_gdf[['LINK_ID',"u","v"]],
                               how = "left",
                               on = ["u","v"])"""
   
    stop_times_df = stop_times.copy()
    stop_id_list = stop_times_df[stop_times_df.trip_id == x.trip_id]['stop_id'].tolist()
    stop_node_list = stop_df[stop_df['stop_id'].isin(stop_id_list)]['model_node_id'].tolist()
    
    #print(stop_node_list)
    
    node_list = transit_link_gdf[transit_link_gdf['trip_id'] == x.trip_id]['A'].tolist() + \
                    [transit_link_gdf[transit_link_gdf['trip_id'] == x.trip_id]['B'].iloc[-1]]
    
    #osmid_N_dict = dict(zip(node_gdf.OSMID, node_gdf.N))
    
    #node_list = list(map(osmid_N_dict.get, node_list))
    
    s = '\nLINE NAME=\"%s\",' % (x.NAME,)
    
    #line attribtes
    s += '\n LONGNAME=\"%s",' % (x.LONGNAME,)
    s += '\n USERA1=\"%s",' % (x.agency_id,)
    s += '\n USERA1=\"%s",' % (x.TM2_line_haul_name,)
    if x.tod == 'AM':
        s += '\n FREQ[2]=%s,' % (x.HEADWAY,)
    elif x.tod == "MD":
        s += '\n FREQ[3]=%s,' % (x.HEADWAY,)
    elif x.tod == "PM":
        s += '\n FREQ[4]=%s,' % (x.HEADWAY,)
    elif x.tod == "NT":
        s += '\n FREQ[5]=%s,' % (x.HEADWAY,)
    elif x.tod == "EA":
        s += '\n FREQ[1]=%s,' % (x.HEADWAY,)
    s += '\n MODE=%s,' % (x.TM2_mode,)
    s += '\n ONEWAY=%s,' % (x.ONEWAY,)
    s += '\n OPERATOR=%s,' % (x.agency_id,)
    s += '\n SHORTNAME=%s,' % (x.route_short_name,)
    s += '\nNODES='
    
    circular = 0
    #node list
    trip_stop_list = []
    for nodeIdx in range(len(node_list)):
        # added condition to make sure stops only get stopped once
        if (node_list[nodeIdx] in stop_node_list) & (node_list[nodeIdx] not in trip_stop_list):
            s += '\n %s' % (node_list[nodeIdx])
            trip_stop_list += [node_list[nodeIdx]]
            if nodeIdx < (len(node_list)-1):
                s += ','
        else:
            s += '\n -%s' % (node_list[nodeIdx])
            if nodeIdx < (len(node_list)-1):
                s += ','
            if (node_list[nodeIdx] in trip_stop_list):
                circular = 1
    
    #if circular == 1:
    #    s += ','
    #    s += '\n CIRCULAR=T'                
            
    lines.append(s)

In [241]:
%%time

lines = [';;<<Trnbuild>>;;']

cube_trip_df.apply(lambda x: node_list(x, 
                                    trip_df,
                                    stop_df, 
                                    bus_link_df,
                                    all_stop_times_df,
                                    roadway_and_rail_node_gdf,
                                    roadway_and_rail_link_df,
                                    rail_link_gdf,
                                    unique_rail_node_gdf), 
                axis=1)

with open(output_folder + "transit.LIN", 'w') as f:
    f.write("\n".join(map(str, lines)))

5781
5877
6391
6619
6765
5775
5871
6229
6469
6639
5776
5872
6230
6470
6640
5777
5825
6423
5881
6025
5782
5910
6296
6536
6770
5778
5826
6424
5882
6026
5780
5876
6234
6474
6644
5779
5863
6221
6461
6635
5772
5820
6178
6418
5771
6123
6369
5773
5869
6227
6467
5774
5810
6168
6408
6592
1
5
4
7809
7772
7682
7481
7648
7533
7618
7212
7607
7570
7861
6894
7900
7223
7087
7437
7412
7854
6887
7798
7180
7179
7465
7499
7494
7323
7622
7327
7276
7667
7121
7563
7137
7403
6917
7211
7228
7753
7205
7194
7868
7915
7732
7581
6932
7921
6940
7794
7420
7545
7167
7290
7777
7497
7156
6908
7832
7597
7364
6930
7428
6882
7909
7145
7784
7800
7837
7602
7604
7906
7074
7264
7926
7016
7914
6961
7118
6979
7357
7022
7091
7375
7553
6897
7082
7081
7392
7630
7890
7932
7114
7112
6904
7705
7160
7452
7889
7574
7446
7702
7681
6935
7898
7843
7700
7005
7313
7243
7105
7917
6970
7054
7802
7818
7876
6958
7652
7380
7369
6992
7888
7768
7069
6931
7050
7588
7882
7759
7536
7258
6918
7940
7250
7591
7120
7010
7824
7067
7453
7253
7650
7551
7517

23419
23420
23421
23422
23432
23437
23453
23454
23458
23460
23461
23464
23465
23466
23467
23468
23469
23477
23479
23480
23481
23482
23485
23486
23487
23491
23492
23495
23496
23497
23498
23499
23501
23502
23504
23505
23507
23509
23510
23511
23512
23513
23514
23519
23520
23526
23527
23528
23530
23531
23533
23534
23535
23536
23537
23538
23539
23541
23542
23546
23547
23556
23557
23568
23576
23604
23605
23613
23616
23624
23625
23631
23638
23640
23644
23645
23652
23655
23656
23662
23663
23667
23668
23680
23697
23740
23756
23757
23759
23760
23763
23779
23780
23784
23797
23798
23803
23804
23818
23823
23910
23934
23988
23989
24086
24162
24163
24190
24270
24271
24273
24283
24286
24297
24298
24313
24314
24318
24326
24327
24342
24343
24354
24355
24366
24382
24403
24419
24420
24430
24440
24448
24454
24457
24473
24493
24508
24517
24521
24522
24526
24527
24530
24531
24535
24536
24539
24540
24544
24545
24549
24565
24586
24599
24602
24617
24637
24669
24676
24681
24688
24694
24695
24700
24706
24713
2471

52
53
54
55
57
58
59
60
61
63
64
65
66
67
69
74
75
78
80
81
83
91
92
94
95
96
97
104
106
108
114
116
120
126
127
128
139
144
149
153
183
184
188
193
197
199
207
217
225
228
235
245
253
257
277
302
322
324
344
369
389
392
393
395
396
399
400
408
409
443
444
447
450
459
460
462
480
481
496
507
516
522
529
533
544
547
560
569
581
582
590
600
608
614
615
623
633
641
648
652
659
661
664
671
677
722
723
731
736
743
745
754
756
759
770
807
811
812
821
822
827
828
866
867
875
885
893
894
896
904
914
922
928
929
937
947
955
963
973
981
982
983
987
989
1011
1012
1013
1016
1019
1021
1022
1023
1036
1047
1071
1077
1087
1095
1104
1106
1115
1120
1121
1150
1151
1158
1159
1160
1161
1163
1164
1167
1168
1218
1220
1222
1230
1240
1250
1256
1264
1274
1283
1284
1286
1287
1297
1298
1299
1300
1302
1303
1306
1308
1309
1310
1433
1434
1435
1436
1437
1438
1439
1440
1443
1445
1446
1447
1448
1449
1450
1453
1461
1472
1475
1494
1579
1580
1581
1582
1590
1591
1600
1601
1620
1621
1780
1782
1786
1787
1798
1799
1816
1817
1

In [860]:
stop_df[stop_df.stop_id == "5184"]

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,osm_node_id,shst_node_id,model_node_id
1711,5184,Jones St & Beach St,,37.807248,-122.417366,,,65297249,86e7a3a45b9e6e7a3fa01d63948d4260,90361


In [862]:
unique_rail_node_gdf[unique_rail_node_gdf.stop_id == "5184"]

Unnamed: 0,node_id,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled,is_stop,stop_id,model_node_id,geometry,transit_access,walk_access
593,2301,139233,-122.417366,37.807248,1,0,1,5184,90361,POINT (-122.417366 37.807248),1,1
