# This notebook processes SHST extraction data to network standard, and conflates with OSM, TomTom

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import glob
import json
import geojson
from shapely.geometry import Point
import osmnx as ox
import networkx as nx
import fiona

In [2]:
from methods import extract_osm_link_from_shst_shape
from methods import osm_link_with_shst_info
from methods import add_two_way_osm
from methods import consolidate_osm_way_to_shst_link
from methods import create_node_gdf
from methods import link_df_to_geojson
from methods import point_df_to_geojson
from methods import fill_na
from methods import ox_graph
from methods import identify_dead_end_nodes
from methods import highway_attribute_list_to_value
from methods import read_shst_extract

In [3]:
pd.options.display.max_columns = None

In [None]:
%load_ext autoreload
%autoreload 2

# I/O

In [5]:
data_external_dir = "../../data/external/"
osm_link_file = data_external_dir + "osmnx_extract/link.geojson"
osm_node_file = data_external_dir + "osmnx_extract/node.geojson"
shst_extract_dir = data_external_dir + "sharedstreets_extract/"

# read OSM data

In [6]:
# input osm data
print("-------reading osmnx data---------")

osmnx_link_gdf = gpd.read_file(osm_link_file)
osmnx_node_gdf = gpd.read_file(osm_node_file)
    
    
print("-------finished reading osmnx data---------")

display(osmnx_link_gdf.head(3))

-------reading osmnx data---------
-------finished reading osmnx data---------


# read SHST extraction output, and process it to standard network

In [8]:
%%time
shst_link_gdf = read_shst_extract(shst_extract_dir, "*.out.geojson")

----------start reading shst extraction data-------------
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_1.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_11.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_2.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_3.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_4.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_5.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_6.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_8.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_9.out.geojson
reading shst extraction data :  ../../data/external/sharedstreets_extract\mtc_7.out.geojson
reading shst extracti

In [9]:
print(shst_link_gdf.columns)

Index(['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId',
       'backReferenceId', 'roadClass', 'metadata', 'geometry', 'source'],
      dtype='object')


In [10]:
shst_link_gdf.head(3)

Unnamed: 0,id,fromIntersectionId,toIntersectionId,forwardReferenceId,backReferenceId,roadClass,metadata,geometry,source
0,89d807e34d7630c619bbc23a64e1313e,fc62d018be0236d6b9e43536ebc51830,61c9208c38c5d942c222859cb7073e30,1f25555f0c2f4a6c1205f4f0079c818f,d0e1dddc808772de514b913b8f6875c7,Residential,"{'gisMetadata': [], 'geometryId': '89d807e34d7...","LINESTRING (-121.8172415 38.00969540000001, -1...",../../data/external/sharedstreets_extract\mtc_...
1,95451c235af17ca9cde815dfc3f4b6ce,1bf4572e933b7f6d81535ff217053926,d60ae663adb14804616d2ca06c3ff8fb,89dd7cf0f842c091241b9a139a5e9506,f38e31579534d3a5abb518ef31bb0517,Residential,"{'gisMetadata': [], 'geometryId': '95451c235af...","LINESTRING (-121.8171988 38.0083948, -121.8172...",../../data/external/sharedstreets_extract\mtc_...
2,6c28c78ee3ac710cf40194e2cac3730d,fc62d018be0236d6b9e43536ebc51830,d60ae663adb14804616d2ca06c3ff8fb,31529c3bf7a3c1a1aa9f76a607762487,80c4235b61bd0fe1b1538b8b36b40958,Residential,"{'gisMetadata': [], 'geometryId': '6c28c78ee3a...","LINESTRING (-121.8172415 38.00969540000001, -1...",../../data/external/sharedstreets_extract\mtc_...


In [18]:
# shst geometry file has duplicates, due to the buffer area along polygon boundries
# drop duplicates

print("--------removing duplicated shst extraction data---------")
print("before removing duplicates, shst extraction has geometry # : ", shst_link_gdf.shape[0])

shst_link_non_dup_gdf = shst_link_gdf.drop_duplicates(
    subset = ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId'])

print("after removing duplicates, shst extraction has geometry # : ", shst_link_non_dup_gdf.shape[0])

--------removing duplicated shst extraction data---------
before removing duplicates, shst extraction has geometry # :  1237008
after removing duplicates, shst extraction has geometry # :  908281


In [19]:
%%time

# this step is needed for obtaining OSM data for SHST links

print("-------extracting single osm ways by every shst geometry----------")

shst_link_df_list = []

temp = shst_link_non_dup_gdf.apply(lambda x: extract_osm_link_from_shst_shape(x, shst_link_df_list),
                            axis = 1)

osm_link_df = pd.concat(shst_link_df_list)

-------extracting single osm ways by every shst geometry----------
Wall time: 32min 18s


In [20]:
len(shst_link_df_list)

908281

In [21]:
osm_link_df

Unnamed: 0,link,name,nodeIds,oneWay,roadClass,roundabout,wayId,geometryId
0,False,,"[57927258, 2565726730, 57927260, 2565726729, 2...",False,Residential,False,7874755,89d807e34d7630c619bbc23a64e1313e
0,False,,"[4932505296, 57825298]",False,Residential,False,7857892,95451c235af17ca9cde815dfc3f4b6ce
0,False,,"[57927258, 57825298]",False,Residential,False,7879092,6c28c78ee3ac710cf40194e2cac3730d
0,False,,"[57825300, 57825302]",False,Residential,False,7857892,bcf240fee04a590969705fdaeea814ea
0,False,,"[2450165485, 351884044, 57825302]",False,Residential,False,7864815,2172fcefdf21e39f6810c4deaf9f89ee
0,False,,"[57872518, 57825300]",False,Residential,False,454841378,ca69fa9f3760f10ab6f6fae09fbd4bc1
0,False,,"[57825298, 57825300]",False,Residential,False,7857892,2477e97f737bcc11c2ba86d4eeef8ce1
0,False,,"[57825297, 4932505296]",False,Residential,False,7857892,fee066c4040395bc6b25667e12e073fd
0,False,,"[4932441355, 4932441353]",False,Service,False,502893079,a4ea005b4d92189d670e4cdfdd5b1e30
0,False,,"[4932441355, 4932505268]",False,Residential,False,7880774,c78d00321ccbb0f1d4e6ef465d5854d8


In [25]:
osm_link_gdf = osm_link_with_shst_info(osm_link_df,
                                       shst_link_non_dup_gdf)

In [30]:
# note, the sharedstreets extraction using default tile osm/planet 181224

# 1. join SHST with OSM
# 2. add two way links

osm_link_gdf = osm_link_with_shst_info(osm_link_df,
                                       shst_link_non_dup_gdf)

osm_link_gdf = add_two_way_osm(osm_link_gdf,osmnx_link_gdf)

shst extraction has geometry:  908281
osm links from shst extraction:  974897
---joining osm shst with osmnx data---
which includes two way links: 843159
and they are geometrys:  798093
after join, osm links from shst extraction:  1818056  out of which there are  151836  links that do not have osm info, due to shst extraction (default tile 181224) contains  42539  osm ids that are not included in latest OSM extraction, e.g. private streets, closed streets.
after join, there are shst geometry # :  1706374


In [32]:
# fill NAs
# for shst links that do not have complete osm info 

osm_link_non_na_gdf = fill_na(osm_link_gdf)

numeric columns:  ['wayId', 'u', 'v', 'key', 'osmid', 'reverse_out']
str columns:  ['nodeIds', 'roadClass', 'shstGeometryId', 'id', 'fromIntersectionId', 'toIntersectionId', 'shstReferenceId', 'geometry', 'access', 'area', 'bridge', 'est_width', 'highway', 'junction', 'landuse', 'lanes', 'maxspeed', 'name', 'oneway', 'ref', 'service', 'tunnel', 'width']


In [33]:
%%time

# aggregate osm data back to shst geometry based links

link_gdf = consolidate_osm_way_to_shst_link(osm_link_non_na_gdf)

print("after joining back to shst geometry, network has ", len(link_gdf), " links, which are based on ", 
      link_gdf.shstGeometryId.nunique(), " geometries")

-----start aggregating osm segments to one shst link for forward links----------
-----start aggregating osm segments to one shst link for backward links----------
all
after joining back to shst geometry, network has  1706374  links, which are based on  908281  geometries
Wall time: 15min 2s


In [34]:
# simplify highway
highway_to_roadway_df = pd.read_csv("../../data/interim/highway_to_roadway.csv").fillna("")

highway_to_roadway_dict = pd.Series(highway_to_roadway_df.roadway.values, 
                                    index = highway_to_roadway_df.highway).to_dict()

roadway_hierarchy_dict = pd.Series(highway_to_roadway_df.hierarchy.values, 
                                   index = highway_to_roadway_df.roadway).to_dict()
    
link_gdf["roadway"] = link_gdf.apply(lambda x: highway_attribute_list_to_value(x, 
                                                                              highway_to_roadway_dict,
                                                                              roadway_hierarchy_dict),
                                    axis = 1)

In [35]:
print(link_gdf.roadway.value_counts())
print(link_gdf[link_gdf.highway == ""].roadway.value_counts())

service           567900
residential       558684
footway           251136
cycleway          118781
tertiary           95320
secondary          69775
primary            27601
motorway_link       4988
trunk               4249
motorway            2871
secondary_link      1567
primary_link        1506
trunk_link          1384
tertiary_link        612
Name: roadway, dtype: int64
cycleway       56190
service        38015
residential    32902
tertiary        3463
secondary       2628
primary         1426
motorway         135
trunk            118
Name: roadway, dtype: int64


In [36]:
link_gdf.roadway.value_counts()

service           567900
residential       558684
footway           251136
cycleway          118781
tertiary           95320
secondary          69775
primary            27601
motorway_link       4988
trunk               4249
motorway            2871
secondary_link      1567
primary_link        1506
trunk_link          1384
tertiary_link        612
Name: roadway, dtype: int64

In [37]:
# there are links with different shstgeomid, but same shstrefid, to/from nodes

shst_refid_counts_df = pd.DataFrame(link_gdf.shstReferenceId.value_counts())
shst_refid_counts_df = shst_refid_counts_df[shst_refid_counts_df.shstReferenceId == 2]

links_with_diff_geomid_same_refid_df = link_gdf[link_gdf.shstReferenceId.isin(shst_refid_counts_df.index.values)]

links_with_diff_geomid_same_refid_df.sort_values(by = ["shstReferenceId"])

links_with_diff_geomid_same_refid_df.highway.value_counts()

links_with_diff_geomid_same_refid_df[links_with_diff_geomid_same_refid_df.highway == "tertiary"].sort_values(by = ["shstReferenceId"])

links_with_diff_geomid_same_refid_df[links_with_diff_geomid_same_refid_df.shstReferenceId == "18b6ce8eeb59e96413ede8e115ac2aa1"]

links_with_diff_geomid_same_refid_df.groupby("shstReferenceId")['roadway'].apply(list).value_counts()

# drop one of the links that have two shstGeomId

link_gdf.drop_duplicates(subset = ["shstReferenceId"],
                        inplace = True)

link_gdf.shape

(1705772, 32)

In [38]:
print("In the end, network has ", len(link_gdf), " links, which are based on ", 
      link_gdf.shstGeometryId.nunique(), " geometries")

In the end, network has  1705772  links, which are based on  908267  geometries


In [39]:
# add network type variables

network_type_df = pd.read_csv("../../data/interim/network_type_indicator.csv")

link_gdf = pd.merge(link_gdf,
                    network_type_df,
                    how = "left",
                    on = "roadway")

In [40]:
%%time

# create node gdf

node_gdf = create_node_gdf(link_gdf)

print("In the end, network has ", len(node_gdf), " nodes")

-------start creating shst nodes--------
In the end, network has  661159  nodes
Wall time: 1min 43s


In [41]:
# add network type variable for node

A_B_df = pd.concat([link_gdf[["u", "drive_access", "walk_access", "bike_access"]].rename(columns = {"u":"osm_node_id"}),
                  link_gdf[["v", "drive_access", "walk_access", "bike_access"]].rename(columns = {"v":"osm_node_id"})],
                  sort = False,
                  ignore_index = True)

A_B_df.drop_duplicates(inplace = True)

A_B_df = A_B_df.groupby("osm_node_id").max().reset_index()

node_gdf = pd.merge(node_gdf,
                      A_B_df,
                      how = "left",
                      on = "osm_node_id")

In [44]:
node_gdf.osm_node_id.nunique()

661159

In [45]:
len(set(link_gdf.u.tolist() + link_gdf.v.tolist()))

661159

In [46]:
link_gdf[~link_gdf.v.isin(node_gdf.osm_node_id.tolist())]

Unnamed: 0,shstReferenceId,id,shstGeometryId,fromIntersectionId,toIntersectionId,geometry,u,v,link,nodeIds,oneWay,roadClass,roundabout,wayId,access,area,bridge,est_width,highway,junction,key,landuse,lanes,maxspeed,name,oneway,ref,service,tunnel,width,forward,roadway,drive_access,walk_access,bike_access


In [47]:
print(shst_link_non_dup_gdf.columns)
print(shst_link_non_dup_gdf.shape)
print(shst_link_non_dup_gdf.id.nunique())

Index(['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId',
       'backReferenceId', 'roadClass', 'metadata', 'geometry', 'source'],
      dtype='object')
(908281, 9)
908281


In [48]:
shape_gdf = shst_link_non_dup_gdf[shst_link_non_dup_gdf.id.isin(link_gdf.shstGeometryId.tolist())].copy()
print(" In the end, there are " + str(len(shape_gdf)) + " geometries.")

 In the end, there are 908267 geometries.


In [49]:
%%time

print("-------write out link shape geojson---------")

shape_prop = ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId']
shape_geojson = link_df_to_geojson(shape_gdf, shape_prop)

with open("../../data/interim/step3_join_shst_extraction_with_osm/shape.geojson", "w") as f:
    json.dump(shape_geojson, f)

-------write out link shape geojson---------
Wall time: 5min 23s


In [50]:
%%time

# write out link variable json
# link unique handle "shstReferenceId" + "shstGeometryId"

print("-------write out link json---------")
#["id", "link", "nodeIds", "roadClass", "forward", "oneway"]
link_prop = link_gdf.drop(["geometry", "nodeIds", "forward", "roadClass", "oneway"], 
                          axis = 1).columns.tolist()

out = link_gdf[link_prop].to_json(orient = "records")

with open('../../data/interim/step3_join_shst_extraction_with_osm/link.json', 'w') as f:
    f.write(out)

-------write out link json---------
Wall time: 36.7 s


In [51]:
link_gdf.drive_access.value_counts()

1    1336384
0     369388
Name: drive_access, dtype: int64

In [44]:
link_gdf.drive_access.value_counts()

1    1336380
0     369392
Name: drive_access, dtype: int64

In [52]:
%%time

print("-------write out node geojson---------")

node_prop = node_gdf.drop("geometry", axis = 1).columns.tolist()
node_geojson = point_df_to_geojson(node_gdf, node_prop)

with open("../../data/interim/step3_join_shst_extraction_with_osm/node.geojson", "w") as f:
    json.dump(node_geojson, f)

-------write out node geojson---------
Wall time: 2min 48s
