In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import json
from scipy.spatial import cKDTree

In [2]:
from methods import link_df_to_geojson
from methods import point_df_to_geojson
from methods import identify_dead_end_nodes

In [3]:
data_interim_dir = "../../data/interim/"
data_external_dir = "../../data/external/"

# Read network

In [4]:
%%time
shape_gdf = gpd.read_file(data_interim_dir + "step3_join_shst_extraction_with_osm/" + "shape.geojson")
print(shape_gdf.shape)
print(list(shape_gdf))
display(shape_gdf.head())

(908267, 6)
['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId', 'geometry']


Unnamed: 0,id,fromIntersectionId,toIntersectionId,forwardReferenceId,backReferenceId,geometry
0,89d807e34d7630c619bbc23a64e1313e,fc62d018be0236d6b9e43536ebc51830,61c9208c38c5d942c222859cb7073e30,1f25555f0c2f4a6c1205f4f0079c818f,d0e1dddc808772de514b913b8f6875c7,"LINESTRING (-121.81724 38.00970, -121.81667 38..."
1,95451c235af17ca9cde815dfc3f4b6ce,1bf4572e933b7f6d81535ff217053926,d60ae663adb14804616d2ca06c3ff8fb,89dd7cf0f842c091241b9a139a5e9506,f38e31579534d3a5abb518ef31bb0517,"LINESTRING (-121.81720 38.00839, -121.81726 38..."
2,6c28c78ee3ac710cf40194e2cac3730d,fc62d018be0236d6b9e43536ebc51830,d60ae663adb14804616d2ca06c3ff8fb,31529c3bf7a3c1a1aa9f76a607762487,80c4235b61bd0fe1b1538b8b36b40958,"LINESTRING (-121.81724 38.00970, -121.81726 38..."
3,bcf240fee04a590969705fdaeea814ea,3e8bf208fb1c0ead020efb50b80c55a2,41a9c2c8d1301064a72cedcb5f063361,a70e2e0a5ae6887d0c7b05ed4697c38b,5db4d3943495df8be0ec50e54142ba76,"LINESTRING (-121.81806 38.00838, -121.81887 38..."
4,2172fcefdf21e39f6810c4deaf9f89ee,cc73172a9282f3a8567c5d5303a61b42,41a9c2c8d1301064a72cedcb5f063361,91739e1e9ab6e5825162731d510ac4cb,adbcce138fec70e8b6c0abbc86918b1c,"LINESTRING (-121.81878 38.00941, -121.81883 38..."


Wall time: 1min 43s


In [5]:
%%time
node_file = data_interim_dir + "step3_join_shst_extraction_with_osm/" + "node.geojson"
node_gdf = gpd.read_file(node_file)
print(node_gdf.shape)
print(list(node_gdf))
display(node_gdf.head())

(661159, 6)
['osm_node_id', 'shst_node_id', 'drive_access', 'walk_access', 'bike_access', 'geometry']


Unnamed: 0,osm_node_id,shst_node_id,drive_access,walk_access,bike_access,geometry
0,2401244716,505d64eb98f1da8d812a3b3801034308,1,1,1,POINT (-122.33155 37.98120)
1,57839068,473979c78435732f01ca5a168afb62e0,1,1,1,POINT (-121.94477 37.95332)
2,1024388950,fc7b575d5d8c961d4a70fca846ae7f80,1,1,1,POINT (-122.53983 37.89800)
3,65561433,6c60cf34e9dc3e123eefb829fe80c76a,1,1,1,POINT (-122.03190 37.26739)
4,4545575571,013e1f994fd86c1f226098f8364f7286,1,1,1,POINT (-122.01186 37.37845)


Wall time: 48 s


In [6]:
%%time
link_file = data_interim_dir + "step4_conflate_with_tomtom/" + "link.feather"

link_df = pd.read_feather(link_file)

print(link_df.shape)
print(list(link_df))
display(link_df.head())

(1705772, 30)
['access', 'area', 'bike_access', 'bridge', 'drive_access', 'est_width', 'fromIntersectionId', 'highway', 'id', 'junction', 'key', 'landuse', 'lanes', 'link', 'maxspeed', 'name', 'oneWay', 'ref', 'roadway', 'roundabout', 'service', 'shstGeometryId', 'shstReferenceId', 'toIntersectionId', 'tunnel', 'u', 'v', 'walk_access', 'wayId', 'width']


Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,service,shstGeometryId,shstReferenceId,toIntersectionId,tunnel,u,v,walk_access,wayId,width
0,,,1,,1,,505d64eb98f1da8d812a3b3801034308,service,208e093f10a62dcc6646a8efa0bde136,,...,parking_aisle,208e093f10a62dcc6646a8efa0bde136,00000461094d7f302e7afdcfc7ff5ba3,fb8b1bcb4bb81380f0dc83c1aa5006f4,,2401244716,2401244712,1,231794292,
1,,,1,,1,,473979c78435732f01ca5a168afb62e0,residential,5fe3056a5583474c0c898983cd6a638b,,...,,5fe3056a5583474c0c898983cd6a638b,00000b467d2e08f9abf13eeafee3ed46,5adfc2245bcc082b5b004dbc53d9dfb0,,57839068,57869731,1,7864473,
2,,,1,,1,,fc7b575d5d8c961d4a70fca846ae7f80,tertiary,38e962038ecf17c6c7394ba88bc3b4c1,,...,,38e962038ecf17c6c7394ba88bc3b4c1,000018062272093dbaa5d49303062454,7550e87fc64657a10282672d814ab3c5,,1024388950,110424978,1,12183318,
3,,,1,,1,,6c60cf34e9dc3e123eefb829fe80c76a,residential,f3558c6d4c7e3d48742ea3755186c20d,,...,,f3558c6d4c7e3d48742ea3755186c20d,000018a23d1330dcfbae79f44e9fca1a,cce854f3462409d6825c15ac6309f495,,65561433,1325928459,1,8951190,
4,,,1,,1,,013e1f994fd86c1f226098f8364f7286,service,550ef74674d67256659c5ab80d35d7a7,,...,road,550ef74674d67256659c5ab80d35d7a7,0000491ae6975893d350d75ed7aa5842,9513a9ae6406c5d62f9bf1a7bafd2b73,,4545575571,4545575563,1,458575920,


Wall time: 13.7 s


In [8]:
print(shape_gdf.crs)

epsg:4326


# Join county name to shapes and nodes

In [9]:
county_file = data_external_dir + "county_boundaries/cb_2018_us_county_500k/cb_2018_us_county_500k.shp"

county_gdf = gpd.read_file(county_file)

county_gdf = county_gdf.to_crs(shape_gdf.crs)

print(county_gdf.shape)
print(list(county_gdf))
display(county_gdf.head())

(3233, 10)
['STATEFP', 'COUNTYFP', 'COUNTYNS', 'AFFGEOID', 'GEOID', 'NAME', 'LSAD', 'ALAND', 'AWATER', 'geometry']


Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,"POLYGON ((-89.18137 37.04630, -89.17938 37.053..."
1,21,17,516855,0500000US21017,21017,Bourbon,6,750439351,4829777,"POLYGON ((-84.44266 38.28324, -84.44114 38.283..."
2,21,31,516862,0500000US21031,21031,Butler,6,1103571974,13943044,"POLYGON ((-86.94486 37.07341, -86.94346 37.074..."
3,21,65,516879,0500000US21065,21065,Estill,6,655509930,6516335,"POLYGON ((-84.12662 37.64540, -84.12483 37.646..."
4,21,69,516881,0500000US21069,21069,Fleming,6,902727151,7182793,"POLYGON ((-83.98428 38.44549, -83.98246 38.450..."


In [10]:
%%time
# spatial join nodes with county shape
node_county_gdf = gpd.sjoin(node_gdf, county_gdf, how = "left", op = "intersects")

Wall time: 4min 31s


In [11]:
# one node got joined to two counties
print('# of unique nodes: {}'.format(node_gdf.shape[0]))
print('# of nodes in spatial join result: {}'.format(node_county_gdf.shape[0]))
print('# of unique nodes in spatial join result: {}'.format(node_county_gdf.shst_node_id.nunique()))

# of unique nodes: 661159
# of nodes in spatial join result: 661160
# of unique nodes in spatial join result: 661159


In [13]:
# use nearest match for nodes that did not get county match (e.g. in the Bay)

node_county_matched_gdf = node_county_gdf[node_county_gdf.NAME.notnull()].copy()
node_county_unmatched_gdf = node_county_gdf[node_county_gdf.NAME.isnull()].copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

node_county_unmatched_gdf = node_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
node_county_unmatched_gdf['X'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
node_county_unmatched_gdf['Y'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

node_county_rematch_gdf = pd.DataFrame()

for i in range(len(node_county_unmatched_gdf)):
    point = node_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["NAME"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['shst_node_id'] = node_county_unmatched_gdf.iloc[i]['shst_node_id']
    
    if i == 0:
        node_county_rematch_gdf = add_snap_gdf.copy()
    else:
        node_county_rematch_gdf = node_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

  return _prepare_from_string(" ".join(pjargs))


In [15]:
# fill out missing county names

node_county_rematch_dict = dict(zip(node_county_rematch_gdf.shst_node_id, node_county_rematch_gdf.NAME))
node_county_gdf["NAME"] = node_county_gdf["NAME"].fillna(node_county_gdf.shst_node_id.map(node_county_rematch_dict))

print(node_county_gdf.NAME.value_counts())

Santa Clara      193106
Alameda          126021
Contra Costa      97374
San Mateo         56430
Sonoma            56166
Solano            47305
San Francisco     27656
Marin             26543
Napa              13838
Santa Cruz         5232
Yolo               4238
Lake               2890
San Joaquin        2120
San Benito         1095
Sacramento          383
Mendocino           324
Stanislaus          268
Monterey            124
Merced               47
Name: NAME, dtype: int64


In [16]:
%%time
# spatial join link shapes with county shape

# first, get link centroids
shape_centroid_gdf = shape_gdf.copy()
shape_centroid_gdf["geometry"] = shape_centroid_gdf["geometry"].centroid

# spatial join
shape_centroid_gdf = gpd.sjoin(shape_centroid_gdf, county_gdf, how = "left", op = "intersects")

# merge name to shape_gdf
shape_county_gdf = pd.merge(
    shape_gdf,
    shape_centroid_gdf[["id", "NAME"]],
    how = "left",
    on = "id"
)




Wall time: 6min 24s


In [17]:
print('# of unique link shapes: {}'.format(shape_gdf.shape[0]))
print('# of link shapes in spatial join result: {}'.format(shape_county_gdf.shape[0]))
print('# of unique link shapes in spatial join result: {}'.format(shape_county_gdf.id.nunique()))

# of unique link shapes: 908267
# of link shapes in spatial join result: 908267
# of unique link shapes in spatial join result: 908267


In [18]:
# use nearest for links that did not get county match
node_county_matched_gdf = node_county_gdf[node_county_gdf.NAME.notnull()].copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

shape_county_unmatched_gdf = shape_county_gdf[shape_county_gdf.NAME.isnull()].copy()

shape_county_unmatched_gdf = shape_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
shape_county_unmatched_gdf["geometry"] = shape_county_unmatched_gdf["geometry"].centroid
shape_county_unmatched_gdf['X'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
shape_county_unmatched_gdf['Y'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

shape_county_rematch_gdf = pd.DataFrame()

for i in range(len(shape_county_unmatched_gdf)):
    point = shape_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["NAME"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['id'] = shape_county_unmatched_gdf.iloc[i]['id']
    
    if i == 0:
        shape_county_rematch_gdf = add_snap_gdf.copy()
    else:
        shape_county_rematch_gdf = shape_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

print(shape_county_rematch_gdf.shape)

  return _prepare_from_string(" ".join(pjargs))


(275, 2)


In [20]:
# fill out missing county names
shape_county_rematch_dict = dict(zip(shape_county_rematch_gdf.id, shape_county_rematch_gdf.NAME))
shape_county_gdf["NAME"] = shape_county_gdf["NAME"].fillna(shape_county_gdf.id.map(shape_county_rematch_dict))

print(shape_county_gdf.NAME.value_counts())

Santa Clara      278209
Alameda          173779
Contra Costa     130627
San Mateo         77557
Sonoma            70904
Solano            63237
San Francisco     41427
Marin             34603
Napa              17845
Santa Cruz         6053
Yolo               5072
Lake               3452
San Joaquin        2895
San Benito         1234
Sacramento          499
Mendocino           367
Stanislaus          319
Monterey            139
Merced               49
Name: NAME, dtype: int64


# Remove duplicate shape/node in county match, e.g. geometry on the boundary

In [21]:
node_county_gdf.drop_duplicates(subset = ["shst_node_id"], inplace = True)
shape_county_gdf.drop_duplicates(subset = ["id"], inplace = True)

# Remove links and nodes outside of the 9 counties

In [22]:
MPO_county_list = ['San Francisco', 'Santa Clara', 'Sonoma', 'Marin', 'San Mateo',
       'Contra Costa', 'Solano', 'Napa', 'Alameda']

In [23]:
shape_MPO_gdf = shape_county_gdf[shape_county_gdf.NAME.isin(MPO_county_list)].copy()

In [28]:
# merge name to node_gdf
node_gdf = pd.merge(
    node_gdf, 
    node_county_gdf[["shst_node_id", "NAME"]].rename(columns = {"NAME": "county"}), 
    how = "left", 
    on = "shst_node_id") 

In [30]:
link_MPO_df = link_df[link_df.shstGeometryId.isin(shape_MPO_gdf.id.tolist())].copy()

node_MPO_gdf = node_gdf[node_gdf.shst_node_id.isin(link_MPO_df.fromIntersectionId.tolist() + 
                                                   link_MPO_df.toIntersectionId.tolist())].copy()

In [31]:
# nodes that are outside of MPO but used by MPO links, need to give them the internal county names for node numbering

node_MPO_rename_county_gdf = node_MPO_gdf[~node_MPO_gdf.county.isin(MPO_county_list)].copy()

node_link_county_names_df = pd.concat(
    [
        shape_MPO_gdf.groupby(["fromIntersectionId", "NAME"])["id"].count().reset_index().rename(
            columns = {"fromIntersectionId" : "shst_node_id", "NAME" : "county"}),
        shape_MPO_gdf.groupby(["toIntersectionId", "NAME"])["id"].count().reset_index().rename(
            columns = {"toIntersectionId" : "shst_node_id", "NAME" : "county"})
    ],
    sort = False,
    ignore_index = True
)

node_MPO_rename_county_gdf = pd.merge(
    node_MPO_rename_county_gdf.drop(["county"], axis = 1),
    node_link_county_names_df[["shst_node_id", "county"]],
    how = "left",
    on = "shst_node_id"
)

node_MPO_rename_county_gdf.drop_duplicates(subset = ["osm_node_id", "shst_node_id"], inplace = True)

In [32]:
node_MPO_gdf = pd.concat(
    [
        node_MPO_gdf[node_MPO_gdf.county.isin(MPO_county_list)],
        node_MPO_rename_county_gdf
    ],
    sort = False,
    ignore_index = True
)

In [33]:
link_MPO_gdf = pd.merge(
    link_MPO_df,
    shape_MPO_gdf[["id", "NAME", "geometry"]].rename(columns = {"NAME" : "county"}),
    how = "left", 
    on = "id"        # note: "id" is "shstGeometryId"
)

link_MPO_gdf = gpd.GeoDataFrame(link_MPO_gdf, geometry = link_MPO_gdf["geometry"],
                                crs={"init" : "epsg:4326"})

  return _prepare_from_string(" ".join(pjargs))


# Add length

In [34]:
# add length in meters

geom_length = link_MPO_gdf[['geometry']].copy()
geom_length = geom_length.to_crs(epsg = 26915)
geom_length["length"] = geom_length.length

link_MPO_gdf["length"] = geom_length["length"]

# drop circular links (u == v)

In [35]:
circular_link_gdf = link_MPO_gdf[link_MPO_gdf.u == link_MPO_gdf.v].copy()
print('{} circular links, {} unique id'.format(circular_link_gdf.shape[0], circular_link_gdf.id.nunique()))

5560 circular links, 4921 unique id


In [36]:
link_MPO_gdf = link_MPO_gdf[~ link_MPO_gdf.shstReferenceId.isin(circular_link_gdf.shstReferenceId.tolist())]

print('after dropping circular links, {} links remain, with {} unique shstReferenceId, {} unique id'.format(
        link_MPO_gdf.shape[0],
        link_MPO_gdf.shstReferenceId.nunique(),
        link_MPO_gdf.id.nunique()))

after dropping circular links, 1661184 links remain, with 1661184 unique shstReferenceId, 883267 unique id


In [37]:
# drop shapes of circular links
shape_MPO_gdf = shape_MPO_gdf[shape_MPO_gdf.id.isin(link_MPO_gdf.id)]
print('after dropping circular links, {} shapes remain, with {} unique id'.format(
        shape_MPO_gdf.shape[0],
        shape_MPO_gdf.id.nunique()))

after dropping circular links, 883267 shapes remain, with 883267 unique id


In [38]:
# drop circular-link-only nodes 
node_MPO_gdf = node_MPO_gdf[(node_MPO_gdf.osm_node_id.isin(link_MPO_gdf.u.tolist())) | 
                            (node_MPO_gdf.osm_node_id.isin(link_MPO_gdf.v.tolist()))]
print('after dropping circular links, {} nodes remain'.format(node_MPO_gdf.shape[0]))

after dropping circular links, 643811 nodes remain


# Flag drive dead end

In [39]:
non_dead_end_link_handle_df = link_MPO_gdf[(link_MPO_gdf.drive_access == 1)][["u", "v"]]

dead_end_node_list = identify_dead_end_nodes(non_dead_end_link_handle_df)

cumulative_dead_end_node_list = []

while len(dead_end_node_list) > 0:
    cumulative_dead_end_node_list = cumulative_dead_end_node_list + dead_end_node_list
    
    non_dead_end_link_handle_df = non_dead_end_link_handle_df[~(non_dead_end_link_handle_df.u.isin(dead_end_node_list)) & 
                                            ~(non_dead_end_link_handle_df.v.isin(dead_end_node_list))].copy()
    
    dead_end_node_list = identify_dead_end_nodes(non_dead_end_link_handle_df)

print('# of dead end nodes: {}'.format(len(cumulative_dead_end_node_list)))

# of dead end nodes: 142963


In [40]:
# update node and link drive access
# if u/v in dead end node list, then drive access = 0
# if osm_node_id in dead end node list, then drive access = 0

print('drive access stats of links: \n {}\n'.format(link_MPO_gdf.drive_access.value_counts()))

link_MPO_gdf['drive_access'] = np.where(((link_MPO_gdf.u.isin(cumulative_dead_end_node_list)) | 
                                               (link_MPO_gdf.v.isin(cumulative_dead_end_node_list))) &
                                        ~(link_MPO_gdf.roadway.isin(['primary', 'secondary', 'motorway', 'primary_link',
                                               'motorway_link', 'trunk_link', 'trunk', 'secondary_link',
                                               'tertiary_link'])),
                                               0,
                                               link_MPO_gdf.drive_access)

print('after flagging dead end links, drive access stats of links: \n {}'.format(link_MPO_gdf.drive_access.value_counts()))

drive access stats of links: 
 1    1309170
0     352014
Name: drive_access, dtype: int64
after flagging dead end links, drive access stats of links: 
 1    1019540
0     641644
Name: drive_access, dtype: int64


In [41]:
# update network type variable for node

print('drive access stats of nodes: \n {}\n'.format(node_MPO_gdf.drive_access.value_counts()))

A_B_df = pd.concat([link_MPO_gdf[["u", "fromIntersectionId", "drive_access", "walk_access", "bike_access"]].rename(
                            columns = {"u":"osm_node_id", "fromIntersectionId" : "shst_node_id"}),
                  link_MPO_gdf[["v", "toIntersectionId", "drive_access", "walk_access", "bike_access"]].rename(
                            columns = {"v":"osm_node_id", "toIntersectionId" : "shst_node_id"})],
                  sort = False,
                  ignore_index = True)

A_B_df.drop_duplicates(inplace = True)

A_B_df = A_B_df.groupby(["osm_node_id", "shst_node_id"]).max().reset_index()

node_MPO_gdf = pd.merge(node_MPO_gdf.drop(["drive_access", "walk_access", "bike_access"], axis = 1),
                      A_B_df,
                      how = "left",
                      on = ["osm_node_id", "shst_node_id"])

print('after flagging dead end links, drive access stats of nodes: \n {}\n'.format(node_MPO_gdf.drive_access.value_counts()))

drive access stats of nodes: 
 1    545259
0     98552
Name: drive_access, dtype: int64

after flagging dead end links, drive access stats of nodes: 
 1    402361
0    241450
Name: drive_access, dtype: int64



In [42]:
# check: there should be no link that is not accessible by all modes
link_MPO_gdf[(link_MPO_gdf.drive_access == 0) & (link_MPO_gdf.walk_access == 0) & (link_MPO_gdf.bike_access == 0)].shape[0]

0

In [43]:
# double check: roadway types versus drive_access

print('roadway types: \n{}\n'.format(link_df.roadway.unique()))

print('roadway types of link with drive_access==0: \n{}\n'.format(
    link_MPO_gdf[(link_MPO_gdf.drive_access == 0)].roadway.value_counts()))

roadway types: 
['service' 'residential' 'tertiary' 'primary' 'cycleway' 'footway'
 'secondary' 'motorway' 'primary_link' 'motorway_link' 'trunk_link'
 'trunk' 'secondary_link' 'tertiary_link']

roadway types of link with drive_access==0: 
footway        250682
service        156349
residential    132552
cycleway       101332
tertiary          729
Name: roadway, dtype: int64



# Drop duplicate links between same AB node pair

In [44]:
# get count of shstReferenceId of unique u/v pairs
non_unique_AB_links_df = link_MPO_gdf.groupby(["u", "v"]).shstReferenceId.count().sort_values().reset_index()
display(non_unique_AB_links_df)

Unnamed: 0,u,v,shstReferenceId
0,281266,65358141,1
1,4913174652,4913174650,1
2,4913174652,4913174644,1
3,4913174650,4913174654,1
4,4913174650,4913174652,1
...,...,...,...
1632697,65409920,670714007,3
1632698,4913479606,4913479605,3
1632699,747649693,747649688,5
1632700,747649688,747649693,5


In [46]:
print('links has {} unique shstReferenceId, {} unique u/v pairs'.format(link_MPO_gdf.shstReferenceId.nunique(),
                                                                        non_unique_AB_links_df.shape[0]))

links has 1661184 unique shstReferenceId, 1632702 unique u/v pairs


In [47]:
# u/v pairs with multiple links
non_unique_AB_links_df = non_unique_AB_links_df[non_unique_AB_links_df.shstReferenceId > 1]
print('{} u/v pairs have multiple links')

# get their link attributes
non_unique_AB_links_df = pd.merge(non_unique_AB_links_df[["u", "v"]],
                                  link_MPO_gdf[["u", "v", "highway", "roadway",
                                                "drive_access", "bike_access", "walk_access", "length",
                                                "wayId", "shstGeometryId", "shstReferenceId", "geometry"]],
                                  how = "left",
                                  on = ["u", "v"])

In [48]:
# read roadway hierarchy crosswalk
roadway_hierarchy_df = pd.read_csv("../../data/interim/highway_to_roadway.csv")

roadway_hierarchy_df = roadway_hierarchy_df.drop_duplicates(subset = "roadway")

# merge roadway hierarchy to u/v pairs with multiple links
non_unique_AB_links_df = pd.merge(non_unique_AB_links_df,
                                  roadway_hierarchy_df[["roadway", "hierarchy"]],
                                  how = "left",
                                  on = "roadway")

In [49]:
# sort on hierarchy (ascending), drive_access(descending), bike_access(descending), walk_access(descending), length(ascending)

non_unique_AB_links_sorted_df =  non_unique_AB_links_df.sort_values(
    by = ["hierarchy", "drive_access", "bike_access", "walk_access", "length"],
    ascending = [True, False, False, False, True])

# keep only one link for each u/v pair 
unique_AB_links_df = non_unique_AB_links_sorted_df.drop_duplicates(subset = ["u", "v"], keep = "first")

In [50]:
# select links that should be dropped
from_list = non_unique_AB_links_df.shstReferenceId.tolist()
to_list = unique_AB_links_df.shstReferenceId.tolist()

drop_link_model_link_id_list = [c for c in from_list if c not in to_list]

In [51]:
# drop the links and the corresponding shapes
link_MPO_gdf = link_MPO_gdf[~ link_MPO_gdf.shstReferenceId.isin(drop_link_model_link_id_list)]
shape_MPO_gdf = shape_MPO_gdf[shape_MPO_gdf.id.isin(link_MPO_gdf.id)].copy()

# Numbering Nodes

In [52]:
# number ranges for nodes by county
county_node_numbering_start_dict = {
    "San Francisco" : 1000000, 
    "San Mateo" : 1500000,
    "Santa Clara" : 2000000,
    "Alameda" : 2500000,
    "Contra Costa" : 3000000,
    "Solano" : 3500000,
    "Napa" : 4000000,
    "Sonoma" : 4500000,
    "Marin" : 5000000
}

In [54]:
# create model_mode_id by county
node_MPO_gdf["model_node_id"] = node_MPO_gdf.groupby(["county"]).cumcount()
node_MPO_gdf["county_numbering_start"] = node_MPO_gdf["county"].map(county_node_numbering_start_dict)
node_MPO_gdf["model_node_id"] = node_MPO_gdf["model_node_id"] + node_MPO_gdf["county_numbering_start"]

node_MPO_gdf.county.value_counts(dropna=False)

Santa Clara      192799
Alameda          125942
Contra Costa      97273
San Mateo         56363
Sonoma            56146
Solano            47318
San Francisco     27608
Marin             26529
Napa              13833
Name: county, dtype: int64

In [56]:
# check consistency
print('{} unique model_node_id, {} nodes have county tagging'.format(
    node_MPO_gdf.model_node_id.nunique(),
    node_MPO_gdf[node_MPO_gdf.county.isin(county_node_numbering_start_dict.keys())].shape[0]))

643811 unique model_node_id, 643811 nodes have county tagging


# Numbering Links

In [55]:
# number ranges for links by county
county_link_numbering_start_dict = {
    "San Francisco" : 1, 
    "San Mateo" : 1000000,
    "Santa Clara" : 2000000,
    "Alameda" : 3000000,
    "Contra Costa" : 4000000,
    "Solano" : 5000000,
    "Napa" : 6000000,
    "Sonoma" : 7000000,
    "Marin" : 8000000
}

In [57]:
# create model_link_id by county
link_MPO_gdf["model_link_id"] = link_MPO_gdf.groupby(["county"]).cumcount()
link_MPO_gdf["county_numbering_start"] = link_MPO_gdf["county"].map(county_link_numbering_start_dict)
link_MPO_gdf["model_link_id"] = link_MPO_gdf["model_link_id"] + link_MPO_gdf["county_numbering_start"]

link_MPO_gdf.county.value_counts(dropna=False)

Santa Clara      516565
Alameda          315314
Contra Costa     237448
San Mateo        140927
Sonoma           135042
Solano           117102
San Francisco     73440
Marin             63184
Napa              33680
Name: county, dtype: int64

In [58]:
# check consistency
print('{} unique model_link_id, {} links have county tagging'.format(
    link_MPO_gdf.model_link_id.nunique(),
    link_MPO_gdf[link_MPO_gdf.county.isin(county_link_numbering_start_dict.keys())].shape[0]))

1632702 unique model_link_id, 1632702 links have county tagging


# Numbering Link A/B nodes

In [59]:
# shst_node_id - model_node_id dictionary 
node_shst_model_id_dict = dict(zip(node_MPO_gdf.shst_node_id, node_MPO_gdf.model_node_id))

In [60]:
# map shst_node_id to model_node_id as A/B
link_MPO_gdf["A"] = link_MPO_gdf["fromIntersectionId"].map(node_shst_model_id_dict)
link_MPO_gdf["B"] = link_MPO_gdf["toIntersectionId"].map(node_shst_model_id_dict)

In [61]:
# check: all links should have A and B
print(link_MPO_gdf[link_MPO_gdf.A.isnull()].county.value_counts())
print(link_MPO_gdf[link_MPO_gdf.B.isnull()].county.value_counts())

Series([], Name: county, dtype: int64)
Series([], Name: county, dtype: int64)


# Write out

In [62]:
not_to_export_link_json = ["id", "link"]

print("-------write out link shape geojson---------")

shape_prop = ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId']
shape_geojson = link_df_to_geojson(shape_MPO_gdf, shape_prop)

with open("../../data/interim/step5_tidy_roadway/shape_yq.geojson", "w") as f:
    json.dump(shape_geojson, f)

    
print("-------write out link json---------")

link_prop = link_MPO_gdf.drop(["geometry"] + not_to_export_link_json, axis = 1).columns.tolist()

out = link_MPO_gdf[link_prop].to_json(orient = "records")

with open('../../data/interim/step5_tidy_roadway/link_yq.json', 'w') as f:
    f.write(out)

    
print("-------write out node geojson---------")

node_prop = node_MPO_gdf.drop("geometry", axis = 1).columns.tolist()
node_geojson = point_df_to_geojson(node_MPO_gdf, node_prop)

with open("../../data/interim/step5_tidy_roadway/node_yq.geojson", "w") as f:
    json.dump(node_geojson, f)  

-------write out link shape geojson---------
-------write out link json---------
-------write out node geojson---------


In [63]:
link_MPO_gdf.columns

Index(['access', 'area', 'bike_access', 'bridge', 'drive_access', 'est_width',
       'fromIntersectionId', 'highway', 'id', 'junction', 'key', 'landuse',
       'lanes', 'link', 'maxspeed', 'name', 'oneWay', 'ref', 'roadway',
       'roundabout', 'service', 'shstGeometryId', 'shstReferenceId',
       'toIntersectionId', 'tunnel', 'u', 'v', 'walk_access', 'wayId', 'width',
       'county', 'geometry', 'length', 'model_link_id',
       'county_numbering_start', 'A', 'B'],
      dtype='object')

In [64]:
print("-------write out link feather---------")

link_feather = link_MPO_gdf.reset_index(drop = True).drop("geometry", axis = 1).copy()

link_feather.to_feather(data_interim_dir + 'step5_tidy_roadway/link_yq.feather')

-------write out link feather---------



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  link_feather.to_feather(data_interim_dir + 'step5_tidy_roadway/link_yq.feather')


In [65]:
link_feather.columns

Index(['access', 'area', 'bike_access', 'bridge', 'drive_access', 'est_width',
       'fromIntersectionId', 'highway', 'id', 'junction', 'key', 'landuse',
       'lanes', 'link', 'maxspeed', 'name', 'oneWay', 'ref', 'roadway',
       'roundabout', 'service', 'shstGeometryId', 'shstReferenceId',
       'toIntersectionId', 'tunnel', 'u', 'v', 'walk_access', 'wayId', 'width',
       'county', 'length', 'model_link_id', 'county_numbering_start', 'A',
       'B'],
      dtype='object')

In [67]:
shape_MPO_gdf.columns

Index(['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId',
       'backReferenceId', 'geometry', 'NAME'],
      dtype='object')

In [69]:
node_MPO_gdf.columns

Index(['osm_node_id', 'shst_node_id', 'geometry', 'county', 'drive_access',
       'walk_access', 'bike_access', 'model_node_id',
       'county_numbering_start'],
      dtype='object')