In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import json
from scipy.spatial import cKDTree

In [2]:
from methods import link_df_to_geojson
from methods import point_df_to_geojson
from methods import identify_dead_end_nodes

In [3]:
data_interim_dir = "../../data/interim/"

data_external_dir = "../../data/external/"

# Read network

In [4]:
shape_gdf = gpd.read_file(data_interim_dir + "step3_join_shst_extraction_with_osm/" 
                          + "shape.geojson")

In [5]:
node_file = data_interim_dir + "step3_join_shst_extraction_with_osm/" + "node.geojson"
node_gdf = gpd.read_file(node_file)

In [102]:
link_file = data_interim_dir + "step4_conflate_with_tomtom/" + "link.feather"

link_df = pd.read_feather(link_file)

In [7]:
shape_gdf.crs

{'init': 'epsg:4326'}

# Join county name to shapes and nodes

In [8]:
county_file = data_external_dir + "county_boundaries/cb_2018_us_county_500k/cb_2018_us_county_500k.shp"

county_gdf = gpd.read_file(county_file)

county_gdf = county_gdf.to_crs(shape_gdf.crs)

In [9]:
node_county_gdf = gpd.sjoin(node_gdf, county_gdf, how = "left", op = "intersects")

In [10]:
node_gdf.shape

(661159, 6)

In [11]:
node_county_gdf.shape

(661160, 16)

In [12]:
# use nearest match for nodes that did not get county match (e.g. in the Bay)

node_county_matched_gdf = node_county_gdf[node_county_gdf.NAME.notnull()].copy()
node_county_unmatched_gdf = node_county_gdf[node_county_gdf.NAME.isnull()].copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

node_county_unmatched_gdf = node_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
node_county_unmatched_gdf['X'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
node_county_unmatched_gdf['Y'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

node_county_rematch_gdf = pd.DataFrame()

for i in range(len(node_county_unmatched_gdf)):
    point = node_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["NAME"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['shst_node_id'] = node_county_unmatched_gdf.iloc[i]['shst_node_id']
    
    if i == 0:
        node_county_rematch_gdf = add_snap_gdf.copy()
    else:
        node_county_rematch_gdf = node_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

In [13]:
node_county_rematch_dict = dict(zip(node_county_rematch_gdf.shst_node_id, node_county_rematch_gdf.NAME))

In [14]:
node_county_gdf["NAME"] = node_county_gdf["NAME"].fillna(node_county_gdf.shst_node_id.map(node_county_rematch_dict))

In [15]:
node_county_gdf.NAME.value_counts()

Santa Clara      193106
Alameda          126021
Contra Costa      97374
San Mateo         56430
Sonoma            56166
Solano            47305
San Francisco     27656
Marin             26543
Napa              13838
Santa Cruz         5232
Yolo               4238
Lake               2890
San Joaquin        2120
San Benito         1095
Sacramento          383
Mendocino           324
Stanislaus          268
Monterey            124
Merced               47
Name: NAME, dtype: int64

In [16]:
shape_centroid_gdf = shape_gdf.copy()
shape_centroid_gdf["geometry"] = shape_centroid_gdf["geometry"].centroid

shape_centroid_gdf = gpd.sjoin(shape_centroid_gdf, county_gdf, how = "left", op = "intersects")

shape_county_gdf = pd.merge(
    shape_gdf,
    shape_centroid_gdf[["id", "NAME"]],
    how = "left",
    on = "id"
)

In [17]:
shape_gdf.shape

(908267, 6)

In [18]:
shape_county_gdf.shape

(908267, 7)

In [19]:
shape_county_gdf.NAME.value_counts().sum()

907992

In [20]:
shape_county_gdf[shape_county_gdf.NAME.isnull()]

Unnamed: 0,id,fromIntersectionId,toIntersectionId,forwardReferenceId,backReferenceId,geometry,NAME
61460,97923a12ab2e1af21bb937026a44901e,b20b93ee7a961d12df4aa1bff5b6c4b6,066f6941259de5b7cc4be841fdc6d3a8,57cf9e1a7845b106a711e48de0eec385,7d0c43c3eada56909feeb8d47778966c,"LINESTRING (-122.3246558 37.9022385, -122.3250...",
65176,d6c888449000f0fdd3173604d9d7c8d9,45f70acb7476543507028a89760646d0,4ff0b86166effcebe962e4026f822fbf,4e54403f45d454b0b918c5037d5ae6d8,bb468c2f3de7977e4fd9aa68012ebf14,"LINESTRING (-122.3166339 37.8992776, -122.3163...",
65177,566baa6eec4f1551b848cdf23809c285,e443e8c37267364776caa11fb2ee7c41,a33a208ba34710b941df306e3cb27d11,ded94f4618e7b4e0af379a3107650f44,9428a762479e5934d29757830491412f,"LINESTRING (-122.3259169 37.8882585, -122.3261...",
65178,c905f36cfeb26e4a0b7fc3333204aba7,e443e8c37267364776caa11fb2ee7c41,0091ac97067fac13570e1149b384c3c3,ada9b5c5d65cacd45d326d63f00cad22,10fef368bb66241d74392bcc475d9a81,"LINESTRING (-122.3259169 37.8882585, -122.3258...",
65644,97ecb5aed64c894e3bd577e56eac075d,d3021feb77c8cb6c2e6372d6ea4f51dd,dc6bcf18e1afe9fe32e56e3c509c5808,39e6256df670971c1ba99fa6760ede63,804776aac6f27d20c1b33a11fc154813,"LINESTRING (-122.3184502 37.8753084, -122.3184...",
65653,9c0b2ac0777952a69b41daecad325880,41e8cb6bfff6238a00cf8e7d0c486a68,d3021feb77c8cb6c2e6372d6ea4f51dd,e79b8ea26bda85f9bb004057957a8432,5a9469a291ed0fb2db55677368c336ef,"LINESTRING (-122.3179177 37.87479370000001, -1...",
65654,d85bdb1683f8ff4eb13097c3d6792eae,ea921f78797e19cf9faeebf6863f1bd6,dc6bcf18e1afe9fe32e56e3c509c5808,047c5e77015b2c78ac39892a85583da1,6b30fde2c028d293e204915969b93072,"LINESTRING (-122.3179297 37.8750054, -122.3180...",
65655,b62e38a28da207711aae4c9f43c5e7be,ea921f78797e19cf9faeebf6863f1bd6,67c3b7329c49a55bec57a9b418933854,abf53d41663279ba06174ba397a2ad93,38a18147af7fde7757af5cd56c8d4c49,"LINESTRING (-122.3179297 37.8750054, -122.3178...",
65656,ba1346bb33e4819c4dad428dd33632c6,41e8cb6bfff6238a00cf8e7d0c486a68,ea921f78797e19cf9faeebf6863f1bd6,b8abac2d71da7da9015acb96b030d12f,302aa6b40dc1c092cfe2d261e12d9cc9,"LINESTRING (-122.3179177 37.87479370000001, -1...",
65657,0dddafd372dc697d4f3b2d9ff856c33a,d3021feb77c8cb6c2e6372d6ea4f51dd,b27ef63d7b56ad095adf4a61fb3ae7b1,f8ba4f5c194ae0f2084011df535fdc7e,05c571e594d59c520e0442c80edac231,"LINESTRING (-122.3184502 37.8753084, -122.3186...",


In [21]:
# use nearest for links that did not get county match
node_county_matched_gdf = node_county_gdf[node_county_gdf.NAME.notnull()].copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

shape_county_unmatched_gdf = shape_county_gdf[shape_county_gdf.NAME.isnull()].copy()

shape_county_unmatched_gdf = shape_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
shape_county_unmatched_gdf["geometry"] = shape_county_unmatched_gdf["geometry"].centroid
shape_county_unmatched_gdf['X'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
shape_county_unmatched_gdf['Y'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

shape_county_rematch_gdf = pd.DataFrame()

for i in range(len(shape_county_unmatched_gdf)):
    point = shape_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["NAME"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['id'] = shape_county_unmatched_gdf.iloc[i]['id']
    
    if i == 0:
        shape_county_rematch_gdf = add_snap_gdf.copy()
    else:
        shape_county_rematch_gdf = shape_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

In [22]:
shape_county_rematch_dict = dict(zip(shape_county_rematch_gdf.id, shape_county_rematch_gdf.NAME))

shape_county_gdf["NAME"] = shape_county_gdf["NAME"].fillna(shape_county_gdf.id.map(shape_county_rematch_dict))

# Remove duplicate shape/node in county match, e.g. geometry on the boundary

In [23]:
node_county_gdf.drop_duplicates(subset = ["shst_node_id"], inplace = True)

shape_county_gdf.drop_duplicates(subset = ["id"], inplace = True)

In [24]:
node_gdf.shape

(661159, 6)

In [25]:
node_county_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 661159 entries, 0 to 661158
Data columns (total 16 columns):
osm_node_id     661159 non-null int64
shst_node_id    661159 non-null object
drive_access    661159 non-null int64
walk_access     661159 non-null int64
bike_access     661159 non-null int64
geometry        661159 non-null object
index_right     660885 non-null float64
STATEFP         660885 non-null object
COUNTYFP        660885 non-null object
COUNTYNS        660885 non-null object
AFFGEOID        660885 non-null object
GEOID           660885 non-null object
NAME            661159 non-null object
LSAD            660885 non-null object
ALAND           660885 non-null float64
AWATER          660885 non-null float64
dtypes: float64(3), int64(4), object(9)
memory usage: 85.8+ MB


In [26]:
shape_gdf.shape

(908267, 6)

In [27]:
shape_county_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 908267 entries, 0 to 908266
Data columns (total 7 columns):
id                    908267 non-null object
fromIntersectionId    908267 non-null object
toIntersectionId      908267 non-null object
forwardReferenceId    908267 non-null object
backReferenceId       908267 non-null object
geometry              908267 non-null object
NAME                  908267 non-null object
dtypes: object(7)
memory usage: 55.4+ MB


# Remove links and nodes outside of the 9 counties

In [28]:
MPO_county_list = ['San Francisco', 'Santa Clara', 'Sonoma', 'Marin', 'San Mateo',
       'Contra Costa', 'Solano', 'Napa', 'Alameda']

In [29]:
node_gdf = pd.merge(
    node_gdf, 
    node_county_gdf[["shst_node_id", "NAME"]].rename(columns = {"NAME": "county"}), 
    how = "left", 
    on = "shst_node_id") 

In [30]:
node_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 661159 entries, 0 to 661158
Data columns (total 7 columns):
osm_node_id     661159 non-null int64
shst_node_id    661159 non-null object
drive_access    661159 non-null int64
walk_access     661159 non-null int64
bike_access     661159 non-null int64
geometry        661159 non-null object
county          661159 non-null object
dtypes: int64(4), object(3)
memory usage: 40.4+ MB


In [31]:
shape_MPO_gdf = shape_county_gdf[shape_county_gdf.NAME.isin(MPO_county_list)].copy()

In [32]:
shape_MPO_gdf.columns

Index(['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId',
       'backReferenceId', 'geometry', 'NAME'],
      dtype='object')

In [103]:
link_MPO_df = link_df[link_df.shstGeometryId.isin(shape_MPO_gdf.id.tolist())].copy()

node_MPO_gdf = node_gdf[node_gdf.shst_node_id.isin(link_MPO_df.fromIntersectionId.tolist() + 
                                                   link_MPO_df.toIntersectionId.tolist())].copy()

In [104]:
# nodes that are outside of MPO but used by MPO links, need to give them the internal county names for node numbering

node_MPO_rename_county_gdf = node_MPO_gdf[~node_MPO_gdf.county.isin(MPO_county_list)].copy()

node_link_county_names_df = pd.concat(
    [
        shape_MPO_gdf.groupby(["fromIntersectionId", "NAME"])["id"].count().reset_index().rename(
            columns = {"fromIntersectionId" : "shst_node_id", "NAME" : "county"}),
        shape_MPO_gdf.groupby(["toIntersectionId", "NAME"])["id"].count().reset_index().rename(
            columns = {"toIntersectionId" : "shst_node_id", "NAME" : "county"})
    ],
    sort = False,
    ignore_index = True
)

node_MPO_rename_county_gdf = pd.merge(
    node_MPO_rename_county_gdf.drop(["county"], axis = 1),
    node_link_county_names_df[["shst_node_id", "county"]],
    how = "left",
    on = "shst_node_id"
)

node_MPO_rename_county_gdf.drop_duplicates(subset = ["osm_node_id", "shst_node_id"], inplace = True)

In [105]:
node_MPO_rename_county_gdf.head(3)

Unnamed: 0,osm_node_id,shst_node_id,drive_access,walk_access,bike_access,geometry,county
0,90535719,9ac4414af3095d8b2fd034dda4ac6330,1,1,1,POINT (-121.7016325 38.1050639),Solano
1,86017483,1af41a838d805cc4ef50c28aa508f6da,1,1,1,POINT (-123.231362 38.81535),Sonoma
2,5040098253,0be862b72977b1f6e8398628c22be835,1,1,1,POINT (-121.5568837 37.7856817),Alameda


In [106]:
node_MPO_gdf.columns

Index(['osm_node_id', 'shst_node_id', 'drive_access', 'walk_access',
       'bike_access', 'geometry', 'county'],
      dtype='object')

In [107]:
node_MPO_rename_county_gdf.columns

Index(['osm_node_id', 'shst_node_id', 'drive_access', 'walk_access',
       'bike_access', 'geometry', 'county'],
      dtype='object')

In [108]:
node_MPO_gdf = pd.concat(
    [
        node_MPO_gdf[node_MPO_gdf.county.isin(MPO_county_list)],
        node_MPO_rename_county_gdf
    ],
    sort = False,
    ignore_index = True
)

In [109]:
link_MPO_gdf = pd.merge(
    link_MPO_df,
    shape_MPO_gdf[["id", "NAME", "geometry"]].rename(columns = {"NAME" : "county"}),
    how = "left", 
    on = "id"
)

link_MPO_gdf = gpd.GeoDataFrame(link_MPO_gdf, geometry = link_MPO_gdf["geometry"],
                            crs={"init" : "epsg:4326"})

In [110]:
link_MPO_gdf.county.value_counts()

Santa Clara      524820
Alameda          321922
Contra Costa     243348
San Mateo        144181
Sonoma           138807
Solano           119966
San Francisco     74441
Marin             64637
Napa              34622
Name: county, dtype: int64

In [111]:
print(link_MPO_gdf.drive_access.value_counts())
print(link_MPO_gdf.walk_access.value_counts())
print(link_MPO_gdf.bike_access.value_counts())

1    1313212
0     353532
Name: drive_access, dtype: int64
1    1653436
0      13308
Name: walk_access, dtype: int64
1    1402514
0     264230
Name: bike_access, dtype: int64


In [112]:
node_MPO_gdf.county.value_counts()

Santa Clara      193195
Alameda          126035
Contra Costa      97375
San Mateo         56453
Sonoma            56183
Solano            47336
San Francisco     27656
Marin             26543
Napa              13841
Name: county, dtype: int64

In [113]:
print(node_MPO_gdf.drive_access.value_counts())
print(node_MPO_gdf.walk_access.value_counts())
print(node_MPO_gdf.bike_access.value_counts())

1    545387
0     99230
Name: drive_access, dtype: int64
1    639799
0      4818
Name: walk_access, dtype: int64
1    576552
0     68065
Name: bike_access, dtype: int64


In [114]:
len(set(link_MPO_gdf.u.tolist() + link_MPO_gdf.v.tolist()))

644617

In [115]:
node_MPO_gdf.shape

(644617, 7)

# Add length

In [116]:
# add length in meters

geom_length = link_MPO_gdf[['geometry']].copy()
geom_length = geom_length.to_crs(epsg = 26915)
geom_length["length"] = geom_length.length

link_MPO_gdf["length"] = geom_length["length"]

# Flag drive dead end

In [117]:
non_dead_end_link_handle_df = link_MPO_gdf[(link_MPO_gdf.drive_access == 1)][["u", "v"]]

dead_end_node_list = identify_dead_end_nodes(non_dead_end_link_handle_df)

cumulative_dead_end_node_list = []

while len(dead_end_node_list) > 0:
    cumulative_dead_end_node_list = cumulative_dead_end_node_list + dead_end_node_list
    
    non_dead_end_link_handle_df = non_dead_end_link_handle_df[~(non_dead_end_link_handle_df.u.isin(dead_end_node_list)) & 
                                            ~(non_dead_end_link_handle_df.v.isin(dead_end_node_list))].copy()
    
    dead_end_node_list = identify_dead_end_nodes(non_dead_end_link_handle_df)

In [118]:
len(cumulative_dead_end_node_list)

137963

In [119]:
cumulative_dead_end_node_list

[26117855,
 26408927,
 29409975,
 31354449,
 31845971,
 31866760,
 33242031,
 33947187,
 35719197,
 35719237,
 35719242,
 35725887,
 52154568,
 52978502,
 52978511,
 52978530,
 52978548,
 52978787,
 52978791,
 52979353,
 52979660,
 52979702,
 52979741,
 52979746,
 52979819,
 52980213,
 52980607,
 52981509,
 52981521,
 52981528,
 52982126,
 52982289,
 52982540,
 52982567,
 52982608,
 52982610,
 52982925,
 52982938,
 52982952,
 52983422,
 52983441,
 52983476,
 52983477,
 52983483,
 52983936,
 52983940,
 52984451,
 52984958,
 52985470,
 52986212,
 52986214,
 52986385,
 52986401,
 52986402,
 52986566,
 52986578,
 52986586,
 52986595,
 52986752,
 52987302,
 52987692,
 52987693,
 52987961,
 52988038,
 52988041,
 52988102,
 52988140,
 52988144,
 52988147,
 52988152,
 52988328,
 52988928,
 52989564,
 52989590,
 52989626,
 52990130,
 52990143,
 52990473,
 52990722,
 52990785,
 52991502,
 52991995,
 52992052,
 52992056,
 52992078,
 52992100,
 52992350,
 52992360,
 52992414,
 52993138,
 52993437,

In [120]:
non_dead_end_link_handle_df

Unnamed: 0,u,v
0,2401244716,2401244712
1,57839068,57869731
2,1024388950,110424978
3,65561433,1325928459
4,4545575571,4545575563
5,3377850399,4313871331
6,4305402275,4305402282
8,4925258551,4332522355
9,65308539,65318508
10,4932240373,4616479310


In [121]:
link_MPO_gdf.drive_access.value_counts()

1    1313212
0     353532
Name: drive_access, dtype: int64

In [122]:
node_MPO_gdf.drive_access.value_counts()

1    545387
0     99230
Name: drive_access, dtype: int64

In [123]:
# update node and link drive access
# if u/v in dead end node list, then drive access = 0
# if osm_node_id in dead end node list, then drive access = 0

link_MPO_gdf['drive_access'] = np.where(((link_MPO_gdf.u.isin(cumulative_dead_end_node_list)) | 
                                               (link_MPO_gdf.v.isin(cumulative_dead_end_node_list))) &
                                        ~(link_MPO_gdf.roadway.isin(['primary', 'secondary', 'motorway', 'primary_link',
                                               'motorway_link', 'trunk_link', 'trunk', 'secondary_link',
                                               'tertiary_link'])),
                                               0,
                                               link_MPO_gdf.drive_access)

In [124]:
# update network type variable for node

A_B_df = pd.concat([link_MPO_gdf[["u", "fromIntersectionId", "drive_access", "walk_access", "bike_access"]].rename(
                            columns = {"u":"osm_node_id", "fromIntersectionId" : "shst_node_id"}),
                  link_MPO_gdf[["v", "toIntersectionId", "drive_access", "walk_access", "bike_access"]].rename(
                            columns = {"v":"osm_node_id", "toIntersectionId" : "shst_node_id"})],
                  sort = False,
                  ignore_index = True)

A_B_df.drop_duplicates(inplace = True)

A_B_df = A_B_df.groupby(["osm_node_id", "shst_node_id"]).max().reset_index()

node_MPO_gdf = pd.merge(node_MPO_gdf.drop(["drive_access", "walk_access", "bike_access"], axis = 1),
                      A_B_df,
                      how = "left",
                      on = ["osm_node_id", "shst_node_id"])

In [125]:
link_MPO_gdf.drive_access.value_counts()

1    1033818
0     632926
Name: drive_access, dtype: int64

In [126]:
node_MPO_gdf.drive_access.value_counts()

1    407491
0    237126
Name: drive_access, dtype: int64

In [127]:
link_MPO_gdf[(link_MPO_gdf.drive_access == 0) & (link_MPO_gdf.walk_access == 0) & (link_MPO_gdf.bike_access == 0)].roadway

Series([], Name: roadway, dtype: object)

In [128]:
link_df.roadway.unique()

array(['service', 'residential', 'tertiary', 'primary', 'cycleway',
       'footway', 'secondary', 'motorway', 'primary_link',
       'motorway_link', 'trunk_link', 'trunk', 'secondary_link',
       'tertiary_link'], dtype=object)

In [129]:
link_MPO_gdf[(link_MPO_gdf.drive_access == 0)].roadway.value_counts()

footway        250922
service        149825
residential    128880
cycleway       102610
tertiary          689
Name: roadway, dtype: int64

In [130]:
node_MPO_gdf[node_MPO_gdf.shst_node_id.isin(node_MPO_rename_county_gdf.shst_node_id.tolist())]

Unnamed: 0,osm_node_id,shst_node_id,geometry,county,drive_access,walk_access,bike_access
644390,90535719,9ac4414af3095d8b2fd034dda4ac6330,POINT (-121.7016325 38.1050639),Solano,1,1,1
644391,86017483,1af41a838d805cc4ef50c28aa508f6da,POINT (-123.231362 38.81535),Sonoma,0,1,1
644392,5040098253,0be862b72977b1f6e8398628c22be835,POINT (-121.5568837 37.7856817),Alameda,0,1,1
644393,95722215,5d60a0cb1999ab51c0a043b8660a310a,POINT (-121.695261 38.514734),Solano,0,1,1
644394,624524794,5ce7982c8f26842d124adb37a07fbd2c,POINT (-121.7568325 37.0491931),Santa Clara,0,1,1
644395,89445717,c3faaf4edfb3666650660c48d8602251,POINT (-121.217535 37.122678),Santa Clara,0,1,1
644396,4003083628,acdea6b75887fafaf03ecd5341df6360,POINT (-121.5373077 37.8429623),Contra Costa,0,1,1
644397,4993578363,57e57e3ba4c52ba8948a03b870ad9aa1,POINT (-121.5564266 37.7588262),Alameda,0,1,1
644398,86452091,8069a166d33ca9002cc0e0d01f98e711,POINT (-123.220412 38.818201),Sonoma,0,1,1
644399,62394174,c0e3f6a6ad024fd6754e19228134509f,POINT (-122.0949798 38.5137149),Solano,1,1,1


In [131]:
link_MPO_gdf[link_MPO_gdf.v == 62394174]

Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,toIntersectionId,tunnel,u,v,walk_access,wayId,width,county,geometry,length
1252258,"['nan', 'nan']","['nan', 'nan']",1,"['yes', 'nan']",1,"['nan', 'nan']",b6e691d77bc53a92d4f6a62e91301ccc,"['secondary', 'secondary']",3d6d41eef3868e2d4448c5edd8703bc6,"['nan', 'nan']",...,c0e3f6a6ad024fd6754e19228134509f,"['nan', 'nan']",387990215,62394174,1,"[8713932, 25749628]","['nan', 'nan']",Solano,"LINESTRING (-122.0949798 38.5137149, -122.0952...",211.216796


# Rename attributes

In [132]:
node_MPO_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 644617 entries, 0 to 644616
Data columns (total 7 columns):
osm_node_id     644617 non-null int64
shst_node_id    644617 non-null object
geometry        644617 non-null object
county          644617 non-null object
drive_access    644617 non-null int64
walk_access     644617 non-null int64
bike_access     644617 non-null int64
dtypes: int64(4), object(3)
memory usage: 39.3+ MB


In [133]:
shape_MPO_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 888188 entries, 0 to 907403
Data columns (total 7 columns):
id                    888188 non-null object
fromIntersectionId    888188 non-null object
toIntersectionId      888188 non-null object
forwardReferenceId    888188 non-null object
backReferenceId       888188 non-null object
geometry              888188 non-null object
NAME                  888188 non-null object
dtypes: object(7)
memory usage: 54.2+ MB


In [134]:
link_MPO_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1666744 entries, 0 to 1666743
Data columns (total 33 columns):
access                1666744 non-null object
area                  1666744 non-null object
bike_access           1666744 non-null int64
bridge                1666744 non-null object
drive_access          1666744 non-null int64
est_width             1666744 non-null object
fromIntersectionId    1666744 non-null object
highway               1666744 non-null object
id                    1666744 non-null object
junction              1666744 non-null object
key                   1666744 non-null object
landuse               1666744 non-null object
lanes                 1666744 non-null object
link                  1666744 non-null object
maxspeed              1666744 non-null object
name                  1666744 non-null object
oneWay                1666744 non-null object
ref                   1666744 non-null object
roadway               1666744 non-null object
roundab

In [72]:
not_to_export_link_json = ["id", "link"]

In [135]:
link_MPO_gdf.roadway.value_counts()

service           563572
residential       543661
footway           250922
cycleway          102610
tertiary           93964
secondary          68532
primary            26492
motorway_link       4986
trunk               4160
motorway            2778
secondary_link      1566
primary_link        1506
trunk_link          1384
tertiary_link        611
Name: roadway, dtype: int64

# Numbering Nodes

In [136]:
county_node_numbering_start_dict = {
    "San Francisco" : 1000000, 
    "San Mateo" : 1500000,
    "Santa Clara" : 2000000,
    "Alameda" : 2500000,
    "Contra Costa" : 3000000,
    "Solano" : 3500000,
    "Napa" : 4000000,
    "Sonoma" : 4500000,
    "Marin" : 5000000
}

In [137]:
node_MPO_gdf["model_node_id"] = node_MPO_gdf.groupby(["county"]).cumcount()

In [138]:
node_MPO_gdf["county_numbering_start"] = node_MPO_gdf["county"].map(county_node_numbering_start_dict)

node_MPO_gdf["model_node_id"] = node_MPO_gdf["model_node_id"] + node_MPO_gdf["county_numbering_start"]

In [139]:
node_MPO_gdf[node_MPO_gdf.model_node_id.isnull()].county.unique()

array([], dtype=object)

In [140]:
node_MPO_gdf.county.value_counts()

Santa Clara      193195
Alameda          126035
Contra Costa      97375
San Mateo         56453
Sonoma            56183
Solano            47336
San Francisco     27656
Marin             26543
Napa              13841
Name: county, dtype: int64

In [141]:
node_MPO_gdf.model_node_id.nunique()

644617

In [142]:
node_MPO_gdf[node_MPO_gdf.county.isin(county_node_numbering_start_dict.keys())].shape

(644617, 9)

# Numbering Links

In [143]:
county_link_numbering_start_dict = {
    "San Francisco" : 1, 
    "San Mateo" : 1000000,
    "Santa Clara" : 2000000,
    "Alameda" : 3000000,
    "Contra Costa" : 4000000,
    "Solano" : 5000000,
    "Napa" : 6000000,
    "Sonoma" : 7000000,
    "Marin" : 8000000
}

In [144]:
link_MPO_gdf.county.value_counts()

Santa Clara      524820
Alameda          321922
Contra Costa     243348
San Mateo        144181
Sonoma           138807
Solano           119966
San Francisco     74441
Marin             64637
Napa              34622
Name: county, dtype: int64

In [145]:
link_MPO_gdf["model_link_id"] = link_MPO_gdf.groupby(["county"]).cumcount()

link_MPO_gdf["county_numbering_start"] = link_MPO_gdf["county"].map(county_link_numbering_start_dict)

link_MPO_gdf["model_link_id"] = link_MPO_gdf["model_link_id"] + link_MPO_gdf["county_numbering_start"]

In [146]:
link_MPO_gdf[link_MPO_gdf.model_link_id.isnull()].county.unique()

array([], dtype=object)

In [147]:
link_MPO_gdf.model_link_id.nunique()

1666744

In [148]:
link_MPO_gdf[link_MPO_gdf.county.isin(county_link_numbering_start_dict.keys())].shape

(1666744, 35)

# Numbering Link A/B nodes

In [149]:
node_shst_model_id_dict = dict(zip(node_MPO_gdf.shst_node_id, node_MPO_gdf.model_node_id))

In [150]:
link_MPO_gdf["A"] = link_MPO_gdf["fromIntersectionId"].map(node_shst_model_id_dict)
link_MPO_gdf["B"] = link_MPO_gdf["toIntersectionId"].map(node_shst_model_id_dict)

In [151]:
link_MPO_gdf.head(3)

Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,walk_access,wayId,width,county,geometry,length,model_link_id,county_numbering_start,A,B
0,,,1,,1,,505d64eb98f1da8d812a3b3801034308,service,208e093f10a62dcc6646a8efa0bde136,,...,1,231794292,,Contra Costa,"LINESTRING (-122.3315542 37.9812044, -122.3315...",84.64525,4000000,4000000,3000000,3007839
1,,,1,,1,,473979c78435732f01ca5a168afb62e0,residential,5fe3056a5583474c0c898983cd6a638b,,...,1,7864473,,Contra Costa,"LINESTRING (-121.94477 37.953322, -121.9443904...",509.283331,4000001,4000000,3000001,3077548
2,,,1,,1,,fc7b575d5d8c961d4a70fca846ae7f80,tertiary,38e962038ecf17c6c7394ba88bc3b4c1,,...,1,12183318,,Marin,"LINESTRING (-122.5398278 37.8979989, -122.5400...",128.106786,8000000,8000000,5000000,5014587


In [152]:
link_MPO_gdf[link_MPO_gdf.shstGeometryId == "38e962038ecf17c6c7394ba88bc3b4c1"]

Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,walk_access,wayId,width,county,geometry,length,model_link_id,county_numbering_start,A,B
2,,,1,,1,,fc7b575d5d8c961d4a70fca846ae7f80,tertiary,38e962038ecf17c6c7394ba88bc3b4c1,,...,1,12183318,,Marin,"LINESTRING (-122.5398278 37.8979989, -122.5400...",128.106786,8000000,8000000,5000000,5014587
1413864,,,1,,1,,7550e87fc64657a10282672d814ab3c5,tertiary,38e962038ecf17c6c7394ba88bc3b4c1,,...,1,12183318,,Marin,"LINESTRING (-122.5398278 37.8979989, -122.5400...",128.106786,8054842,8000000,5014587,5000000


In [153]:
link_MPO_gdf[link_MPO_gdf.A.isnull()].county.value_counts()

Series([], Name: county, dtype: int64)

In [154]:
node_MPO_gdf[node_MPO_gdf.osm_node_id == 322507978]

Unnamed: 0,osm_node_id,shst_node_id,geometry,county,drive_access,walk_access,bike_access,model_node_id,county_numbering_start
644400,322507978,dddd914e805e1676c74d20ad2111ec59,POINT (-121.3149569 37.1691465),Santa Clara,0,1,1,2193094,2000000


In [155]:
node_MPO_gdf.model_node_id.nunique()

644617

# Write out

In [156]:
print("-------write out link shape geojson---------")

shape_prop = ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId']
shape_geojson = link_df_to_geojson(shape_MPO_gdf, shape_prop)

with open("../../data/interim/step5_tidy_roadway/shape.geojson", "w") as f:
    json.dump(shape_geojson, f)

    
print("-------write out link json---------")

link_prop = link_MPO_gdf.drop(["geometry"] + not_to_export_link_json, axis = 1).columns.tolist()

out = link_MPO_gdf[link_prop].to_json(orient = "records")

with open('../../data/interim/step5_tidy_roadway/link.json', 'w') as f:
    f.write(out)

    
print("-------write out node geojson---------")

node_prop = node_MPO_gdf.drop("geometry", axis = 1).columns.tolist()
node_geojson = point_df_to_geojson(node_MPO_gdf, node_prop)

with open("../../data/interim/step5_tidy_roadway/node.geojson", "w") as f:
    json.dump(node_geojson, f)  

-------write out link shape geojson---------
-------write out link json---------
-------write out node geojson---------


In [157]:
print("-------write out link feather---------")

link_feather = link_MPO_gdf.drop("geometry", axis = 1).copy()

link_feather.to_feather(data_interim_dir + 'step5_tidy_roadway/link.feather')

-------write out link feather---------
