In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import json
from scipy.spatial import cKDTree

In [2]:
from methods import link_df_to_geojson
from methods import point_df_to_geojson
from methods import identify_dead_end_nodes

In [3]:
data_interim_dir = "../../data/interim/"

data_external_dir = "../../data/external/"

# Read network

In [4]:
shape_gdf = gpd.read_file(data_interim_dir + "step3_join_shst_extraction_with_osm/" 
                          + "shape.geojson")

In [5]:
node_file = data_interim_dir + "step3_join_shst_extraction_with_osm/" + "node.geojson"
node_gdf = gpd.read_file(node_file)

In [6]:
link_file = data_interim_dir + "step4_conflate_with_tomtom/" + "link.feather"

link_df = pd.read_feather(link_file)

In [7]:
shape_gdf.crs

{'init': 'epsg:4326'}

# Join county name to shapes and nodes

In [8]:
county_file = data_external_dir + "county_boundaries/cb_2018_us_county_500k/cb_2018_us_county_500k.shp"

county_gdf = gpd.read_file(county_file)

county_gdf = county_gdf.to_crs(shape_gdf.crs)

In [9]:
node_county_gdf = gpd.sjoin(node_gdf, county_gdf, how = "left", op = "intersects")

In [10]:
node_gdf.shape

(661159, 6)

In [11]:
node_county_gdf.shape

(661160, 16)

In [12]:
# use nearest match for nodes that did not get county match (e.g. in the Bay)

node_county_matched_gdf = node_county_gdf[node_county_gdf.NAME.notnull()].copy()
node_county_unmatched_gdf = node_county_gdf[node_county_gdf.NAME.isnull()].copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

node_county_unmatched_gdf = node_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
node_county_unmatched_gdf['X'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
node_county_unmatched_gdf['Y'] = node_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

node_county_rematch_gdf = pd.DataFrame()

for i in range(len(node_county_unmatched_gdf)):
    point = node_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["NAME"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['shst_node_id'] = node_county_unmatched_gdf.iloc[i]['shst_node_id']
    
    if i == 0:
        node_county_rematch_gdf = add_snap_gdf.copy()
    else:
        node_county_rematch_gdf = node_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

In [13]:
node_county_rematch_dict = dict(zip(node_county_rematch_gdf.shst_node_id, node_county_rematch_gdf.NAME))

In [14]:
node_county_gdf["NAME"] = node_county_gdf["NAME"].fillna(node_county_gdf.shst_node_id.map(node_county_rematch_dict))

In [15]:
node_county_gdf.NAME.value_counts()

Santa Clara      193106
Alameda          126021
Contra Costa      97374
San Mateo         56430
Sonoma            56166
Solano            47305
San Francisco     27656
Marin             26543
Napa              13838
Santa Cruz         5232
Yolo               4238
Lake               2890
San Joaquin        2120
San Benito         1095
Sacramento          383
Mendocino           324
Stanislaus          268
Monterey            124
Merced               47
Name: NAME, dtype: int64

In [16]:
shape_centroid_gdf = shape_gdf.copy()
shape_centroid_gdf["geometry"] = shape_centroid_gdf["geometry"].centroid

shape_centroid_gdf = gpd.sjoin(shape_centroid_gdf, county_gdf, how = "left", op = "intersects")

shape_county_gdf = pd.merge(
    shape_gdf,
    shape_centroid_gdf[["id", "NAME"]],
    how = "left",
    on = "id"
)

In [17]:
shape_gdf.shape

(908267, 6)

In [18]:
shape_county_gdf.shape

(908267, 7)

In [19]:
shape_county_gdf.NAME.value_counts().sum()

907992

In [20]:
shape_county_gdf[shape_county_gdf.NAME.isnull()]

Unnamed: 0,id,fromIntersectionId,toIntersectionId,forwardReferenceId,backReferenceId,geometry,NAME
61460,97923a12ab2e1af21bb937026a44901e,b20b93ee7a961d12df4aa1bff5b6c4b6,066f6941259de5b7cc4be841fdc6d3a8,57cf9e1a7845b106a711e48de0eec385,7d0c43c3eada56909feeb8d47778966c,"LINESTRING (-122.3246558 37.9022385, -122.3250...",
65176,d6c888449000f0fdd3173604d9d7c8d9,45f70acb7476543507028a89760646d0,4ff0b86166effcebe962e4026f822fbf,4e54403f45d454b0b918c5037d5ae6d8,bb468c2f3de7977e4fd9aa68012ebf14,"LINESTRING (-122.3166339 37.8992776, -122.3163...",
65177,566baa6eec4f1551b848cdf23809c285,e443e8c37267364776caa11fb2ee7c41,a33a208ba34710b941df306e3cb27d11,ded94f4618e7b4e0af379a3107650f44,9428a762479e5934d29757830491412f,"LINESTRING (-122.3259169 37.8882585, -122.3261...",
65178,c905f36cfeb26e4a0b7fc3333204aba7,e443e8c37267364776caa11fb2ee7c41,0091ac97067fac13570e1149b384c3c3,ada9b5c5d65cacd45d326d63f00cad22,10fef368bb66241d74392bcc475d9a81,"LINESTRING (-122.3259169 37.8882585, -122.3258...",
65644,97ecb5aed64c894e3bd577e56eac075d,d3021feb77c8cb6c2e6372d6ea4f51dd,dc6bcf18e1afe9fe32e56e3c509c5808,39e6256df670971c1ba99fa6760ede63,804776aac6f27d20c1b33a11fc154813,"LINESTRING (-122.3184502 37.8753084, -122.3184...",
65653,9c0b2ac0777952a69b41daecad325880,41e8cb6bfff6238a00cf8e7d0c486a68,d3021feb77c8cb6c2e6372d6ea4f51dd,e79b8ea26bda85f9bb004057957a8432,5a9469a291ed0fb2db55677368c336ef,"LINESTRING (-122.3179177 37.87479370000001, -1...",
65654,d85bdb1683f8ff4eb13097c3d6792eae,ea921f78797e19cf9faeebf6863f1bd6,dc6bcf18e1afe9fe32e56e3c509c5808,047c5e77015b2c78ac39892a85583da1,6b30fde2c028d293e204915969b93072,"LINESTRING (-122.3179297 37.8750054, -122.3180...",
65655,b62e38a28da207711aae4c9f43c5e7be,ea921f78797e19cf9faeebf6863f1bd6,67c3b7329c49a55bec57a9b418933854,abf53d41663279ba06174ba397a2ad93,38a18147af7fde7757af5cd56c8d4c49,"LINESTRING (-122.3179297 37.8750054, -122.3178...",
65656,ba1346bb33e4819c4dad428dd33632c6,41e8cb6bfff6238a00cf8e7d0c486a68,ea921f78797e19cf9faeebf6863f1bd6,b8abac2d71da7da9015acb96b030d12f,302aa6b40dc1c092cfe2d261e12d9cc9,"LINESTRING (-122.3179177 37.87479370000001, -1...",
65657,0dddafd372dc697d4f3b2d9ff856c33a,d3021feb77c8cb6c2e6372d6ea4f51dd,b27ef63d7b56ad095adf4a61fb3ae7b1,f8ba4f5c194ae0f2084011df535fdc7e,05c571e594d59c520e0442c80edac231,"LINESTRING (-122.3184502 37.8753084, -122.3186...",


In [21]:
# use nearest for links that did not get county match
node_county_matched_gdf = node_county_gdf[node_county_gdf.NAME.notnull()].copy()

node_county_matched_gdf = node_county_matched_gdf.to_crs(epsg = 26915)
node_county_matched_gdf['X'] = node_county_matched_gdf.geometry.map(lambda g:g.x)
node_county_matched_gdf['Y'] = node_county_matched_gdf.geometry.map(lambda g:g.y)

node_matched_inventory_ref = node_county_matched_gdf[['X', 'Y']].values
node_matched_tree = cKDTree(node_matched_inventory_ref)

shape_county_unmatched_gdf = shape_county_gdf[shape_county_gdf.NAME.isnull()].copy()

shape_county_unmatched_gdf = shape_county_unmatched_gdf.to_crs({'init' : 'epsg:26915'})
shape_county_unmatched_gdf["geometry"] = shape_county_unmatched_gdf["geometry"].centroid
shape_county_unmatched_gdf['X'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.x)
shape_county_unmatched_gdf['Y'] = shape_county_unmatched_gdf['geometry'].apply(lambda p: p.y)

shape_county_rematch_gdf = pd.DataFrame()

for i in range(len(shape_county_unmatched_gdf)):
    point = shape_county_unmatched_gdf.iloc[i][['X', 'Y']].values
    dd, ii = node_matched_tree.query(point, k = 1)
    add_snap_gdf = gpd.GeoDataFrame(node_county_matched_gdf.iloc[ii][["NAME"]]).transpose().reset_index(drop = True)
    
    add_snap_gdf['id'] = shape_county_unmatched_gdf.iloc[i]['id']
    
    if i == 0:
        shape_county_rematch_gdf = add_snap_gdf.copy()
    else:
        shape_county_rematch_gdf = shape_county_rematch_gdf.append(add_snap_gdf, ignore_index=True, sort=False)

In [22]:
shape_county_rematch_dict = dict(zip(shape_county_rematch_gdf.id, shape_county_rematch_gdf.NAME))

shape_county_gdf["NAME"] = shape_county_gdf["NAME"].fillna(shape_county_gdf.id.map(shape_county_rematch_dict))

# Remove duplicate shape/node in county match, e.g. geometry on the boundary

In [23]:
node_county_gdf.drop_duplicates(subset = ["shst_node_id"], inplace = True)

shape_county_gdf.drop_duplicates(subset = ["id"], inplace = True)

In [24]:
node_gdf.shape

(661159, 6)

In [25]:
node_county_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 661159 entries, 0 to 661158
Data columns (total 16 columns):
osm_node_id     661159 non-null int64
shst_node_id    661159 non-null object
drive_access    661159 non-null int64
walk_access     661159 non-null int64
bike_access     661159 non-null int64
geometry        661159 non-null object
index_right     660885 non-null float64
STATEFP         660885 non-null object
COUNTYFP        660885 non-null object
COUNTYNS        660885 non-null object
AFFGEOID        660885 non-null object
GEOID           660885 non-null object
NAME            661159 non-null object
LSAD            660885 non-null object
ALAND           660885 non-null float64
AWATER          660885 non-null float64
dtypes: float64(3), int64(4), object(9)
memory usage: 85.8+ MB


In [26]:
shape_gdf.shape

(908267, 6)

In [27]:
shape_county_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 908267 entries, 0 to 908266
Data columns (total 7 columns):
id                    908267 non-null object
fromIntersectionId    908267 non-null object
toIntersectionId      908267 non-null object
forwardReferenceId    908267 non-null object
backReferenceId       908267 non-null object
geometry              908267 non-null object
NAME                  908267 non-null object
dtypes: object(7)
memory usage: 55.4+ MB


# Remove links and nodes outside of the 9 counties

In [28]:
MPO_county_list = ['San Francisco', 'Santa Clara', 'Sonoma', 'Marin', 'San Mateo',
       'Contra Costa', 'Solano', 'Napa', 'Alameda']

In [29]:
node_gdf = pd.merge(
    node_gdf, 
    node_county_gdf[["shst_node_id", "NAME"]].rename(columns = {"NAME": "county"}), 
    how = "left", 
    on = "shst_node_id") 

In [30]:
node_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 661159 entries, 0 to 661158
Data columns (total 7 columns):
osm_node_id     661159 non-null int64
shst_node_id    661159 non-null object
drive_access    661159 non-null int64
walk_access     661159 non-null int64
bike_access     661159 non-null int64
geometry        661159 non-null object
county          661159 non-null object
dtypes: int64(4), object(3)
memory usage: 40.4+ MB


In [174]:
shape_MPO_gdf = shape_county_gdf[shape_county_gdf.NAME.isin(MPO_county_list)].copy()

In [175]:
shape_MPO_gdf.columns

Index(['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId',
       'backReferenceId', 'geometry', 'NAME'],
      dtype='object')

In [176]:
shape_MPO_gdf.shape

(888188, 7)

In [177]:
link_df[link_df.u == 5372055804][["roadway", "v", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,v,drive_access,walk_access,bike_access
142409,footway,5000827329,0,1,0
548904,cycleway,890045129,0,1,1
1221973,footway,5000826420,0,1,0


In [178]:
link_df[link_df.v == 5372055804][["roadway", "u", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,u,drive_access,walk_access,bike_access
338507,footway,5000826420,0,1,0
903556,service,890045140,1,1,1
926879,footway,5000827329,0,1,0


In [179]:
link_MPO_df = link_df[link_df.shstGeometryId.isin(shape_MPO_gdf.id.tolist())].copy()

node_MPO_gdf = node_gdf[node_gdf.shst_node_id.isin(link_MPO_df.fromIntersectionId.tolist() + 
                                                   link_MPO_df.toIntersectionId.tolist())].copy()

In [180]:
# nodes that are outside of MPO but used by MPO links, need to give them the internal county names for node numbering

node_MPO_rename_county_gdf = node_MPO_gdf[~node_MPO_gdf.county.isin(MPO_county_list)].copy()

node_link_county_names_df = pd.concat(
    [
        shape_MPO_gdf.groupby(["fromIntersectionId", "NAME"])["id"].count().reset_index().rename(
            columns = {"fromIntersectionId" : "shst_node_id", "NAME" : "county"}),
        shape_MPO_gdf.groupby(["toIntersectionId", "NAME"])["id"].count().reset_index().rename(
            columns = {"toIntersectionId" : "shst_node_id", "NAME" : "county"})
    ],
    sort = False,
    ignore_index = True
)

node_MPO_rename_county_gdf = pd.merge(
    node_MPO_rename_county_gdf.drop(["county"], axis = 1),
    node_link_county_names_df[["shst_node_id", "county"]],
    how = "left",
    on = "shst_node_id"
)

node_MPO_rename_county_gdf.drop_duplicates(subset = ["osm_node_id", "shst_node_id"], inplace = True)

In [181]:
node_MPO_rename_county_gdf.head(3)

Unnamed: 0,osm_node_id,shst_node_id,drive_access,walk_access,bike_access,geometry,county
0,90535719,9ac4414af3095d8b2fd034dda4ac6330,1,1,1,POINT (-121.7016325 38.1050639),Solano
1,86017483,1af41a838d805cc4ef50c28aa508f6da,1,1,1,POINT (-123.231362 38.81535),Sonoma
2,5040098253,0be862b72977b1f6e8398628c22be835,1,1,1,POINT (-121.5568837 37.7856817),Alameda


In [182]:
node_MPO_gdf.columns

Index(['osm_node_id', 'shst_node_id', 'drive_access', 'walk_access',
       'bike_access', 'geometry', 'county'],
      dtype='object')

In [183]:
node_MPO_rename_county_gdf.columns

Index(['osm_node_id', 'shst_node_id', 'drive_access', 'walk_access',
       'bike_access', 'geometry', 'county'],
      dtype='object')

In [184]:
node_MPO_gdf = pd.concat(
    [
        node_MPO_gdf[node_MPO_gdf.county.isin(MPO_county_list)],
        node_MPO_rename_county_gdf
    ],
    sort = False,
    ignore_index = True
)

In [185]:
link_MPO_gdf = pd.merge(
    link_MPO_df,
    shape_MPO_gdf[["id", "NAME", "geometry"]].rename(columns = {"NAME" : "county"}),
    how = "left", 
    on = "id"
)

link_MPO_gdf = gpd.GeoDataFrame(link_MPO_gdf, geometry = link_MPO_gdf["geometry"],
                            crs={"init" : "epsg:4326"})

In [186]:
link_MPO_gdf.county.value_counts()

Santa Clara      524820
Alameda          321922
Contra Costa     243348
San Mateo        144181
Sonoma           138807
Solano           119966
San Francisco     74441
Marin             64637
Napa              34622
Name: county, dtype: int64

In [187]:
print(link_MPO_gdf.drive_access.value_counts())
print(link_MPO_gdf.walk_access.value_counts())
print(link_MPO_gdf.bike_access.value_counts())

1    1313212
0     353532
Name: drive_access, dtype: int64
1    1653436
0      13308
Name: walk_access, dtype: int64
1    1402514
0     264230
Name: bike_access, dtype: int64


In [188]:
node_MPO_gdf.county.value_counts()

Santa Clara      193195
Alameda          126035
Contra Costa      97375
San Mateo         56453
Sonoma            56183
Solano            47336
San Francisco     27656
Marin             26543
Napa              13841
Name: county, dtype: int64

In [189]:
print(node_MPO_gdf.drive_access.value_counts())
print(node_MPO_gdf.walk_access.value_counts())
print(node_MPO_gdf.bike_access.value_counts())

1    545387
0     99230
Name: drive_access, dtype: int64
1    639799
0      4818
Name: walk_access, dtype: int64
1    576552
0     68065
Name: bike_access, dtype: int64


In [190]:
len(set(link_MPO_gdf.u.tolist() + link_MPO_gdf.v.tolist()))

644617

In [191]:
node_MPO_gdf.shape

(644617, 7)

# Add length

In [192]:
# add length in meters

geom_length = link_MPO_gdf[['geometry']].copy()
geom_length = geom_length.to_crs(epsg = 26915)
geom_length["length"] = geom_length.length

link_MPO_gdf["length"] = geom_length["length"]

# drop circular links (u == v)

In [193]:
circular_link_gdf = link_MPO_gdf[link_MPO_gdf.u == link_MPO_gdf.v].copy()
circular_link_gdf.shape

(5560, 33)

In [194]:
circular_link_gdf.id.nunique()

4921

In [195]:
link_MPO_gdf.shape

(1666744, 33)

In [196]:
link_MPO_gdf.shstReferenceId.nunique()

1666744

In [197]:
link_MPO_gdf = link_MPO_gdf[~ link_MPO_gdf.shstReferenceId.isin(circular_link_gdf.shstReferenceId.tolist())]

In [198]:
link_MPO_gdf.shape

(1661184, 33)

In [199]:
link_MPO_gdf.shstReferenceId.nunique()

1661184

In [200]:
link_MPO_gdf.id.nunique()

883267

In [201]:
shape_MPO_gdf = shape_MPO_gdf[shape_MPO_gdf.id.isin(link_MPO_gdf.id)]

In [202]:
shape_MPO_gdf.id.nunique()

883267

In [203]:
node_MPO_gdf = node_MPO_gdf[(node_MPO_gdf.osm_node_id.isin(link_MPO_gdf.u.tolist())) | 
                            (node_MPO_gdf.osm_node_id.isin(link_MPO_gdf.v.tolist()))]

In [204]:
node_MPO_gdf.shape

(643811, 7)

# Flag drive dead end

In [205]:
non_dead_end_link_handle_df = link_MPO_gdf[(link_MPO_gdf.drive_access == 1)][["u", "v"]]

dead_end_node_list = identify_dead_end_nodes(non_dead_end_link_handle_df)

cumulative_dead_end_node_list = []

while len(dead_end_node_list) > 0:
    cumulative_dead_end_node_list = cumulative_dead_end_node_list + dead_end_node_list
    
    non_dead_end_link_handle_df = non_dead_end_link_handle_df[~(non_dead_end_link_handle_df.u.isin(dead_end_node_list)) & 
                                            ~(non_dead_end_link_handle_df.v.isin(dead_end_node_list))].copy()
    
    dead_end_node_list = identify_dead_end_nodes(non_dead_end_link_handle_df)

In [206]:
len(cumulative_dead_end_node_list)

142850

In [207]:
5372055804 in cumulative_dead_end_node_list

True

In [208]:
link_df[link_df.u == 5372055804].v

142409     5000827329
548904      890045129
1221973    5000826420
Name: v, dtype: int64

In [209]:
non_dead_end_link_handle_df

Unnamed: 0,u,v
0,2401244716,2401244712
1,57839068,57869731
2,1024388950,110424978
3,65561433,1325928459
4,4545575571,4545575563
5,3377850399,4313871331
6,4305402275,4305402282
8,4925258551,4332522355
9,65308539,65318508
10,4932240373,4616479310


In [210]:
link_MPO_gdf.drive_access.value_counts()

1    1308672
0     352512
Name: drive_access, dtype: int64

In [211]:
node_MPO_gdf.drive_access.value_counts()

1    545062
0     98749
Name: drive_access, dtype: int64

In [212]:
# update node and link drive access
# if u/v in dead end node list, then drive access = 0
# if osm_node_id in dead end node list, then drive access = 0

link_MPO_gdf['drive_access'] = np.where(((link_MPO_gdf.u.isin(cumulative_dead_end_node_list)) | 
                                               (link_MPO_gdf.v.isin(cumulative_dead_end_node_list))) &
                                        ~(link_MPO_gdf.roadway.isin(['primary', 'secondary', 'motorway', 'primary_link',
                                               'motorway_link', 'trunk_link', 'trunk', 'secondary_link',
                                               'tertiary_link'])),
                                               0,
                                               link_MPO_gdf.drive_access)

In [213]:
# update network type variable for node

A_B_df = pd.concat([link_MPO_gdf[["u", "fromIntersectionId", "drive_access", "walk_access", "bike_access"]].rename(
                            columns = {"u":"osm_node_id", "fromIntersectionId" : "shst_node_id"}),
                  link_MPO_gdf[["v", "toIntersectionId", "drive_access", "walk_access", "bike_access"]].rename(
                            columns = {"v":"osm_node_id", "toIntersectionId" : "shst_node_id"})],
                  sort = False,
                  ignore_index = True)

A_B_df.drop_duplicates(inplace = True)

A_B_df = A_B_df.groupby(["osm_node_id", "shst_node_id"]).max().reset_index()

node_MPO_gdf = pd.merge(node_MPO_gdf.drop(["drive_access", "walk_access", "bike_access"], axis = 1),
                      A_B_df,
                      how = "left",
                      on = ["osm_node_id", "shst_node_id"])

In [214]:
A_B_df.shape

(643811, 5)

In [215]:
link_MPO_gdf.drive_access.value_counts()

1    1019260
0     641924
Name: drive_access, dtype: int64

In [216]:
node_MPO_gdf.shape

(643811, 7)

In [217]:
node_MPO_gdf.drive_access.value_counts()

1    402274
0    241537
Name: drive_access, dtype: int64

In [218]:
link_MPO_gdf[(link_MPO_gdf.drive_access == 0) & (link_MPO_gdf.walk_access == 0) & (link_MPO_gdf.bike_access == 0)].roadway

Series([], Name: roadway, dtype: object)

In [219]:
link_df.roadway.unique()

array(['service', 'residential', 'tertiary', 'primary', 'cycleway',
       'footway', 'secondary', 'motorway', 'primary_link',
       'motorway_link', 'trunk_link', 'trunk', 'secondary_link',
       'tertiary_link'], dtype=object)

In [220]:
link_MPO_gdf[(link_MPO_gdf.drive_access == 0)].roadway.value_counts()

footway        250682
service        156057
residential    132626
cycleway       101830
tertiary          729
Name: roadway, dtype: int64

In [221]:
node_MPO_gdf[node_MPO_gdf.shst_node_id.isin(node_MPO_rename_county_gdf.shst_node_id.tolist())]

Unnamed: 0,osm_node_id,shst_node_id,geometry,county,drive_access,walk_access,bike_access
643584,90535719,9ac4414af3095d8b2fd034dda4ac6330,POINT (-121.7016325 38.1050639),Solano,1,1,1
643585,86017483,1af41a838d805cc4ef50c28aa508f6da,POINT (-123.231362 38.81535),Sonoma,0,1,1
643586,5040098253,0be862b72977b1f6e8398628c22be835,POINT (-121.5568837 37.7856817),Alameda,0,1,1
643587,95722215,5d60a0cb1999ab51c0a043b8660a310a,POINT (-121.695261 38.514734),Solano,0,1,1
643588,624524794,5ce7982c8f26842d124adb37a07fbd2c,POINT (-121.7568325 37.0491931),Santa Clara,0,1,1
643589,89445717,c3faaf4edfb3666650660c48d8602251,POINT (-121.217535 37.122678),Santa Clara,0,1,1
643590,4003083628,acdea6b75887fafaf03ecd5341df6360,POINT (-121.5373077 37.8429623),Contra Costa,0,1,1
643591,4993578363,57e57e3ba4c52ba8948a03b870ad9aa1,POINT (-121.5564266 37.7588262),Alameda,0,1,1
643592,86452091,8069a166d33ca9002cc0e0d01f98e711,POINT (-123.220412 38.818201),Sonoma,0,1,1
643593,62394174,c0e3f6a6ad024fd6754e19228134509f,POINT (-122.0949798 38.5137149),Solano,1,1,1


In [222]:
link_MPO_gdf[link_MPO_gdf.v == 62394174]

Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,toIntersectionId,tunnel,u,v,walk_access,wayId,width,county,geometry,length
1252258,"['nan', 'nan']","['nan', 'nan']",1,"['yes', 'nan']",1,"['nan', 'nan']",b6e691d77bc53a92d4f6a62e91301ccc,"['secondary', 'secondary']",3d6d41eef3868e2d4448c5edd8703bc6,"['nan', 'nan']",...,c0e3f6a6ad024fd6754e19228134509f,"['nan', 'nan']",387990215,62394174,1,"[8713932, 25749628]","['nan', 'nan']",Solano,"LINESTRING (-122.0949798 38.5137149, -122.0952...",211.216796


# Drop duplicate links between same AB node pair

In [223]:
non_unique_AB_links_df = link_MPO_gdf.groupby(["u", "v"]).shstReferenceId.count().sort_values().reset_index()
non_unique_AB_links_df = non_unique_AB_links_df[non_unique_AB_links_df.shstReferenceId > 1]

non_unique_AB_links_df = pd.merge(non_unique_AB_links_df[["u", "v"]],
                              link_MPO_gdf[["u", "v", "highway", "roadway", "drive_access", "bike_access", "walk_access", "length",
                                      "wayId", "shstGeometryId", "shstReferenceId", "geometry"]],
                              how = "left",
                              on = ["u", "v"])

In [224]:
non_unique_AB_links_df.roadway.value_counts()

service          34100
residential      10503
footway           5318
cycleway          5032
tertiary           876
secondary          627
primary            266
trunk               20
motorway            10
trunk_link           4
primary_link         3
motorway_link        3
Name: roadway, dtype: int64

In [225]:
link_MPO_gdf[link_MPO_gdf.u == link_MPO_gdf.v]

Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,toIntersectionId,tunnel,u,v,walk_access,wayId,width,county,geometry,length


In [226]:
non_unique_AB_links_df.shape

(56762, 12)

In [227]:
link_MPO_gdf[link_MPO_gdf.shstGeometryId == "c6aa967814865bf0eec8ebd0e5b27d3c"]

Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,toIntersectionId,tunnel,u,v,walk_access,wayId,width,county,geometry,length
871576,,,1,,0,,40b7757ac264ef8fc3deeee11d25514f,residential,c6aa967814865bf0eec8ebd0e5b27d3c,,...,4319c448c6e983dc44395113accaae58,,55956915,1129773955,1,7699978,,Napa,"LINESTRING (-122.2577045 38.1849324, -122.2577...",113.890865
1386565,,,1,,0,,4319c448c6e983dc44395113accaae58,residential,c6aa967814865bf0eec8ebd0e5b27d3c,,...,40b7757ac264ef8fc3deeee11d25514f,,1129773955,55956915,1,7699978,,Napa,"LINESTRING (-122.2577045 38.1849324, -122.2577...",113.890865


In [228]:
roadway_hierarchy_df = pd.read_csv("../../data/interim/highway_to_roadway.csv")

In [229]:
roadway_hierarchy_df = roadway_hierarchy_df.drop_duplicates(subset = "roadway")

In [230]:
non_unique_AB_links_df = pd.merge(non_unique_AB_links_df,
                              roadway_hierarchy_df[["roadway", "hierarchy"]],
                              how = "left",
                              on = "roadway")

In [232]:
non_unique_AB_links_df.shape

(56762, 13)

In [233]:
# sort on hierarchy (ascending), drive_access(descending), bike_access(descending), walk_access(descending), length(ascending)

non_unique_AB_links_sorted_df =  non_unique_AB_links_df.sort_values(
    by = ["hierarchy", "drive_access", "bike_access", "walk_access", "length"],
    ascending = [True, False, False, False, True])

unique_AB_links_df = non_unique_AB_links_sorted_df.drop_duplicates(subset = ["u", "v"], keep = "first")

In [234]:
unique_AB_links_df

Unnamed: 0,u,v,highway,roadway,drive_access,bike_access,walk_access,length,wayId,shstGeometryId,shstReferenceId,geometry,hierarchy
45791,80574977,6143592274,,motorway,1,0,0,146.285563,25006974,4a4040e3db69d4cc7a24a3dc400ff2bf,eb648201446c48094c36d0739395b709,"LINESTRING (-122.2206949 37.8539139, -122.2204...",1
28267,2269022504,2269022489,motorway,motorway,1,0,0,159.864904,25031162,4278ae2c7bb28d101e56b7c8b3f5b3de,daefefc1234e53ea1953b20ddaa25827,"LINESTRING (-122.2615455 38.1328911, -122.2620...",1
5888,345570842,345570884,motorway,motorway,1,0,0,342.244801,48675482,1ab6fbf1f72344c1157aba42aa36ae3b,4c26658336f3c8e42f3a86bb5723a0e9,"LINESTRING (-122.5418763 38.0123289, -122.5418...",1
20336,242858467,242858487,motorway,motorway,1,0,0,449.214731,6319195,9bd1a82fa54b922d135f8fdebcda9905,6e38b9561161cd5d4ed3d0818d15fbc8,"LINESTRING (-122.1016067 37.6892574, -122.1001...",1
52006,4518155785,4518156092,"['motorway', 'motorway']",motorway,1,0,0,554.209424,"[455411594, 455410282]",85439a06751a946069d3a86973c9a64a,217253518a29aa69c40a4114d557d24d,"LINESTRING (-122.1120148 38.0255832, -122.1126...",1
34670,258157744,258773620,"['motorway', '', 'motorway']",motorway,1,0,0,1560.480741,"[83149248, 105429350, 49120137]",5fe5cb7bc973e1d0c92029d6b462adc9,0019760a2e3c8942dc68d10377285734,"LINESTRING (-122.2093825 37.863447, -122.20960...",1
44304,65350670,1674152019,motorway_link,motorway_link,1,0,0,47.795926,175354957,a60e501a60b7e5c38b33ba76aea03a86,3e2ad8cd124da74cf508ffe4a89c1838,"LINESTRING (-122.3931083 37.7839299, -122.3931...",2
14195,4930462776,4930462772,motorway_link,motorway_link,1,0,0,78.711176,393526122,9f0b1f0a67d0426dbe26aa6c82a09a44,c9d2ae75be374cbb7e895d46e4517e07,"LINESTRING (-122.4030107 37.9320856, -122.4021...",2
33730,5480616905,5480616902,trunk,trunk,1,0,0,15.376112,172673827,38523c53ce433a51601831139abdb3f1,1b859ef918a40fefc0f32dbcef87a82a,"LINESTRING (-121.8935434 38.0060455, -121.8935...",3
11849,1128776508,1128776403,trunk,trunk,1,0,0,18.407046,37354566,4ddec90ceca8a087530173d3b2e57121,962b1850c95a084980e3d07bf1002f47,"LINESTRING (-122.5030542 37.5966571, -122.5029...",3


In [235]:
from_list = non_unique_AB_links_df.shstReferenceId.tolist()
to_list = unique_AB_links_df.shstReferenceId.tolist()

drop_link_model_link_id_list = [c for c in from_list if c not in to_list]

In [236]:
link_MPO_gdf = link_MPO_gdf[~ link_MPO_gdf.shstReferenceId.isin(drop_link_model_link_id_list)]

In [238]:
shape_MPO_gdf = shape_MPO_gdf[shape_MPO_gdf.id.isin(link_MPO_gdf.id)].copy()

In [240]:
len(set(link_MPO_gdf.u.tolist() + link_MPO_gdf.v.tolist()))

643811

In [241]:
node_MPO_gdf.shape

(643811, 7)

In [249]:
link_MPO_gdf.id.nunique()

868567

In [242]:
shape_MPO_gdf.shape

(868567, 7)

In [243]:
link_MPO_gdf.shape

(1632702, 33)

In [269]:
link_MPO_gdf.groupby(["u", "v"]).shstReferenceId.count().shape

(1632702,)

In [246]:
node_MPO_gdf.osm_node_id.nunique()

643811

In [247]:
shape_MPO_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 868567 entries, 0 to 907403
Data columns (total 7 columns):
id                    868567 non-null object
fromIntersectionId    868567 non-null object
toIntersectionId      868567 non-null object
forwardReferenceId    868567 non-null object
backReferenceId       868567 non-null object
geometry              868567 non-null object
NAME                  868567 non-null object
dtypes: object(7)
memory usage: 53.0+ MB


In [248]:
link_MPO_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1632702 entries, 0 to 1666743
Data columns (total 33 columns):
access                1632702 non-null object
area                  1632702 non-null object
bike_access           1632702 non-null int64
bridge                1632702 non-null object
drive_access          1632702 non-null int64
est_width             1632702 non-null object
fromIntersectionId    1632702 non-null object
highway               1632702 non-null object
id                    1632702 non-null object
junction              1632702 non-null object
key                   1632702 non-null object
landuse               1632702 non-null object
lanes                 1632702 non-null object
link                  1632702 non-null object
maxspeed              1632702 non-null object
name                  1632702 non-null object
oneWay                1632702 non-null object
ref                   1632702 non-null object
roadway               1632702 non-null object
roundab

# Rename attributes

In [250]:
node_MPO_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 643811 entries, 0 to 643810
Data columns (total 7 columns):
osm_node_id     643811 non-null int64
shst_node_id    643811 non-null object
geometry        643811 non-null object
county          643811 non-null object
drive_access    643811 non-null int64
walk_access     643811 non-null int64
bike_access     643811 non-null int64
dtypes: int64(4), object(3)
memory usage: 39.3+ MB


In [251]:
shape_MPO_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 868567 entries, 0 to 907403
Data columns (total 7 columns):
id                    868567 non-null object
fromIntersectionId    868567 non-null object
toIntersectionId      868567 non-null object
forwardReferenceId    868567 non-null object
backReferenceId       868567 non-null object
geometry              868567 non-null object
NAME                  868567 non-null object
dtypes: object(7)
memory usage: 53.0+ MB


In [252]:
link_MPO_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1632702 entries, 0 to 1666743
Data columns (total 33 columns):
access                1632702 non-null object
area                  1632702 non-null object
bike_access           1632702 non-null int64
bridge                1632702 non-null object
drive_access          1632702 non-null int64
est_width             1632702 non-null object
fromIntersectionId    1632702 non-null object
highway               1632702 non-null object
id                    1632702 non-null object
junction              1632702 non-null object
key                   1632702 non-null object
landuse               1632702 non-null object
lanes                 1632702 non-null object
link                  1632702 non-null object
maxspeed              1632702 non-null object
name                  1632702 non-null object
oneWay                1632702 non-null object
ref                   1632702 non-null object
roadway               1632702 non-null object
roundab

In [253]:
not_to_export_link_json = ["id", "link"]

In [254]:
link_MPO_gdf.roadway.value_counts()

service           541489
residential       538666
footway           247489
cycleway           99108
tertiary           93947
secondary          68529
primary            26492
motorway_link       4983
trunk               4160
motorway            2774
secondary_link      1566
primary_link        1506
trunk_link          1382
tertiary_link        611
Name: roadway, dtype: int64

# Numbering Nodes

In [255]:
county_node_numbering_start_dict = {
    "San Francisco" : 1000000, 
    "San Mateo" : 1500000,
    "Santa Clara" : 2000000,
    "Alameda" : 2500000,
    "Contra Costa" : 3000000,
    "Solano" : 3500000,
    "Napa" : 4000000,
    "Sonoma" : 4500000,
    "Marin" : 5000000
}

In [256]:
node_MPO_gdf["model_node_id"] = node_MPO_gdf.groupby(["county"]).cumcount()

In [257]:
node_MPO_gdf["county_numbering_start"] = node_MPO_gdf["county"].map(county_node_numbering_start_dict)

node_MPO_gdf["model_node_id"] = node_MPO_gdf["model_node_id"] + node_MPO_gdf["county_numbering_start"]

In [258]:
node_MPO_gdf[node_MPO_gdf.model_node_id.isnull()].county.unique()

array([], dtype=object)

In [259]:
node_MPO_gdf.county.value_counts()

Santa Clara      192799
Alameda          125942
Contra Costa      97273
San Mateo         56363
Sonoma            56146
Solano            47318
San Francisco     27608
Marin             26529
Napa              13833
Name: county, dtype: int64

In [260]:
node_MPO_gdf.model_node_id.nunique()

643811

In [261]:
node_MPO_gdf[node_MPO_gdf.county.isin(county_node_numbering_start_dict.keys())].shape

(643811, 9)

# Numbering Links

In [262]:
county_link_numbering_start_dict = {
    "San Francisco" : 1, 
    "San Mateo" : 1000000,
    "Santa Clara" : 2000000,
    "Alameda" : 3000000,
    "Contra Costa" : 4000000,
    "Solano" : 5000000,
    "Napa" : 6000000,
    "Sonoma" : 7000000,
    "Marin" : 8000000
}

In [263]:
link_MPO_gdf.county.value_counts()

Santa Clara      516565
Alameda          315314
Contra Costa     237448
San Mateo        140927
Sonoma           135042
Solano           117102
San Francisco     73440
Marin             63184
Napa              33680
Name: county, dtype: int64

In [264]:
link_MPO_gdf["model_link_id"] = link_MPO_gdf.groupby(["county"]).cumcount()

link_MPO_gdf["county_numbering_start"] = link_MPO_gdf["county"].map(county_link_numbering_start_dict)

link_MPO_gdf["model_link_id"] = link_MPO_gdf["model_link_id"] + link_MPO_gdf["county_numbering_start"]

In [265]:
link_MPO_gdf[link_MPO_gdf.model_link_id.isnull()].county.unique()

array([], dtype=object)

In [266]:
link_MPO_gdf.model_link_id.nunique()

1632702

In [270]:
link_MPO_gdf[link_MPO_gdf.county.isin(county_link_numbering_start_dict.keys())].shape

(1632702, 35)

# Numbering Link A/B nodes

In [271]:
node_shst_model_id_dict = dict(zip(node_MPO_gdf.shst_node_id, node_MPO_gdf.model_node_id))

In [272]:
link_MPO_gdf["A"] = link_MPO_gdf["fromIntersectionId"].map(node_shst_model_id_dict)
link_MPO_gdf["B"] = link_MPO_gdf["toIntersectionId"].map(node_shst_model_id_dict)

In [273]:
link_MPO_gdf.head(3)

Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,walk_access,wayId,width,county,geometry,length,model_link_id,county_numbering_start,A,B
0,,,1,,1,,505d64eb98f1da8d812a3b3801034308,service,208e093f10a62dcc6646a8efa0bde136,,...,1,231794292,,Contra Costa,"LINESTRING (-122.3315542 37.9812044, -122.3315...",84.64525,4000000,4000000,3000000,3007836
1,,,1,,1,,473979c78435732f01ca5a168afb62e0,residential,5fe3056a5583474c0c898983cd6a638b,,...,1,7864473,,Contra Costa,"LINESTRING (-121.94477 37.953322, -121.9443904...",509.283331,4000001,4000000,3000001,3077456
2,,,1,,1,,fc7b575d5d8c961d4a70fca846ae7f80,tertiary,38e962038ecf17c6c7394ba88bc3b4c1,,...,1,12183318,,Marin,"LINESTRING (-122.5398278 37.8979989, -122.5400...",128.106786,8000000,8000000,5000000,5014577


In [274]:
link_MPO_gdf[link_MPO_gdf.shstGeometryId == "38e962038ecf17c6c7394ba88bc3b4c1"]

Unnamed: 0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id,junction,...,walk_access,wayId,width,county,geometry,length,model_link_id,county_numbering_start,A,B
2,,,1,,1,,fc7b575d5d8c961d4a70fca846ae7f80,tertiary,38e962038ecf17c6c7394ba88bc3b4c1,,...,1,12183318,,Marin,"LINESTRING (-122.5398278 37.8979989, -122.5400...",128.106786,8000000,8000000,5000000,5014577
1413864,,,1,,1,,7550e87fc64657a10282672d814ab3c5,tertiary,38e962038ecf17c6c7394ba88bc3b4c1,,...,1,12183318,,Marin,"LINESTRING (-122.5398278 37.8979989, -122.5400...",128.106786,8053564,8000000,5014577,5000000


In [275]:
link_MPO_gdf[link_MPO_gdf.A.isnull()].county.value_counts()

Series([], Name: county, dtype: int64)

In [276]:
node_MPO_gdf[node_MPO_gdf.osm_node_id == 322507978]

Unnamed: 0,osm_node_id,shst_node_id,geometry,county,drive_access,walk_access,bike_access,model_node_id,county_numbering_start
643594,322507978,dddd914e805e1676c74d20ad2111ec59,POINT (-121.3149569 37.1691465),Santa Clara,0,1,1,2192698,2000000


In [277]:
node_MPO_gdf.model_node_id.nunique()

643811

# Write out

In [278]:
print("-------write out link shape geojson---------")

shape_prop = ['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId', 'backReferenceId']
shape_geojson = link_df_to_geojson(shape_MPO_gdf, shape_prop)

with open("../../data/interim/step5_tidy_roadway/shape.geojson", "w") as f:
    json.dump(shape_geojson, f)

    
print("-------write out link json---------")

link_prop = link_MPO_gdf.drop(["geometry"] + not_to_export_link_json, axis = 1).columns.tolist()

out = link_MPO_gdf[link_prop].to_json(orient = "records")

with open('../../data/interim/step5_tidy_roadway/link.json', 'w') as f:
    f.write(out)

    
print("-------write out node geojson---------")

node_prop = node_MPO_gdf.drop("geometry", axis = 1).columns.tolist()
node_geojson = point_df_to_geojson(node_MPO_gdf, node_prop)

with open("../../data/interim/step5_tidy_roadway/node.geojson", "w") as f:
    json.dump(node_geojson, f)  

-------write out link shape geojson---------
-------write out link json---------
-------write out node geojson---------


In [283]:
link_MPO_gdf.columns

Index(['access', 'area', 'bike_access', 'bridge', 'drive_access', 'est_width',
       'fromIntersectionId', 'highway', 'id', 'junction', 'key', 'landuse',
       'lanes', 'link', 'maxspeed', 'name', 'oneWay', 'ref', 'roadway',
       'roundabout', 'service', 'shstGeometryId', 'shstReferenceId',
       'toIntersectionId', 'tunnel', 'u', 'v', 'walk_access', 'wayId', 'width',
       'county', 'geometry', 'length', 'model_link_id',
       'county_numbering_start', 'A', 'B'],
      dtype='object')

In [285]:
print("-------write out link feather---------")

link_feather = link_MPO_gdf.reset_index(drop = True).drop("geometry", axis = 1).copy()

link_feather.to_feather(data_interim_dir + 'step5_tidy_roadway/link.feather')

-------write out link feather---------


In [286]:
link_feather.columns

Index(['access', 'area', 'bike_access', 'bridge', 'drive_access', 'est_width',
       'fromIntersectionId', 'highway', 'id', 'junction', 'key', 'landuse',
       'lanes', 'link', 'maxspeed', 'name', 'oneWay', 'ref', 'roadway',
       'roundabout', 'service', 'shstGeometryId', 'shstReferenceId',
       'toIntersectionId', 'tunnel', 'u', 'v', 'walk_access', 'wayId', 'width',
       'county', 'length', 'model_link_id', 'county_numbering_start', 'A',
       'B'],
      dtype='object')

In [287]:
link_df[link_df.u == 890045140][["roadway", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,drive_access,walk_access,bike_access
246811,service,1,1,1
903556,service,1,1,1
1598733,service,1,1,1


In [288]:
link_MPO_gdf[link_MPO_gdf.u == 890045140][["roadway", "drive_access", "walk_access", "bike_access"]]

Unnamed: 0,roadway,drive_access,walk_access,bike_access
241376,service,1,1,1
883586,service,0,1,1
1562251,service,1,1,1
