## Purpose of this notebook

This notebook generates boolean indicators for whether an auto trip in BATS 2023 involves the use of a freeway. These booleans are used to support a request for information on the percentage of low-income households that do **not** regularly use highways during peak-hour periods ([Asana task link](https://app.asana.com/1/11860278793487/project/12291104512646/task/1210119087413706?focus=true))

## Input file

`tds_conflation_results.gpkg` — created through the conflation process documented here: https://github.com/BayAreaMetro/Travel-Diary-Surveys/tree/master/trip-trace-conflation

In [1]:
import geopandas as gpd
import os

import pandas as pd
pd.set_option('display.max_rows', 1000)

In [2]:
# The matched_path_gdf layer takes a while to load. Tracking the time it takes here:

from datetime import datetime
start_time = datetime.now()
print("Started reading matched_path_gdf at:", start_time)

Started reading matched_path_gdf at: 2025-05-12 07:39:34.983302


In [3]:
# The layer in the input GeoPackage is named matched_path_gdf, and I used the same name for the GeoDataFrame
matched_path_gdf = gpd.read_file(
    r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Full Weighted 2023 Dataset\WeightedDataset_08092024\OSM_match_v2\tds_conflation_results.gpkg",
    layer="matched_path_gdf"
)


In [4]:
end_time = datetime.now()
print("Started reading matched_path_gdf at:", end_time)
print("Duration:", end_time - start_time)

Started reading matched_path_gdf at: 2025-05-12 08:21:11.104909
Duration: 0:41:36.121607


In [5]:
len(matched_path_gdf)

8941049

In [6]:
# note to self: do not do matched_path_gdf.head(), as it may reveal PII data!
# but I can list the columns

for col in matched_path_gdf.columns:
    print(col)

road_id
origin_junction_id
destination_junction_id
road_key
kilometers
travel_time
trip_id
osmid
ref
name
maxspeed
highway
bridge
tunnel
rownum
geometry


In [7]:
matched_path_gdf["highway"].value_counts()

secondary                              2490183
primary                                1489085
motorway                               1475978
tertiary                               1325476
residential                            1097249
motorway_link                           473121
trunk                                   403204
unclassified                             66918
primary_link                             30613
secondary_link                           26954
trunk_link                               23624
busway                                   19463
tertiary_link                             7554
['motorway', 'trunk']                     3470
['secondary', 'tertiary']                 1519
['residential', 'tertiary']               1117
['motorway', 'motorway_link']              853
['secondary', 'motorway_link']             652
['motorway_link', 'primary']               569
living_street                              549
['unclassified', 'tertiary']               433
['secondary',

In [8]:
matched_path_gdf["bridge"].value_counts(dropna=False)

None                  8069697
yes                    865588
['yes', 'movable']       2227
viaduct                  1883
['yes', 'viaduct']       1171
movable                   245
cantilever                142
no                         96
Name: bridge, dtype: int64

In [9]:
# verify if "None" is actually null or a string
matched_path_gdf["bridge"].isnull().sum()

8069697

In [10]:
# Based on a visualization of the map matched_path_gdf, we found that
# matched_path_gdf["highway"] == "motorway" a reasonable representation the Bay Area's freeway systems (even though it's not perfect) 
# However, the bridge attribute in OSM does not reliably represent the Bay Area's bridges
# Tableau Online map: https://10ay.online.tableau.com/#/site/metropolitantransportationcommission/views/matched_path_gdf/bridge?:iid=2

# But Shimon has previously done some manual coding of the bridges
osmid_bridge_csv = r"E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\Data\2023\Survey Conflation\osmid_facility_equivalence_lookup.csv"
osmid_bridge_df = pd.read_csv(osmid_bridge_csv)

osmid_bridge_df.head()

Unnamed: 0,osmid,Facility
0,"[24307457, 47245593, 24307484, 123867358]",i580_hayward_to_sanjoaquin
1,"[24307457, 24290323]",i580_hayward_to_sanjoaquin
2,"[123867360, 47245593, 47245597]",i580_hayward_to_sanjoaquin
3,"[496168690, 94008284]",i580_hayward_to_sanjoaquin
4,"[496168690, 496168691, 24307478, 32168615]",i580_hayward_to_sanjoaquin


In [11]:
osmid_bridge_df["Facility"].value_counts(dropna=False)

i880_baybridge_to_237         140
i680_80_to_580_portion        136
sr4_80_to_160                  98
i680_580_to_101                98
i580_hayward_to_sanjoaquin     85
i580_hayward_to_baybridge      74
i80_580_to_Carquinez           69
sr37_121_to_101                23
sr37_80_to_mare                19
i80_13_to_580                  12
sr37_mare_to_121                9
i80_680_to_12                   7
bay_bridge                      6
bm_bridge                       2
ant_bridge                      2
dum_bridge                      2
carq_bridge                     2
rsr_bridge                      2
gg_bridge                       2
sm_bridge                       2
Name: Facility, dtype: int64

In [12]:
# check if there are duplicated osmid
osmid_bridge_df["osmid"].duplicated().sum()

0

In [13]:
# join the file with the manual bridge coding to the matched_path_gdf
# want the result to remain a gdf. don't use pd.merge()

matched_path_bridge_gdf = matched_path_gdf.merge(osmid_bridge_df, on="osmid", how="left")

In [14]:
# the join shouldn't add any rows. check that this is the case.
# the following should return 8941049 rows
len(matched_path_bridge_gdf)

8941049

In [15]:
# create a new variable for the eight bridges
BayArea_bridges = [
    "sm_bridge", "dum_bridge", "ant_bridge", "bm_bridge",
    "bay_bridge", "carq_bridge", "rsr_bridge", "gg_bridge"
]

matched_path_bridge_gdf["is_bridge"] = matched_path_bridge_gdf["Facility"].isin(BayArea_bridges).astype(int)

In [16]:
# Create boolean column for motorway
matched_path_bridge_gdf["is_motorway"] = matched_path_bridge_gdf["highway"] == "motorway"

# Create boolean column for motorway and NOT a bridge
matched_path_bridge_gdf["is_motorway_not_bridge"] = (
    (matched_path_bridge_gdf["highway"] == "motorway") & 
    (matched_path_bridge_gdf["is_bridge"] != 1)
)

In [17]:
# Group by trip_id and use max 
trip_motorway_booleans_df = matched_path_bridge_gdf.groupby("trip_id")[["is_motorway", "is_motorway_not_bridge"]].max().astype(int).reset_index()


In [18]:
# Rename the columns
trip_motorway_booleans_df.rename(columns={"is_motorway": "has_motorway"}, inplace=True)
trip_motorway_booleans_df.rename(columns  ={"is_motorway_not_bridge": "has_nonBridge_motorway"}, inplace=True)


In [19]:
# Save to CSV
BATS_data_location = r"E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\Data\2023\Full Weighted 2023 Dataset\WeightedDataset_02212025"
Output_location = os.path.join(BATS_data_location, "derived_variables", "trip_motorway_booleans.csv")
trip_motorway_booleans_df.to_csv(Output_location, index=False)

In [20]:
# Export the gdf results to shapefile (so they can be visualized in Tableau)
# Only the first 10,000 rows first (for a smaller database to play with) 
# See how long it takes to export

from datetime import datetime
start_time = datetime.now()
print("Started at:", start_time)

Started at: 2025-05-12 08:21:32.955975


In [21]:
matched_path_bridge_gdf.head(10000).to_file(
    r"E:\matched_path_bridge_gdf_first10000.shp",
    driver="ESRI Shapefile"
)


  matched_path_bridge_gdf.head(10000).to_file(


In [22]:
end_time = datetime.now()
print("Finished at:", end_time)
print("Duration:", end_time - start_time)

Finished at: 2025-05-12 08:21:36.266277
Duration: 0:00:03.310302


In [23]:
# Export the gdf results to shapefile (so they can be visualized in Tableau)
# See how long it takes to export

from datetime import datetime
start_time = datetime.now()
print("Started at:", start_time)

Started at: 2025-05-12 08:21:36.283273


In [24]:
matched_path_bridge_gdf.to_file(
    r"E:\matched_path_bridge_gdf.shp",
    driver="ESRI Shapefile"
)

  matched_path_bridge_gdf.to_file(


In [25]:
end_time = datetime.now()
print("Finished at:", end_time)
print("Duration:", end_time - start_time)

Finished at: 2025-05-12 09:03:53.578084
Duration: 0:42:17.294811
