In [1]:
import geopandas as gpd
import os

In [2]:
# The matched_path_gdf layer takes a while to load. Tracking the time it takes here:

from datetime import datetime
start_time = datetime.now()
print("Started reading matched_path_gdf at:", start_time)

Started reading matched_path_gdf at: 2025-05-06 10:47:49.267190


In [3]:
# Look at the matched_path_gdf layer
# matched_path_gdf is the name of the layer and it is also the name of the geodataframe

matched_path_gdf = gpd.read_file(
    r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Full Weighted 2023 Dataset\WeightedDataset_08092024\OSM_match_v2\tds_conflation_results.gpkg",
    layer="matched_path_gdf"
)

# note to self: do not do matched_path_gdf.head(), as it may reveal PII data!

In [4]:
end_time = datetime.now()
print("Started reading matched_path_gdf at:", end_time)
print("Duration:", end_time - start_time)

Started reading matched_path_gdf at: 2025-05-06 11:30:07.907174
Duration: 0:42:18.639984


In [5]:
len(matched_path_gdf)

8941049

In [6]:
matched_path_gdf["highway"].value_counts()

secondary                              2490183
primary                                1489085
motorway                               1475978
tertiary                               1325476
residential                            1097249
motorway_link                           473121
trunk                                   403204
unclassified                             66918
primary_link                             30613
secondary_link                           26954
trunk_link                               23624
busway                                   19463
tertiary_link                             7554
['motorway', 'trunk']                     3470
['secondary', 'tertiary']                 1519
['residential', 'tertiary']               1117
['motorway', 'motorway_link']              853
['secondary', 'motorway_link']             652
['motorway_link', 'primary']               569
living_street                              549
['unclassified', 'tertiary']               433
['secondary',

In [7]:
matched_path_gdf["bridge"].value_counts(dropna=False)

None                  8069697
yes                    865588
['yes', 'movable']       2227
viaduct                  1883
['yes', 'viaduct']       1171
movable                   245
cantilever                142
no                         96
Name: bridge, dtype: int64

In [17]:
# Create boolean column for motorway
matched_path_gdf["is_motorway"] = matched_path_gdf["highway"] == "motorway"

# Create boolean column for motorway and NOT a bridge
matched_path_gdf["is_motorway_not_bridge"] = matched_path_gdf["highway"] == "motorway"


In [18]:
# Group by trip_id and use max 
trip_motorway_booleans_df = matched_path_gdf.groupby("trip_id")[["is_motorway", "is_motorway_not_bridge"]].max().astype(int).reset_index()


In [19]:
# Rename the columns
trip_motorway_booleans_df.rename(columns={"is_motorway": "has_motorway"}, inplace=True)
trip_motorway_booleans_df.rename(columns  ={"is_motorway_not_bridge": "has_nonBridge_motorway"}, inplace=True)


In [20]:
# Save to CSV
BATS_data_location = r"E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\Data\2023\Full Weighted 2023 Dataset\WeightedDataset_02212025"
Output_location = os.path.join(BATS_data_location, "derived_variables", "trip_motorway_booleans.csv")
trip_motorway_booleans_df.to_csv(Output_location, index=False)