In [23]:
# intial tabulation of linked trips from BATS 2023
# background:
# this analysis uses the linked trip analysis ran on 20250728
# which uses the tue to thu weighted data from February 2025 (WeightedDataset_02212025)
# the config for the trip linking process can be found in: https://github.com/ZephyrTransport/travel-diary-survey-tools/blob/df723a6376c0859b34e0fec89206c681b485353d/config/pipeline_config_mtc.toml

import shutil

# Make a copy of the linked trip file to a new analysis folder
TripLinking_csv = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\03b-assign_day\wt-wkday_3day\trip.csv"
TripLinking_csv_renamed = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\trip_linked.csv"

shutil.copy(TripLinking_csv, TripLinking_csv_renamed)

print(f"Linked trip file copied from:\n{TripLinking_csv}\nto:\n{TripLinking_csv_renamed}")

Linked trip file copied from:
M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\03b-assign_day\wt-wkday_3day\trip.csv
to:
M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\trip_linked.csv


In [24]:
import pandas as pd
from enum import Enum

# Read the linked trip file
LinkedTrips_filepath = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\trip_linked.csv"
LinkedTrips_df = pd.read_csv(LinkedTrips_filepath)

# Define enums for mode, path and purpose
class Mode(Enum):
    OTHER = 0
    WALK = 1
    BIKE = 2
    DA = 3
    HOV2 = 4
    HOV3 = 5
    WALKTRAN = 6
    DRIVETRAN = 7
    SCHBUS = 8
    TNC = 9

class Path(Enum):
    NONE = 0
    FULLNETWORK = 1
    NO_TOLL_NETWORK = 2 # not used in the current processing
    BUS = 3
    LRT = 4
    PREMIUM = 5
    BART = 6
    FERRY = 7

class Dpurp(Enum):
    HOME = 0
    WORK = 1
    SCHOOL = 2
    ESCORT = 3
    PERS_BUS = 4
    SHOP = 5
    MEAL = 6
    SOCREC = 7
    CHANGE_MODE = 10 # change mode is still in the dataset, but it is only a very small number of linked trips (23 cases)
    OTHER = 11
    MISSING = -1


# Map numeric codes to enum names
LinkedTrips_df["mode_enum"] = LinkedTrips_df["mode"].apply(
    lambda x: Mode(x).name if x in [m.value for m in Mode] else None
)

LinkedTrips_df["path_enum"] = LinkedTrips_df["pathtype"].apply(
    lambda x: Path(x).name if x in [p.value for p in Path] else None
)

LinkedTrips_df["dpurp_enum"] = LinkedTrips_df["dpurp"].apply(
    lambda x: Dpurp(x).name if x in [d.value for d in Dpurp] else None
)

# View the first few rows (but make sure no indivdiual records are shown before committing to GitHub)
# LinkedTrips_df.head()

In [38]:
# calculated weighted number of trips by purpose
# generate a person-day level trips by destination purpose file
trips_byDpurp_df = (
    LinkedTrips_df
    .groupby(['hhno', 'pno', 'day', 'dpurp_enum'])
    .agg(
        trip_count=('dpurp_enum', 'size'),  # count number of rows
        trexpfac_sum=('trexpfac', 'sum')     # sum trexpfac
    )
    .unstack(fill_value=0)
)

# Flatten the MultiIndex columns created by unstack
trips_byDpurp_df.columns = ['_'.join(map(str, col)).strip('_') for col in trips_byDpurp_df.columns.values]
trips_byDpurp_df = trips_byDpurp_df.reset_index()

output_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\trips_byDpurp.csv"
trips_byDpurp_df.to_csv(output_path, index=False)
print(f"trips_byDpurp_df saved to {output_path}")

trips_byDpurp_df saved to M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\trips_byDpurp.csv


In [39]:
# Read person file
person_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\03b-assign_day\wt-wkday_3day\person.csv"
person_df = pd.read_csv(person_path)

# Join person_df with trips_byDpurp_df (left join to keep all person records) 
# note that some people participated 1 day, some people participated multiple days, but this is taken care of by the weight
person_wTripCount_df = pd.merge(
    person_df,
    trips_byDpurp_df,
    on=['hhno', 'pno'],   
    how='left'           
)

# fill 0 for no trip
person_wTripCount_df = person_wTripCount_df.fillna(0)

output_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\person_wTripCount.csv"

person_wTripCount_df.to_csv(output_path, index=False)
print(f"person_wTripCount_df saved to {output_path}")



person_wTripCount_df saved to M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\person_wTripCount.csv


In [41]:
# list of columns to sum
cols_to_sum = [
    'trexpfac_sum_CHANGE_MODE', 'trexpfac_sum_ESCORT', 'trexpfac_sum_HOME',
    'trexpfac_sum_MEAL', 'trexpfac_sum_OTHER', 'trexpfac_sum_PERS_BUS',
    'trexpfac_sum_SCHOOL', 'trexpfac_sum_SHOP', 'trexpfac_sum_SOCREC', 'trexpfac_sum_WORK'
]

# group by pptyp and sum the selected columns
sumTrips_by_pptyp_df = (
    person_wTripCount_df
    .groupby('pptyp')[cols_to_sum]
    .sum()
    .reset_index()
)

output_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\sumTrips_by_pptyp.csv"

sumTrips_by_pptyp_df.to_csv(output_path, index=False)
print(f"sumTrips_by_pptyp_df saved to {output_path}")



sumTrips_by_pptyp_df saved to M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\sumTrips_by_pptyp.csv


In [33]:
# Read person file
person_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\03b-assign_day\wt-wkday_3day\person.csv"
person_df = pd.read_csv(person_path)

# Count the number of rows (unweighted count)
num_persons_unweighted = len(person_df)

# Sum the person expansion factor (weighted count)
num_persons_weighted = person_df['psexpfac'].sum()


print(f"Unweighted persons: {num_persons_unweighted}")
print(f"Weighted persons: {num_persons_weighted}")

Unweighted persons: 15985
Weighted persons: 7326898.264751358


In [None]:
# I might want to focus on pptyp=1, Full-time worker (age16+), to make the trip rates more intuitive to understand

num_pptyp1_unweighted = (person_df['pptyp'] == 1).sum()
num_pptyp1_weighted = person_df[person_df['pptyp'] == 1]['psexpfac'].sum()

print(f"Unweighted persons (pptyp=1): {num_pptyp1_unweighted}")
print(f"Weighted persons (pptyp=1): {num_pptyp1_weighted}")

Unweighted persons (pptyp=1): 8319
Weighted persons (pptyp=1): 3225037.287001975


In [45]:
# Group by pptyp and calculate num_person
num_persons_df = person_df.groupby('pptyp').agg(
    num_persons_unweighted=('pptyp', 'size'),
    num_persons_weighted=('psexpfac', 'sum')
).reset_index()

print(num_persons_df)

   pptyp  num_persons_unweighted  num_persons_weighted
0      1                    8319          3.225037e+06
1      2                    1018          7.404677e+05
2      3                    2224          9.105070e+05
3      4                    1818          8.427431e+05
4      5                     731          3.559589e+05
5      6                      35          1.251005e+04
6      7                    1260          8.891066e+05
7      8                     580          3.505676e+05


In [None]:
# join num_persons_df with sumTrips_by_pptyp_df on pptyp 

tripRates_by_pptyp_df = pd.merge(
    num_persons_df,
    sumTrips_by_pptyp_df,
    on='pptyp',
    how='left'   # keeps all rows from num_persons_df, adds matches from sumTrips_by_pptyp_df
)

output_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\tripRates_by_pptyp_df.csv"

tripRates_by_pptyp_df.to_csv(output_path, index=False)
print(f"tripRates_by_pptyp_df_df saved to {output_path}")

# the trip rates by person type seems pretty reasonable


tripRates_by_pptyp_df_df saved to M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\tripRates_by_pptyp_df.csv


In [None]:
# look at trip rate of telecommuters. Perhaps (work from home 3 or more days a week) vs trip rate of other workers (based on telework_freq).
# Read the person file with  more background information
person_background_file_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Full Weighted 2023 Dataset\WeightedDataset_02212025\person.csv"

# Read the CSV file
person_background_df = pd.read_csv(person_background_file_path)

# Keep only the desired columns and rename hh_id to hhno
person_background_df = person_background_df[['person_id', 'hh_id', 'telework_freq']].rename(columns={'hh_id': 'hhno'})


# Create pno as the last two digits of person_id and convert to numeric
person_background_df['pno'] = (
    person_background_df['person_id']
    .astype(str)
    .str[-2:]
    .astype(int)
)

# Preview the dataframe (but don't commit the preview to github)
#person_background_df.head()

In [57]:
#Join person_background_df to num_persons_df
person_wTeleworkFreq_df = pd.merge(
    person_df,
    person_background_df,
    on=['hhno', 'pno'],      
    how='left' 
)              

# Group by telework_freq and calculate num_person
num_persons_ByTelework_df = person_wTeleworkFreq_df.groupby('telework_freq').agg(
    num_persons_unweighted=('telework_freq', 'size'),
    num_persons_weighted=('psexpfac', 'sum')
).reset_index()

print(num_persons_ByTelework_df )

   telework_freq  num_persons_unweighted  num_persons_weighted
0              1                     180          8.694760e+04
1              2                     652          2.555789e+05
2              3                     564          1.776140e+05
3              4                     833          2.902242e+05
4              5                    1212          4.150588e+05
5              6                     601          2.166517e+05
6              7                     514          2.465600e+05
7              8                     427          1.932492e+05
8            995                    8760          4.007171e+06
9            996                    2242          1.437843e+06


In [65]:

#sum up the trips
# list of columns to sum
cols_to_sum = [
    'trexpfac_sum_CHANGE_MODE', 'trexpfac_sum_ESCORT', 'trexpfac_sum_HOME',
    'trexpfac_sum_MEAL', 'trexpfac_sum_OTHER', 'trexpfac_sum_PERS_BUS',
    'trexpfac_sum_SCHOOL', 'trexpfac_sum_SHOP', 'trexpfac_sum_SOCREC', 'trexpfac_sum_WORK'
]

#Join person_background_df to person_wTripCount_df
person_wTripCountwTeleworkFreq_df = pd.merge(
    person_wTripCount_df,
    person_background_df,
    on=['hhno', 'pno'],      
    how='left' 
)   

# group by telework_freq and sum the selected columns
sumTrips_by_telework_freq_df = (
    person_wTripCountwTeleworkFreq_df
    .groupby('telework_freq')[cols_to_sum]
    .sum()
    .reset_index()
)

output_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\sumTrips_by_telework_freq.csv"

sumTrips_by_telework_freq_df.to_csv(output_path, index=False)
print(f"sumTrips_by_telework_freq_df saved to {output_path}")




sumTrips_by_telework_freq_df saved to M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\sumTrips_by_telework_freq.csv


In [None]:
 #join num_persons_df with sumTrips_by_telework_freq_df on telework_freq

tripRates_by_teleworkFreq_df = pd.merge(
    num_persons_ByTelework_df,
    sumTrips_by_telework_freq_df,
    on='telework_freq',
    how='left'   
)

output_path = r"M:\Data\HomeInterview\Bay Area Travel Study 2023\Data\Processed\TripLinking_20250728\Linked_Trip_Analysis\tripRates_by_teleworkFreq_df.csv"

tripRates_by_teleworkFreq_df.to_csv(output_path, index=False)
print(f"tripRates_by_teleworkFreq_df_df saved to {output_path}")



KeyError: 'telework_freq'