This notebook implements the completeness checks laid out in draft TO7: https://mtcdrive.box.com/s/04x6zdoenhxin0n4p79ygfavsumo5262

In [1]:
import os
import pandas as pd

input_dir = r"E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\Versioned_Data\PreWeight_PreLink_MonToSun_20250610"
output_dir = r"E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\MTC_SFCTA_VTA Travel Diary Discussion\Data Review\Completeness Evaluation\Completeness_per_TO7"

In [2]:
# check households completeness

# Read the hh.csv file
hh_df = pd.read_csv(os.path.join(input_dir, "hh.csv"))

print(f"Total number of households: {len(hh_df)}")

# Create hh_complete variable defaulted to 0
hh_df['hh_complete'] = 0

# household completeness criteria
hh_conditions = (
    (hh_df['num_vehicles'].between(0, 8)) &           # Number of vehicles in the household
    (hh_df['num_adults'].between(1, 13)) &            # Number of adults in the household
    (hh_df['num_kids'].between(0, 12)) &              # Number of children in the household
    (hh_df['num_people'].between(1, 13)) &            # Total number of people in the household
    (hh_df['num_workers'].between(0, 13)) &           # Number of workers in the household
    (hh_df['home_lat'].notna()) &                     # Home latitude
    (hh_df['home_lon'].notna())                       # Home longitude
)

# Set hh_complete to 1 where all conditions are met
hh_df.loc[hh_conditions, 'hh_complete'] = 1

# Display results
print(f"\nhh_complete value counts:")
print(hh_df['hh_complete'].value_counts())


Total number of households: 8842

hh_complete value counts:
hh_complete
1    8842
Name: count, dtype: int64


In [3]:
# check person completeness

# Read the person.csv file
person_df = pd.read_csv(os.path.join(input_dir, "person.csv"))

print(f"Total number of persons: {len(person_df)}")

# Create person_complete variable defaulted to 0
person_df['person_complete'] = 0

# For employed persons (employment 1, 2, 3, or 7)
# 1. job_type = 3 (work only from home). Simple requirements as no telework_freq. 
person_df.loc[
    (person_df['employment'].isin([1, 2, 3, 7])) & 
    (person_df['job_type'] == 3),
    'person_complete'
] = 1

# 2. job_type = 2 (Work location regularly varies) or 4 (Drive/bike/travel for work) - check telework_freq only
person_df.loc[
    (person_df['employment'].isin([1, 2, 3, 7])) & 
    (person_df['job_type'].isin([2, 4])) & 
    (person_df['telework_freq'].isin(list(range(1, 9)) + [996])), #996 means "Never" which is a valid answer
    'person_complete'
] = 1

# 3. job_type = 1 (Go to one work location ONLY) or 5 (Work remotely some days and travel to a work location some days) - check telework_freq AND work location information
person_df.loc[
    (person_df['employment'].isin([1, 2, 3, 7])) & 
    (person_df['job_type'].isin([1, 5])) & 
    (person_df['telework_freq'].isin(list(range(1, 9)) + [996])) &
    (person_df['work_lat'].notna()) & 
    (person_df['work_lon'].notna()) &
    (person_df['work_park'].between(1, 4)),
    'person_complete'
] = 1

# employment = 4 is not in the codebook (explicitly set to incomplete)
person_df.loc[
    person_df['employment'] == 4,
    'person_complete'
] = 0

# for unemployed persons
# set person_complete to 1 if employment status is not missing
person_df.loc[
    person_df['employment'].isin([5, 6, 8]),
    'person_complete'
] = 1

# for students
# Set person_complete to 1 if school_lat and school_lon are not missing
person_df.loc[
    (person_df['student'].isin([0, 1])) & 
    (person_df['school_lat'].notna()) & 
    (person_df['school_lon'].notna()),
    'person_complete'
] = 1

# for non-students (or students who are online only)
# Set person_complete to 1 if student status is not missing
person_df.loc[
    person_df['student'].isin([2, 3, 4]),
    'person_complete'
] = 1

# note that being employed and being a student are not mutually exclusive 
# under the current logic, if a person is employed and his/her student status is missing from the dataset, this person is still counted as a complete person. Vice Versa.

# Display results
print(f"\nperson_complete value counts:")
print(person_df['person_complete'].value_counts())


Total number of persons: 17188

person_complete value counts:
person_complete
1    15174
0     2014
Name: count, dtype: int64


In [4]:
# check telecommute time completeness in the day file

# Read the day.csv file
from pandas import NA


day_df = pd.read_csv(os.path.join(input_dir, "day.csv"))

print(f"Total number of days: {len(day_df)}")

# Create telecommute_time_complete variable defaulted to 0
day_df['telecommute_time_complete'] = 0

# Based on the questionnaire, telecommute_time is not asked unless the person is employed full/part/self/volunteer
# But the day_file itself doesn't have employment status information
# So we need to merge that in from the person file
# Merge employment status from person_df
day_df = day_df.merge(
    person_df[['person_id', 'employment']],
    on='person_id',
    how='left',
    validate='m:1'
)

# Set telecommute_time_complete to 1 when telecommute_time is not NA, or person is unemployed (5, 6, 8)
day_df.loc[
    (day_df['telecommute_time'].notna()) | 
    (day_df['employment'].isin([5, 6, 8])),
    'telecommute_time_complete'
] = 1

# Display results
print(f"\ntelecommute_time_complete value counts:")
print(day_df['telecommute_time_complete'].value_counts())


Total number of days: 91581

telecommute_time_complete value counts:
telecommute_time_complete
1    79082
0    12499
Name: count, dtype: int64


In [5]:
# check trip completeness

# Read the trip.csv file
trip_df = pd.read_csv(os.path.join(input_dir, "trip.csv"))

# Create trip_complete variable defaulted to 0
trip_df['trip_complete'] = 0

# Set trip_complete to 1 when all conditions are met
trip_df.loc[
    (trip_df['o_purpose_category'].between(1, 13)) & # or should this be o_purpose_reported and d_purpose_reported
    (trip_df['d_purpose_category'].between(1, 13)) & 
    (trip_df['mode_type'].between(1, 14)) &
    (trip_df['depart_hour'].notna()) &
    (trip_df['depart_minute'].notna()) &
    (trip_df['depart_seconds'].notna()) &
    (trip_df['arrive_hour'].notna()) &
    (trip_df['arrive_minute'].notna()) &
    (trip_df['arrive_second'].notna()) &
    (trip_df['o_lon'].notna()) &
    (trip_df['o_lat'].notna()) &
    (trip_df['d_lon'].notna()) &
    (trip_df['d_lat'].notna()),
    'trip_complete'
] = 1

# Display results
print(f"\ntrip_complete value counts:")
print(trip_df['trip_complete'].value_counts())


trip_complete value counts:
trip_complete
1    363035
0     10371
Name: count, dtype: int64


In [6]:
# check person-trip completeness

# Add 'hh_complete' from hh_df to trip_df by joining on hh_id    
trip_df = trip_df.merge(
    hh_df[['hh_id', 'hh_complete']],
    on='hh_id',
    how='left',
    validate='m:1',
    indicator=True
)

print(f"\nMerge status:")
print(trip_df['_merge'].value_counts())
print(f"\nMerge status breakdown:")
print(f"  - both: trip matched with hh_df")
print(f"  - left_only: trip with hh_id NOT found in hh_df")
print(f"  - right_only: This shouldn't appear with left join")

# Done with the indicator column. Drop it
trip_df = trip_df.drop('_merge', axis=1)

# Add 'person_complete' from person_df to trip_df by joining on person_id
trip_df = trip_df.merge(
    person_df[['person_id', 'person_complete']], 
    on='person_id', 
    how='left',
    validate='m:1',
    indicator=True  
)

# Check merge status
print(f"\nMerge status:")
print(trip_df['_merge'].value_counts())
print(f"\nMerge status breakdown:")
print(f"  - both: trip matched with person_df")
print(f"  - left_only: trip with person_id NOT found in person_df")
print(f"  - right_only: This shouldn't appear with left join")

# Done with the indicator column. Drop it
trip_df = trip_df.drop('_merge', axis=1)

# Add 'telecommute_complete' from day_df to trip_df by joining on day_id
trip_df = trip_df.merge(
    day_df[['hh_id', 'person_num', 'day_num', 'telecommute_time_complete']], 
    on=['hh_id', 'person_num', 'day_num'], 
    how='left',
    validate='m:1',
    indicator=True  
)

# Check merge status
print(f"\nMerge status:")
print(trip_df['_merge'].value_counts())
print(f"\nMerge status breakdown:")
print(f"  - both: trip matched with day_df")
print(f"  - left_only: trip with day_id NOT found in day_df")
print(f"  - right_only: This shouldn't appear with left join")

# Done with the indicator column. Drop it
trip_df = trip_df.drop('_merge', axis=1)


# Create a new variable personTrip_complete by multiplying person_complete with trip_complete
trip_df['personTrip_complete'] = trip_df['hh_complete'] * trip_df['person_complete'] * trip_df['telecommute_time_complete'] * trip_df['trip_complete']

# output trip_df for review
output_path = os.path.join(output_dir, "trip_df_review.csv")

trip_df.to_csv(
    output_path,
    index=False
)



Merge status:
_merge
both          373406
left_only          0
right_only         0
Name: count, dtype: int64

Merge status breakdown:
  - both: trip matched with hh_df
  - left_only: trip with hh_id NOT found in hh_df
  - right_only: This shouldn't appear with left join

Merge status:
_merge
both          373406
left_only          0
right_only         0
Name: count, dtype: int64

Merge status breakdown:
  - both: trip matched with person_df
  - left_only: trip with person_id NOT found in person_df
  - right_only: This shouldn't appear with left join

Merge status:
_merge
both          373405
left_only          1
right_only         0
Name: count, dtype: int64

Merge status breakdown:
  - both: trip matched with day_df
  - left_only: trip with day_id NOT found in day_df
  - right_only: This shouldn't appear with left join


In [7]:
# check person-day completeness

# check if all trips of a person in the day are complete
person_day_df = (
    trip_df
    .groupby(['hh_id', 'person_id', 'day_num'], as_index=False)
    .agg(
        allTrips_complete=('personTrip_complete', lambda s: int((s == 1).all())),
        n_trips=('personTrip_complete', 'size'),
        n_incomplete_trips=('personTrip_complete', lambda s: (s != 1).sum())
    )
)

# append person day that explicitly reported "did not make trips" from day_df
no_trip_day_df = day_df[day_df['made_travel'] == 2]
no_trip_day_df = no_trip_day_df[['hh_id', 'person_id', 'day_num', 'made_travel']]
person_day_df = pd.concat([person_day_df, no_trip_day_df], ignore_index=True)

# A person-day is only complete if either 1) all trips are complete OR person made no travel
person_day_df['personDay_complete'] = (
    (person_day_df['allTrips_complete'] == 1) | 
    (person_day_df['made_travel'] == 2)
).astype(int)

output_path = os.path.join(output_dir, "person_day_completeness.csv")

person_day_df.to_csv(
    output_path,
    index=False
)

print(f"Person-day file written to {output_path}")

Person-day file written to E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\MTC_SFCTA_VTA Travel Diary Discussion\Data Review\Completeness Evaluation\Completeness_per_TO7\person_day_completeness.csv


In [8]:
# check household-day completeness
household_day_df = (
    person_day_df
    .groupby(['hh_id', 'day_num'], as_index=False)
    .agg(
        householdDay_complete=('personDay_complete', lambda s: int((s == 1).all()))
    )
)

output_path = os.path.join(output_dir, "household_day_completeness.csv")

household_day_df.to_csv(
    output_path,
    index=False
)

print(f"Household-day file written to {output_path}")

print(f"Total number of complete household-days: {(household_day_df['householdDay_complete'] == 1).sum()}")


Household-day file written to E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\MTC_SFCTA_VTA Travel Diary Discussion\Data Review\Completeness Evaluation\Completeness_per_TO7\household_day_completeness.csv
Total number of complete household-days: 36223
