In [None]:
import pandas as pd

# Read the day file
file_path = r'E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\Versioned_Data\PreWeight_PreLink_MonToSun_20250610\day.csv'
day_df = pd.read_csv(file_path)

# check number of unique households filtering
unique_households_before = day_df[['hh_id']].drop_duplicates().shape[0]

# Filter records where hh_day_complete == 1
filtered_day_df = day_df[day_df['hh_day_complete'] == 1]

# Count unique combinations of hh_id and day_num
unique_household_days   = filtered_day_df[['hh_id', 'day_num']].drop_duplicates().shape[0]
unique_households_after = filtered_day_df[['hh_id']].drop_duplicates().shape[0]

print(f"Number of person-days before filtering: {len(day_df)}")
print(f"Number of unique hh_id before filtering: {unique_households_before}")
print(f"Number of person-days after filtering (hh_day_complete==1): {len(filtered_day_df)}")
print(f"Number of unique hh_id after filtering: {unique_households_after}")
print(f"Number of unique (hh_id, day_num) combinations: {unique_household_days}")

In [None]:
# Collapse day_df to hh_day_df
hh_day_df = filtered_day_df[['hh_id', 'day_num']].drop_duplicates().reset_index(drop=True)

print(f"\nCollapsed day_df to hh_day_df: {hh_day_df.shape}")

# Read the incentives file
incentives_file_path = r'E:\Box\Modeling and Surveys\Surveys\Travel Diary Survey\BATS_2023\MTC_RSG_Partner Repository\5.Deliverables\Task 5 - Sample Plan\incentives_disaggregate.xlsx'
incentives_df = pd.read_excel(incentives_file_path, sheet_name='Sheet1')

print(f"Incentives dataframe shape: {incentives_df.shape}")
print(f"Incentives columns: {list(incentives_df.columns)}")

# Join the incentives data to the collapsed dataframe
merged_df = hh_day_df.merge(incentives_df, on='hh_id', how='left')

print(f"\nMerged dataframe shape: {merged_df.shape}")
print(f"Number of households with incentive data: {merged_df['hh_id'].nunique()}")
print(f"Number of households without incentive data (nulls): {merged_df.isnull().any(axis=1).sum()}")

# Count records grouped by signup_status
print("\n" + "="*50)
print("COUNT OF RECORDS BY SIGNUP_STATUS:")
print("="*50)
signup_counts = merged_df['signup_status'].value_counts(dropna=False)
print(signup_counts)

print(f"\nTotal records: {signup_counts.sum()}")


