In [11]:
# 4. CROSSWALK BUILDER (Robust Version)
def build_puma_crosswalk():
    """
    Reads the Relationship File and Tract Data to build the PUMA-County weight table.
    Includes auto-detection for column names.
    """
    crosswalk_path = os.path.join(DATA_DIR, "puma_county_crosswalk.csv")
    
    if os.path.exists(crosswalk_path):
        print(f"✅ Crosswalk already exists: {crosswalk_path}")
        return pd.read_csv(crosswalk_path)

    print("Building PUMA-County Crosswalk Table...")
    
    # 1. Load Relationship File
    rel_path = os.path.join(DATA_DIR, RELATIONSHIP_FILE)
    if not os.path.exists(rel_path):
         print(f"❌ Relationship file missing at {rel_path}. Cannot build crosswalk.")
         return None

    # Load with string types to preserve leading zeros
    df_rel = pd.read_csv(rel_path, dtype=str)
    
    # Auto-Rename: Handle both IPUMS (STATEFIP) and Census (STATEFP) formats
    rename_map = {
        'STATEFIP': 'STATEFP',
        'COUNTYFIP': 'COUNTYFP',
        'TRACT': 'TRACTCE',
        'PUMA': 'PUMA5CE'
    }
    df_rel.rename(columns=rename_map, inplace=True)
    
    # Create GEOID (State+County+Tract)
    # Ensure proper padding (State=2, County=3, Tract=6 chars)
    try:
        df_rel['GEOID'] = (
            df_rel['STATEFP'].str.zfill(2) + 
            df_rel['COUNTYFP'].str.zfill(3) + 
            df_rel['TRACTCE'].str.zfill(6)
        )
    except KeyError as e:
        print(f"❌ Column Error in Relationship File. Found: {df_rel.columns.tolist()}")
        raise e

    # 2. Load Tract Data
    tract_path = os.path.join(DATA_DIR, "acs_tract_demographics_2023.csv")
    if not os.path.exists(tract_path):
        print(f"❌ Tract data missing at {tract_path}. Cannot build crosswalk.")
        return None
        
    df_tract = pd.read_csv(tract_path, dtype=str) # Load as str to safely check GEOIDs
    
    # --- FIX: COLUMN AUTO-CORRECTION ---
    print(f"   -> Raw Tract Columns: {df_tract.columns.tolist()[:5]}...") # Debug print

    # Fix GEOID Column
    if 'GEO_ID' in df_tract.columns:
        # Census GEO_ID looks like "1400000US24031..." -> Clean to "24031..."
        df_tract['GEOID'] = df_tract['GEO_ID'].str.split('US').str[-1]
    elif 'id' in df_tract.columns:
        df_tract['GEOID'] = df_tract['id'].str.split('US').str[-1]
    
    # Fix Population Column (Look for common census codes for Total Pop)
    pop_cols = ['DP05_0001E', 'Total', 'Estimate!!Total', 'S0101_C01_001E', 'P001001']
    for col in pop_cols:
        if col in df_tract.columns:
            print(f"   -> Found population column: '{col}'")
            df_tract['Total_Population'] = pd.to_numeric(df_tract[col], errors='coerce').fillna(0)
            break
            
    # Check if we have what we need
    if 'GEOID' not in df_tract.columns or 'Total_Population' not in df_tract.columns:
        print("❌ ERROR: Could not identify 'GEOID' or 'Total_Population' columns in tract data.")
        print(f"Available columns: {df_tract.columns.tolist()}")
        raise KeyError("Missing required columns in acs_tract_demographics_2023.csv")

    # 3. Merge
    # Inner join filters out tracts not in the relationship file (or vice versa)
    df_merged = pd.merge(df_rel, df_tract[['GEOID', 'Total_Population']], on='GEOID', how='inner')
    
    # 4. Aggregate Logic
    crosswalk_rows = []
    for puma, group in df_merged.groupby('PUMA5CE'):
        total_puma_pop = group['Total_Population'].sum()
        unique_counties = group['COUNTYFP'].unique().tolist()
        tract_list = group['GEOID'].tolist()
        
        county_weights = []
        for county in unique_counties:
            # Pop of this county *inside this PUMA*
            county_pop = group[group['COUNTYFP'] == county]['Total_Population'].sum()
            
            # Avoid divide by zero
            weight = county_pop / total_puma_pop if total_puma_pop > 0 else 0
            county_weights.append(weight)
            
        crosswalk_rows.append({
            'PUMA': puma,
            'tract_ids': tract_list,
            'counties': unique_counties,
            'weights': county_weights
        })
        
    df_crosswalk = pd.DataFrame(crosswalk_rows)
    df_crosswalk.to_csv(crosswalk_path, index=False)
    print(f"✅ Crosswalk built and saved to: {crosswalk_path}")
    return df_crosswalk