In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import re
import os

CalGuide = pd.read_csv("../USA_ZIP_Code_Areas_anaylsis_1822069148502459445.csv")
CalZips = CalGuide.ZIP_Code
CalZips = CalZips.tolist()

In [4]:
file_paths = [
    'WhitePop2017.csv',
    'WhitePop2018.csv',
    'WhitePop2019.csv',
    'WhitePop2020.csv',
    'WhitePop2021.csv',
    'WhitePop2022.csv',
    'WhitePop2023.csv'
]

In [7]:
# Load ZIP-level dataset
zip_df = pd.read_csv('../zip_code_database.csv')
zip_df['county'] = zip_df['county'].str.replace(' County', '', regex=False)

# Filter ZIPs to California
CalZips = zip_df.loc[zip_df['state'] == 'CA', 'zip'].unique()

# Collect cleaned data for all years
all_years_data = []

for input_file in file_paths:
    year = re.search(r'\d{4}', input_file).group()  # Extract year

    # Step 1: Clean county-level percent_white from raw file
    county_data = []
    with open(input_file, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        county_name = None
        percent_value = None

        for row in reader:
            if not row or not row[0].strip():
                continue

            first_col = row[0].strip()

            if 'County' in first_col and 'California' in first_col:
                if county_name and percent_value:
                    county_data.append((county_name, percent_value))
                    percent_value = None
                county_name = re.sub(r'\s*County, California', '', first_col)

            elif 'Percent' in first_col:
                percent_value = row[1].strip().replace('%', '')

        if county_name and percent_value:
            county_data.append((county_name, percent_value))

    # Convert to DataFrame
    county_df = pd.DataFrame(county_data, columns=['county', 'percent_white'])

    # Normalize county names before merging
    zip_df['county'] = zip_df['county'].str.replace(' County', '', regex=False).str.strip().str.lower()
    county_df['county'] = county_df['county'].str.strip().str.lower()

    # Merge with ZIP-level data
    merged_df = pd.merge(zip_df, county_df, on='county', how='left')

    # Merge with ZIP data
    #merged_df = pd.merge(zip_df, county_df, on='county', how='left')
    merged_df = merged_df[['zip', 'percent_white']]
    merged_df = merged_df[merged_df['zip'].isin(CalZips)]
    merged_df['year'] = int(year)
    
    all_years_data.append(merged_df)

# Combine and pivot
combined_df = pd.concat(all_years_data, ignore_index=True)
pivoted_df = combined_df.pivot(index='zip', columns='year', values='percent_white')

# Optional: sort columns (years)
pivoted_df = pivoted_df.sort_index(axis=1)

# Reset index so ZIP is a column
pivoted_df.reset_index(inplace=True)

# Save output
pivoted_df.to_csv('percent_white_population_yearly.csv', index=False)


year    zip  2017  2018  2019  2020  2021  2022  2023
2031  95214  55.9  56.3  56.5  51.5  46.5  42.1  37.1
