In [2]:
import pandas as pd
import geopandas as gpd
import maup
import numpy as np

# Current Working Directory
working_directory = "/Users/stanleymui/Downloads/CSE 416 Preprocessing Data/"

In [3]:
# Density/Category Calculation Function
def region_category(row):
    if row['Density'] < 0.1:
        return 'Rural'
    elif row['Density'] > 3:
        return 'Urban'
    else:
        return 'Suburban'

# Population by Region Type
def fill_population(row):
    if row["Category"] == "Urban":
        row["Urban"] = row["TOT_POP"]
        row["Suburban"] = 0.0
        row["Rural"] = 0.0
    elif row["Category"] == "Suburban":
        row["Suburban"] = row["TOT_POP"]
        row["Urban"] = 0.0
        row["Rural"] = 0.0
    elif row["Category"] == "Rural":
        row["Rural"] = row["TOT_POP"]
        row["Suburban"] = 0.0
        row["Urban"] = 0.0
    return row

In [None]:
# File paths
demographic_block_file = working_directory + "nhgis0015_ds258_2020_block.csv"
region_type_block_file = working_directory + "nhgis0016_ds258_2020_block.csv"
block_geojson_file = working_directory + "ca_cvap_2020_2020_b.json"
precinct_votes_file = working_directory + "state_g20_sov_data_by_g20_srprec.csv"
precinct_geojson_file = working_directory + "srprec_state_g20_v01_shp.json"
income_bg_file = working_directory + "ca_inc_2021_bg.json"
congressional_district_file = working_directory + 'ca_cvap_2020_cd.json'

# Load and process demographic block data
df_block = pd.read_csv(demographic_block_file)[['GEOID', 'U7R001', 'U7R002', 'U7R005', 'U7R006', 'U7R007', 'U7R008', 'U7R009', 'U7R010', 'U7R011']]
df_block = df_block.rename(columns={
    'GEOID': 'GEOID20', 'U7R001': 'TOT_POP', 'U7R002': 'POP_HISLAT', 'U7R005': 'POP_WHT',
    'U7R006': 'POP_BLK', 'U7R007': 'POP_AINDALK', 'U7R008': 'POP_ASN', 'U7R009': 'POP_HIPI',
    'U7R010': 'POP_OTH', 'U7R011': 'POP_TWOMOR'
})
df_block['GEOID20'] = df_block['GEOID20'].str.replace('1000000US', '', regex=False)

# Load and process region type block data
df_block_region = pd.read_csv(region_type_block_file)[["GEOID", "AREALAND", "U7I002", "U7I003"]]
df_block_region = df_block_region.rename(columns={'GEOID': 'GEOID20', 'U7I002': 'Urban', 'U7I003': 'Rural'})
df_block_region['GEOID20'] = df_block_region['GEOID20'].str.replace('1000000US', '', regex=False)

# Load block geometry data
gdf_block = gpd.read_file(block_geojson_file)[['GEOID20', 'geometry']]

# Merge demographic and region type data with block geometry
merged_gdf_block = gdf_block.merge(df_block, on='GEOID20').merge(df_block_region, on='GEOID20')
merged_gdf_block.to_crs(crs="EPSG:3857", inplace=True)

# Load and process precinct vote data
df_precinct = pd.read_csv(precinct_votes_file)[['SRPREC_KEY', 'PRSDEM01', 'PRSREP01']]
df_precinct['TOT_VOTES'] = df_precinct['PRSDEM01'] + df_precinct['PRSREP01']
df_precinct['PCT_DEM'] = (df_precinct['PRSDEM01'] / df_precinct['TOT_VOTES']) * 100
df_precinct['PCT_REP'] = (df_precinct['PRSREP01'] / df_precinct['TOT_VOTES']) * 100

# Load precinct boundary data
gdf_precinct = gpd.read_file(precinct_geojson_file)
gdf_precinct = gdf_precinct.merge(df_precinct, on='SRPREC_KEY')
gdf_precinct = gdf_precinct[['SRPREC_KEY', 'PCT_DEM', 'PCT_REP', 'PRSDEM01', 'PRSREP01', 'TOT_VOTES', 'geometry']]
gdf_precinct.to_crs(crs="EPSG:3857", inplace=True)

# Assign blocks to precincts and aggregate demographic data
variables = ['TOT_POP', 'POP_HISLAT', 'POP_WHT', 'POP_BLK', 'POP_AINDALK', 'POP_ASN', 'POP_HIPI', 'POP_OTH', 'POP_TWOMOR', 'AREALAND', 'Urban', 'Rural']
blocks_to_precincts_assignment = maup.assign(merged_gdf_block, gdf_precinct)
gdf_precinct[variables] = merged_gdf_block[variables].groupby(blocks_to_precincts_assignment).sum()

gdf_precinct.fillna(0, inplace=True)
gdf_precinct['Density'] = (gdf_precinct['TOT_POP'] / gdf_precinct['AREALAND']) * 1000

# Load and process income block group data
gdf_income_bg = gpd.read_file(income_bg_file)
gdf_income_bg.to_crs(crs="EPSG:3857", inplace=True)

# Assign block groups to precincts and aggregate income data
income_variables = ['TOT_HOUS21', 'LESS_10K21', '10K_15K21', '15K_20K21', '20K_25K21', '25K_30K21', '30K_35K21', '35K_40K21', '40K_45K21', '45K_50K21', 
                    '50K_60K21', '60K_75K21', '75K_100K21', '100_125K21', '125_150K21', '150_200K21', '200K_MOR21']
gdf_precinct_income = gpd.read_file(precinct_geojson_file)[['SRPREC_KEY', 'geometry']]
gdf_precinct_income = gpd.read_file(precinct_geojson_file)[['SRPREC_KEY', 'geometry']]
gdf_precinct_income.to_crs(crs="EPSG:3857", inplace=True)
blocks_group_to_precinct_assignment = maup.assign(gdf_income_bg, gdf_precinct_income)
gdf_precinct_income[income_variables] = gdf_income_bg[income_variables].groupby(blocks_group_to_precinct_assignment).sum()
gdf_precinct_income['MEDN_INC21'] = gdf_income_bg['MEDN_INC21'].groupby(blocks_group_to_precinct_assignment).mean()

# Calculate poverty level and percentage
gdf_precinct_income['POVERTY'] = gdf_precinct_income[['LESS_10K21', '10K_15K21', '15K_20K21', '20K_25K21', '25K_30K21', '30K_35K21', '35K_40K21']].sum(axis=1)
gdf_precinct_income['POVERTY_PCT'] = np.where(gdf_precinct_income['TOT_HOUS21'] == 0.0, 0.0, (gdf_precinct_income['POVERTY'] / gdf_precinct_income['TOT_HOUS21']) * 100)

# Load congressional district boundaries and assign precincts to districts
gdf_congressional_district = gpd.read_file(congressional_district_file)
gdf_congressional_district.to_crs(crs="EPSG:3857", inplace=True)
precinct_to_cd_assignment = maup.assign(gdf_precinct, gdf_congressional_district)
gdf_precinct['CD_ID'] = precinct_to_cd_assignment + 1
gdf_precinct_income['CD_ID'] = precinct_to_cd_assignment + 1

# Final merge and save
merge_keys = ['SRPREC_KEY', 'CD_ID', 'geometry']
merged_precinct_gdf = gdf_precinct.merge(gdf_precinct_income, on=merge_keys, how='outer')
merged_precinct_gdf.to_file(working_directory + "california_precinct_merged.geojson", driver="GeoJSON")

In [None]:
# Read the congressional district file for voting and demographic data
gdf_cd_voting_demo = gpd.read_file(f"{working_directory}ca_cvap_2020_cd.json")
gdf_cd_voting_demo = gdf_cd_voting_demo[['CD', 'geometry']].rename(columns={'CD': 'ID'})
gdf_cd_voting_demo.to_crs(inplace=True, crs="EPSG:3857")

# Assign precincts to congressional districts
precinct_to_cd_assignment = maup.assign(merged_precinct_gdf, gdf_cd_voting_demo)

voting_and_demo_vars = [
    'PCT_DEM', 'PCT_REP', 'PRSDEM01', 'PRSREP01', 'TOT_VOTES',
    'TOT_POP', 'POP_HISLAT', 'POP_WHT', 'POP_BLK', 'POP_AINDALK',
    'POP_ASN', 'POP_HIPI', 'POP_OTH', 'POP_TWOMOR', 'AREALAND',
    'Urban', 'Rural', 'Suburban'
]

gdf_cd_voting_demo[voting_and_demo_vars] = (
    merged_precinct_gdf[voting_and_demo_vars]
    .groupby(precinct_to_cd_assignment)
    .sum()
)
gdf_cd_voting_demo.fillna(0, inplace=True)

# Calculate vote percentages and regional categories
gdf_cd_voting_demo['PCT_DEM'] = (
    gdf_cd_voting_demo['PRSDEM01'] / gdf_cd_voting_demo['TOT_VOTES'] * 100
)
gdf_cd_voting_demo['PCT_REP'] = (
    gdf_cd_voting_demo['PRSREP01'] / gdf_cd_voting_demo['TOT_VOTES'] * 100
)
gdf_cd_voting_demo['Density'] = (
    gdf_cd_voting_demo['TOT_POP'] / gdf_cd_voting_demo['AREALAND'] * 1000
)
gdf_cd_voting_demo['Category'] = gdf_cd_voting_demo.apply(region_category, axis=1)

# Read the congressional district file for income data
gdf_cd_income = gpd.read_file(f"{working_directory}ca_cvap_2020_cd.json")
gdf_cd_income = gdf_cd_income[['CD', 'geometry']].rename(columns={'CD': 'ID'})
gdf_cd_income.to_crs(inplace=True, crs="EPSG:3857")

# Assign precincts to congressional districts for income data
precinct_to_cd_assignment = maup.assign(gdf_precinct_income, gdf_cd_income)

income_vars = [
    'TOT_HOUS21', 'LESS_10K21', '10K_15K21', '15K_20K21', '20K_25K21',
    '25K_30K21', '30K_35K21', '35K_40K21', '40K_45K21', '45K_50K21',
    '50K_60K21', '60K_75K21', '75K_100K21', '100_125K21', '125_150K21',
    '150_200K21', '200K_MOR21'
]

gdf_cd_income[income_vars] = (
    gdf_precinct_income[income_vars]
    .groupby(precinct_to_cd_assignment)
    .sum()
)

gdf_cd_income['MEDN_INC21'] = (
    gdf_precinct_income.loc[gdf_precinct_income['TOT_HOUS21'] > 0.0, 'MEDN_INC21']
    .groupby(precinct_to_cd_assignment)
    .mean()
)

# Calculate poverty statistics
poverty_vars = [
    'LESS_10K21', '10K_15K21', '15K_20K21', '20K_25K21', '25K_30K21',
    '30K_35K21', '35K_40K21'
]
gdf_cd_income['POVERTY'] = gdf_cd_income[poverty_vars].sum(axis=1)
gdf_cd_income['POVERTY_PCT'] = np.where(
    gdf_cd_income['TOT_HOUS21'] == 0.0,
    0.0,
    (gdf_cd_income['POVERTY'] / gdf_cd_income['TOT_HOUS21']) * 100
)
gdf_cd_income.fillna(0, inplace=True)

# Convert CRS to EPSG:4326 for final GeoDataFrames
gdf_cd_voting_demo.to_crs(inplace=True, crs="EPSG:4326")
gdf_cd_income.to_crs(inplace=True, crs="EPSG:4326")

# Merge the voting and income data
merge_keys = ['ID', 'geometry']
merged_cd_gdf = gdf_cd_voting_demo.merge(gdf_cd_income, on=merge_keys, how='outer')

# Save the merged GeoDataFrame to a GeoJSON file
merged_cd_gdf.to_file(
    f"{working_directory}california_congressional_district_merged.geojson",
    driver="GeoJSON"
)


In [None]:
# Read the state file for voting and demographic data
gdf_state = gpd.read_file(f"{working_directory}CA_State_New.json")
gdf_state = gdf_state[['NAME', 'geometry']]
gdf_state.to_crs(inplace=True, crs="EPSG:3857")

# Define voting and demographic variables
voting_and_demo_vars = [
    'PCT_DEM', 'PCT_REP', 'PRSDEM01', 'PRSREP01', 'TOT_VOTES',
    'TOT_POP', 'POP_HISLAT', 'POP_WHT', 'POP_BLK', 'POP_AINDALK',
    'POP_ASN', 'POP_HIPI', 'POP_OTH', 'POP_TWOMOR', 'AREALAND',
    'Urban', 'Rural', 'Suburban'
]

# Assign each precinct to the state
precinct_to_state_assignment = maup.assign(merged_precinct_gdf, gdf_state)

gdf_state[voting_and_demo_vars] = (
    merged_precinct_gdf[voting_and_demo_vars]
    .groupby(precinct_to_state_assignment)
    .sum()
)

gdf_state.fillna(0, inplace=True)

# Calculate vote percentages and region categories
gdf_state['PCT_DEM'] = (gdf_state['PRSDEM01'] / gdf_state['TOT_VOTES']) * 100
gdf_state['PCT_REP'] = (gdf_state['PRSREP01'] / gdf_state['TOT_VOTES']) * 100
gdf_state['Density'] = (gdf_state['TOT_POP'] / gdf_state['AREALAND']) * 1000
gdf_state['Category'] = gdf_state.apply(region_category, axis=1)

# Read the state file for income data
gdf_state_income = gpd.read_file(f"{working_directory}CA_State_New.json")
gdf_state_income = gdf_state_income[['NAME', 'geometry']]
gdf_state_income.to_crs(inplace=True, crs="EPSG:3857")

# Assign precincts to the state for income data
precinct_to_state_assignment = maup.assign(gdf_precinct_income, gdf_state_income)

# Define income variables
income_vars = [
    'TOT_HOUS21', 'LESS_10K21', '10K_15K21', '15K_20K21', '20K_25K21',
    '25K_30K21', '30K_35K21', '35K_40K21', '40K_45K21', '45K_50K21',
    '50K_60K21', '60K_75K21', '75K_100K21', '100_125K21', '125_150K21',
    '150_200K21', '200K_MOR21'
]

gdf_state_income[income_vars] = (
    gdf_precinct_income[income_vars]
    .groupby(precinct_to_state_assignment)
    .sum()
)

gdf_state_income['MEDN_INC21'] = (
    gdf_precinct_income.loc[gdf_precinct_income['TOT_HOUS21'] > 0.0, 'MEDN_INC21']
    .groupby(precinct_to_state_assignment)
    .mean()
)

# Calculate poverty statistics
poverty_vars = [
    'LESS_10K21', '10K_15K21', '15K_20K21', '20K_25K21', '25K_30K21',
    '30K_35K21', '35K_40K21'
]

gdf_state_income['POVERTY'] = gdf_state_income[poverty_vars].sum(axis=1)
gdf_state_income['POVERTY_PCT'] = np.where(
    gdf_state_income['TOT_HOUS21'] == 0.0,
    0.0,
    (gdf_state_income['POVERTY'] / gdf_state_income['TOT_HOUS21']) * 100
)
gdf_state_income.fillna(0, inplace=True)

# Convert CRS to EPSG:4326 for final GeoDataFrames
gdf_state.to_crs(inplace=True, crs="EPSG:4326")
gdf_state_income.to_crs(inplace=True, crs="EPSG:4326")

# Merge the GeoDataFrames on common keys
merge_keys = ['NAME', 'geometry']
merged_state_gdf = gdf_state.merge(gdf_state_income, on=merge_keys, how='outer')

# Save the merged GeoDataFrame to a GeoJSON file
merged_state_gdf.to_file(
    f"{working_directory}california_state_merged.geojson",
    driver="GeoJSON"
)