In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np

from covidcaremap.constants import state_name_to_abbreviation
from covidcaremap.geo import sum_per_county, sum_per_state, sum_per_hrr
from covidcaremap.data import (external_data_path,
                               processed_data_path,
                               read_census_data_df)

# Merge Region and Census Data

This notebook utilizes US Census data at the county and state level to merge population data into the county, state, and HRR region data.

Most logic taken from [usa_beds_capacity_analysis_20200313_v2](https://github.com/daveluo/covid19-healthsystemcapacity/blob/9a45c424a23e7a15559527893ebeb28703f26422/nbs/usa_beds_capacity_analysis_20200313_v2.ipynb)

In [None]:
county_census_df = pd.read_csv(external_data_path('us-census-cc-est2018-alldata.csv'), 
                               encoding='unicode_escape')

In [None]:
puerto_rico_census_df = pd.read_csv(external_data_path('PEP_2018_PEPAGESEX_with_ann.csv'), 
                               encoding='unicode_escape')

# Filter dataset to Puerto Rico and format it to join
puerto_rico_census_df = puerto_rico_census_df[puerto_rico_census_df['GEO.display-label'] == 'Puerto Rico']
puerto_rico_census_df = puerto_rico_census_df.rename(columns={'GEO.display-label': 'STNAME'})

#### Format FIPS code as to be joined with county geo data

In [None]:
county_census_df['fips_code'] = county_census_df['STATE'].apply(lambda x: str(x).zfill(2)) + \
                                  county_census_df['COUNTY'].apply(lambda x: str(x).zfill(3))

#### Filter to 7/1/2018 population estimate

In [None]:
county_census2018_df = county_census_df[county_census_df['YEAR'] == 11]

#### Filter by age groups

We will be looking at total population, adult population (20+ years old), 
and elderly population (65+ years old). These age groups match up with the
CDC groupings here: https://www.cdc.gov/mmwr/volumes/69/wr/mm6912e2.htm?s_cid=mm6912e2_w

From https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2018/cc-est2018-alldata.pdf, the key for AGEGRP is as follows:
- 0 = Total
- 1 = Age 0 to 4 years
- 2 = Age 5 to 9 years
- 3 = Age 10 to 14 years
- 4 = Age 15 to 19 years
- 5 = Age 20 to 24 years
- 6 = Age 25 to 29 years
- 7 = Age 30 to 34 years
- 8 = Age 35 to 39 years
- 9 = Age 40 to 44 years
- 10 = Age 45 to 49 years
- 11 = Age 50 to 54 years
- 12 = Age 55 to 59 years
- 13 = Age 60 to 64 years
- 14 = Age 65 to 69 years
- 15 = Age 70 to 74 years
- 16 = Age 75 to 79 years
- 17 = Age 80 to 84 years
- 18 = Age 85 years or older

In [None]:
county_pop_all = county_census2018_df[county_census2018_df['AGEGRP']==0].groupby(
    ['fips_code'])['TOT_POP'].sum()
county_pop_adult = county_census2018_df[county_census2018_df['AGEGRP']>=5].groupby(
    ['fips_code'])['TOT_POP'].sum()
county_pop_elderly = county_census2018_df[county_census2018_df['AGEGRP']>=14].groupby(
    ['fips_code'])['TOT_POP'].sum()

In [None]:
county_pop_all.sum(), county_pop_adult.sum(), county_pop_elderly.sum()

In [None]:
state_pop_all = county_census2018_df[county_census2018_df['AGEGRP']==0].groupby(
    ['STNAME'])['TOT_POP'].sum()
state_pop_adult = county_census2018_df[county_census2018_df['AGEGRP']>=5].groupby(
    ['STNAME'])['TOT_POP'].sum()
state_pop_elderly = county_census2018_df[county_census2018_df['AGEGRP']>=14].groupby(
    ['STNAME'])['TOT_POP'].sum()

In [None]:
# Calculate populations for Puerto Rico
pr_pop_all_columns = ['est72018sex0_age999']
pr_pop_adult_columns = [
    'est72018sex0_age{}to{}'.format(x, x+4)
    for x in range(20,60, 5)
] + ['est72018sex0_age65plus']
pr_pop_edlerly_columns = ['est72018sex0_age65plus']

puerto_rico_census_df = puerto_rico_census_df.astype(dtype=dict(
    (n, int) for n in pr_pop_all_columns + pr_pop_adult_columns))

def get_pr_pop(columns):
    result = puerto_rico_census_df.transpose().reset_index()
    result = result[result['index'].isin(columns)].sum()
    result = pd.DataFrame(data={'STNAME': ['Puerto Rico'], 'TOT_POP': [result.iloc[1]]}) \
        .set_index('STNAME').groupby(
            ['STNAME'])['TOT_POP'].sum()
    return result

state_pop_all_with_pr = pd.concat([state_pop_all, get_pr_pop(pr_pop_all_columns)])
state_pop_adult_with_pr = pd.concat([state_pop_adult, get_pr_pop(pr_pop_adult_columns)])
state_pop_elderly_with_pr = pd.concat([state_pop_elderly, get_pr_pop(pr_pop_edlerly_columns)])
get_pr_pop(pr_pop_all_columns)

In [None]:
state_pop_all_with_pr

In [None]:
state_pop_all_with_pr.sum(), state_pop_adult_with_pr.sum(), state_pop_elderly_with_pr.sum()

In [None]:
county_pops = {
    'Population': county_pop_all,
    'Population (20+)': county_pop_adult,
    'Population (65+)': county_pop_elderly
}

state_pops = {
    'Population': state_pop_all_with_pr,
    'Population (20+)': state_pop_adult_with_pr,
    'Population (65+)': state_pop_elderly_with_pr
}

In [None]:
def set_population_field(target_df, pop_df, column_name, join_on):
    result = target_df.join(pop_df, how='left', on=join_on)
    result = result.rename({'TOT_POP': column_name}, axis=1)
    result = result.fillna(value={column_name: 0})
    return result

### Merge census data into states

In [None]:
state_gdf = gpd.read_file(external_data_path('us_states.geojson'), encoding='utf-8')

In [None]:
enriched_state_df = state_gdf.set_index('NAME')
for column_name, pop_df in state_pops.items():
    pop_df = pop_df.rename({'STNAME': 'State Name'}, axis=1)
    enriched_state_df = set_population_field(enriched_state_df,
                                              pop_df, 
                                              column_name, 
                                              join_on='NAME')
enriched_state_df = enriched_state_df.reset_index()
enriched_state_df = enriched_state_df.rename(columns={'STATE': 'STATE_FIPS',
                                                      'NAME': 'State Name'})
enriched_state_df['State'] = enriched_state_df['State Name'].apply(
    lambda x: state_name_to_abbreviation[x])


In [None]:
enriched_state_df.to_file(processed_data_path('us_states_with_pop.geojson'), driver='GeoJSON')

### Merge census data into counties

In [None]:
county_gdf = gpd.read_file(external_data_path('us_counties.geojson'), encoding='utf-8')


In [None]:
county_gdf = county_gdf.rename(columns={'STATE': 'STATE_FIPS',
                                        'NAME': 'County Name'})

In [None]:
county_gdf = county_gdf.merge(enriched_state_df[['STATE_FIPS', 'State']], on='STATE_FIPS')

In [None]:
#  FIPS code is last 5 digits of GEO_ID
county_gdf['COUNTY_FIPS'] = county_gdf['GEO_ID'].apply(lambda x: x[-5:])
county_gdf = county_gdf.drop(columns=['COUNTY'])

In [None]:
enriched_county_df = county_gdf
for column_name, pop_df in county_pops.items():
    enriched_county_df = set_population_field(enriched_county_df,
                                              pop_df, 
                                              column_name, 
                                              join_on='COUNTY_FIPS')

In [None]:
enriched_county_df.to_file(processed_data_path('us_counties_with_pop.geojson'), driver='GeoJSON')

## Generate population data for HRRs

Spatially join HRRs with counties. For each intersecting county, take the ratio of the area of intersection with the HRR and the area of the county as the ratio of population for that county to be assigned to that HRR.

In [None]:
hrr_gdf = gpd.read_file(external_data_path('us_hrr.geojson'), encoding='utf-8')
hrr_gdf = hrr_gdf.to_crs('EPSG:5070')
hrr_gdf['hrr_geom'] = hrr_gdf['geometry']

In [None]:
county_pop_gdf = enriched_county_df
county_pop_gdf = county_pop_gdf.to_crs('EPSG:5070')
county_pop_gdf['county_geom'] = county_pop_gdf['geometry']

In [None]:
hrr_counties_joined_gpd = gpd.sjoin(county_pop_gdf, hrr_gdf, how='left', op='intersects')


In [None]:
def calculate_ratio(row):
    if row['hrr_geom'] is None:
        return 0.0
    i = row['hrr_geom'].buffer(0).intersection(row['geometry'].buffer(0))
    return i.area / row['geometry'].area

hrr_counties_joined_gpd['ratio'] = hrr_counties_joined_gpd.apply(calculate_ratio, axis=1)


In [None]:
for column in county_pops.keys():    
    hrr_counties_joined_gpd[column] = \
        (hrr_counties_joined_gpd[column] * hrr_counties_joined_gpd['ratio']).round()


In [None]:
hrr_pops = hrr_counties_joined_gpd.groupby('HRR_BDRY_I')[list(county_pops.keys())].sum()
hrr_pops

In [None]:
enriched_hrr_gdf = hrr_gdf.join(hrr_pops, on='HRR_BDRY_I').fillna(value=0)
enriched_hrr_gdf = enriched_hrr_gdf.drop('hrr_geom', axis=1).to_crs('EPSG:4326')

In [None]:
enriched_hrr_gdf

In [None]:
enriched_hrr_gdf.to_file(processed_data_path('us_hrr_with_pop.geojson'), driver='GeoJSON')