In [357]:
import os
import configparser
import tqdm
import warnings
import pandas as pd
import geopandas as gpd
from shapely import wkt
warnings.filterwarnings('ignore')

In [358]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

In [359]:
data_path = os.path.join(DATA_RESULTS, 'processed', 'age_cdc_pulmonary_data.csv')
df = pd.read_csv(data_path)
df.head(5)

Unnamed: 0,state,county_name,state_fips,res_countyfips,res_statefips,age_cat,mort_count,fileyear,filetype
0,Alabama,Autauga,1,1,AL,30 - 49 years,2,2005,US
1,Alabama,Autauga,1,1,AL,30 - 49 years,3,2006,US
2,Alabama,Autauga,1,1,AL,70 years or above,2,2006,US
3,Alabama,Autauga,1,1,AL,30 - 49 years,1,2007,US
4,Alabama,Autauga,1,1,AL,70 years or above,1,2007,US


Only select from the year 2012

In [360]:
df = df[df['fileyear'] >= 2012].copy()

Sum the mortality count for all the years.

In [361]:
df = (df.groupby(['state', 'county_name', 'state_fips', 
                  'res_countyfips', 'res_statefips', 'age_cat'], 
                 as_index = False).agg({'mort_count': 'sum'}))
df

Unnamed: 0,state,county_name,state_fips,res_countyfips,res_statefips,age_cat,mort_count
0,Alabama,Autauga,1,1,AL,30 - 49 years,2
1,Alabama,Autauga,1,1,AL,70 years or above,16
2,Alabama,Baldwin,1,3,AL,10 - 29 years,2
3,Alabama,Baldwin,1,3,AL,30 - 49 years,9
4,Alabama,Baldwin,1,3,AL,70 years or above,61
...,...,...,...,...,...,...,...
5990,Wyoming,Teton,56,39,WY,30 - 49 years,1
5991,Wyoming,Teton,56,39,WY,70 years or above,8
5992,Wyoming,Uinta,56,41,WY,70 years or above,8
5993,Wyoming,Washakie,56,43,WY,70 years or above,5


We want to pivot our DataFrame so that the "sex" column becomes wide columns, and the "total_mortality" values appear under those columns.

In [362]:
df = (df.pivot_table(index = ['state', 'county_name', 
                              'state_fips', 'res_countyfips', 
                              'res_statefips'], columns = 'age_cat', 
                     values = 'mort_count', aggfunc = 'sum').reset_index())
df = df[['state', 'county_name', 'res_statefips', '70 years or above', 
         '30 - 49 years', '10 - 29 years', ' 9 years or below']]

### Combine Mortality & County Shapefiles

We read and prepare the county shapefiles data

In [363]:
county_path = os.path.join(DATA_RAW, 'geodata', 'USCounties.shp')
gdf = gpd.read_file(county_path)
gdf = gdf[['NAME', 'STATE_NAME', 'FIPS', 'geometry']]
gdf.rename(columns = {'NAME': 'county_name', 'STATE_NAME': 'state', 
                      'FIPS': 'fips_new'}, inplace = True)
gdf

Unnamed: 0,county_name,state,fips_new,geometry
0,Siskiyou,California,06093,"POLYGON ((-2263443.001 2373524.342, -2263269.8..."
1,Big Horn,Montana,30003,"POLYGON ((-961891.533 2507738.015, -961894.81 ..."
2,Del Norte,California,06015,"POLYGON ((-2292563.856 2394672.986, -2292561.7..."
3,Linn,Oregon,41043,"POLYGON ((-2028816.907 2640191.92, -2029870.14..."
4,Benton,Oregon,41003,"POLYGON ((-2179108.518 2686663.84, -2179075.94..."
...,...,...,...,...
3064,Indian River,Florida,12061,"MULTIPOLYGON (((1545289.189 626576.115, 154513..."
3065,St. Lucie,Florida,12111,"MULTIPOLYGON (((1558142.384 592233.686, 155813..."
3066,Broward,Florida,12011,"MULTIPOLYGON (((1522198.511 446881.286, 152200..."
3067,Miami-Dade,Florida,12086,"MULTIPOLYGON (((1594417.097 444933.533, 159441..."


In [364]:
gdf1 = gdf.merge(df, on = ['state', 'county_name'], how = 'left')
#gdf1[['F','M']] = gdf1[['F','M']].fillna(0)

### Mortality vs Population

We now combine the pulmonary embolism data with the 2020 Census results

In [365]:
cen_path = os.path.join(DATA_RAW, 'geodata', 'pop_2020_censu_block.csv')
cs = pd.read_csv(cen_path, encoding = 'latin')
cs = cs[['county', 'tract', 'stab', 'CountyName', 'pop20']]

Next we drop unnecessary columns and then sum up the 2020 population by counties.

<div class="alert alert-block alert-warning">

<b>!! Attention !!</b> We are averaging the county fips but this is not a standard practice. We are only doing it because it will return the same value for each county.

</div>

In [366]:
cs = cs.drop(['tract'], axis = 1)

cs = cs.groupby(["stab", "CountyName"]).agg({"pop20": "sum",
    "county": "mean"}).reset_index()
cs.rename(columns={'county': 'county_fips'}, inplace = True)

For this data, the two-letter state abbreviation is added at each end of the county name. We therefore need to remove the abbreviations at the end of the names

In [367]:
cs["county_name"] = cs["CountyName"].str.replace(r'\s+[A-Z]{2}$', '', regex = True)
cs = cs.drop(['CountyName'], axis = 1)
cs.rename(columns={'stab': 'res_statefips'}, inplace = True)

In [368]:
gdf2 = gdf1.merge(cs, on = ['county_name', 'res_statefips'], how = 'left')

In [369]:
folder_out = os.path.join(DATA_RESULTS, 'final')
filename = 'pulmonary_census_age_data.csv'
path_out = os.path.join(folder_out, filename)
gdf2.to_csv(path_out, index = False)

## Prepare Data for GWR

In this section we prepare the sex pulmonary embolism data for running the Multiscale Geographically Weighted Regression

#### Convert County Geometries into points

First, let's calculate the centroid.

In [370]:
gdf2 = gdf2.to_crs(epsg = 4326)
gdf2['centroid'] = gdf2.geometry.centroid

And then extract longitudes and latitudes.

In [371]:
gdf2['longitude'] = gdf2.centroid.x
gdf2['latitude'] = gdf2.centroid.y

And then drop the redundant columns before saving the data.

In [372]:
gdf2 = gdf2.drop(columns=['centroid', 'geometry', 'fips_new'])
gdf2

Unnamed: 0,county_name,state,res_statefips,70 years or above,30 - 49 years,10 - 29 years,9 years or below,pop20,county_fips,longitude,latitude
0,Siskiyou,California,CA,29.0,3.0,,,44076.0,6093.0,-122.540339,41.592660
1,Big Horn,Montana,MT,3.0,1.0,1.0,,13124.0,30003.0,-107.489699,45.423456
2,Del Norte,California,CA,16.0,3.0,,,27743.0,6015.0,-123.896765,41.743053
3,Linn,Oregon,OR,26.0,3.0,,,128610.0,41043.0,-122.534212,44.488590
4,Benton,Oregon,OR,14.0,2.0,2.0,,95184.0,41003.0,-123.429203,44.492176
...,...,...,...,...,...,...,...,...,...,...,...
3064,Indian River,Florida,FL,49.0,2.0,,,159788.0,12061.0,-80.616658,27.692695
3065,St. Lucie,Florida,,,,,,,,-80.483686,27.376240
3066,Broward,Florida,FL,248.0,58.0,7.0,,1944375.0,12011.0,-80.487838,26.151847
3067,Miami-Dade,Florida,FL,408.0,58.0,12.0,,2701767.0,12086.0,-80.568319,25.613647


Next we convert our wide dataframe (F and M columns) into long format, so that there is:

    - A column sex (with values "F" and "M")

    - A column mortality_count with the corresponding values.

In [373]:
'''for val in gdf2['age_cat'].unique():
    print(val)'''

"for val in gdf2['age_cat'].unique():\n    print(val)"

**Age**

- 3: 70 years or above
- 2: 30 - 49 years
- 1: 10 - 29 years
- 0: 9 years or below

**Race**

- 2: Non-White/Black
- 1: White
- 0: Black

**Sex**

- 1: M
- 0: F

In [374]:
'''gdf2['race_recode3'] = gdf2['race_recode3'].map(
    {'Non-White/Black': 2, 'White': 1, 'Black': 0})
gdf2['age_cat'] = gdf2['age_cat'].map(
    {'70 years or above': 3, '30 - 49 years': 2, 
     '10 - 29 years': 1, '9 years or below': 0})'''

"gdf2['race_recode3'] = gdf2['race_recode3'].map(\n    {'Non-White/Black': 2, 'White': 1, 'Black': 0})\ngdf2['age_cat'] = gdf2['age_cat'].map(\n    {'70 years or above': 3, '30 - 49 years': 2, \n     '10 - 29 years': 1, '9 years or below': 0})"

In [375]:
gdf2 = gdf2.melt(id_vars = ['county_name', 'state', 'res_statefips', 
                            'pop20', 'county_fips', 'longitude', 'latitude'],
       value_vars = ['70 years or above', '30 - 49 years', '10 - 29 years', 
                     ' 9 years or below'], 
       var_name = 'age_cat', value_name = 'mortality_count')
gdf2['age_cat'] = gdf2['age_cat'].map(
    {'70 years or above': 3, '30 - 49 years': 2, 
     '10 - 29 years': 1, ' 9 years or below': 0})
gdf2['mortality_count'] = gdf2['mortality_count'].fillna(0)

In [376]:
folder_out = os.path.join(DATA_RESULTS, 'gwr_data')
filename = 'age_gwr_data.csv'
path_out = os.path.join(folder_out, filename)
gdf2.to_csv(path_out, index = False)