In [31]:
import os
import configparser
import tqdm
import pandas as pd
import geopandas as gpd

In [2]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

In [8]:
cnty_path = os.path.join(DATA_RAW, 'geodata', 'county_states.csv')
df = pd.read_csv(cnty_path)
#df1 = pd.read_csv("cdc_mort_2005to2022_race_recode3sum.csv")
#df2 = pd.merge(df, df1, on=["res_statefips", "res_countyfips"], how = "inner")
#df2

In [9]:
#df2.to_csv("race_cdc_pulmonary_data.csv", index = False)

## 1. Handling Census Data

First let us import census data, do some pre-processing and cleaning. 

In [10]:
census_path = os.path.join(DATA_RAW, 'geodata', 'pop_2020_censu_block.csv')
census = pd.read_csv(census_path, encoding = 'latin')

Next we drop unnecessary columns and then sum up the 2020 population by counties.

<div class="alert alert-block alert-warning">

<b>!! Attention !!</b> We are averaging the county fips but this is not a standard practice. We are only doing it because it will return the same value for each county.

</div>

In [11]:
census = census.drop(['tract', 'state'], axis = 1)

results = census.groupby(["stab", "CountyName"]).agg({"pop20": "sum",
    "county": "mean"}).reset_index()
results.rename(columns={'county': 'county_fips'}, inplace = True)

For this data, the two-letter state abbreviation is added at each end of the county name. We therefore need to remove the abbreviations at the end of the names.

In [12]:
results["county_name"] = results["CountyName"].str.replace(r'\s+[A-Z]{2}$', '', regex = True)
results = results.drop(['CountyName'], axis = 1)
results.rename(columns={'stab': 'res_statefips'}, inplace = True)

## 2. Merging Census with Pulmonary Embolism Data

<div class="alert alert-block alert-info">
    
<b>Note:</b> We explain in details how to merge the Census to the pulmonary embolism data by sex and then apply the same approach for age and race. So the subsections of age and race will not be detailed. 

</div>

In [35]:
county_path = os.path.join(DATA_RAW, 'geodata', 'USCounties.shp')
gdf = gpd.read_file(county_path)
gdf = gdf[['NAME', 'STATE_NAME', 'FIPS', 'geometry']]
gdf.rename(columns = {'NAME': 'county_name', 'STATE_NAME': 'state', 
                      'FIPS': 'fips_new'}, inplace = True)

df = df[['state', 'res_statefips']]
gdf1 = gdf.merge(df, on = "state", how = 'inner')
gdf1 = gdf1.drop_duplicates(subset = "geometry", keep = "first")

gdf1['centroid'] = gdf1.geometry.centroid
gdf1 = gdf1.drop('geometry', axis = 1)
gdf1

Unnamed: 0,county_name,state,fips_new,res_statefips,centroid
0,Siskiyou,California,06093,CA,POINT (-2167649.205 2370072.976)
58,Big Horn,Montana,30003,MT,POINT (-896823.436 2547053.597)
114,Del Norte,California,06015,CA,POINT (-2270465.19 2417785.501)
172,Linn,Oregon,41043,OR,POINT (-2078036.407 2680804.985)
208,Benton,Oregon,41003,OR,POINT (-2146117.679 2701037.647)
...,...,...,...,...,...
297067,Indian River,Florida,12061,FL,POINT (1517377.333 637127.521)
297134,St. Lucie,Florida,12111,FL,POINT (1536048.635 604816.767)
297201,Broward,Florida,12011,FL,POINT (1557530.467 471825.952)
297268,Miami-Dade,Florida,12086,FL,POINT (1559054.995 412183.615)


## Full Data

In [124]:
all_path = os.path.join(DATA_RESULTS, 'processed', 
                   'complete_cdc_pulmonary_data.csv')
df1 = pd.read_csv(all_path)
df1 = df1[df1['fileyear'] >= 2012]

In [125]:
df1 = (df1.groupby(['county_name', 'state_fips', 'res_countyfips', 
                    'res_statefips', 'sex', 'race_recode3', 'age_cat'], 
                   as_index = False).agg(sum = ("mort_count", "sum")))

In [126]:
df1 = gdf1.merge(df1, on = "res_statefips", how = 'inner')

In [127]:
df1.columns

Index(['county_name_x', 'state', 'fips_new', 'res_statefips', 'centroid',
       'county_name_y', 'state_fips', 'res_countyfips', 'sex', 'race_recode3',
       'age_cat', 'sum'],
      dtype='object')

In [128]:
df1.rename(columns = {'county_name_y': 'county_name', 
           'centroid': 'geometry', 'sum': 'mortality_count'}, 
           inplace = True)

df1 = df1.drop(['county_name_x', 'state'], axis = 1)

#df1.rename(columns={'fips_new_y': 'fips_new',
    #'centroid': 'geometry', 'sum': 'mortality_count'}, inplace=True)

folder_out = os.path.join(DATA_RESULTS, 'final')

filename = 'pulmonary_full_data.csv'
path_out = os.path.join(folder_out, filename)
df1.to_csv(path_out, index = False)
df1

Unnamed: 0,fips_new,res_statefips,geometry,county_name,state_fips,res_countyfips,sex,race_recode3,age_cat,mortality_count
0,06093,CA,POINT (-2167649.205 2370072.976),Alameda,6,1,F,Black,30 - 49 years,10
1,06093,CA,POINT (-2167649.205 2370072.976),Alameda,6,1,F,Black,70 years or above,23
2,06093,CA,POINT (-2167649.205 2370072.976),Alameda,6,1,F,Non-White/Black,10 - 29 years,1
3,06093,CA,POINT (-2167649.205 2370072.976),Alameda,6,1,F,Non-White/Black,30 - 49 years,3
4,06093,CA,POINT (-2167649.205 2370072.976),Alameda,6,1,F,Non-White/Black,70 years or above,5
...,...,...,...,...,...,...,...,...,...,...
1179092,12087,FL,POINT (1514598.479 385973.284),Washington,12,133,F,Black,10 - 29 years,1
1179093,12087,FL,POINT (1514598.479 385973.284),Washington,12,133,F,White,70 years or above,3
1179094,12087,FL,POINT (1514598.479 385973.284),Washington,12,133,M,Black,30 - 49 years,1
1179095,12087,FL,POINT (1514598.479 385973.284),Washington,12,133,M,White,30 - 49 years,1


### a. By Sex

First, let us import the pulmonary embolism data.

In [51]:
pulm_path = os.path.join(DATA_RESULTS, 'processed', 'sex_cdc_pulmonary_data.csv')
pulmonary = pd.read_csv(pulm_path)

We then calculate the total mortality counts per county, state and sex.

In [52]:
pulmonary_summ = (
    pulmonary.groupby(["res_statefips", "county_name", "sex", 
                       "fileyear"], as_index = False)
       .agg(sum=("mort_count", "sum")))

Next, we merge the two datasets using the state code and the county names. 

In [62]:
pulm_cens = pd.merge(results, pulmonary_summ, 
            on = ["res_statefips", "county_name"], how = "inner")
results.rename(columns = {'stab': 'res_statefips'}, inplace = True)
pulm_cens.rename(columns = {'sum': 'total_mortality'}, inplace = True)

In [63]:
pulm_cens = pulm_cens[['county_fips', 'res_statefips', 'county_name', 
                       'sex', 'fileyear', 'total_mortality']]
pulm_cens = pulm_cens[pulm_cens['fileyear'] >= 2012]
pulm_cens = (pulm_cens.groupby(['county_fips', 'res_statefips', 
    'county_name', 'sex'], as_index = False)['total_mortality'].sum())

In [64]:
pulm_cens = gdf1.merge(pulm_cens, on = "res_statefips", how = 'inner')
pulm_cens.rename(columns = {'county_name_y': 'county_name'}, inplace = True)
pulm_cens = pulm_cens.drop(['state','county_name_x'], axis = 1)

folder_out = os.path.join(DATA_RESULTS, 'final')

filename = 'pulmonary_census_sex_data.csv'
path_out = os.path.join(folder_out, filename)
pulm_cens.to_csv(path_out, index = False)

### b. By Race

In [68]:
pulm_race_path = os.path.join(DATA_RESULTS, 'processed', 'race_cdc_pulmonary_data.csv')
pulm_race = pd.read_csv(pulm_race_path)
pulm_race = pulm_race[pulm_race['fileyear'] >= 2012]

In [69]:
pulm_race_summ = (
    pulm_race.groupby(["res_statefips", "county_name", "race_recode3", 
                       "fileyear"], as_index = False)
       .agg(sum=("mort_count", "sum")))

In [70]:
pulm_race_cens = pd.merge(results, 
                 pulm_race_summ, on = ["res_statefips", "county_name"], 
                 how = "inner")
pulm_race_cens.rename(columns = {'sum': 'total_mortality'}, inplace = True)
pulm_race_cens = pulm_race_cens[['county_fips', 'res_statefips', 'county_name', 
                       'race_recode3', 'fileyear', 'total_mortality']]

pulm_race_cens = (pulm_race_cens.groupby(['county_fips', 'res_statefips', 
    'county_name', 'race_recode3'], as_index = False)['total_mortality'].sum())

pulm_race_cens = gdf1.merge(pulm_race_cens, on = "res_statefips", how = 'inner')
pulm_race_cens = pulm_race_cens.drop(['state','county_name_x'], axis = 1)
pulm_race_cens.rename(columns = {'county_name_y': 'county_name'}, inplace = True)

filename = 'pulmonary_census_race_data.csv'
path_out = os.path.join(folder_out, filename)
pulm_race_cens.to_csv(path_out, index = False)

### c. By Age

In [77]:
pulm_age_path = os.path.join(DATA_RESULTS, 'processed', 'age_cdc_pulmonary_data.csv')
pulm_age = pd.read_csv(pulm_age_path)
pulm_age

Unnamed: 0,state,county_name,state_fips,res_countyfips,res_statefips,age_cat,mort_count,fileyear,filetype
0,Alabama,Autauga,1,1,AL,30 - 49 years,2,2005,US
1,Alabama,Autauga,1,1,AL,30 - 49 years,3,2006,US
2,Alabama,Autauga,1,1,AL,70 years or above,2,2006,US
3,Alabama,Autauga,1,1,AL,30 - 49 years,1,2007,US
4,Alabama,Autauga,1,1,AL,70 years or above,1,2007,US
...,...,...,...,...,...,...,...,...,...
45830,Wyoming,Weston,56,45,WY,30 - 49 years,1,2008,US
45831,Wyoming,Weston,56,45,WY,70 years or above,1,2009,US
45832,Wyoming,Weston,56,45,WY,70 years or above,1,2011,US
45833,Wyoming,Weston,56,45,WY,70 years or above,1,2015,US


In [78]:
pulm_age_summ = (
    pulm_age.groupby(["res_statefips", "county_name", "age_cat", 
                       "fileyear"], as_index = False)
       .agg(sum=("mort_count", "sum")))

In [82]:
pulm_age_cens = pd.merge(results, 
                 pulm_age_summ, on = ["res_statefips", "county_name"], 
                 how = "inner")
pulm_age_cens.rename(columns = {'sum': 'total_mortality'}, inplace = True)
pulm_age_cens = pulm_age_cens[['county_fips', 'res_statefips', 'county_name', 
                       'age_cat', 'fileyear', 'total_mortality']]
pulm_age_cens = (pulm_age_cens.groupby(['county_fips', 'res_statefips', 
    'county_name', 'age_cat'], as_index = False)['total_mortality'].sum())

pulm_age_cens = gdf1.merge(pulm_age_cens, on = "res_statefips", how = 'inner')
pulm_age_cens = pulm_age_cens.drop(['state','county_name_x'], axis = 1)
pulm_age_cens.rename(columns = {'county_name_y': 'county_name'}, inplace = True)

filename = 'pulmonary_census_age_data.csv'
path_out = os.path.join(folder_out, filename)
pulm_age_cens.to_csv(path_out, index = False)