In [184]:
import os
import configparser
import tqdm
import warnings
import pandas as pd
import geopandas as gpd
from shapely import wkt
warnings.filterwarnings('ignore')

In [185]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

First we read all the required datasets

In [186]:
data_path = os.path.join(DATA_RESULTS, 'gwr_data', 'all_gwr_data.csv')

df = pd.read_csv(data_path)
df['mort_count'] = df['mort_count'].fillna(0)

Before we proceed, let us remove the US territories since we do not have their data. This includes American Samoa, Guam, Northern Mariana, Puerto Rico and Virgin Islands.

In [187]:
territories = ['AS', 'GU', 'MP', 'PR', 'VI']
df = df[~df['res_statefips'].isin(territories)]
df = df.reset_index(drop = True)

We are only going to keep the age and sex columns since the occurrence of pulmonary embolism differs with age and maybe sex.

In [188]:
group_cols = ['county_name', 'state', 'state_fips', 
              'res_countyfips', 'sex', 'county_fips', 
              'res_statefips', 'longitude', 'latitude', 
              'age_cat']

df = (df.groupby(group_cols, as_index = False).agg({
        'mort_count': 'sum',
        'pop20': 'first'}))

Next, we calculate the mortality counts per a 100,000 people.

In [189]:
df['mort_per_100k'] = ((df['mort_count'] * 100000) / df['pop20'])#.round().astype(int)

### Air Quality Data

We now include the air quality data

In [190]:
shp_path = os.path.join(DATA_RAW, 'geodata', 'air_quality.shp')
gdf = gpd.read_file(shp_path)
gdf = gdf[['pm25', 'aqi_value','STATE', 'STATEABBRV', 
           'STATEFIPS', 'COUNTY', 'COUNTYTYPE']]
gdf = gdf.rename(columns = {'pm25': 'pm25_mean',
    'aqi_value': 'air_quality_index', 'COUNTY': 'county_name',
    'STATE': 'state'})
gdf = gdf[~gdf['STATEABBRV'].isin(territories)]
gdf = gdf.reset_index(drop = True)

In [191]:
df1 = pd.merge(gdf, df, on =['state', 'county_name'], how = 'left')
#df1 = df1.drop(columns = ['state_fips', 'county_fips', 'mort_count'])https://catalog.data.gov/dataset/walkability-index8/resource/89b876c2-934f-4d06-b895-6949edfcb53c

### Walkability Index Data

We also include the walkability index data downloaded from https://catalog.data.gov/dataset/walkability-index8/resource/89b876c2-934f-4d06-b895-6949edfcb53c 

In [192]:
df1

Unnamed: 0,pm25_mean,air_quality_index,state,STATEABBRV,STATEFIPS,county_name,COUNTYTYPE,state_fips,res_countyfips,sex,county_fips,res_statefips,longitude,latitude,age_cat,mort_count,pop20,mort_per_100k
0,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,1.0,0.0,1.0,AL,-86.642759,32.534932,2.0,5.0,58764.0,8.508611
1,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,1.0,0.0,1.0,AL,-86.642759,32.534932,3.0,15.0,58764.0,25.525832
2,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,1.0,1.0,1.0,AL,-86.642759,32.534932,2.0,4.0,58764.0,6.806889
3,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,1.0,1.0,1.0,AL,-86.642759,32.534932,3.0,9.0,58764.0,15.315499
4,7.780374,39.967453,Alabama,AL,01,Baldwin,County,1.0,3.0,0.0,3.0,AL,-87.723352,30.736470,1.0,1.0,231365.0,0.432217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11114,0.000000,0.000000,Virginia,VA,51,Waynesboro,City,,,,,,,,,,,
11115,0.000000,0.000000,Virginia,VA,51,Williamsburg,City,,,,,,,,,,,
11116,0.000000,0.000000,Virginia,VA,51,Winchester,City,,,,,,,,,,,
11117,0.000000,0.000000,Wisconsin,WI,55,Pepin,County,55.0,91.0,0.0,91.0,WI,-92.001564,44.582933,3.0,2.0,7313.0,27.348557


In [173]:
walk_path = os.path.join(DATA_RAW, 'geodata', 'walkability_index.shp')
wki = gpd.read_file(walk_path)
wki = wki.rename(columns = {'STATEFP': 'county_name',
    'STATE_NAME': 'state', 'mean': 'walkability_index'})
wki

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,NatWalkInd,Shape_Leng,Shape_Area,geometry
0,1.0,48,113,14.000000,3110.360820,2.978361e+05,"POLYGON ((-6.9e+04 1.09e+06, -6.9e+04 1.09e+06..."
1,2.0,48,113,10.833333,3519.469110,4.849451e+05,"POLYGON ((-6.89e+04 1.09e+06, -6.89e+04 1.09e+..."
2,3.0,48,113,8.333333,1697.091802,1.067059e+05,"POLYGON ((-6.81e+04 1.09e+06, -6.81e+04 1.09e+..."
3,4.0,48,113,15.666667,2922.609204,4.818284e+05,"POLYGON ((-6.9e+04 1.09e+06, -6.9e+04 1.09e+06..."
4,5.0,48,113,10.166667,3731.971773,6.876848e+05,"POLYGON ((-6.9e+04 1.09e+06, -6.9e+04 1.09e+06..."
...,...,...,...,...,...,...,...
220734,220735.0,78,030,7.333333,3414.446949,3.355857e+05,"POLYGON ((3.35e+06 5.01e+04, 3.35e+06 5.01e+04..."
220735,220736.0,78,030,7.333333,2421.025608,2.924305e+05,"POLYGON ((3.35e+06 5.12e+04, 3.35e+06 5.12e+04..."
220736,220737.0,78,030,7.333333,1955.909418,1.619395e+05,"POLYGON ((3.35e+06 5.13e+04, 3.35e+06 5.13e+04..."
220737,220738.0,78,030,4.000000,16896.768872,1.038966e+07,"MULTIPOLYGON (((3.36e+06 5.04e+04, 3.36e+06 5...."


In [166]:
df2 = pd.merge(df1, wki, on =['state', 'county_name'], how = 'left')

### Poverty and Income Data

We also include poverty and income data https://www.census.gov/data/datasets/2023/demo/saipe/2023-state-and-county.html

In [167]:
inc_path = os.path.join(DATA_RAW, 'poverty_income.csv')
povin = pd.read_csv(inc_path)
povin = povin[['res_statefips', 'county_name', 
               'poverty_estimate', 'poverty_perc', 
               'median_income']]
povin['county_name'] = povin['county_name'].str.replace(r'\s*county\s*', '', 
                       case = False, regex = True).str.strip()

In [168]:
df3 = pd.merge(df2, povin, on =['res_statefips', 'county_name'], how = 'left')
df3

Unnamed: 0,pm25_mean,air_quality_index,state,STATEABBRV,STATEFIPS,county_name,COUNTYTYPE,res_countyfips,sex,res_statefips,longitude,latitude,age_cat,pop20,mort_per_100k,walkability_index,poverty_estimate,poverty_perc,median_income
0,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,0.0,AL,-86.642759,32.534932,2.0,58764.0,8.508611,4.826667,7004,11.7,68857
1,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,0.0,AL,-86.642759,32.534932,3.0,58764.0,25.525832,4.826667,7004,11.7,68857
2,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,1.0,AL,-86.642759,32.534932,2.0,58764.0,6.806889,4.826667,7004,11.7,68857
3,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,1.0,AL,-86.642759,32.534932,3.0,58764.0,15.315499,4.826667,7004,11.7,68857
4,7.780374,39.967453,Alabama,AL,01,Baldwin,County,3.0,0.0,AL,-87.723352,30.736470,1.0,231365.0,0.432217,6.038690,24942,10,74248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11114,0.000000,0.000000,Virginia,VA,51,Waynesboro,City,,,,,,,,,,,,
11115,0.000000,0.000000,Virginia,VA,51,Williamsburg,City,,,,,,,,,,,,
11116,0.000000,0.000000,Virginia,VA,51,Winchester,City,,,,,,,,,,,,
11117,0.000000,0.000000,Wisconsin,WI,55,Pepin,County,91.0,0.0,WI,-92.001564,44.582933,3.0,7313.0,27.348557,5.208333,762,10.3,70012


### Heart Disease Data

We also include heart disease mortality data from US Department of Health and Human Services https://catalog.data.gov/dataset/heart-disease-mortality-data-among-us-adults-35-by-state-territory-and-county-2019-2021

In [169]:
hd_path = os.path.join(DATA_RAW, 'heart_disease_mortality_geo.csv')
hd = pd.read_csv(hd_path)
hd = hd[['state', 'county_name', 'heart_mort_100k']]
df4 = pd.merge(df3, hd, on =['state', 'county_name'], how = 'left')
df4

Unnamed: 0,pm25_mean,air_quality_index,state,STATEABBRV,STATEFIPS,county_name,COUNTYTYPE,res_countyfips,sex,res_statefips,longitude,latitude,age_cat,pop20,mort_per_100k,walkability_index,poverty_estimate,poverty_perc,median_income,heart_mort_100k
0,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,0.0,AL,-86.642759,32.534932,2.0,58764.0,8.508611,4.826667,7004,11.7,68857,410.9
1,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,0.0,AL,-86.642759,32.534932,3.0,58764.0,25.525832,4.826667,7004,11.7,68857,410.9
2,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,1.0,AL,-86.642759,32.534932,2.0,58764.0,6.806889,4.826667,7004,11.7,68857,410.9
3,7.824958,39.653584,Alabama,AL,01,Autauga,County,1.0,1.0,AL,-86.642759,32.534932,3.0,58764.0,15.315499,4.826667,7004,11.7,68857,410.9
4,7.780374,39.967453,Alabama,AL,01,Baldwin,County,3.0,0.0,AL,-87.723352,30.736470,1.0,231365.0,0.432217,6.038690,24942,10,74248,358.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11138,0.000000,0.000000,Virginia,VA,51,Waynesboro,City,,,,,,,,,,,,,
11139,0.000000,0.000000,Virginia,VA,51,Williamsburg,City,,,,,,,,,,,,,
11140,0.000000,0.000000,Virginia,VA,51,Winchester,City,,,,,,,,,,,,,
11141,0.000000,0.000000,Wisconsin,WI,55,Pepin,County,91.0,0.0,WI,-92.001564,44.582933,3.0,7313.0,27.348557,5.208333,762,10.3,70012,311.7


In [170]:
folder_out = os.path.join(DATA_RESULTS, 'gwr_data')
filename = 'full_gwr_ready_data.csv'
path_out = os.path.join(folder_out, filename)
df4.to_csv(path_out, index = False)