In [10]:
import os
import configparser
import tqdm
import warnings
import pandas as pd
import geopandas as gpd
warnings.filterwarnings('ignore')

In [11]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

We access the air quality data downloaded from https://www.epa.gov/outdoor-air-quality-data/download-daily-data

In [12]:
air_path = os.path.join(DATA_RAW, 'air_quality')
files = os.listdir(air_path)

In [13]:
dfs = []

for file in files:
    
    state_path = os.path.join(air_path, file)
    df = pd.read_csv(state_path)
    df = df[['Daily Mean PM2.5 Concentration', 
             'Units', 'Daily AQI Value', 'State', 
             'County', 'Site Latitude', 'Site Longitude']]
    grouped_df = (
        df.groupby(['State', 'County', 'Units', 'Site Latitude', 'Site Longitude'], as_index=False)
          .agg({'Daily Mean PM2.5 Concentration': 'mean',
              'Daily AQI Value': 'mean'}))
    dfs.append(grouped_df)

Combine air quality data

In [14]:
dfs = pd.concat(dfs, ignore_index=True)
folder_out = os.path.join(DATA_RESULTS, 'final')

filename = 'air_quality_data.csv'
path_out = os.path.join(folder_out, filename)
dfs.to_csv(path_out, index = False)
dfs

Unnamed: 0,State,County,Units,Site Latitude,Site Longitude,Daily Mean PM2.5 Concentration,Daily AQI Value
0,Minnesota,Anoka,ug/m3 LC,45.137680,-93.207615,5.986638,31.491379
1,Minnesota,Becker,ug/m3 LC,46.851811,-95.846272,5.297953,28.149123
2,Minnesota,Beltrami,ug/m3 LC,47.878159,-95.029172,4.651156,25.300578
3,Minnesota,Carlton,ug/m3 LC,46.713694,-92.511722,1.348406,9.028986
4,Minnesota,Cass,ug/m3 LC,47.384430,-94.601660,5.858908,31.692529
...,...,...,...,...,...,...,...
1271,Montana,Sanders,ug/m3 LC,47.594395,-115.323746,8.119452,39.172603
1272,Montana,Sanders,ug/m3 LC,47.954771,-115.671925,4.029126,17.184466
1273,Montana,Sheridan,ug/m3 LC,48.487054,-104.476346,3.600000,19.337079
1274,Montana,Silver Bow,ug/m3 LC,46.002602,-112.501247,7.982512,34.815271


Next, we merge the air quality data with pulmonary embolism data.

In [6]:
pul_path = os.path.join(DATA_RESULTS, 'final', 'pulmonary_full_data.csv')
df2 = pd.read_csv(pul_path)

In [7]:
dfs = dfs[['State', 'County', 'Units', 'Daily Mean PM2.5 Concentration', 'Daily AQI Value']]
dfs.rename(columns = {'County': 'county_name', 'State': 'state'}, inplace = True)
df2 = df2.merge(dfs, on = ['state', 'county_name'], how = 'right')

Since pulmonary embolism is a rare condition, we calculate the mortality rates per 100,000 people.

In [8]:
df2['mort_per_100k'] = (df2['mortality_count'] / df2['pop20']) * 100000

In [9]:
filename = 'pulmonary_air_quality_data.csv'
path_out = os.path.join(folder_out, filename)
df2.to_csv(path_out, index = False)