In [1]:
import os
import configparser
import tqdm
import pandas as pd
import geopandas as gpd

In [2]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

We access the air quality data downloaded from https://www.epa.gov/outdoor-air-quality-data/download-daily-data

In [4]:
air_path = os.path.join(DATA_RAW, 'air_quality')
files = os.listdir(air_path)

In [24]:
dfs = []

for file in files:
    
    state_path = os.path.join(air_path, file)
    df = pd.read_csv(state_path)
    df = df[['Daily Mean PM2.5 Concentration', 
             'Units', 'Daily AQI Value', 'State', 
             'County', 'Site Latitude', 'Site Longitude']]
    grouped_df = (
        df.groupby(['State', 'County', 'Units', 'Site Latitude', 'Site Longitude'], as_index=False)
          .agg({'Daily Mean PM2.5 Concentration': 'mean',
              'Daily AQI Value': 'mean'}))
    dfs.append(grouped_df)

Combine air quality data

In [25]:
dfs = pd.concat(dfs, ignore_index=True)
folder_out = os.path.join(DATA_RESULTS, 'final')

filename = 'air_quality_data.csv'
path_out = os.path.join(folder_out, filename)
dfs.to_csv(path_out, index = False)
dfs

Unnamed: 0,State,County,Units,Site Latitude,Site Longitude,Daily Mean PM2.5 Concentration,Daily AQI Value
0,Minnesota,Anoka,ug/m3 LC,45.137680,-93.207615,5.986638,31.491379
1,Minnesota,Becker,ug/m3 LC,46.851811,-95.846272,5.297953,28.149123
2,Minnesota,Beltrami,ug/m3 LC,47.878159,-95.029172,4.651156,25.300578
3,Minnesota,Carlton,ug/m3 LC,46.713694,-92.511722,1.348406,9.028986
4,Minnesota,Cass,ug/m3 LC,47.384430,-94.601660,5.858908,31.692529
...,...,...,...,...,...,...,...
1260,Montana,Sanders,ug/m3 LC,47.594395,-115.323746,8.119452,39.172603
1261,Montana,Sanders,ug/m3 LC,47.954771,-115.671925,4.029126,17.184466
1262,Montana,Sheridan,ug/m3 LC,48.487054,-104.476346,3.600000,19.337079
1263,Montana,Silver Bow,ug/m3 LC,46.002602,-112.501247,7.982512,34.815271


In [29]:
pul_path = os.path.join(DATA_RESULTS, 'final', 'pulmonary_full_data.csv')
df2 = pd.read_csv(pul_path)

Unnamed: 0,fips_new,res_statefips,geometry,county_name,state_fips,res_countyfips,sex,race_recode3,age_cat,mortality_count
0,6093,CA,POINT (-2167649.2049579737 2370072.9761748626),Alameda,6,1,F,Black,30 - 49 years,10
1,6093,CA,POINT (-2167649.2049579737 2370072.9761748626),Alameda,6,1,F,Black,70 years or above,23
2,6093,CA,POINT (-2167649.2049579737 2370072.9761748626),Alameda,6,1,F,Non-White/Black,10 - 29 years,1
3,6093,CA,POINT (-2167649.2049579737 2370072.9761748626),Alameda,6,1,F,Non-White/Black,30 - 49 years,3
4,6093,CA,POINT (-2167649.2049579737 2370072.9761748626),Alameda,6,1,F,Non-White/Black,70 years or above,5
...,...,...,...,...,...,...,...,...,...,...
1179092,12087,FL,POINT (1514598.4786629865 385973.2843747351),Washington,12,133,F,Black,10 - 29 years,1
1179093,12087,FL,POINT (1514598.4786629865 385973.2843747351),Washington,12,133,F,White,70 years or above,3
1179094,12087,FL,POINT (1514598.4786629865 385973.2843747351),Washington,12,133,M,Black,30 - 49 years,1
1179095,12087,FL,POINT (1514598.4786629865 385973.2843747351),Washington,12,133,M,White,30 - 49 years,1


In [32]:
dfs = dfs[['County', 'Units', 'Daily Mean PM2.5 Concentration', 'Daily AQI Value']]
dfs.rename(columns = {'County': 'county_name'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs.rename(columns = {'County': 'county_name'}, inplace = True)


In [33]:
df2 = df2.merge(dfs, on = "county_name", how = 'inner')
filename = 'pulmonary_air_quality_data.csv'
path_out = os.path.join(folder_out, filename)
df2.to_csv(path_out, index = False)