# Cal-CRAI Metric Calculation for: Natural Systems / Air Quality
This notebook calculates one metric sourced from the Environmental Protection Agency

* Percentage of days with unhealthy or worse air quality days per county between 1980-2022

In [1]:
import pandas as pd
import os
import sys
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/natural_systems/ecosystem_condition/epa/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'natural_epa_air_quality.csv'


In [3]:
# read in air quality data (already for state of CA)
air_quality = pd.read_csv('natural_epa_air_quality.csv')
print(len(air_quality))
air_quality.head(5)
os.remove('natural_epa_air_quality.csv')

1337


In [4]:
# drop duplicates, original dataset had a repeat year
filtered_air_quality = air_quality.drop_duplicates(subset=['Year', 'County'])
filtered_air_quality.head(5)

Unnamed: 0.1,Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,0,California,Alameda,1999,365,258,81,19,5,2,0,212,87,41,4,142,166,52,1
1,1,California,Amador,1999,365,212,89,53,10,1,0,202,122,45,6,0,359,0,0
2,2,California,Butte,1999,365,237,98,23,7,0,0,187,97,44,7,84,223,29,22
3,3,California,Calaveras,1999,365,196,104,49,15,1,0,201,129,48,1,0,349,15,0
4,4,California,Colusa,1999,365,235,110,20,0,0,0,150,87,44,0,0,283,31,51


In [5]:
# Create df that holds desired data variables
columns_to_sum = ['Days with AQI', 
                    'Unhealthy for Sensitive Groups Days',
                    'Unhealthy Days',
                    'Very Unhealthy Days',
                    'Hazardous Days']
# Group data by county and sum desired columns for the temporal range of the dataset (1980-2022)
ca_county_unhealthy_days = filtered_air_quality.groupby('County')[columns_to_sum].sum().reset_index()
print(len(ca_county_unhealthy_days)) # confirmed, number of counties in CA
ca_county_unhealthy_days.head(5)

57


Unnamed: 0,County,Days with AQI,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days
0,Alameda,8979,256,64,9,0
1,Alpine,168,14,8,2,0
2,Amador,8840,350,50,4,0
3,Butte,8942,644,126,10,5
4,Calaveras,8902,511,114,5,1


In [6]:
# Create new column counting total unhealthy air quality days
ca_county_unhealthy_days['Total_Unhealthy_AQI_Days'] = (
    ca_county_unhealthy_days['Unhealthy for Sensitive Groups Days'] + 
    ca_county_unhealthy_days['Unhealthy Days'] + 
    ca_county_unhealthy_days['Very Unhealthy Days'] + 
    ca_county_unhealthy_days['Hazardous Days']
)
ca_county_unhealthy_days.head(5)

Unnamed: 0,County,Days with AQI,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Total_Unhealthy_AQI_Days
0,Alameda,8979,256,64,9,0,329
1,Alpine,168,14,8,2,0,24
2,Amador,8840,350,50,4,0,404
3,Butte,8942,644,126,10,5,785
4,Calaveras,8902,511,114,5,1,631


In [7]:
# Calculate CRI metric
ca_county_unhealthy_days['unhealthy_to_total_AQI_days'] = (
    ca_county_unhealthy_days['Total_Unhealthy_AQI_Days'] / ca_county_unhealthy_days['Days with AQI']
)
ca_county_unhealthy_days['percent_unhealthy_days'] = ca_county_unhealthy_days['unhealthy_to_total_AQI_days'] * 100 # into a percent
ca_county_unhealthy_days.head(5)

Unnamed: 0,County,Days with AQI,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Total_Unhealthy_AQI_Days,unhealthy_to_total_AQI_days,percent_unhealthy_days
0,Alameda,8979,256,64,9,0,329,0.036641,3.664105
1,Alpine,168,14,8,2,0,24,0.142857,14.285714
2,Amador,8840,350,50,4,0,404,0.045701,4.570136
3,Butte,8942,644,126,10,5,785,0.087788,8.778797
4,Calaveras,8902,511,114,5,1,631,0.070883,7.088295


In [8]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county = ca_tract_county.rename(columns={'TRACT':'USCB_GEOID'})

In [9]:
ca_tract_county

Unnamed: 0,USCB_GEOID,County
0,06085504321,Santa Clara
1,06085504410,Santa Clara
2,06085507003,Santa Clara
3,06085507004,Santa Clara
4,06085502204,Santa Clara
...,...,...
9124,06059001303,Orange
9125,06059001304,Orange
9126,06059001401,Orange
9127,06013367200,Contra Costa


In [10]:
ca_county_unhealthy_days_metric = pd.merge(ca_tract_county, ca_county_unhealthy_days, on='County', how='left')
ca_county_unhealthy_days_metric

Unnamed: 0,USCB_GEOID,County,Days with AQI,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Total_Unhealthy_AQI_Days,unhealthy_to_total_AQI_days,percent_unhealthy_days
0,06085504321,Santa Clara,8978.0,293.0,82.0,0.0,0.0,375.0,0.041769,4.176877
1,06085504410,Santa Clara,8978.0,293.0,82.0,0.0,0.0,375.0,0.041769,4.176877
2,06085507003,Santa Clara,8978.0,293.0,82.0,0.0,0.0,375.0,0.041769,4.176877
3,06085507004,Santa Clara,8978.0,293.0,82.0,0.0,0.0,375.0,0.041769,4.176877
4,06085502204,Santa Clara,8978.0,293.0,82.0,0.0,0.0,375.0,0.041769,4.176877
...,...,...,...,...,...,...,...,...,...,...
9124,06059001303,Orange,8948.0,624.0,111.0,5.0,1.0,741.0,0.082812,8.281180
9125,06059001304,Orange,8948.0,624.0,111.0,5.0,1.0,741.0,0.082812,8.281180
9126,06059001401,Orange,8948.0,624.0,111.0,5.0,1.0,741.0,0.082812,8.281180
9127,06013367200,Contra Costa,8979.0,238.0,58.0,4.0,0.0,300.0,0.033411,3.341129


In [None]:
# Saving metric df to .csv file
ca_county_unhealthy_days_metric.to_csv('natural_air_quality_metric.csv')

### Function call for this metric

In [None]:
@append_metadata
def calc_unhealthy_days(input_csv, export=False, varname=''):
    '''
    Calculates the total number of air quality days worse than 'Unhealthy for Sensitive Groups' per California County
    between 1980-2022. Data is sourced from EPA's Air Quality Index Report: 
    https://www.epa.gov/outdoor-air-quality-data/air-quality-index-report

    Methods
    -------
    Number of days rated at or worse than 'Unhealthy for Sensitive Groups' were summed per county for years 1980-2022.
    Metric is calculated by dividing the number of unhealthy days by the total number of tested days.

    Parameters
    ----------
    df: string
        the dataframe containing the initial air quality data
    export: True/False boolian
        False = will not upload resulting df containing CAL CRAI air quality metric to AWS
        True = will upload resulting df containing CAL CRAI air quality metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS
    
    Script
    ------
    natural_air_quality.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are 
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    '''if os.path.exists(input_csv):
        os.remove(input_csv)'''

In [None]:
input_csv = 'natural_air_quality_metric.csv'
varname = 'natural_epa_air_quality'

calc_unhealthy_days(input_csv, export=True, varname='test')