# Cal-CRAI Metric Calculation for: Natural Systems / Air Quality
This notebook calculates one metric sourced from the Environmental Protection Agency

* Percentage of days with unhealthy or worse air quality days per county between 1980-2022

In [1]:
import pandas as pd
import os
import sys
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/natural_systems/ecosystem_condition/epa/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

In [None]:
# read in air quality data (already for state of CA)
air_quality = pd.read_csv('natural_epa_air_quality.csv')
print(len(air_quality))
air_quality.head(5)
os.remove('natural_epa_air_quality.csv')

In [None]:
# drop duplicates, original dataset had a repeat year
filtered_air_quality = air_quality.drop_duplicates(subset=['Year', 'County'])
filtered_air_quality.head(5)

In [None]:
# Create df that holds desired data variables
columns_to_sum = ['Days with AQI', 
                    'Unhealthy for Sensitive Groups Days',
                    'Unhealthy Days',
                    'Very Unhealthy Days',
                    'Hazardous Days']
# Group data by county and sum desired columns for the temporal range of the dataset (1980-2022)
ca_county_unhealthy_days = filtered_air_quality.groupby('County')[columns_to_sum].sum().reset_index()
print(len(ca_county_unhealthy_days)) # confirmed, number of counties in CA
ca_county_unhealthy_days.head(5)

In [None]:
# Create new column counting total unhealthy air quality days
ca_county_unhealthy_days['Total_Unhealthy_AQI_Days'] = (
    ca_county_unhealthy_days['Unhealthy for Sensitive Groups Days'] + 
    ca_county_unhealthy_days['Unhealthy Days'] + 
    ca_county_unhealthy_days['Very Unhealthy Days'] + 
    ca_county_unhealthy_days['Hazardous Days']
)
ca_county_unhealthy_days.head(5)

In [None]:
# Calculate CRI metric
ca_county_unhealthy_days['unhealthy_to_total_AQI_days'] = (
    ca_county_unhealthy_days['Total_Unhealthy_AQI_Days'] / ca_county_unhealthy_days['Days with AQI']
)
ca_county_unhealthy_days['percent_unhealthy_days'] = ca_county_unhealthy_days['unhealthy_to_total_AQI_days'] * 100 # into a percent
ca_county_unhealthy_days.head(5)

In [8]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county = ca_tract_county.rename(columns={'TRACT':'USCB_GEOID'})

In [None]:
ca_tract_county

In [None]:
ca_county_unhealthy_days_metric = pd.merge(ca_tract_county, ca_county_unhealthy_days, on='County', how='left')
ca_county_unhealthy_days_metric

In [None]:
# Saving metric df to .csv file
ca_county_unhealthy_days_metric.to_csv('natural_air_quality_metric.csv')

### Function call for this metric

In [None]:
@append_metadata
def calc_unhealthy_days(input_csv, export=False, varname=''):
    '''
    Calculates the total number of air quality days worse than 'Unhealthy for Sensitive Groups' per California County
    between 1980-2022. Data is sourced from EPA's Air Quality Index Report: 
    https://www.epa.gov/outdoor-air-quality-data/air-quality-index-report

    Methods
    -------
    Number of days rated at or worse than 'Unhealthy for Sensitive Groups' were summed per county for years 1980-2022.
    Metric is calculated by dividing the number of unhealthy days by the total number of tested days.

    Parameters
    ----------
    df: string
        the dataframe containing the initial air quality data
    export: True/False boolian
        False = will not upload resulting df containing CAL CRAI air quality metric to AWS
        True = will upload resulting df containing CAL CRAI air quality metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS
    
    Script
    ------
    natural_air_quality.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are 
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    '''if os.path.exists(input_csv):
        os.remove(input_csv)'''

In [None]:
input_csv = 'natural_air_quality_metric.csv'
varname = 'natural_epa_air_quality'

calc_unhealthy_days(input_csv, export=True, varname='test')