# Cal-CRAI Metric Calculation for: Natural Systems / Air Quality
This notebook calculates one metric sourced from the Environmental Protection Agency

* Percentage of days with unhealthy or worse air quality days per county between 1980-2022

In [1]:
import pandas as pd
import os
import sys

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/natural_systems/ecosystem_condition/epa/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

In [None]:
# read in air quality data (already for state of CA)
air_quality = pd.read_csv('natural_epa_air_quality.csv')
print(len(air_quality))
air_quality.head(5)
os.remove('natural_epa_air_quality.csv')

In [None]:
# drop duplicates, original dataset had a repeat year
filtered_air_quality = air_quality.drop_duplicates(subset=['Year', 'County'])
filtered_air_quality.head(5)

In [None]:
# Create df that holds desired data variables
columns_to_sum = ['Days with AQI', 
                    'Unhealthy for Sensitive Groups Days',
                    'Unhealthy Days',
                    'Very Unhealthy Days',
                    'Hazardous Days']
# Group data by county and sum desired columns for the temporal range of the dataset (1980-2022)
ca_county_unhealthy_days = filtered_air_quality.groupby('County')[columns_to_sum].sum().reset_index()
print(len(ca_county_unhealthy_days)) # confirmed, number of counties in CA
ca_county_unhealthy_days.head(5)

In [None]:
# Create new column counting total unhealthy air quality days
ca_county_unhealthy_days['Total Unhealthy AQI Days'] = (
    ca_county_unhealthy_days['Unhealthy for Sensitive Groups Days'] + 
    ca_county_unhealthy_days['Unhealthy Days'] + 
    ca_county_unhealthy_days['Very Unhealthy Days'] + 
    ca_county_unhealthy_days['Hazardous Days']
)
ca_county_unhealthy_days.head(5)

In [None]:
# Calculate CRI metric
ca_county_unhealthy_days['CRI Metric'] = (
    ca_county_unhealthy_days['Total Unhealthy AQI Days'] / ca_county_unhealthy_days['Days with AQI']
)
ca_county_unhealthy_days['CRI Metric Percentage'] = ca_county_unhealthy_days['CRI Metric'] * 100 # into a percent
ca_county_unhealthy_days.head(5)

In [None]:
# Saving metric df to .csv file
ca_county_unhealthy_days.to_csv('natural_air_quality_metric.csv')

In [None]:
# upload final csv file to aws
bucket_name = 'ca-climate-index'
file_name = ['natural_air_quality_metric.csv']
directory = '3_fair_data/index_data'

#@append_metadata
upload_csv_aws(file_name, bucket_name, directory)
os.remove('natural_air_quality_metric.csv')

Example of a function call for this metric

In [2]:
def calc_unhealthy_days(df, export=False, export_filename=None):
    '''
    Useful docstring here -- fill in with necessary details for metadata!
    '''

    # Drop duplicate year
    print('Metadata statement - duplicate year dropped')
    filtered_air_quality = air_quality.drop_duplicates(subset=['Year', 'County'])

    # Create df that holds desired data variables
    columns_to_sum = ['Days with AQI', 'Unhealthy for Sensitive Groups Days', 'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days']
    
    # Group data by county and sum desired columns for the temporal range of the dataset (1980-2022)
    ca_county_unhealthy_days = filtered_air_quality.groupby('County')[columns_to_sum].sum().reset_index()

    # Create new column counting total unhealthy air quality days
    ca_county_unhealthy_days['Total Unhealthy AQI Days'] = (
        ca_county_unhealthy_days['Unhealthy for Sensitive Groups Days'] + 
        ca_county_unhealthy_days['Unhealthy Days'] + 
        ca_county_unhealthy_days['Very Unhealthy Days'] + 
        ca_county_unhealthy_days['Hazardous Days'])

    # Calculate CRI metric
    ca_county_unhealthy_days['CRI Metric'] = (
        ca_county_unhealthy_days['Total Unhealthy AQI Days'] / ca_county_unhealthy_days['Days with AQI']
    )
    ca_county_unhealthy_days['CRI Metric Percentage'] = ca_county_unhealthy_days['CRI Metric'] * 100 # as a percent
    print('Metadata statement - metric calculated by summing the number of total unhealthy AQI days and dividing by total days with an AQI value')
    print('Metadata statement - Total unhealthy AQI days include "unhealthy for sensitive groups", "unhealthy", "very unhealthy", and "hazardous" classified days')

    # export to csv and upload to AWS
    if export == True:
        ca_county_unhealthy_days.to_csv(export_filename) # 'natural_air_quality_metric.csv'
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename] #'natural_air_quality_metric.csv'
        upload_csv_aws(export_filename, bucket_name, directory)

        os.remove('natural_epa_air_quality.csv') # remove from local to clear up directory
        os.remove(export_filename[0])

    return ca_county_unhealthy_days # returns df

In [3]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/natural_systems/ecosystem_condition/epa/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

air_quality = pd.read_csv('natural_epa_air_quality.csv')

calc_unhealthy_days(air_quality, export=True, export_filename = 'natural_air_quality_metric.csv')

Saved DataFrame as 'natural_epa_air_quality.csv'
Metadata statement - duplicate year dropped
Metadata statement - metric calculated by summing the number of total unhealthy AQI days and dividing by total days with an AQI value
Metadata statement - Total unhealthy AQI days include "unhealthy for sensitive groups", "unhealthy", "very unhealthy", and "hazardous" classified days
natural_air_quality_metric.csv uploaded to AWS


Unnamed: 0,County,Days with AQI,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Total Unhealthy AQI Days,CRI Metric,CRI Metric Percentage
0,Alameda,15585,529,168,31,0,728,0.046712,4.671158
1,Alpine,168,14,8,2,0,24,0.142857,14.285714
2,Amador,10942,614,125,10,0,749,0.068452,6.845184
3,Butte,15510,915,144,10,5,1074,0.069246,6.924565
4,Calaveras,10242,720,205,13,1,939,0.091681,9.168131
5,Colusa,12903,481,104,3,0,588,0.045571,4.55708
6,Contra Costa,15585,512,157,10,0,679,0.043568,4.356753
7,Del Norte,7044,5,6,2,0,13,0.001846,0.184554
8,El Dorado,15506,1496,481,57,1,2035,0.13124,13.123952
9,Fresno,15584,3319,2057,473,0,5849,0.375321,37.532084
