**Description:**

We collect data from the EPA APIs over the period from 2000 to 2022. We extract the 'arithmetic_mean' values from the JSON responses and then calculate averages for each state and year. These values are subsequently filtered based on the pollutant standard with the longest available exposure. This resulting dataset is used for our data analysis and predictive modeling efforts.

This dataset has been sourced from the following source: United States Environmental Protection Agency [https://www.epa.gov/outdoor-air-quality-data]

Duration: 2002 - 2022

This dataset provides estimates of outdoor concentrations for five pollutants (four gases: O3, CO, SO2, NO2; one aerosol: PM2.5) throughout the contiguous U.S.

In [2]:
import pandas as pd
import requests
from collections import defaultdict

In [3]:
def get_data(years):    
    for year in years:
        for state_name, state_code in states.items():
            state_avg_values = {"Year": year, "State": state_name}

            for param_code, param_name in param_codes.items():
                url = f"{base_url}?email={email}&key={key}&param={param_code}&bdate={year}0101&edate={year}1231&state={state_code}"

                response = requests.get(url)

                if response.status_code == 200:
                    data = response.json()
                    arith_mean_values = [record['arithmetic_mean'] for record in data['Data'] if record.get('pollutant_standard') in pollutant_standard]
                    
                    avg_value = sum(arith_mean_values) / len(arith_mean_values) if arith_mean_values else None

                    
                    state_avg_values[param_name] = avg_value

            results.append(state_avg_values)

In [4]:
base_url = "https://aqs.epa.gov/data/api/annualData/byState"
email = "ayushee.tbmun@gmail.com"
key = "cobaltheron43"

param_codes = {
    "42101": "CO",
    "42602": "NO2",
    "42401": "SO2",
    "44201": "O3",
    "88101": "PM2.5"
}

states = {
    "Alabama": "01",
    "Arizona": "04",
    "Arkansas": "05",
    "California": "06",
    "Colorado": "08",
    "Connecticut": "09",
    "Delaware": "10",
    "District of Columbia": "12",
    "Florida": "12",
    "Georgia": "13",
    "Idaho": "16",
    "Illinois": "17",
    "Indiana": "18",
    "Iowa": "19",
    "Kansas": "20",
    "Kentucky": "21",
    "Louisiana": "22",
    "Maine": "23",
    "Maryland": "24",
    "Massachusetts": "25",
    "Michigan": "26",
    "Minnesota": "27",
    "Mississippi": "28",
    "Missouri": "29",
    "Montana": "30",
    "Nebraska": "31",
    "Nevada": "32",
    "New Hampshire": "33",
    "New Jersey": "34",
    "New Mexico": "35",
    "New York": "36",
    "North Carolina": "37",
    "North Dakota": "38",
    "Ohio": "39",
    "Oklahoma": "40",
    "Oregon": "41",
    "Pennsylvania": "42",
    "Rhode Island": "44",
    "South Carolina": "45",
    "South Dakota": "46",
    "Tennessee": "47",
    "Texas": "48",
    "Utah": "49",
    "Vermont": "50",
    "Virginia": "51",
    "Washington": "53",
    "West Virginia": "54",
    "Wisconsin": "55",
    "Wyoming": "56",
}

pollutant_standard = ["CO 8-hour 1971", "Ozone 8-Hour 1997", "SO2 24-hour 1971","PM25 24-hour 2012", "NO2 Annual 1971"]

average_results = defaultdict(dict)

results = []

In [5]:
years = [str(year) for year in range(2002, 2007)]
get_data(years)

In [7]:
years = [str(year) for year in range(2007, 2012)]
get_data(years)

In [9]:
years = [str(year) for year in range(2012, 2017)]
get_data(years)

In [11]:
years = [str(year) for year in range(2017, 2022)]
get_data(years)

In [None]:
get_data(["2022"])

In [12]:
df = pd.DataFrame(columns=["Year", "State"] + list(param_codes.values()))
df = df._append(results)
df.to_csv("pollution_data.csv", index=False)