# Group 4 Project 2

In [88]:
import pandas as pd
import requests
from dotenv import load_dotenv
import os
import json
import time

In [89]:
# Set environment variables from the .env in the local environment
load_dotenv()
epa_api_key = os.getenv("EPA_API_KEY")
email = os.getenv("EMAIL")

### API call for entire NY state  

Data for Counties:  
103 Suffolk (LI),  
005 Bronx,  
061 NYC,  
081 Queens  
085 Richmond  

Parameter Codes used:

42101 - Carbon monoxide (has data in four counties)  
44201 - Ozone (has data in all above five counties)  
88101 - PM2.5 (has data in all above five counties)  

81102 - PM10 (Bronx 117, Queens 58)  
42401 - Sulfer Dioxide (Bronx 3649, Queens 1822)  
42602 - Nitrogen Dioxide (Queens 1454, Bronx 1450, Suffolk 730)  

# Data Extract using API calls
Get Daily Summary Data by State  
API Source for Daily AQI data from EPA - https://aqs.epa.gov/aqsweb/documents/data_api.html#daily   

In [243]:
# Pick specific columns from larger dataset
selected_columns = ['county_code', 'parameter_code', 'latitude', 'longitude', 'sample_duration_code', 
                    'pollutant_standard','date_local','units_of_measure', 'observation_count', 
                    'validity_indicator', 'arithmetic_mean','first_max_value','first_max_hour', 'aqi', 'county', 'city']

In [244]:
# Set environment variables from the .env in the local environment
load_dotenv()
epa_api_key = os.getenv("EPA_API_KEY")
email = os.getenv("EMAIL")

In [245]:
# Build URL
state = "36"  # New York State code

url = "https://aqs.epa.gov/data/api/dailyData/byState?"
query_url = (
    f"{url}email={email}&key={epa_api_key}&param={param}&bdate={begin_date}&edate={end_date}"
    + f"&state={state}" )

# Years list
years_list = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
# List of parameter codes
parameter_codes = [42101, 44201, 88101] #, 88101, 81102, 42401, 42602]  

# List of counties
five_counties = ['103', '005', '061', '081', '085']

In [254]:
# Function to make API call and return selected columns as DataFrame
def get_data_for_parameter(parameter_code):
    # Build URL
    query_url = (f"{url}email={email}&key={epa_api_key}&param={parameter_code}&bdate={begin_date}&edate={end_date}"
               + f"&state={state}" )

    # API Call to get data for just one Parameter & for one year
    param_data = requests.get(query_url).json()
    #Add 2 second wait step
    time.sleep(2)
    #param_data.raise_for_status()  # Raise an error for bad status codes
    json.dumps(param_data, indent=4)
    # Convert data into DataFrame
    param_df = pd.json_normalize(param_data["Data"])
    # Sort by Dates
    param_df.sort_values(by='date_local',inplace=True)

    
    # Five counties dataframe
    five_counties_param_df = param_df[param_df['county_code'].isin(five_counties)]

    # Return data frame that has only selected columns
    five_counties_param_df = five_counties_param_df[selected_columns]

    print(begin_date, end_date, parameter_code)

    return five_counties_param_df

In [255]:
# Loop for years list (2013 - 2024)
for yrs_index, year in enumerate (years_list):
    
    #Build begin date and end date values
    begin_date = str(year) + "0101"
    end_date = str(year) + "1231"
    
    # Loop through the parameter codes, get data, and merge
    for param_index, code in enumerate (parameter_codes):

        # Call function to get data using API
        all_counties_param_df = get_data_for_parameter(code)  
        all_counties_param_df = all_counties_param_df.dropna()
        all_counties_param_df.sort_index(ascending=True, inplace=True)
        # Compress multiple rows of data for same day using groupby function
        counties_aggr_param_df = all_counties_param_df.groupby('date_local').agg({
                                   'county_code': 'first',
                                   'parameter_code': 'first', 
                                   'latitude': 'first',
                                   'longitude': 'first',
                                   'sample_duration_code': 'first',
                                   'pollutant_standard': 'first',
                                   'units_of_measure': 'first',
                                   'observation_count': 'sum',
                                   'validity_indicator': 'first',
                                   'arithmetic_mean': 'mean',
                                   'first_max_value': 'mean',
                                   'first_max_hour': 'mean',
                                   'aqi': 'mean',
                                   'county': 'first',
                                   'city': 'first'
                            }).reset_index()

        # Set Indexes
        counties_aggr_param_df = counties_aggr_param_df.set_index(["county_code","date_local"])

        # Dataframe for all parameters & for all specified counties
        if param_index == 0:
            counties_param_joined_df = counties_aggr_param_df 
        else:
            counties_param_joined_df = counties_param_joined_df.join(counties_aggr_param_df, on=["county_code","date_local"], how="outer",rsuffix=code)

    # Dataframe for All years & parameters
    if yrs_index == 0:          
        all_yrs_counties_params_df = counties_param_joined_df
    else:   
        all_yrs_counties_params_df = pd.concat([all_yrs_counties_params_df, counties_param_joined_df], axis='rows', join="outer")

all_yrs_counties_params_df.sort_index(ascending=True, inplace=True)
all_yrs_counties_params_df.head()

20130101 20131231 42101
20130101 20131231 44201
20130101 20131231 88101
20140101 20141231 42101
20140101 20141231 44201
20140101 20141231 88101
20150101 20151231 42101
20150101 20151231 44201
20150101 20151231 88101
20160101 20161231 42101
20160101 20161231 44201
20160101 20161231 88101
20170101 20171231 42101
20170101 20171231 44201
20170101 20171231 88101
20180101 20181231 42101
20180101 20181231 44201
20180101 20181231 88101
20190101 20191231 42101
20190101 20191231 44201
20190101 20191231 88101
20200101 20201231 42101
20200101 20201231 44201
20200101 20201231 88101
20210101 20211231 42101
20210101 20211231 44201
20210101 20211231 88101
20220101 20221231 42101
20220101 20221231 44201
20220101 20221231 88101
20230101 20231231 42101
20230101 20231231 44201
20230101 20231231 88101
20240101 20241231 42101
20240101 20241231 44201
20240101 20241231 88101


Unnamed: 0_level_0,Unnamed: 1_level_0,parameter_code,latitude,longitude,sample_duration_code,pollutant_standard,units_of_measure,observation_count,validity_indicator,arithmetic_mean,first_max_value,...,pollutant_standard88101,units_of_measure88101,observation_count88101,validity_indicator88101,arithmetic_mean88101,first_max_value88101,first_max_hour88101,aqi88101,county88101,city88101
county_code,date_local,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5,2013-01-01,,,,,,,,,,,...,PM25 24-hour 2006,Micrograms/cubic meter (LC),72.0,Y,9.922222,9.922222,0.0,52.0,Bronx,New York
5,2013-01-04,,,,,,,,,,,...,PM25 24-hour 2006,Micrograms/cubic meter (LC),72.0,Y,16.088889,16.088889,0.0,63.888889,Bronx,New York
5,2013-01-07,,,,,,,,,,,...,PM25 24-hour 2006,Micrograms/cubic meter (LC),72.0,Y,12.1,12.1,0.0,56.555556,Bronx,New York
5,2013-01-10,,,,,,,,,,,...,PM25 24-hour 2006,Micrograms/cubic meter (LC),72.0,Y,6.177778,6.177778,0.0,34.444444,Bronx,New York
5,2013-01-13,,,,,,,,,,,...,PM25 24-hour 2006,Micrograms/cubic meter (LC),72.0,Y,13.255556,13.255556,0.0,56.444444,Bronx,New York


In [256]:
all_yrs_counties_params_df.reset_index(inplace=True)
all_yrs_counties_params_df.columns

Index(['county_code', 'date_local', 'parameter_code', 'latitude', 'longitude',
       'sample_duration_code', 'pollutant_standard', 'units_of_measure',
       'observation_count', 'validity_indicator', 'arithmetic_mean',
       'first_max_value', 'first_max_hour', 'aqi', 'county', 'city',
       'parameter_code44201', 'latitude44201', 'longitude44201',
       'sample_duration_code44201', 'pollutant_standard44201',
       'units_of_measure44201', 'observation_count44201',
       'validity_indicator44201', 'arithmetic_mean44201',
       'first_max_value44201', 'first_max_hour44201', 'aqi44201',
       'county44201', 'city44201', 'parameter_code88101', 'latitude88101',
       'longitude88101', 'sample_duration_code88101',
       'pollutant_standard88101', 'units_of_measure88101',
       'observation_count88101', 'validity_indicator88101',
       'arithmetic_mean88101', 'first_max_value88101', 'first_max_hour88101',
       'aqi88101', 'county88101', 'city88101'],
      dtype='object')

In [257]:
all_yrs_counties_params_df.shape

(7145, 44)

In [258]:
# Add new column for Max AQI value
all_yrs_counties_params_df['aqi_max'] = all_yrs_counties_params_df[['aqi', 'aqi44201', 'aqi88101']].max(axis=1)

In [259]:
all_yrs_counties_params_df.head()

Unnamed: 0,county_code,date_local,parameter_code,latitude,longitude,sample_duration_code,pollutant_standard,units_of_measure,observation_count,validity_indicator,...,units_of_measure88101,observation_count88101,validity_indicator88101,arithmetic_mean88101,first_max_value88101,first_max_hour88101,aqi88101,county88101,city88101,aqi_max
0,5,2013-01-01,,,,,,,,,...,Micrograms/cubic meter (LC),72.0,Y,9.922222,9.922222,0.0,52.0,Bronx,New York,52.0
1,5,2013-01-04,,,,,,,,,...,Micrograms/cubic meter (LC),72.0,Y,16.088889,16.088889,0.0,63.888889,Bronx,New York,63.888889
2,5,2013-01-07,,,,,,,,,...,Micrograms/cubic meter (LC),72.0,Y,12.1,12.1,0.0,56.555556,Bronx,New York,56.555556
3,5,2013-01-10,,,,,,,,,...,Micrograms/cubic meter (LC),72.0,Y,6.177778,6.177778,0.0,34.444444,Bronx,New York,34.444444
4,5,2013-01-13,,,,,,,,,...,Micrograms/cubic meter (LC),72.0,Y,13.255556,13.255556,0.0,56.444444,Bronx,New York,56.444444


In [260]:
all_yrs_counties_params_df.shape

(7145, 45)

In [261]:
all_yrs_counties_params_df.to_csv("Resources/five_counties_aqi_data_2013_to_2024.csv", index=False)