# Group 4 Project 2

In [27]:
import pandas as pd
import requests
from dotenv import load_dotenv
import os
import json
import time

### API call for entire NY state  

Data for Counties:  
103 Suffolk (LI),  
005 Bronx,  
061 NYC,  
081 Queens  
085 Richmond  

Parameter Codes used:

42101 - Carbon monoxide (has data in four counties)  
44201 - Ozone (has data in all above five counties)  
88101 - PM2.5 (has data in all above five counties)  

81102 - PM10 (Bronx 117, Queens 58)  
42401 - Sulfer Dioxide (Bronx 3649, Queens 1822)  
42602 - Nitrogen Dioxide (Queens 1454, Bronx 1450, Suffolk 730)  

# Data Extract using API calls
Get Daily Summary Data by State  
API Source for Daily AQI data from EPA - https://aqs.epa.gov/aqsweb/documents/data_api.html#daily   

In [43]:
# Set environment variables from the .env in the local environment
load_dotenv()
epa_api_key = os.getenv("EPA_API_KEY")
email = os.getenv("EMAIL")

In [147]:
# Build URL
state = "36"  # New York State code
# API url
url = "https://aqs.epa.gov/data/api/dailyData/byState?"
# Years list
years_list = list(range(2013, 2025))

# List of parameter codes
parameter_codes = [42101, 44201, 88101] #, 88101, 81102, 42401, 42602]  

# List of counties
county_list = ['005','047','059','061','081','085','103']

In [148]:
# Pick specific columns from larger dataset
selected_columns = ['county_code', 'parameter_code', 'latitude', 'longitude', 'sample_duration_code', 
                    'pollutant_standard','date_local','units_of_measure', 'observation_count', 
                    'validity_indicator', 'arithmetic_mean','first_max_value','first_max_hour', 'aqi', 'county', 'city']

In [149]:
# Function to make API call and return selected columns as DataFrame
def get_data_for_parameter(parameter_code):
    # Build URL
    query_url = (f"{url}email={email}&key={epa_api_key}&param={parameter_code}&bdate={begin_date}&edate={end_date}"
               + f"&state={state}" )

    # API Call to get data for just one Parameter & for one year
    param_data = requests.get(query_url).json()
    #Add 2 second wait step
    time.sleep(2)
    #param_data.raise_for_status()  # Raise an error for bad status codes
    json.dumps(param_data, indent=4)
    # Convert data into DataFrame
    param_df = pd.json_normalize(param_data["Data"])
    # Sort by Dates
    param_df.sort_values(by='date_local',inplace=True)

    
    # Dataframe for only five counties
    # five_counties_param_df = param_df[param_df['county_code'].isin(five_counties)]
    five_counties_param_df = param_df.loc[param_df['county_code'].isin(county_list)]

    # Return data frame that has only selected columns
    five_counties_param_df = five_counties_param_df[selected_columns]

    print(begin_date, end_date, parameter_code)
    print(five_counties_param_df['county_code'].value_counts())

    return five_counties_param_df

In [150]:
# Loop for years list (2013 - 2024)
for yrs_index, year in enumerate (years_list):
    
    #Build begin date and end date values
    begin_date = str(year) + "0101"
    end_date = str(year) + "1231"
    
    # Loop through the parameter codes, get data, and merge
    for param_index, code in enumerate (parameter_codes):

        # Call function to get data using API
        all_counties_param_df = get_data_for_parameter(code)  
        all_counties_param_df = all_counties_param_df.dropna()
        all_counties_param_df.sort_index(ascending=True, inplace=True)
        
        # Use groupby and agg function to get one record per day when multiple rows of data exist for "same day & for same type of test"
        counties_aggr_param_df = all_counties_param_df.groupby(['county_code','date_local']).agg({
                                   'parameter_code': 'first', 
                                   'latitude': 'first',
                                   'longitude': 'first',
                                   'sample_duration_code': 'first',
                                   'pollutant_standard': 'first',
                                   'units_of_measure': 'first',
                                   'observation_count': 'sum',
                                   'validity_indicator': 'first',
                                   'arithmetic_mean': 'mean',
                                   'first_max_value': 'mean',
                                   'first_max_hour': 'mean',
                                   'aqi': 'mean',
                                   'county': 'first',
                                   'city': 'first' })

        # Dataframe for all parameters & for all specified counties
        if param_index == 0:
            counties_param_joined_df = counties_aggr_param_df 
        else:
            counties_param_joined_df = counties_param_joined_df.join(counties_aggr_param_df, how="outer",rsuffix=code)

    # Dataframe for All years & parameters
    if yrs_index == 0:          
        all_yrs_counties_params_df = counties_param_joined_df
    else:   
        all_yrs_counties_params_df = pd.concat([all_yrs_counties_params_df, counties_param_joined_df], axis='rows', join="outer")

# Sort ascending
all_yrs_counties_params_df.sort_index(ascending=True, inplace=True)

all_yrs_counties_params_df.head()

20130101 20131231 42101
county_code
081    724
005    722
061    712
Name: count, dtype: int64
20130101 20131231 44201
county_code
103    3861
005    2904
061    1452
081    1440
085    1263
Name: count, dtype: int64
20130101 20131231 88101
county_code
061    3568
081    2664
005    1936
103     936
047     912
085     832
Name: count, dtype: int64
20140101 20141231 42101
county_code
081    728
005    712
061    706
Name: count, dtype: int64
20140101 20141231 44201
county_code
103    3963
005    2910
081    1438
061    1379
085    1022
Name: count, dtype: int64
20140101 20141231 88101
county_code
081    5793
005    4605
061    3584
103     912
047     888
085     824
Name: count, dtype: int64
20150101 20151231 42101
county_code
081    730
005    730
061    712
Name: count, dtype: int64
20150101 20151231 44201
county_code
103    3672
005    2920
081    1459
061    1453
085    1012
Name: count, dtype: int64
20150101 20151231 88101
county_code
081    6029
005    5165
061    3712
085     9

Unnamed: 0_level_0,Unnamed: 1_level_0,parameter_code,latitude,longitude,sample_duration_code,pollutant_standard,units_of_measure,observation_count,validity_indicator,arithmetic_mean,first_max_value,...,pollutant_standard88101,units_of_measure88101,observation_count88101,validity_indicator88101,arithmetic_mean88101,first_max_value88101,first_max_hour88101,aqi88101,county88101,city88101
county_code,date_local,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5,2013-01-01,42101,40.8679,-73.87809,Z,CO 8-hour 1971,Parts per million,19.0,Y,0.547368,0.6,...,PM25 24-hour 2006,Micrograms/cubic meter (LC),16.0,Y,9.9,9.9,0.0,52.0,Bronx,New York
5,2013-01-02,42101,40.8679,-73.87809,Z,CO 8-hour 1971,Parts per million,24.0,Y,0.416667,0.5,...,,,,,,,,,,
5,2013-01-03,42101,40.8679,-73.87809,Z,CO 8-hour 1971,Parts per million,24.0,Y,0.495833,0.6,...,,,,,,,,,,
5,2013-01-04,42101,40.8679,-73.87809,Z,CO 8-hour 1971,Parts per million,24.0,Y,0.575,0.6,...,PM25 24-hour 2006,Micrograms/cubic meter (LC),16.0,Y,16.5,16.5,0.0,64.5,Bronx,New York
5,2013-01-05,42101,40.8679,-73.87809,Z,CO 8-hour 1971,Parts per million,24.0,Y,0.633333,0.8,...,,,,,,,,,,


In [152]:
all_yrs_counties_params_df.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,parameter_code,latitude,longitude,sample_duration_code,pollutant_standard,units_of_measure,observation_count,validity_indicator,arithmetic_mean,first_max_value,...,pollutant_standard88101,units_of_measure88101,observation_count88101,validity_indicator88101,arithmetic_mean88101,first_max_value88101,first_max_hour88101,aqi88101,county88101,city88101
county_code,date_local,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
103,2024-03-29,,,,,,,,,,,...,,,,,,,,,,
103,2024-03-30,,,,,,,,,,,...,,,,,,,,,,
103,2024-03-31,,,,,,,,,,,...,PM25 24-hour 2006,Micrograms/cubic meter (LC),8.0,Y,4.4,4.4,0.0,24.0,Suffolk,East Farmingdale


In [153]:
# Reset Index
all_yrs_counties_params_df.reset_index(inplace=True)

In [154]:
all_yrs_counties_params_df.shape

(21020, 44)

In [155]:
# Add new column for Max AQI value
all_yrs_counties_params_df['aqi_max'] = all_yrs_counties_params_df[['aqi', 'aqi44201', 'aqi88101']].max(axis=1)

In [156]:
all_yrs_counties_params_df.columns

Index(['county_code', 'date_local', 'parameter_code', 'latitude', 'longitude',
       'sample_duration_code', 'pollutant_standard', 'units_of_measure',
       'observation_count', 'validity_indicator', 'arithmetic_mean',
       'first_max_value', 'first_max_hour', 'aqi', 'county', 'city',
       'parameter_code44201', 'latitude44201', 'longitude44201',
       'sample_duration_code44201', 'pollutant_standard44201',
       'units_of_measure44201', 'observation_count44201',
       'validity_indicator44201', 'arithmetic_mean44201',
       'first_max_value44201', 'first_max_hour44201', 'aqi44201',
       'county44201', 'city44201', 'parameter_code88101', 'latitude88101',
       'longitude88101', 'sample_duration_code88101',
       'pollutant_standard88101', 'units_of_measure88101',
       'observation_count88101', 'validity_indicator88101',
       'arithmetic_mean88101', 'first_max_value88101', 'first_max_hour88101',
       'aqi88101', 'county88101', 'city88101', 'aqi_max'],
      dtype='ob

In [157]:
# Verify new column aqi_max values in few records
aqi_values = all_yrs_counties_params_df[['county_code', 'date_local', 'aqi', 'aqi44201', 'aqi88101', 'aqi_max']][1:10]
aqi_values

Unnamed: 0,county_code,date_local,aqi,aqi44201,aqi88101,aqi_max
1,5,2013-01-02,6.0,20.5,,20.5
2,5,2013-01-03,7.0,19.0,,19.0
3,5,2013-01-04,7.0,19.0,64.5,64.5
4,5,2013-01-05,9.0,25.0,,25.0
5,5,2013-01-06,9.0,18.0,,18.0
6,5,2013-01-07,9.0,23.5,57.0,57.0
7,5,2013-01-08,10.0,9.5,,10.0
8,5,2013-01-09,15.0,17.5,,17.5
9,5,2013-01-10,5.0,21.5,35.5,35.5


In [158]:
# Dataframe dimensions
all_yrs_counties_params_df.shape

(21020, 45)

In [159]:
# List total records by county
print(all_yrs_counties_params_df['county_code'].value_counts())

county_code
005    4109
081    4109
061    4098
103    4064
085    3190
047    1450
Name: count, dtype: int64


In [160]:
# This does not give us true count since columns appears multiple times with different suffixes (based on param value)
# when a field is blank in one column then other columns related columns in data frame might have some value in the same row
all_yrs_counties_params_df.isnull().sum()

county_code                     0
date_local                      0
parameter_code               7842
latitude                     7842
longitude                    7842
sample_duration_code         7842
pollutant_standard           7842
units_of_measure             7842
observation_count            7842
validity_indicator           7842
arithmetic_mean              7842
first_max_value              7842
first_max_hour               7842
aqi                          7842
county                       7842
city                         7842
parameter_code44201          2003
latitude44201                2003
longitude44201               2003
sample_duration_code44201    2003
pollutant_standard44201      2003
units_of_measure44201        2003
observation_count44201       2003
validity_indicator44201      2003
arithmetic_mean44201         2003
first_max_value44201         2003
first_max_hour44201          2003
aqi44201                     2003
county44201                  2003
city44201     

In [161]:
# Write output file to /Resources folder
all_yrs_counties_params_df.to_csv("Resources/selected_counties_features_42101_44201_88101_aqi_data_2013_to_2024.csv", index=False)