# Air Quality

In [1]:
# Import dependencies
import requests
import json
import os
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## EDA

### Data Collection using API

In [2]:
load_dotenv()
epa_api_key = os.getenv("EPA_API_KEY")
my_email = os.getenv("EMAIL")

In [3]:
def get_data(years, param):
    # Create a dataframe to hold the values
    df = pd.DataFrame
    # Loop through the years
    for year in years:
        # Display year being downloaded
        print(f"Downloading data for {year}")
        # URL attributes
        url = "https://aqs.epa.gov/data/api/dailyData/byState?"
        begin_date = str(year) + "0101"
        end_date = str(year) + "1231"
        state = "36"
        # Build the URL
        query_url = (f"{url}email={my_email}&key={epa_api_key}&param={param}"
                     + f"&bdate={begin_date}&edate={end_date}&state={state}")
        # Get a response and convert it into json
        response = requests.get(query_url).json()
        # Check if dataframe is empty
        if df.empty:
            df = pd.DataFrame(response['Data'])
        else:
            # Concatenate the dataframe with the response
            df = pd.concat([df,pd.DataFrame(response['Data'])], axis=0)
    # Return the dataframe        
    return df

In [4]:
# Attributes for downloading data
years = ['2024','2023','2022','2021','2020','2019','2018','2017','2016','2015','2014','2013']
pm_parameters = '88502,88101,81102'
pollutant_parameters = '42401,42101,42602,44201'

In [5]:
# Get PM data
pm_df = get_data(years, pm_parameters)

Downloading data for 2024


KeyError: 'Data'

In [None]:
# Get Pollutant data
pollutant_df = get_data(years, pollutant_parameters)

: 

In [None]:
# Combined dataframe
combined_df = pd.concat([pm_df, pollutant_df]).reset_index(drop=True)

: 

In [None]:
# View the dataframe
combined_df.head()

: 

In [None]:
# View the shape
combined_df.shape

: 

In [None]:
# Check null values
combined_df.isnull().sum()

: 

In [None]:
# Check data types
combined_df.dtypes

: 

### Clean the Data

In [None]:
# Filter the data to keep the relevant columns
combined_df = combined_df[['county_code', 'parameter_code', 'parameter', 'latitude', 'longitude', 'sample_duration_code', 
                    'pollutant_standard','date_local','units_of_measure', 'observation_count', 
                    'validity_indicator', 'arithmetic_mean','first_max_value','first_max_hour', 'aqi', 'county', 'city']]
# View the head of the dataframe
combined_df.head()

: 

In [None]:
# Check the null values
combined_df.isnull().sum()

: 

In [None]:
# Get unique county names under the column 'county'
unique_counties = combined_df['county'].unique()
# View the counties
unique_counties

: 

In [None]:
# Drop the counties that are not required
# Define object names to drop
names_to_drop = ['Monroe','Erie', 'Hamilton', 'St. Lawrence' ,'Essex' ,'Steuben'
 ,'Albany', 'Chautauqua', 'Dutchess' ,'Putnam' ,'Onondaga'
 , 'Herkimer' ,'Tompkins', 'Seneca' ,'Franklin', 'Rockland',
 'Westchester' , 'Oneida' ,'Orange' , 'Jefferson',
 'Niagara', 'Oswego', 'Saratoga', 'Wayne']

# Drop rows where 'Column1' contains specific object names
combined_df = combined_df[~combined_df['county'].isin(names_to_drop)]

# Print the DataFrame after dropping rows
combined_df.head()

: 

In [None]:
# Convert date_local to datetime , extract the year and add to the dataframe
combined_df['date_local'] = pd.to_datetime(combined_df['date_local'].copy())
combined_df['year'] = combined_df['date_local'].dt.year
combined_df

: 

In [None]:
combined_df.set_index('date_local', inplace=True)

: 

In [None]:
#To fill the missing AQI score values in the provided code, we used imputation techniques. 
#Based on the nature of air quality data, which often has temporal and spatial dependencies, we useds a combination of methods. 

#1. Forward fill and backward fill:
# First, we’ll use forward fill and backward fill methods to handle missing values that occur in time series data.

# Forward fill and backward fill
combined_df['aqi'] = combined_df['aqi'].fillna(method='ffill').fillna(method='bfill')

#2. Interpolation:
#For any remaining missing values, we can use interpolation, which is particularly useful for time series data.
combined_df['aqi'] = combined_df['aqi'].interpolate(method='time')

#3. Mean imputation by category:
# If there are still missing values after the above steps, we can use mean imputation based on categories like ‘County’ and ‘date_local’.
# Group by relevant categories and fill with mean
combined_df['aqi'] = combined_df.groupby(['county','date_local'])['aqi'].transform(lambda x: x.fillna(x.mean()))

# 4. Overall mean imputation:
# As a last resort, fill any remaining missing values with the overall mean.
combined_df['aqi'] = combined_df['aqi'].fillna(combined_df['aqi'].mean())

# # Check for missing values
print("Missing values before imputation:")
print(combined_df.isnull().sum())


: 

In [None]:
# dropped "pollutant_standard" post discussion with the team and decision that it's irrelevant to the modelling dataset we need
combined_df = combined_df.drop('pollutant_standard', axis=1)
combined_df.isnull().sum()

: 

In [None]:
#resetted index for date_local
combined_df.reset_index(inplace=True)

: 

In [None]:
#dropped additional irrelevant columns post discussion with the team and decision that it's irrelevant to the modelling dataset we need
combined_df = combined_df.drop(['parameter','sample_duration_code', 'units_of_measure','validity_indicator',\
                                                        'first_max_value','first_max_hour','county', 'city'], axis=1)
combined_df.isnull().sum()

: 

In [None]:
##Preprocess the data for Modelling

: 

In [None]:
combined_df.info()

: 

In [None]:
combined_df['date_local'] = combined_df['date_local'].astype('datetime64[ns]').astype(np.int64)
combined_df.head()

: 

In [None]:
#Get the target variable
y = combined_df['aqi']

: 

In [None]:
#Get the features i.e. everything except 'aqi' column
X = combined_df\
        .copy()\
        .drop(columns="aqi")

: 

In [None]:
## Split the Data into Training and Testing Sets

: 

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

: 

In [None]:
regr = RandomForestRegressor()
regr.fit(X_train, y_train)

: 

In [None]:
# Use our models to make predictions
predicted = regr.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print(f"All Features:")
print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

: 

: 