# AQI prediction with historical pollutants concentration values

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays


## Read Datasets

In [2]:
# Read 9 datasets using a for loop
#Theres 3 locations, 2015 to 2023
locations = [7188,7191,8720]
start_years = [2015,2018,2020]
end_years = [2018,2020,2023]

for location, start_year, end_year in zip(locations, start_years, end_years):
    dataset_name = f"braga_{location}_{start_year}_{end_year}.csv"
    globals()[f"braga_{location}"] = pd.read_csv("OpenAQ/" + dataset_name, index_col=0)


## Proccess

In [3]:
# Concatenate all datasets into one
braga_air_data = pd.concat([braga_7188, braga_7191, braga_8720], axis=0)

# Drop duplicates
braga_air_data = braga_air_data.drop_duplicates()

In [4]:
# remove location column
braga_air_data = braga_air_data.drop(columns=["location"])

In [5]:
# check if utl column and local columns are the same
braga_air_data["utc"].equals(braga_air_data["local"])

False

In [6]:
# drop local column
braga_air_data = braga_air_data.drop(columns=["local"])

In [7]:
# drop unit column
braga_air_data = braga_air_data.drop(columns=["unit"])

In [8]:
# create column for each parameter
parameters = braga_air_data['parameter'].unique()

for param in parameters:
    braga_air_data[param] = None

# Preencher os valores nas colunas correspondentes
for index, row in braga_air_data.iterrows():
    parameter = row['parameter']
    value = row['value']
    braga_air_data.at[index, parameter] = value

# drop parameter column
braga_air_data = braga_air_data.drop(columns=["parameter"])

# drop value column
braga_air_data = braga_air_data.drop(columns=["value"])

# check head
braga_air_data.head()

Unnamed: 0_level_0,city,country,utc,latitude,longitude,pm10,no2,o3
locationId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7188,Braga,PT,2017-12-29T04:00:00+00:00,41.549722,-8.405833,9.4,67.2,
7188,Braga,PT,2017-12-29T04:00:00+00:00,41.549722,-8.405833,9.4,67.2,
7188,Braga,PT,2017-12-29T03:00:00+00:00,41.549722,-8.405833,9.4,67.2,
7188,Braga,PT,2017-12-29T03:00:00+00:00,41.549722,-8.405833,9.4,67.2,
7188,Braga,PT,2017-12-29T02:00:00+00:00,41.549722,-8.405833,9.4,67.2,


In [9]:
# drop country column
braga_air_data = braga_air_data.drop(columns=["country"])
# drop cty column
braga_air_data = braga_air_data.drop(columns=["city"])

In [10]:
# check head
braga_air_data.head()

Unnamed: 0_level_0,utc,latitude,longitude,pm10,no2,o3
locationId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7188,2017-12-29T04:00:00+00:00,41.549722,-8.405833,9.4,67.2,
7188,2017-12-29T04:00:00+00:00,41.549722,-8.405833,9.4,67.2,
7188,2017-12-29T03:00:00+00:00,41.549722,-8.405833,9.4,67.2,
7188,2017-12-29T03:00:00+00:00,41.549722,-8.405833,9.4,67.2,
7188,2017-12-29T02:00:00+00:00,41.549722,-8.405833,9.4,67.2,


In [11]:
# Calcutate AQI for each row using columns pm10, no2, o3
braga_air_data['AQI'] = None

In [12]:
# calculate AQI for each row
def calculateAQI(data):

    # define breakpoints
    breakpoints = [
        {
            "pollutant": "no2",
            "conc": [0, 50, 100, 200, 400, 1000],
            "aqi": [0, 50, 100, 150, 200, 300, 400],
        },
        {
            "pollutant": "pm10",
            "conc": [0, 20, 40, 70, 100, 200],
            "aqi": [0, 50, 100, 150, 200, 300, 400],
        },
        {
            "pollutant": "o3",
            "conc": [0, 54, 70, 85, 105, 200],
            "aqi": [0, 50, 100, 150, 200, 300, 400],
        },
    ]

    # calculate AQI for each pollutant
    def aqiForConcentration(pollutant, conc):
        bp = next(bp for bp in breakpoints if bp["pollutant"] == pollutant)
        i = next(i for i, c in enumerate(bp["conc"]) if c > conc) - 1
        cLow = bp["conc"][i]
        cHigh = bp["conc"][i + 1]
        aqiLow = bp["aqi"][i]
        aqiHigh = bp["aqi"][i + 1]
        return round(((aqiHigh - aqiLow) / (cHigh - cLow)) * (conc - cLow) + aqiLow)

    # calculate AQI for each row
    for index, row in data.iterrows():
        aqi = None
        for pollutant in ["no2", "pm10", "o3"]:
            if row[pollutant] is not None:
                aqi = aqiForConcentration(pollutant, row[pollutant])
                break
        data.at[index, "AQI"] = aqi
    
    return data

In [13]:
braga_air_data = calculateAQI(braga_air_data)

In [14]:
# drop columns no2, pm10, o3
braga_air_data = braga_air_data.drop(columns=["no2", "pm10", "o3"])
braga_air_data.head()

Unnamed: 0_level_0,utc,latitude,longitude,AQI
locationId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7188,2017-12-29T04:00:00+00:00,41.549722,-8.405833,67
7188,2017-12-29T04:00:00+00:00,41.549722,-8.405833,67
7188,2017-12-29T03:00:00+00:00,41.549722,-8.405833,67
7188,2017-12-29T03:00:00+00:00,41.549722,-8.405833,67
7188,2017-12-29T02:00:00+00:00,41.549722,-8.405833,67


In [16]:
# transform the utc column to datetime
braga_air_data.utc = pd.to_datetime(braga_air_data.utc)

# extract the year, month, day, hour, weekday, season, part of the day, is_weekend, is_holiday (holidays.CountryHolidays("PT")) from the utc column
braga_air_data['year'] = braga_air_data.utc.dt.year
braga_air_data['month'] = braga_air_data.utc.dt.month
braga_air_data['day'] = braga_air_data.utc.dt.day
braga_air_data['hour'] = braga_air_data.utc.dt.hour
braga_air_data['weekday'] = braga_air_data.utc.dt.weekday
braga_air_data['season'] = braga_air_data.utc.dt.month.apply(lambda x: 'winter' if x in [12, 1, 2] else 'spring' if x in [3, 4, 5] else 'summer' if x in [6, 7, 8] else 'fall')
braga_air_data['part_of_day'] = braga_air_data.utc.dt.hour.apply(lambda x: 'morning' if x in [6, 7, 8, 9, 10, 11] else 'afternoon' if x in [12, 13, 14, 15, 16, 17] else 'evening' if x in [18, 19, 20, 21, 22, 23] else 'night')
braga_air_data['is_weekend'] = braga_air_data.utc.dt.weekday.apply(lambda x: 1 if x in [5, 6] else 0)
braga_air_data['is_holiday'] = braga_air_data.utc.apply(lambda x: 1 if x in holidays.CountryHoliday('PT') else 0)

# convert seasons, part_of_day to numerical values
braga_air_data.season = braga_air_data.season.apply(lambda x: 1 if x == 'winter' else 2 if x == 'spring' else 3 if x == 'summer' else 4)
braga_air_data.part_of_day = braga_air_data.part_of_day.apply(lambda x: 1 if x == 'morning' else 2 if x == 'afternoon' else 3 if x == 'evening' else 4)

# drop the utc column
braga_air_data.drop('utc', axis=1, inplace=True)


AttributeError: 'DataFrame' object has no attribute 'utc'

## Export

In [None]:
braga_air_data.to_csv("OpenAQ/braga_air_data.csv")