In [118]:
# Load libraries, run before everything else
library(tidyverse)
library(repr)
library(tidymodels)
install.packages("kknn")
install.packages("con2aqi")
library(con2aqi)
library(zoo) # for moving averages

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [3]:
# Get weather + pollution data for the Aotizhongxin station in Beijing
download.file("https://raw.githubusercontent.com/DonkeyBlaster/dsci-100-2023w1-group43/main/PRSA_Data_Aotizhongxin_20130301-20170228.csv", "Aotizhongxin_data.csv")
air_quality_data <- read_csv("Aotizhongxin_data.csv") |>
    select(-station) |> # This just says "Aotizhongxin", no need to keep it around
    select(-No)  # This is a continuously increasing counter, we don't need it either
head(air_quality_data, 3)
tail(air_quality_data, 3)

[1mRows: [22m[34m35064[39m [1mColumns: [22m[34m18[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): wd, station
[32mdbl[39m (16): No, year, month, day, hour, PM2.5, PM10, SO2, NO2, CO, O3, TEMP, P...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
2013,3,1,0,4,4,4,7,300,77,-0.7,1023.0,-18.8,0,NNW,4.4
2013,3,1,1,8,8,4,7,300,77,-1.1,1023.2,-18.2,0,N,4.7
2013,3,1,2,7,7,5,10,300,73,-1.1,1023.5,-18.2,0,NNW,5.6


year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
2017,2,28,21,16,37,10,66,700,58,10.8,1014.2,-13.3,0,NW,1.1
2017,2,28,22,21,44,12,87,700,35,10.5,1014.4,-12.9,0,NNW,1.2
2017,2,28,23,19,31,10,79,600,42,8.6,1014.1,-15.9,0,NNE,1.3


In [4]:
air_quality_data <- air_quality_data |> na.omit() # AQI cannot be calculated with NA values

In [5]:
R = 0.082057366080960  # Gas constant for litres, atmospheres, kelvin, mols.
SO2_molecular_weight = 64.07  # g/mol
NO2_molecular_weight = 46.01  # g/mol
CO_molecular_weight = 28.01  # g/mol
O3_molecular_weight = 48.00  # g/mol
air_quality_data <- air_quality_data |>
    # PV = nRT formula rearranged to V = RT/P, n=1.
    mutate(volume = R * (273.2 + TEMP) / (PRES/1013)) |>   # Convert temp to Kelvin, pressure to atmospheres
    mutate(so2_ppb = volume * SO2 / SO2_molecular_weight) |>
    mutate(no2_ppb = volume * NO2 / NO2_molecular_weight) |>
    # Multiply by div by 1000 for ppb -> ppm
    mutate(co_ppm = volume * CO / CO_molecular_weight / 1000) |>
    mutate(o3_ppm = volume * O3 / O3_molecular_weight / 1000)
head(air_quality_data, 3)
tail(air_quality_data, 3)

year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,⋯,PRES,DEWP,RAIN,wd,WSPM,volume,so2_ppb,no2_ppb,co_ppm,o3_ppm
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2013,3,1,0,4,4,4,7,300,77,⋯,1023.0,-18.8,0,NNW,4.4,22.14205,1.382366,3.368711,0.2371516,0.03551954
2013,3,1,1,8,8,4,7,300,77,⋯,1023.2,-18.2,0,N,4.7,22.10523,1.380067,3.363108,0.2367572,0.03546047
2013,3,1,2,7,7,5,10,300,73,⋯,1023.5,-18.2,0,NNW,5.6,22.09875,1.724579,4.803032,0.2366878,0.03360852


year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,⋯,PRES,DEWP,RAIN,wd,WSPM,volume,so2_ppb,no2_ppb,co_ppm,o3_ppm
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2017,2,28,21,16,37,10,66,700,58,⋯,1014.2,-13.3,0,NW,1.1,23.27672,3.633014,33.38977,0.5817102,0.02812603
2017,2,28,22,21,44,12,87,700,35,⋯,1014.4,-12.9,0,NNW,1.2,23.24755,4.354152,43.95863,0.5809812,0.01695134
2017,2,28,23,19,31,10,79,600,42,⋯,1014.1,-15.9,0,NNE,1.3,23.09868,3.605226,39.66086,0.4947951,0.02021135


In [6]:
air_quality_data <- air_quality_data |>
    mutate(pm2.5_24hour = zoo::rollmean(PM2.5, k = 24, fill = NA, align = "right")) |>
    mutate(pm10_24hour = zoo::rollmean(PM10, k = 24, fill = NA, align = "right")) |>
    mutate(co_8hour = zoo::rollmean(co_ppm, k = 8, fill = NA, align = "right")) |>
    mutate(o3_8hour = zoo::rollmean(o3_ppm, k = 8, fill = NA, align = "right"))  # For o3 specifically, con2aqi allows us to choose 1 or 8 hours.
    # We're using 8 hours as the 1-hour window does not allow for reporting of AQI values less than 101.
head(air_quality_data, 26)

year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,⋯,WSPM,volume,so2_ppb,no2_ppb,co_ppm,o3_ppm,pm2.5_24hour,pm10_24hour,co_8hour,o3_8hour
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2013,3,1,0,4,4,4,7,300,77,⋯,4.4,22.14205,1.382366,3.368711,0.2371516,0.03551954,,,,
2013,3,1,1,8,8,4,7,300,77,⋯,4.7,22.10523,1.380067,3.363108,0.2367572,0.03546047,,,,
2013,3,1,2,7,7,5,10,300,73,⋯,5.6,22.09875,1.724579,4.803032,0.2366878,0.03360852,,,,
2013,3,1,3,6,6,11,11,300,72,⋯,3.1,22.05284,3.786191,5.272359,0.2361961,0.03307926,,,,
2013,3,1,4,3,3,12,12,300,72,⋯,2.0,21.98913,4.118458,5.735049,0.2355137,0.0329837,,,,
2013,3,1,5,5,5,18,18,400,66,⋯,3.7,21.96435,6.170723,8.592876,0.3136644,0.03020098,,,,
2013,3,1,6,3,3,18,32,500,50,⋯,2.5,21.9127,6.156213,15.240303,0.3911585,0.02282573,,,,
2013,3,1,7,3,6,19,41,500,43,⋯,3.8,21.97441,6.516526,19.58163,0.3922601,0.01968541,,,0.2849237,0.03042045
2013,3,1,8,3,6,16,43,500,45,⋯,4.1,22.0926,5.517115,20.647289,0.3943699,0.02071181,,,0.3045759,0.02856948
2013,3,1,9,3,8,12,28,400,59,⋯,2.6,22.17721,4.153683,13.496235,0.3167041,0.02725948,,,0.3145693,0.02754436


In [7]:
# This cell takes a while.
air_quality_data <- air_quality_data |>
    na.omit() |>  #  We will remove all rows with NA first.
    mutate(pm2.5_aqi = con2aqi(pollutant = "pm25", con = pm2.5_24hour)) |>
    mutate(pm10_aqi = con2aqi(pollutant = "pm10", con = pm10_24hour)) |>
    mutate(so2_aqi = con2aqi(pollutant = "so2", con = so2_ppb)) |>
    mutate(no2_aqi = con2aqi(pollutant = "no2", con = no2_ppb)) |>
    mutate(co_aqi = con2aqi(pollutant = "co", con = co_8hour)) |>
    mutate(o3_aqi = con2aqi(pollutant = "o3", con = o3_8hour, type = "8h"))
air_quality_data
    

year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,⋯,pm2.5_24hour,pm10_24hour,co_8hour,o3_8hour,pm2.5_aqi,pm10_aqi,so2_aqi,no2_aqi,co_aqi,o3_aqi
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2013,3,1,23,24,24,26,54,600,36,⋯,7.125000,10.75000,0.4168641,0.0263273591,30,10,13,25,5,25
2013,3,2,0,22,24,24,44,500,44,⋯,7.875000,11.58333,0.4255614,0.0243705119,33,11,12,20,5,23
2013,3,2,1,14,17,21,36,400,50,⋯,8.125000,11.95833,0.4246043,0.0228959608,34,12,11,17,5,22
2013,3,2,2,13,13,20,37,400,47,⋯,8.375000,12.20833,0.4138305,0.0215023218,35,12,10,17,5,20
2013,3,2,3,3,9,13,34,400,52,⋯,8.250000,12.33333,0.4032704,0.0209912119,35,12,7,16,5,20
2013,3,2,4,3,7,18,43,400,43,⋯,8.250000,12.50000,0.3829259,0.0202643194,35,12,9,20,5,19
2013,3,2,5,9,11,19,70,500,20,⋯,8.416667,12.75000,0.3824938,0.0185281120,36,12,10,32,5,18
2013,3,2,6,4,10,28,46,500,39,⋯,8.458333,13.04167,0.3621904,0.0189052425,36,13,14,21,5,18
2013,3,2,7,3,11,34,58,500,27,⋯,8.458333,13.25000,0.3521262,0.0183816590,36,13,17,27,5,18
2013,3,2,8,3,7,21,49,500,43,⋯,8.458333,13.29167,0.3521930,0.0183277308,36,13,11,23,5,17


In [8]:
air_quality_data <- air_quality_data |>
    select(year, month, day, hour, TEMP, PRES, DEWP, RAIN, WSPM, pm2.5_aqi, pm10_aqi, so2_aqi, no2_aqi, co_aqi, o3_aqi) |>
    group_by(year, month, day) |>
    summarize(across(TEMP:WSPM, mean), across(pm2.5_aqi:o3_aqi, max)) |>
    rowwise()|>
    mutate(aqi = round(max(pm2.5_aqi:o3_aqi)))
head(air_quality_data, 3)
tail(air_quality_data, 3)
    

[1m[22m`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.


year,month,day,TEMP,PRES,DEWP,RAIN,WSPM,pm2.5_aqi,pm10_aqi,so2_aqi,no2_aqi,co_aqi,o3_aqi,aqi
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2013,3,1,-0.2,1030.5,-17.4,0,1.4,30,10,13,25,5,25,30
2013,3,2,0.6166667,1026.85,-15.9375,0,1.479167,91,39,44,50,12,23,91
2013,3,3,5.5666667,1014.608,-12.31667,0,1.658333,166,85,66,67,22,24,166


year,month,day,TEMP,PRES,DEWP,RAIN,WSPM,pm2.5_aqi,pm10_aqi,so2_aqi,no2_aqi,co_aqi,o3_aqi,aqi
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2017,2,26,7.208333,1020.337,-8.258333,0,1.566667,85,38,10,48,10,39,85
2017,2,27,7.809524,1017.143,-7.395238,0,1.52381,157,69,21,66,17,36,157
2017,2,28,10.958333,1014.888,-12.783333,0,2.058333,158,69,12,58,11,57,158


In [107]:
air_quality_data_split <- initial_split(air_quality_data, prop = 0.75, strata = aqi)
aqd_train <- training(air_quality_data_split)
aqd_test <- testing(air_quality_data_split)

In [126]:
aqd_spec <- linear_reg() |>
    set_engine("lm") |>
    set_mode("regression")
aqd_recipe <- recipe(co_aqi ~ TEMP + PRES + DEWP + RAIN + WSPM, data = aqd_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

In [127]:
aqd_results <- workflow() |>
    add_recipe(aqd_recipe) |>
    add_model(aqd_spec) |>
    fit(aqd_train) |>
    predict(aqd_train) |>
    bind_cols(aqd_train) |>
    metrics(truth = co_aqi, estimate = .pred)
aqd_results

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,10.6141883
rsq,standard,0.3339546
mae,standard,7.534676


In [115]:
aqd_vfold <- vfold_cv(data = aqd_train, v = 5, strata = aqi)
aqd_workflow <- workflow() |>
    add_recipe(aqd_recipe) |>
    add_model(aqd_spec)

In [119]:
gridvals <- tibble(neighbors = 1:10)
aqd_results <- tune_grid(aqd_workflow, resamples = aqd_vfold, grid = gridvals) |>
    collect_metrics()
aqd_results

neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,rmse,standard,84.79752130,5,2.19028745,Preprocessor1_Model001
1,rsq,standard,0.09037377,5,0.01401985,Preprocessor1_Model001
2,rmse,standard,73.74079248,5,1.40331002,Preprocessor1_Model002
2,rsq,standard,0.11619471,5,0.01281299,Preprocessor1_Model002
3,rmse,standard,68.12994383,5,1.12215408,Preprocessor1_Model003
3,rsq,standard,0.16265399,5,0.01660734,Preprocessor1_Model003
4,rmse,standard,66.23361069,5,0.81807367,Preprocessor1_Model004
4,rsq,standard,0.17391364,5,0.01930675,Preprocessor1_Model004
5,rmse,standard,64.86948363,5,1.20547611,Preprocessor1_Model005
5,rsq,standard,0.19418967,5,0.02956506,Preprocessor1_Model005


In [121]:
aqd_results |>
    filter(.metric == "rmse") |>
    arrange(by = mean)
aqd_results

neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
18,rmse,standard,61.81579,5,1.440105,Preprocessor1_Model018
17,rmse,standard,61.82326,5,1.414631,Preprocessor1_Model017
16,rmse,standard,61.93849,5,1.424480,Preprocessor1_Model016
13,rmse,standard,61.99487,5,1.451063,Preprocessor1_Model013
15,rmse,standard,62.00015,5,1.290795,Preprocessor1_Model015
19,rmse,standard,62.07878,5,1.405028,Preprocessor1_Model019
14,rmse,standard,62.08673,5,1.451722,Preprocessor1_Model014
12,rmse,standard,62.17899,5,1.464611,Preprocessor1_Model012
20,rmse,standard,62.30800,5,1.279892,Preprocessor1_Model020
11,rmse,standard,62.34615,5,1.422439,Preprocessor1_Model011


neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,rmse,standard,84.79752130,5,2.19028745,Preprocessor1_Model001
1,rsq,standard,0.09037377,5,0.01401985,Preprocessor1_Model001
2,rmse,standard,73.74079248,5,1.40331002,Preprocessor1_Model002
2,rsq,standard,0.11619471,5,0.01281299,Preprocessor1_Model002
3,rmse,standard,68.12994383,5,1.12215408,Preprocessor1_Model003
3,rsq,standard,0.16265399,5,0.01660734,Preprocessor1_Model003
4,rmse,standard,66.23361069,5,0.81807367,Preprocessor1_Model004
4,rsq,standard,0.17391364,5,0.01930675,Preprocessor1_Model004
5,rmse,standard,64.86948363,5,1.20547611,Preprocessor1_Model005
5,rsq,standard,0.19418967,5,0.02956506,Preprocessor1_Model005
