# **Preprocessing**

In preprocessing the data, the following steps are taken:

> * Prepare packages and setup  
> * Load in the data  
> * Tidy the data and store metadata  
> * Inspect data with various metrics  
> * Inspect data with visualisations 
> * Select locations
> * Select timeframe
> * Feature engineering
> * Perform train-validation-test-split  
> * Normalisation  
> * Create big combined normalised dataframe

### **Prepare packages and setup**

In [1]:
from pipeline import read_meteo_csv_from_data_raw
from pipeline import read_four_contaminants
from pipeline import get_metadata
from pipeline import tidy_raw_contaminant_data
from pipeline import tidy_raw_meteo_data
from pipeline import print_aggegrated_sensor_metrics
from pipeline import subset_sensors
from pipeline import perform_data_split
from pipeline import perform_data_split_without_train
from pipeline import print_split_ratios
from pipeline import calc_combined_min_max_params
from pipeline import normalise_linear
from pipeline import print_pollutant_extremes
from pipeline import export_minmax
from pipeline import plot_distributions_KDE
from pipeline import concat_frames_horizontally
from pipeline import delete_timezone_from_index
from pipeline import assert_equal_shape
from pipeline import assert_equal_index
from pipeline import assert_no_NaNs
from pipeline import assert_range


Running __init__.py for data pipeline...
Pipeline initialized



In [2]:
SUBSET_MONTHS = bool(1)                 # If true, only the months specified in the list below will be
                                        # used for the training, validation and testing set
START_MON = '08'                        # starting month for the data
END_MON = '12'                          # ending month for the data

# ============================================================================+

# Sensor locations in the case of Utrecht area:
DE_BILT = 'S260'                        # starting (and only used) location for meteorological data
TUINDORP = 'NL10636'                    # starting location for contamination data
BREUKELEN = 'NL10641'                   # 'goal' location for contamination data

# =============================================================================

contaminants = ['PM25', 'PM10', 'O3', 'NO2']

# Days used for the training, validation and testing splits
days_vali = 21
days_test = 21

days_vali_final_yrs = 63
days_test_final_yrs = 63

# At multiple locations, a sys.exit() can be used to halt the script

LOG = True

In [3]:
contaminants = ['PM25', 'PM10', 'O3', 'NO2']
years = [2017, 2018, 2020, 2021, 2022, 2023]
# year 2016 and 2019 were excluded due to missing data for TUINDORP, and COVID year respectively, as discussed in the Valentijn's thesis


### **Load in the data**

In [4]:
# Explicit variable declaration
df_NO2_2017_raw = None
df_NO2_2017_tidy = None
df_PM25_2017_raw = None
df_PM25_2017_tidy = None
df_PM10_2017_raw = None
df_PM10_2017_tidy = None
df_O3_2017_raw = None
df_O3_2017_tidy = None
df_meteo_2017_raw = None

df_NO2_2018_raw = None
df_NO2_2018_tidy = None
df_PM25_2018_raw = None
df_PM25_2018_tidy = None
df_PM10_2018_raw = None
df_PM10_2018_tidy = None
df_O3_2018_raw = None
df_O3_2018_tidy = None
df_meteo_2018_raw = None

df_NO2_2020_raw = None
df_NO2_2020_tidy = None
df_PM25_2020_raw = None
df_PM25_2020_tidy = None
df_PM10_2020_raw = None
df_PM10_2020_tidy = None
df_O3_2020_raw = None
df_O3_2020_tidy = None
df_meteo_2020_raw = None

df_NO2_2021_raw = None
df_NO2_2021_tidy = None
df_PM25_2021_raw = None
df_PM25_2021_tidy = None
df_PM10_2021_raw = None
df_PM10_2021_tidy = None
df_O3_2021_raw = None
df_O3_2021_tidy = None
df_meteo_2021_raw = None

df_NO2_2022_raw = None
df_NO2_2022_tidy = None
df_PM25_2022_raw = None
df_PM25_2022_tidy = None
df_PM10_2022_raw = None
df_PM10_2022_tidy = None
df_O3_2022_raw = None
df_O3_2022_tidy = None
df_meteo_2022_raw = None

df_NO2_2023_raw = None
df_NO2_2023_tidy = None
df_PM25_2023_raw = None
df_PM25_2023_tidy = None
df_PM10_2023_raw = None
df_PM10_2023_tidy = None
df_O3_2023_raw = None
df_O3_2023_tidy = None
df_meteo_2023_raw = None


In [5]:
# Loading raw data
for year in years:
    raw_data = read_four_contaminants(year, contaminants)
    for contaminant, data in zip(contaminants, raw_data):
        globals()[f'df_{contaminant}_{year}_raw'] = data

In [6]:
# Loading meteorological data
for year in years:
    globals()[f'df_meteo_{year}_raw'] = read_meteo_csv_from_data_raw(year)

In [7]:
df_O3_2017_raw

Unnamed: 0,component,matrix,meetduur,eenheid,begindatumtijd,einddatumtijd,NL01485_O3_lucht,NL01489_O3_lucht,NL01493_O3_lucht,NL01494_O3_lucht,...,NL10934_O3_lucht,NL10938_O3_lucht,NL49003_O3_lucht,NL49012_O3_lucht,NL49014_O3_lucht,NL49564_O3_lucht,NL50002_O3_lucht,NL50004_O3_lucht,NL50006_O3_lucht,NL50007_O3_lucht
0,O3,lucht,uur,µg/m³,2017-01-01T00:00:00+01:00,2017-01-01T01:00:00+01:00,1.3,6.5,1.1,1.2,...,4.45,4.78,-0.2,0.6,1.2,0.4,11.40,6.7,5.17,10.7
1,O3,lucht,uur,µg/m³,2017-01-01T01:00:00+01:00,2017-01-01T02:00:00+01:00,2.6,4.9,1.2,2.2,...,3.89,3.95,0.1,0.9,1.5,0.8,14.80,9.2,13.90,16.1
2,O3,lucht,uur,µg/m³,2017-01-01T02:00:00+01:00,2017-01-01T03:00:00+01:00,3.2,6.5,1.3,3.2,...,5.04,5.40,0.6,0.9,1.7,1.0,17.50,0.8,13.20,9.9
3,O3,lucht,uur,µg/m³,2017-01-01T03:00:00+01:00,2017-01-01T04:00:00+01:00,3.4,9.8,1.2,3.8,...,2.90,6.69,0.0,0.4,1.3,0.8,17.80,-0.5,14.30,4.0
4,O3,lucht,uur,µg/m³,2017-01-01T04:00:00+01:00,2017-01-01T05:00:00+01:00,11.0,12.4,4.1,6.3,...,1.36,4.24,-0.4,0.4,1.8,1.8,7.34,0.9,16.80,1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,O3,lucht,uur,µg/m³,2017-12-31T19:00:00+01:00,2017-12-31T20:00:00+01:00,70.4,61.7,39.5,47.9,...,72.45,71.24,41.4,39.9,58.1,,65.20,57.2,72.30,64.1
8756,O3,lucht,uur,µg/m³,2017-12-31T20:00:00+01:00,2017-12-31T21:00:00+01:00,75.5,63.1,62.0,49.3,...,71.07,67.82,22.9,61.2,65.8,,77.40,70.7,71.60,77.3
8757,O3,lucht,uur,µg/m³,2017-12-31T21:00:00+01:00,2017-12-31T22:00:00+01:00,78.8,69.8,63.4,59.4,...,68.78,69.86,46.5,63.5,74.8,,77.60,72.5,76.90,78.5
8758,O3,lucht,uur,µg/m³,2017-12-31T22:00:00+01:00,2017-12-31T23:00:00+01:00,81.1,73.2,64.8,52.6,...,73.95,67.33,29.7,59.6,67.3,,76.00,72.5,80.00,79.2


In [8]:
df_NO2_2017_raw

Unnamed: 0,component,matrix,meetduur,eenheid,begindatumtijd,einddatumtijd,NL01485_NO2_lucht,NL01487_NO2_lucht,NL01488_NO2_lucht,NL01489_NO2_lucht,...,NL49704_NO2_lucht,NL50002_NO2_lucht,NL50003_NO2_lucht,NL50004_NO2_lucht,NL53001_NO2_lucht,NL53004_NO2_lucht,NL53015_NO2_lucht,NL53016_NO2_lucht,NL53020_NO2_lucht,NL54004_NO2_lucht
0,NO2,lucht,uur,µg/m³,2017-01-01T00:00:00+01:00,2017-01-01T01:00:00+01:00,38.2,37.1,33.4,33.4,...,43.0,39.3,35.0,30.684,40.0,31.8,41.8,36.3,39.1,39.5
1,NO2,lucht,uur,µg/m³,2017-01-01T01:00:00+01:00,2017-01-01T02:00:00+01:00,37.8,35.9,30.8,38.8,...,51.2,33.9,28.7,25.984,35.0,30.6,40.4,34.7,37.4,40.9
2,NO2,lucht,uur,µg/m³,2017-01-01T02:00:00+01:00,2017-01-01T03:00:00+01:00,36.9,34.0,31.1,36.3,...,48.6,34.5,27.8,34.184,38.0,30.9,38.5,33.0,34.5,43.4
3,NO2,lucht,uur,µg/m³,2017-01-01T03:00:00+01:00,2017-01-01T04:00:00+01:00,36.5,32.1,28.1,31.1,...,42.0,39.9,18.4,36.285,40.0,27.4,36.2,32.4,33.0,37.0
4,NO2,lucht,uur,µg/m³,2017-01-01T04:00:00+01:00,2017-01-01T05:00:00+01:00,24.8,24.4,21.8,26.2,...,42.9,32.9,30.9,35.585,39.0,29.3,36.2,33.8,34.1,33.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,NO2,lucht,uur,µg/m³,2017-12-31T19:00:00+01:00,2017-12-31T20:00:00+01:00,9.4,22.9,15.3,19.5,...,20.7,5.1,16.9,4.300,19.0,33.1,10.0,4.7,2.4,7.1
8756,NO2,lucht,uur,µg/m³,2017-12-31T20:00:00+01:00,2017-12-31T21:00:00+01:00,7.1,13.0,12.4,17.6,...,7.4,4.2,12.7,5.100,15.0,29.1,5.8,4.3,1.7,8.5
8757,NO2,lucht,uur,µg/m³,2017-12-31T21:00:00+01:00,2017-12-31T22:00:00+01:00,6.3,11.7,11.3,12.4,...,8.9,12.3,7.5,5.600,18.0,23.0,5.7,3.4,1.9,7.6
8758,NO2,lucht,uur,µg/m³,2017-12-31T22:00:00+01:00,2017-12-31T23:00:00+01:00,5.3,8.8,8.4,9.6,...,9.9,6.7,11.2,3.700,20.0,24.5,4.4,2.7,1.7,8.1


In [9]:
df_meteo_2017_raw

Unnamed: 0,STN,YYYYMMDD,HH,DD,FH,FF,FX,T,T10N,TD,...,VV,N,U,WW,IX,M,R,S,O,Y
0,260,20170701,1,280,30,40,70,146,,138,...,60,8,94,61.0,7,0,1,0,0,0
1,260,20170701,2,260,30,30,70,143,,136,...,56,8,95,61.0,7,0,1,0,0,0
2,260,20170701,3,290,30,20,60,142,,136,...,64,8,95,23.0,7,0,1,0,0,0
3,260,20170701,4,270,20,20,50,143,,136,...,65,8,95,81.0,7,0,1,0,0,0
4,260,20170701,5,270,20,20,50,144,,141,...,42,8,97,62.0,7,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4411,260,20171231,20,220,50,70,110,100,,85,...,69,8,90,57.0,7,0,1,0,0,0
4412,260,20171231,21,230,70,60,120,96,,80,...,62,7,89,57.0,7,0,1,0,0,0
4413,260,20171231,22,220,70,70,150,96,,66,...,70,8,81,23.0,7,0,1,0,0,0
4414,260,20171231,23,220,70,60,150,89,,52,...,75,6,77,,5,0,0,0,0,0


In [10]:
if LOG:
    print('(1/8): Data read successfully')

(1/8): Data read successfully


### **Tidy Pollutants data**

In [11]:
# Metadata extraction
for year in years:
    for contaminant in contaminants:
        raw_data_var = f'df_{contaminant}_{year}_raw'
        meta_var = f'{contaminant}_{year}_meta'
        if raw_data_var in globals():
            globals()[meta_var] = get_metadata(globals()[raw_data_var])
        else:
            print(f"Warning: {raw_data_var} is not defined.")

# Tidying data
for year in years:
    for contaminant in contaminants:
        raw_data_var = f'df_{contaminant}_{year}_raw'
        tidy_var = f'df_{contaminant}_{year}_tidy'
        if raw_data_var in globals():
            globals()[tidy_var] = tidy_raw_contaminant_data(
                globals()[raw_data_var], str(year), SUBSET_MONTHS, START_MON, END_MON
            )
        else:
            print(f"Warning: {raw_data_var} is not defined.")


In [12]:
df_NO2_2017_raw

Unnamed: 0,component,matrix,meetduur,eenheid,begindatumtijd,einddatumtijd,NL01485_NO2_lucht,NL01487_NO2_lucht,NL01488_NO2_lucht,NL01489_NO2_lucht,...,NL49704_NO2_lucht,NL50002_NO2_lucht,NL50003_NO2_lucht,NL50004_NO2_lucht,NL53001_NO2_lucht,NL53004_NO2_lucht,NL53015_NO2_lucht,NL53016_NO2_lucht,NL53020_NO2_lucht,NL54004_NO2_lucht
0,NO2,lucht,uur,µg/m³,2017-01-01T00:00:00+01:00,2017-01-01T01:00:00+01:00,38.2,37.1,33.4,33.4,...,43.0,39.3,35.0,30.684,40.0,31.8,41.8,36.3,39.1,39.5
1,NO2,lucht,uur,µg/m³,2017-01-01T01:00:00+01:00,2017-01-01T02:00:00+01:00,37.8,35.9,30.8,38.8,...,51.2,33.9,28.7,25.984,35.0,30.6,40.4,34.7,37.4,40.9
2,NO2,lucht,uur,µg/m³,2017-01-01T02:00:00+01:00,2017-01-01T03:00:00+01:00,36.9,34.0,31.1,36.3,...,48.6,34.5,27.8,34.184,38.0,30.9,38.5,33.0,34.5,43.4
3,NO2,lucht,uur,µg/m³,2017-01-01T03:00:00+01:00,2017-01-01T04:00:00+01:00,36.5,32.1,28.1,31.1,...,42.0,39.9,18.4,36.285,40.0,27.4,36.2,32.4,33.0,37.0
4,NO2,lucht,uur,µg/m³,2017-01-01T04:00:00+01:00,2017-01-01T05:00:00+01:00,24.8,24.4,21.8,26.2,...,42.9,32.9,30.9,35.585,39.0,29.3,36.2,33.8,34.1,33.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,NO2,lucht,uur,µg/m³,2017-12-31T19:00:00+01:00,2017-12-31T20:00:00+01:00,9.4,22.9,15.3,19.5,...,20.7,5.1,16.9,4.300,19.0,33.1,10.0,4.7,2.4,7.1
8756,NO2,lucht,uur,µg/m³,2017-12-31T20:00:00+01:00,2017-12-31T21:00:00+01:00,7.1,13.0,12.4,17.6,...,7.4,4.2,12.7,5.100,15.0,29.1,5.8,4.3,1.7,8.5
8757,NO2,lucht,uur,µg/m³,2017-12-31T21:00:00+01:00,2017-12-31T22:00:00+01:00,6.3,11.7,11.3,12.4,...,8.9,12.3,7.5,5.600,18.0,23.0,5.7,3.4,1.9,7.6
8758,NO2,lucht,uur,µg/m³,2017-12-31T22:00:00+01:00,2017-12-31T23:00:00+01:00,5.3,8.8,8.4,9.6,...,9.9,6.7,11.2,3.700,20.0,24.5,4.4,2.7,1.7,8.1


In [13]:
df_NO2_2017_tidy

Unnamed: 0_level_0,NL01485,NL01487,NL01488,NL01489,NL01491,NL01493,NL01494,NL01495,NL01496,NL01908,...,NL49703,NL49704,NL50002,NL50003,NL53001,NL53004,NL53015,NL53016,NL53020,NL54004
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-01 00:00:00,65.7,57.1,46.0,33.4,51.0,66.3,66.3,55.0,39.3,24.8,...,20.6,56.1,22.8,17.0,11.0,22.5,22.5,28.7,34.8,29.0
2017-08-01 01:00:00,59.6,53.1,46.2,25.0,47.9,64.4,67.6,34.6,46.8,16.4,...,20.6,48.1,21.3,16.8,11.0,18.8,35.2,31.4,40.1,21.1
2017-08-01 02:00:00,53.3,47.8,34.6,32.5,47.4,52.3,58.1,29.2,19.9,26.2,...,23.1,52.2,21.3,18.3,12.0,17.1,34.6,22.6,31.3,22.4
2017-08-01 03:00:00,45.8,48.5,45.1,41.4,49.3,49.3,62.5,38.6,17.2,32.9,...,28.3,41.8,28.1,27.5,10.0,12.1,23.5,19.4,25.8,27.7
2017-08-01 04:00:00,31.1,52.1,49.3,44.3,48.5,59.6,59.0,51.2,37.2,49.9,...,30.4,33.5,21.8,19.4,19.0,17.9,25.8,30.6,19.7,40.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-30 19:00:00,9.4,19.7,15.9,23.9,23.9,38.8,21.8,25.2,43.4,11.7,...,9.3,21.5,19.1,18.4,9.0,33.4,8.2,9.4,0.8,18.7
2017-12-30 20:00:00,11.3,19.7,15.1,21.4,26.7,41.8,22.9,19.9,35.7,9.7,...,10.2,18.8,7.7,12.0,8.0,35.7,7.9,9.6,1.1,19.4
2017-12-30 21:00:00,9.4,24.8,21.8,16.4,21.2,25.2,22.5,18.3,37.6,8.2,...,10.2,18.5,5.5,13.9,6.0,30.2,10.2,9.3,2.2,17.6
2017-12-30 22:00:00,8.8,26.9,25.0,16.0,19.1,29.8,22.3,21.8,33.0,10.7,...,12.2,22.5,14.1,13.1,8.0,22.8,8.2,10.4,1.1,18.8


In [14]:
if LOG:
    assert_equal_shape([
        df_PM25_2017_tidy, df_PM10_2017_tidy, df_O3_2017_tidy, df_NO2_2017_tidy,
        df_PM25_2018_tidy, df_PM10_2018_tidy, df_O3_2018_tidy, df_NO2_2018_tidy,
        df_PM25_2020_tidy, df_PM10_2020_tidy, df_O3_2020_tidy, df_NO2_2020_tidy,
        df_PM25_2021_tidy, df_PM10_2021_tidy, df_O3_2021_tidy, df_NO2_2021_tidy,
        df_PM25_2022_tidy, df_PM10_2022_tidy, df_O3_2022_tidy, df_NO2_2022_tidy,
        df_PM25_2023_tidy, df_PM10_2023_tidy, df_O3_2023_tidy, df_NO2_2023_tidy
        # Check for equal row length, not column length (there are a variable amount of
        # locations that measure each components, so column number is unequal)
    ], True, False, 'Tidying of pollutant data')
    print('(2/8): Pollutant data tidied successfully')

(2/8): Pollutant data tidied successfully


In [15]:
df_NO2_2017_tidy

Unnamed: 0_level_0,NL01485,NL01487,NL01488,NL01489,NL01491,NL01493,NL01494,NL01495,NL01496,NL01908,...,NL49703,NL49704,NL50002,NL50003,NL53001,NL53004,NL53015,NL53016,NL53020,NL54004
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-01 00:00:00,65.7,57.1,46.0,33.4,51.0,66.3,66.3,55.0,39.3,24.8,...,20.6,56.1,22.8,17.0,11.0,22.5,22.5,28.7,34.8,29.0
2017-08-01 01:00:00,59.6,53.1,46.2,25.0,47.9,64.4,67.6,34.6,46.8,16.4,...,20.6,48.1,21.3,16.8,11.0,18.8,35.2,31.4,40.1,21.1
2017-08-01 02:00:00,53.3,47.8,34.6,32.5,47.4,52.3,58.1,29.2,19.9,26.2,...,23.1,52.2,21.3,18.3,12.0,17.1,34.6,22.6,31.3,22.4
2017-08-01 03:00:00,45.8,48.5,45.1,41.4,49.3,49.3,62.5,38.6,17.2,32.9,...,28.3,41.8,28.1,27.5,10.0,12.1,23.5,19.4,25.8,27.7
2017-08-01 04:00:00,31.1,52.1,49.3,44.3,48.5,59.6,59.0,51.2,37.2,49.9,...,30.4,33.5,21.8,19.4,19.0,17.9,25.8,30.6,19.7,40.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-30 19:00:00,9.4,19.7,15.9,23.9,23.9,38.8,21.8,25.2,43.4,11.7,...,9.3,21.5,19.1,18.4,9.0,33.4,8.2,9.4,0.8,18.7
2017-12-30 20:00:00,11.3,19.7,15.1,21.4,26.7,41.8,22.9,19.9,35.7,9.7,...,10.2,18.8,7.7,12.0,8.0,35.7,7.9,9.6,1.1,19.4
2017-12-30 21:00:00,9.4,24.8,21.8,16.4,21.2,25.2,22.5,18.3,37.6,8.2,...,10.2,18.5,5.5,13.9,6.0,30.2,10.2,9.3,2.2,17.6
2017-12-30 22:00:00,8.8,26.9,25.0,16.0,19.1,29.8,22.3,21.8,33.0,10.7,...,12.2,22.5,14.1,13.1,8.0,22.8,8.2,10.4,1.1,18.8


## **Extract and tidy meteorological data**

In [16]:
only_DeBilt = True  # True: only De Bilt is used
variables = ['T', 'TD', 'DD', 'FH', 'FX', 'DR', 'P', 'RH', 'SQ', 'Q']

dataframes = {}

for year in years:
    raw_data = globals()[f'df_meteo_{year}_raw']
    for var in variables:
        var_name = var if var != 'P' else 'P_'  # Handling 'P' separately to match original naming
        df_name = f"df_{var_name}_{year}_tidy"
        dataframes[df_name] = tidy_raw_meteo_data(
            raw_data, var, only_DeBilt, str(year), SUBSET_MONTHS, START_MON, END_MON)

# Assign variables dynamically
globals().update(dataframes)


In [17]:
# print all the df names of dataframes
for df_name in dataframes:
    print(df_name)


df_T_2017_tidy
df_TD_2017_tidy
df_DD_2017_tidy
df_FH_2017_tidy
df_FX_2017_tidy
df_DR_2017_tidy
df_P__2017_tidy
df_RH_2017_tidy
df_SQ_2017_tidy
df_Q_2017_tidy
df_T_2018_tidy
df_TD_2018_tidy
df_DD_2018_tidy
df_FH_2018_tidy
df_FX_2018_tidy
df_DR_2018_tidy
df_P__2018_tidy
df_RH_2018_tidy
df_SQ_2018_tidy
df_Q_2018_tidy
df_T_2020_tidy
df_TD_2020_tidy
df_DD_2020_tidy
df_FH_2020_tidy
df_FX_2020_tidy
df_DR_2020_tidy
df_P__2020_tidy
df_RH_2020_tidy
df_SQ_2020_tidy
df_Q_2020_tidy
df_T_2021_tidy
df_TD_2021_tidy
df_DD_2021_tidy
df_FH_2021_tidy
df_FX_2021_tidy
df_DR_2021_tidy
df_P__2021_tidy
df_RH_2021_tidy
df_SQ_2021_tidy
df_Q_2021_tidy
df_T_2022_tidy
df_TD_2022_tidy
df_DD_2022_tidy
df_FH_2022_tidy
df_FX_2022_tidy
df_DR_2022_tidy
df_P__2022_tidy
df_RH_2022_tidy
df_SQ_2022_tidy
df_Q_2022_tidy
df_T_2023_tidy
df_TD_2023_tidy
df_DD_2023_tidy
df_FH_2023_tidy
df_FX_2023_tidy
df_DR_2023_tidy
df_P__2023_tidy
df_RH_2023_tidy
df_SQ_2023_tidy
df_Q_2023_tidy


In [18]:
df_T_2017_tidy

Unnamed: 0_level_0,S260
DateTime,Unnamed: 1_level_1
2017-08-01 00:00:00,149.0
2017-08-01 01:00:00,152.0
2017-08-01 02:00:00,140.0
2017-08-01 03:00:00,127.0
2017-08-01 04:00:00,136.0
...,...
2017-12-30 19:00:00,111.0
2017-12-30 20:00:00,108.0
2017-12-30 21:00:00,103.0
2017-12-30 22:00:00,101.0


In [19]:
# Extract all DataFrames from the dictionary
datasets = list(dataframes.values())

# Ensure there is at least one dataset to check
assert len(datasets) > 0, "Error: No datasets found!"

# 1. Assert all datasets have the same shape
reference_shape = datasets[0].shape  # Take the shape of the first dataset as reference
assert all(df.shape == reference_shape for df in datasets), "Error: Not all datasets have the same shape!"

# 2. Assert no NaNs in any dataset
assert all(not df.isnull().values.any() for df in datasets), "Error: Some datasets contain NaN values!"

print("All datasets have the same shape and contain no NaN values.")
print('(3/8): Meteorological data tidied successfully')


All datasets have the same shape and contain no NaN values.
(3/8): Meteorological data tidied successfully


### **Inspect data with various metrics**

pipeline/statistics.py contains more functions

In [20]:
print("Printing some basic statistics for the pollutants:")
print("(Sensor NL10636 is TUINDORP)\n")

print_aggegrated_sensor_metrics(
    [df_PM25_2017_tidy,
     df_PM25_2018_tidy,
     df_PM25_2020_tidy,
     df_PM25_2021_tidy,
     df_PM25_2022_tidy,
     df_PM25_2023_tidy], TUINDORP, PM25_2017_meta
)

print_aggegrated_sensor_metrics(
    [df_PM10_2017_tidy,
     df_PM10_2018_tidy,
     df_PM10_2020_tidy,
     df_PM10_2021_tidy,
     df_PM10_2022_tidy,
     df_PM10_2023_tidy], TUINDORP, PM10_2017_meta
)

print_aggegrated_sensor_metrics(
    [df_O3_2017_tidy,
     df_O3_2018_tidy,
     df_O3_2020_tidy,
     df_O3_2021_tidy,
     df_O3_2022_tidy,
     df_O3_2023_tidy], TUINDORP, O3_2017_meta
)

print_aggegrated_sensor_metrics(
    [df_NO2_2017_tidy,
     df_NO2_2018_tidy,
     df_NO2_2020_tidy,
     df_NO2_2021_tidy,
     df_NO2_2022_tidy,
     df_NO2_2023_tidy], TUINDORP, NO2_2017_meta
)

Printing some basic statistics for the pollutants:
(Sensor NL10636 is TUINDORP)

[min, mean, max] for sensor NL10636 measuring PM2.5 µg/m³
aggregated over multiple years:
[-4.8670, 10.4811, 85.7280] with n = 21888

[min, mean, max] for sensor NL10636 measuring PM10 µg/m³
aggregated over multiple years:
[-19.8900, 16.0351, 111.9500] with n = 21888



[min, mean, max] for sensor NL10636 measuring O3 µg/m³
aggregated over multiple years:
[-2.2200, 33.6199, 180.5100] with n = 21888

[min, mean, max] for sensor NL10636 measuring NO2 µg/m³
aggregated over multiple years:
[0.7600, 23.3401, 107.4500] with n = 21888



In [21]:
pollutants = ['PM25', 'PM10', 'O3', 'NO2']

# Delete pollutant data
for year in years:
    for pollutant in pollutants:
        var_name = f"df_{pollutant}_{year}_raw"
        if var_name in globals():
            del globals()[var_name]

# Delete meteorological data
for year in years:
    var_name = f"df_meteo_{year}_raw"
    if var_name in globals():
        del globals()[var_name]


### **Inspect data with visualisations**

plots.py contains plotting functions

### **Select locations**

In [22]:
# Here, we'll select the locations we want to use. The
# generated dataframes will be 1-dimensional
# datasets are called df_pollutant_year_tidy_subset_1D
sensors_1D = [TUINDORP, BREUKELEN]
pollutants = ['PM25', 'PM10', 'O3', 'NO2']

# Create subset dataframes dynamically
for year in years:
    for pollutant in pollutants:
        raw_var = f"df_{pollutant}_{year}_tidy"
        subset_var = f"{raw_var}_subset_1D"
        
        if raw_var in globals():
            globals()[subset_var] = subset_sensors(globals()[raw_var], sensors_1D)

# Delete original tidy dataframes after subsetting
for year in years:
    for pollutant in pollutants:
        raw_var = f"df_{pollutant}_{year}_tidy"
        if raw_var in globals():
            del globals()[raw_var]


In [23]:
df_NO2_2017_tidy_subset_1D

Unnamed: 0_level_0,NL10636,NL10641
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-01 00:00:00,23.785,22.08
2017-08-01 01:00:00,22.030,14.84
2017-08-01 02:00:00,16.730,26.92
2017-08-01 03:00:00,14.550,40.60
2017-08-01 04:00:00,15.840,45.69
...,...,...
2017-12-30 19:00:00,17.910,21.44
2017-12-30 20:00:00,14.810,23.19
2017-12-30 21:00:00,20.300,23.58
2017-12-30 22:00:00,15.600,22.29


In [24]:
print(df_NO2_2017_tidy_subset_1D.shape, df_O3_2017_tidy_subset_1D.shape,
        df_PM25_2017_tidy_subset_1D.shape, df_PM10_2017_tidy_subset_1D.shape)
print(df_NO2_2018_tidy_subset_1D.shape, df_O3_2018_tidy_subset_1D.shape,
        df_PM25_2018_tidy_subset_1D.shape, df_PM10_2018_tidy_subset_1D.shape)
print(df_NO2_2020_tidy_subset_1D.shape, df_O3_2020_tidy_subset_1D.shape,
        df_PM25_2020_tidy_subset_1D.shape, df_PM10_2020_tidy_subset_1D.shape)
print(df_NO2_2021_tidy_subset_1D.shape, df_O3_2021_tidy_subset_1D.shape,
        df_PM25_2021_tidy_subset_1D.shape, df_PM10_2021_tidy_subset_1D.shape)
print(df_NO2_2022_tidy_subset_1D.shape, df_O3_2022_tidy_subset_1D.shape,
        df_PM25_2022_tidy_subset_1D.shape, df_PM10_2022_tidy_subset_1D.shape)



(3648, 2) (3648, 2) (3648, 2) (3648, 2)
(3648, 2) (3648, 2) (3648, 2) (3648, 2)
(3648, 2) (3648, 2) (3648, 2) (3648, 2)
(3648, 2) (3648, 2) (3648, 2) (3648, 2)
(3648, 2) (3648, 2) (3648, 2) (3648, 2)


### **Perform assertions for pollutant tidy 1D data**

In [25]:
if LOG:
    pollutants = ['NO2', 'O3', 'PM25', 'PM10']


    # Collect dataframes dynamically
    shape_check_dfs = [
        globals()[f"df_{pollutant}_{year}_tidy_subset_1D"]
        for year in years
        for pollutant in pollutants
        if f"df_{pollutant}_{year}_tidy_subset_1D" in globals()
    ]

    nan_check_dfs = [
        globals()[f"df_{pollutant}_{year}_tidy_subset_1D"]
        for year in years
        for pollutant in pollutants
        if f"df_{pollutant}_{year}_tidy_subset_1D" in globals()
    ]

    # Perform assertions
    assert_equal_shape(shape_check_dfs, True, True, 'Location-wise subsetting of pollutant data')
    assert_no_NaNs(nan_check_dfs, 'Location-wise subsetting of pollutant data')

    print('(4/8): Location-wise subsetting of pollutant data successful')


(4/8): Location-wise subsetting of pollutant data successful


### **Select timeframe**

Timeframe selection (excluding 2016 and 2019) was done iteratively and manually by inspecting the data and inspecting the distributions. It is discussed in the thesis, Section 3.1, from Valentijn's thesis.

This has already been done in the beginning of the script.


### **Feature Engineering**

This is discussed in Section 3.2 of the thesis and done mainly with a plain correlation matrix.

-------------------------------------------------------------------------------------------------------------------
PROBLEM: DK IF SHOULD FOLLOW WHAT HE DID SINCE THERE WILL BE NOT ENOUGH PHYSICS FEATURES FOR THE PEML MODEL

### **Perform train-validation-test-split**

In [17]:
# Splitting the data into train, validation and test sets.
# Each component is split separately. (All data remains
# segregate for now for proper normalisation later.)

df_PM25_2017_train_1D = df_PM25_2017_tidy_subset_1D.copy()
df_PM10_2017_train_1D = df_PM10_2017_tidy_subset_1D.copy()
df_NO2_2017_train_1D  = df_NO2_2017_tidy_subset_1D.copy()
df_O3_2017_train_1D   = df_O3_2017_tidy_subset_1D.copy()
df_temp_2017_train = df_temp_2017_tidy.copy()
df_dewP_2017_train = df_dewP_2017_tidy.copy()
df_WD_2017_train   = df_WD_2017_tidy.copy()
df_Wvh_2017_train  = df_Wvh_2017_tidy.copy()
df_P_2017_train    = df_P_2017_tidy.copy()
df_SQ_2017_train   = df_SQ_2017_tidy.copy()

df_PM25_2018_train_1D = df_PM25_2018_tidy_subset_1D.copy()
df_PM10_2018_train_1D = df_PM10_2018_tidy_subset_1D.copy()
df_NO2_2018_train_1D  = df_NO2_2018_tidy_subset_1D.copy()
df_O3_2018_train_1D   = df_O3_2018_tidy_subset_1D.copy()
df_temp_2018_train = df_temp_2018_tidy.copy()
df_dewP_2018_train = df_dewP_2018_tidy.copy()
df_WD_2018_train   = df_WD_2018_tidy.copy()
df_Wvh_2018_train  = df_Wvh_2018_tidy.copy()
df_P_2018_train    = df_P_2018_tidy.copy()
df_SQ_2018_train   = df_SQ_2018_tidy.copy()

df_PM25_2020_train_1D = df_PM25_2020_tidy_subset_1D.copy()
df_PM10_2020_train_1D = df_PM10_2020_tidy_subset_1D.copy()
df_NO2_2020_train_1D  = df_NO2_2020_tidy_subset_1D.copy()
df_O3_2020_train_1D   = df_O3_2020_tidy_subset_1D.copy()
df_temp_2020_train = df_temp_2020_tidy.copy()
df_dewP_2020_train = df_dewP_2020_tidy.copy()
df_WD_2020_train   = df_WD_2020_tidy.copy()
df_Wvh_2020_train  = df_Wvh_2020_tidy.copy()
df_P_2020_train    = df_P_2020_tidy.copy()
df_SQ_2020_train   = df_SQ_2020_tidy.copy()

df_PM25_2021_train_1D, df_PM25_2021_val_1D, df_PM25_2021_test_1D = \
    perform_data_split(df_PM25_2021_tidy_subset_1D, days_vali, days_test)
df_PM10_2021_train_1D, df_PM10_2021_val_1D, df_PM10_2021_test_1D = \
    perform_data_split(df_PM10_2021_tidy_subset_1D, days_vali, days_test)
df_NO2_2021_train_1D,  df_NO2_2021_val_1D,  df_NO2_2021_test_1D  = \
    perform_data_split(df_NO2_2021_tidy_subset_1D, days_vali, days_test)
df_O3_2021_train_1D,   df_O3_2021_val_1D,   df_O3_2021_test_1D   = \
    perform_data_split(df_O3_2021_tidy_subset_1D, days_vali, days_test)
df_temp_2021_train, df_temp_2021_val, df_temp_2021_test = \
    perform_data_split(df_temp_2021_tidy, days_vali, days_test)
df_dewP_2021_train, df_dewP_2021_val, df_dewP_2021_test = \
    perform_data_split(df_dewP_2021_tidy, days_vali, days_test)
df_WD_2021_train,   df_WD_2021_val,   df_WD_2021_test   = \
    perform_data_split(df_WD_2021_tidy, days_vali, days_test)
df_Wvh_2021_train,  df_Wvh_2021_val,  df_Wvh_2021_test  = \
    perform_data_split(df_Wvh_2021_tidy, days_vali, days_test)
df_P_2021_train,    df_P_2021_val,    df_P_2021_test    = \
    perform_data_split(df_P_2021_tidy, days_vali, days_test)
df_SQ_2021_train,   df_SQ_2021_val,   df_SQ_2021_test   = \
    perform_data_split(df_SQ_2021_tidy, days_vali, days_test)

df_PM25_2022_train_1D, df_PM25_2022_val_1D, df_PM25_2022_test_1D = \
    perform_data_split(df_PM25_2022_tidy_subset_1D, days_vali, days_test)
df_PM10_2022_train_1D, df_PM10_2022_val_1D, df_PM10_2022_test_1D = \
    perform_data_split(df_PM10_2022_tidy_subset_1D, days_vali, days_test)
df_NO2_2022_train_1D,  df_NO2_2022_val_1D,  df_NO2_2022_test_1D  = \
    perform_data_split(df_NO2_2022_tidy_subset_1D, days_vali, days_test)
df_O3_2022_train_1D,   df_O3_2022_val_1D,   df_O3_2022_test_1D   = \
    perform_data_split(df_O3_2022_tidy_subset_1D, days_vali, days_test)
df_temp_2022_train, df_temp_2022_val, df_temp_2022_test = \
    perform_data_split(df_temp_2022_tidy, days_vali, days_test)
df_dewP_2022_train, df_dewP_2022_val, df_dewP_2022_test = \
    perform_data_split(df_dewP_2022_tidy, days_vali, days_test)
df_WD_2022_train,   df_WD_2022_val,   df_WD_2022_test   = \
    perform_data_split(df_WD_2022_tidy, days_vali, days_test)
df_Wvh_2022_train,  df_Wvh_2022_val,  df_Wvh_2022_test  = \
    perform_data_split(df_Wvh_2022_tidy, days_vali, days_test)
df_P_2022_train,    df_P_2022_val,    df_P_2022_test    = \
    perform_data_split(df_P_2022_tidy, days_vali, days_test)
df_SQ_2022_train,   df_SQ_2022_val,   df_SQ_2022_test   = \
    perform_data_split(df_SQ_2022_tidy, days_vali, days_test)

df_PM25_2023_val_1D, df_PM25_2023_test_1D = \
    perform_data_split_without_train(
        df_PM25_2023_tidy_subset_1D, days_vali_final_yrs, days_test_final_yrs)
df_PM10_2023_val_1D, df_PM10_2023_test_1D = \
    perform_data_split_without_train(
        df_PM10_2023_tidy_subset_1D, days_vali_final_yrs, days_test_final_yrs)
df_NO2_2023_val_1D,  df_NO2_2023_test_1D  = \
    perform_data_split_without_train(
        df_NO2_2023_tidy_subset_1D, days_vali_final_yrs, days_test_final_yrs)
df_O3_2023_val_1D,   df_O3_2023_test_1D   = \
    perform_data_split_without_train(
        df_O3_2023_tidy_subset_1D, days_vali_final_yrs, days_test_final_yrs)
df_temp_2023_val,    df_temp_2023_test = \
    perform_data_split_without_train(
        df_temp_2023_tidy, days_vali_final_yrs, days_test_final_yrs)
df_dewP_2023_val,    df_dewP_2023_test = \
    perform_data_split_without_train(
        df_dewP_2023_tidy, days_vali_final_yrs, days_test_final_yrs)
df_WD_2023_val,      df_WD_2023_test   = \
    perform_data_split_without_train(
        df_WD_2023_tidy, days_vali_final_yrs, days_test_final_yrs)
df_Wvh_2023_val,     df_Wvh_2023_test  = \
    perform_data_split_without_train(
        df_Wvh_2023_tidy, days_vali_final_yrs, days_test_final_yrs)
df_P_2023_val,       df_P_2023_test    = \
    perform_data_split_without_train(
        df_P_2023_tidy, days_vali_final_yrs, days_test_final_yrs)
df_SQ_2023_val,      df_SQ_2023_test   = \
    perform_data_split_without_train(
        df_SQ_2023_tidy, days_vali_final_yrs, days_test_final_yrs)

In [18]:
if LOG:
    # First, check for equal shape of pollutant data of unsplitted years
    assert_equal_shape([
        df_PM25_2017_train_1D, df_PM10_2017_train_1D,
        df_NO2_2017_train_1D, df_O3_2017_train_1D,
        df_PM25_2018_train_1D, df_PM10_2018_train_1D,
        df_NO2_2018_train_1D, df_O3_2018_train_1D,
        df_PM25_2020_train_1D, df_PM10_2020_train_1D,
        df_NO2_2020_train_1D, df_O3_2020_train_1D
    ], True, True, 'Split of pollutant train set for 2017, 2018 and 2020')
    # Second, check for equal shape of meteorological data of unsplitted years
    assert_equal_shape([
        df_temp_2017_train, df_dewP_2017_train, df_WD_2017_train,
        df_Wvh_2017_train, df_P_2017_train, df_SQ_2017_train,
        df_temp_2018_train, df_dewP_2018_train, df_WD_2018_train,
        df_Wvh_2018_train, df_P_2018_train, df_SQ_2018_train,
        df_temp_2020_train, df_dewP_2020_train, df_WD_2020_train,
        df_Wvh_2020_train, df_P_2020_train, df_SQ_2020_train
    ], True, True, 'Split of meteorological train set for 2017, 2018 and 2020')
    # Third, check for equal row number of training set in 2021 and 2022
    assert_equal_shape([
        df_PM25_2021_train_1D, df_PM10_2021_train_1D,
        df_NO2_2021_train_1D, df_O3_2021_train_1D,
        df_temp_2021_train, df_dewP_2021_train, df_WD_2021_train,
        df_Wvh_2021_train, df_P_2021_train, df_SQ_2021_train,
        df_PM25_2022_train_1D, df_PM10_2022_train_1D,
        df_NO2_2022_train_1D, df_O3_2022_train_1D,
        df_temp_2022_train, df_dewP_2022_train, df_WD_2022_train,
        df_Wvh_2022_train, df_P_2022_train, df_SQ_2022_train
    # They should be of the same length, meaning they're split over the
    # same timeframe. Columns can vary, because meteorological data is
    # not used for the location where the predictions are made, i.e. Breukelen
    ], True, False, 'Split of training data for 2021 and 2022')
    # Fourth, check for equal row number of validation set in 2021 and 2022
    assert_equal_shape([
        df_PM25_2021_val_1D, df_PM10_2021_val_1D,
        df_NO2_2021_val_1D, df_O3_2021_val_1D,
        df_temp_2021_val, df_dewP_2021_val, df_WD_2021_val,
        df_Wvh_2021_val, df_P_2021_val, df_SQ_2021_val,
        df_PM25_2022_val_1D, df_PM10_2022_val_1D,
        df_NO2_2022_val_1D, df_O3_2022_val_1D,
        df_temp_2022_val, df_dewP_2022_val, df_WD_2022_val,
        df_Wvh_2022_val, df_P_2022_val, df_SQ_2022_val
    ], True, False, 'Split of validation data for 2021 and 2022')
    # Fifth, check for equal row number of test set in 2021 and 2022
    assert_equal_shape([
        df_PM25_2021_test_1D, df_PM10_2021_test_1D,
        df_NO2_2021_test_1D, df_O3_2021_test_1D,
        df_temp_2021_test, df_dewP_2021_test, df_WD_2021_test,
        df_Wvh_2021_test, df_P_2021_test, df_SQ_2021_test,
        df_PM25_2022_test_1D, df_PM10_2022_test_1D,
        df_NO2_2022_test_1D, df_O3_2022_test_1D,
        df_temp_2022_test, df_dewP_2022_test, df_WD_2022_test,
        df_Wvh_2022_test, df_P_2022_test, df_SQ_2022_test
    ], True, False, 'Split of test data for 2021 and 2022')
    # Sixth, check for equal row number of validation set in 2023
    assert_equal_shape([
        df_PM25_2023_val_1D, df_PM10_2023_val_1D,
        df_NO2_2023_val_1D, df_O3_2023_val_1D,
        df_temp_2023_val, df_dewP_2023_val, df_WD_2023_val,
        df_Wvh_2023_val, df_P_2023_val, df_SQ_2023_val
    ], True, False, 'Split of validation data for 2023')
    # Seventh, check for equal row number of test set in 2023
    assert_equal_shape([
        df_PM25_2023_test_1D, df_PM10_2023_test_1D,
        df_NO2_2023_test_1D, df_O3_2023_test_1D,
        df_temp_2023_test, df_dewP_2023_test, df_WD_2023_test,
        df_Wvh_2023_test, df_P_2023_test, df_SQ_2023_test
    ], True, False, 'Split of test data for 2023')
    print('(5/8): Train-validation-test split successful')


(5/8): Train-validation-test split successful


In [19]:
print_split_ratios([df_PM25_2017_train_1D,
                    df_PM25_2018_train_1D,
                    df_PM25_2020_train_1D,
                    df_PM25_2021_train_1D,
                    df_PM25_2022_train_1D],
                    [df_PM25_2021_val_1D,
                    df_PM25_2022_val_1D,
                    df_PM25_2023_val_1D],
                    [df_PM25_2021_test_1D,
                    df_PM25_2022_test_1D,
                    df_PM25_2023_test_1D],
                    'the') # Could also print the pollutants here or any other string

[train/validation/test] %-ratio for the data is: [76.3/11.9/11.9]


### **Normalisation**

In [20]:
# Normalise each component separately, using the training data extremes


PM25_min_train, PM25_max_train = calc_combined_min_max_params([
                                                            df_PM25_2017_train_1D,
                                                            df_PM25_2018_train_1D,
                                                            df_PM25_2020_train_1D,
                                                            df_PM25_2021_train_1D,
                                                            df_PM25_2022_train_1D,
                                                            ])
PM10_min_train, PM10_max_train = calc_combined_min_max_params([
                                                            df_PM10_2017_train_1D,
                                                            df_PM10_2018_train_1D,
                                                            df_PM10_2020_train_1D,
                                                            df_PM10_2021_train_1D,
                                                            df_PM10_2022_train_1D,
                                                            ])
O3_min_train,   O3_max_train   = calc_combined_min_max_params([
                                                            df_O3_2017_train_1D,
                                                            df_O3_2018_train_1D,
                                                            df_O3_2020_train_1D,
                                                            df_O3_2021_train_1D,
                                                            df_O3_2022_train_1D,
                                                            ])
NO2_min_train,  NO2_max_train  = calc_combined_min_max_params([
                                                            df_NO2_2017_train_1D,
                                                            df_NO2_2018_train_1D,
                                                            df_NO2_2020_train_1D,
                                                            df_NO2_2021_train_1D,
                                                            df_NO2_2022_train_1D,
                                                            ])
temp_min_train, temp_max_train = calc_combined_min_max_params([
                                                            df_temp_2017_train,
                                                            df_temp_2018_train,
                                                            df_temp_2020_train,
                                                            df_temp_2021_train,
                                                            df_temp_2022_train,
                                                            ])
dewP_min_train, dewP_max_train = calc_combined_min_max_params([
                                                            df_dewP_2017_train,
                                                            df_dewP_2018_train,
                                                            df_dewP_2020_train,
                                                            df_dewP_2021_train,
                                                            df_dewP_2022_train,
                                                            ])
WD_min_train,   WD_max_train   = calc_combined_min_max_params([
                                                            df_WD_2017_train,
                                                            df_WD_2018_train,
                                                            df_WD_2020_train,
                                                            df_WD_2021_train,
                                                            df_WD_2022_train,
                                                            ])
Wvh_min_train,  Wvh_max_train  = calc_combined_min_max_params([
                                                            df_Wvh_2017_train,
                                                            df_Wvh_2018_train,
                                                            df_Wvh_2020_train,
                                                            df_Wvh_2021_train,
                                                            df_Wvh_2022_train,
                                                            ])
P_min_train,    P_max_train    = calc_combined_min_max_params([
                                                            df_P_2017_train,
                                                            df_P_2018_train,
                                                            df_P_2020_train,
                                                            df_P_2021_train,
                                                            df_P_2022_train,
                                                            ])
SQ_min_train,   SQ_max_train   = calc_combined_min_max_params([
                                                            df_SQ_2017_train,
                                                            df_SQ_2018_train,
                                                            df_SQ_2020_train,
                                                            df_SQ_2021_train,
                                                            df_SQ_2022_train,
                                                            ])

print()
df_minmax = print_pollutant_extremes(
    [NO2_min_train, NO2_max_train,
     O3_min_train, O3_max_train,
     PM10_min_train, PM10_max_train,
     PM25_min_train, PM25_max_train]
)
print()
export_minmax(df_minmax, 'contaminant_minmax')


         min      max
NO2   -0.280  107.450
O3    -2.220  180.510
PM10 -19.890  379.470
PM25  -4.934   79.711



In [21]:
df_NO2_2017_train_norm_1D = normalise_linear(df_NO2_2017_train_1D, NO2_min_train, NO2_max_train)
df_NO2_2018_train_norm_1D = normalise_linear(df_NO2_2018_train_1D, NO2_min_train, NO2_max_train)
df_NO2_2020_train_norm_1D = normalise_linear(df_NO2_2020_train_1D, NO2_min_train, NO2_max_train)
df_NO2_2021_train_norm_1D = normalise_linear(df_NO2_2021_train_1D, NO2_min_train, NO2_max_train)
df_NO2_2021_val_norm_1D = normalise_linear(df_NO2_2021_val_1D, NO2_min_train, NO2_max_train)
df_NO2_2021_test_norm_1D = normalise_linear(df_NO2_2021_test_1D, NO2_min_train, NO2_max_train)
df_NO2_2022_train_norm_1D = normalise_linear(df_NO2_2022_train_1D, NO2_min_train, NO2_max_train)
df_NO2_val_2022_norm_1D = normalise_linear(df_NO2_2022_val_1D, NO2_min_train, NO2_max_train)
df_NO2_test_2022_norm_1D = normalise_linear(df_NO2_2022_test_1D, NO2_min_train, NO2_max_train)
df_NO2_val_2023_norm_1D = normalise_linear(df_NO2_2023_val_1D, NO2_min_train, NO2_max_train)
df_NO2_test_2023_norm_1D = normalise_linear(df_NO2_2023_test_1D, NO2_min_train, NO2_max_train)

df_O3_2017_train_norm_1D = normalise_linear(df_O3_2017_train_1D, O3_min_train, O3_max_train)
df_O3_2018_train_norm_1D = normalise_linear(df_O3_2018_train_1D, O3_min_train, O3_max_train)
df_O3_2020_train_norm_1D = normalise_linear(df_O3_2020_train_1D, O3_min_train, O3_max_train)
df_O3_2021_train_norm_1D = normalise_linear(df_O3_2021_train_1D, O3_min_train, O3_max_train)
df_O3_2021_val_norm_1D = normalise_linear(df_O3_2021_val_1D, O3_min_train, O3_max_train)
df_O3_2021_test_norm_1D = normalise_linear(df_O3_2021_test_1D, O3_min_train, O3_max_train)
df_O3_2022_train_norm_1D = normalise_linear(df_O3_2022_train_1D, O3_min_train, O3_max_train)
df_O3_val_2022_norm_1D = normalise_linear(df_O3_2022_val_1D, O3_min_train, O3_max_train)
df_O3_test_2022_norm_1D = normalise_linear(df_O3_2022_test_1D, O3_min_train, O3_max_train)
df_O3_val_2023_norm_1D = normalise_linear(df_O3_2023_val_1D, O3_min_train, O3_max_train)
df_O3_test_2023_norm_1D = normalise_linear(df_O3_2023_test_1D, O3_min_train, O3_max_train)

df_PM10_2017_train_norm_1D = normalise_linear(df_PM10_2017_train_1D, PM10_min_train, PM10_max_train)
df_PM10_2018_train_norm_1D = normalise_linear(df_PM10_2018_train_1D, PM10_min_train, PM10_max_train)
df_PM10_2020_train_norm_1D = normalise_linear(df_PM10_2020_train_1D, PM10_min_train, PM10_max_train)
df_PM10_2021_train_norm_1D = normalise_linear(df_PM10_2021_train_1D, PM10_min_train, PM10_max_train)
df_PM10_2021_val_norm_1D = normalise_linear(df_PM10_2021_val_1D, PM10_min_train, PM10_max_train)
df_PM10_2021_test_norm_1D = normalise_linear(df_PM10_2021_test_1D, PM10_min_train, PM10_max_train)
df_PM10_2022_train_norm_1D = normalise_linear(df_PM10_2022_train_1D, PM10_min_train, PM10_max_train)
df_PM10_val_2022_norm_1D = normalise_linear(df_PM10_2022_val_1D, PM10_min_train, PM10_max_train)
df_PM10_test_2022_norm_1D = normalise_linear(df_PM10_2022_test_1D, PM10_min_train, PM10_max_train)
df_PM10_val_2023_norm_1D = normalise_linear(df_PM10_2023_val_1D, PM10_min_train, PM10_max_train)
df_PM10_test_2023_norm_1D = normalise_linear(df_PM10_2023_test_1D, PM10_min_train, PM10_max_train)

df_PM25_2017_train_norm_1D = normalise_linear(df_PM25_2017_train_1D, PM25_min_train, PM25_max_train)
df_PM25_2018_train_norm_1D = normalise_linear(df_PM25_2018_train_1D, PM25_min_train, PM25_max_train)
df_PM25_2020_train_norm_1D = normalise_linear(df_PM25_2020_train_1D, PM25_min_train, PM25_max_train)
df_PM25_2021_train_norm_1D = normalise_linear(df_PM25_2021_train_1D, PM25_min_train, PM25_max_train)
df_PM25_2021_val_norm_1D = normalise_linear(df_PM25_2021_val_1D, PM25_min_train, PM25_max_train)
df_PM25_2021_test_norm_1D = normalise_linear(df_PM25_2021_test_1D, PM25_min_train, PM25_max_train)
df_PM25_2022_train_norm_1D = normalise_linear(df_PM25_2022_train_1D, PM25_min_train, PM25_max_train)
df_PM25_val_2022_norm_1D = normalise_linear(df_PM25_2022_val_1D, PM25_min_train, PM25_max_train)
df_PM25_test_2022_norm_1D = normalise_linear(df_PM25_2022_test_1D, PM25_min_train, PM25_max_train)
df_PM25_val_2023_norm_1D = normalise_linear(df_PM25_2023_val_1D, PM25_min_train, PM25_max_train)
df_PM25_test_2023_norm_1D = normalise_linear(df_PM25_2023_test_1D, PM25_min_train, PM25_max_train)

df_temp_2017_train_norm = normalise_linear(df_temp_2017_train, temp_min_train, temp_max_train)
df_temp_2018_train_norm = normalise_linear(df_temp_2018_train, temp_min_train, temp_max_train)
df_temp_2020_train_norm = normalise_linear(df_temp_2020_train, temp_min_train, temp_max_train)
df_temp_2021_train_norm = normalise_linear(df_temp_2021_train, temp_min_train, temp_max_train)
df_temp_2021_val_norm = normalise_linear(df_temp_2021_val, temp_min_train, temp_max_train)
df_temp_2021_test_norm = normalise_linear(df_temp_2021_test, temp_min_train, temp_max_train)
df_temp_2022_train_norm = normalise_linear(df_temp_2022_train, temp_min_train, temp_max_train)
df_temp_val_2022_norm = normalise_linear(df_temp_2022_val, temp_min_train, temp_max_train)
df_temp_test_2022_norm = normalise_linear(df_temp_2022_test, temp_min_train, temp_max_train)
df_temp_val_2023_norm = normalise_linear(df_temp_2023_val, temp_min_train, temp_max_train)
df_temp_test_2023_norm = normalise_linear(df_temp_2023_test, temp_min_train, temp_max_train)

df_dewP_2017_train_norm = normalise_linear(df_dewP_2017_train, dewP_min_train, dewP_max_train)
df_dewP_2018_train_norm = normalise_linear(df_dewP_2018_train, dewP_min_train, dewP_max_train)
df_dewP_2020_train_norm = normalise_linear(df_dewP_2020_train, dewP_min_train, dewP_max_train)
df_dewP_2021_train_norm = normalise_linear(df_dewP_2021_train, dewP_min_train, dewP_max_train)
df_dewP_2021_val_norm = normalise_linear(df_dewP_2021_val, dewP_min_train, dewP_max_train)
df_dewP_2021_test_norm = normalise_linear(df_dewP_2021_test, dewP_min_train, dewP_max_train)
df_dewP_2022_train_norm = normalise_linear(df_dewP_2022_train, dewP_min_train, dewP_max_train)
df_dewP_val_2022_norm = normalise_linear(df_dewP_2022_val, dewP_min_train, dewP_max_train)
df_dewP_test_2022_norm = normalise_linear(df_dewP_2022_test, dewP_min_train, dewP_max_train)
df_dewP_val_2023_norm = normalise_linear(df_dewP_2023_val, dewP_min_train, dewP_max_train)
df_dewP_test_2023_norm = normalise_linear(df_dewP_2023_test, dewP_min_train, dewP_max_train)

df_WD_2017_train_norm = normalise_linear(df_WD_2017_train, WD_min_train, WD_max_train)
df_WD_2018_train_norm = normalise_linear(df_WD_2018_train, WD_min_train, WD_max_train)
df_WD_2020_train_norm = normalise_linear(df_WD_2020_train, WD_min_train, WD_max_train)
df_WD_2021_train_norm = normalise_linear(df_WD_2021_train, WD_min_train, WD_max_train)
df_WD_2021_val_norm = normalise_linear(df_WD_2021_val, WD_min_train, WD_max_train)
df_WD_2021_test_norm = normalise_linear(df_WD_2021_test, WD_min_train, WD_max_train)
df_WD_2022_train_norm = normalise_linear(df_WD_2022_train, WD_min_train, WD_max_train)
df_WD_val_2022_norm = normalise_linear(df_WD_2022_val, WD_min_train, WD_max_train)
df_WD_test_2022_norm = normalise_linear(df_WD_2022_test, WD_min_train, WD_max_train)
df_WD_val_2023_norm = normalise_linear(df_WD_2023_val, WD_min_train, WD_max_train)
df_WD_test_2023_norm = normalise_linear(df_WD_2023_test, WD_min_train, WD_max_train)

df_Wvh_2017_train_norm = normalise_linear(df_Wvh_2017_train, Wvh_min_train, Wvh_max_train)
df_Wvh_2018_train_norm = normalise_linear(df_Wvh_2018_train, Wvh_min_train, Wvh_max_train)
df_Wvh_2020_train_norm = normalise_linear(df_Wvh_2020_train, Wvh_min_train, Wvh_max_train)
df_Wvh_2021_train_norm = normalise_linear(df_Wvh_2021_train, Wvh_min_train, Wvh_max_train)
df_Wvh_2021_val_norm = normalise_linear(df_Wvh_2021_val, Wvh_min_train, Wvh_max_train)
df_Wvh_2021_test_norm = normalise_linear(df_Wvh_2021_test, Wvh_min_train, Wvh_max_train)
df_Wvh_2022_train_norm = normalise_linear(df_Wvh_2022_train, Wvh_min_train, Wvh_max_train)
df_Wvh_val_2022_norm = normalise_linear(df_Wvh_2022_val, Wvh_min_train, Wvh_max_train)
df_Wvh_test_2022_norm = normalise_linear(df_Wvh_2022_test, Wvh_min_train, Wvh_max_train)
df_Wvh_val_2023_norm = normalise_linear(df_Wvh_2023_val, Wvh_min_train, Wvh_max_train)
df_Wvh_test_2023_norm = normalise_linear(df_Wvh_2023_test, Wvh_min_train, Wvh_max_train)

df_P_2017_train_norm = normalise_linear(df_P_2017_train, P_min_train, P_max_train)
df_P_2018_train_norm = normalise_linear(df_P_2018_train, P_min_train, P_max_train)
df_P_2020_train_norm = normalise_linear(df_P_2020_train, P_min_train, P_max_train)
df_P_2021_train_norm = normalise_linear(df_P_2021_train, P_min_train, P_max_train)
df_P_2021_val_norm = normalise_linear(df_P_2021_val, P_min_train, P_max_train)
df_P_2021_test_norm = normalise_linear(df_P_2021_test, P_min_train, P_max_train)
df_P_2022_train_norm = normalise_linear(df_P_2022_train, P_min_train, P_max_train)
df_P_val_2022_norm = normalise_linear(df_P_2022_val, P_min_train, P_max_train)
df_P_test_2022_norm = normalise_linear(df_P_2022_test, P_min_train, P_max_train)
df_P_val_2023_norm = normalise_linear(df_P_2023_val, P_min_train, P_max_train)
df_P_test_2023_norm = normalise_linear(df_P_2023_test, P_min_train, P_max_train)

df_SQ_2017_train_norm = normalise_linear(df_SQ_2017_train, SQ_min_train, SQ_max_train)
df_SQ_2018_train_norm = normalise_linear(df_SQ_2018_train, SQ_min_train, SQ_max_train)
df_SQ_2020_train_norm = normalise_linear(df_SQ_2020_train, SQ_min_train, SQ_max_train)
df_SQ_2021_train_norm = normalise_linear(df_SQ_2021_train, SQ_min_train, SQ_max_train)
df_SQ_2021_val_norm = normalise_linear(df_SQ_2021_val, SQ_min_train, SQ_max_train)
df_SQ_2021_test_norm = normalise_linear(df_SQ_2021_test, SQ_min_train, SQ_max_train)
df_SQ_2022_train_norm = normalise_linear(df_SQ_2022_train, SQ_min_train, SQ_max_train)
df_SQ_val_2022_norm = normalise_linear(df_SQ_2022_val, SQ_min_train, SQ_max_train)
df_SQ_test_2022_norm = normalise_linear(df_SQ_2022_test, SQ_min_train, SQ_max_train)
df_SQ_val_2023_norm = normalise_linear(df_SQ_2023_val, SQ_min_train, SQ_max_train)
df_SQ_test_2023_norm = normalise_linear(df_SQ_2023_test, SQ_min_train, SQ_max_train)

In [22]:
if LOG:
    # Assert range only for training frames, validation and test
    # frames can, very theoretically, have unlimited values
    assert_range([
        df_NO2_2017_train_norm_1D, df_NO2_2018_train_norm_1D,
        df_NO2_2020_train_norm_1D, df_NO2_2021_train_norm_1D,
        df_NO2_2022_train_norm_1D
    ], 0, 1, 'Normalisation of NO2 data')
    assert_range([
        df_O3_2017_train_norm_1D, df_O3_2018_train_norm_1D,
        df_O3_2020_train_norm_1D, df_O3_2021_train_norm_1D,
        df_O3_2022_train_norm_1D
    ], 0, 1, 'Normalisation of O3 data')
    assert_range([
        df_PM10_2017_train_norm_1D, df_PM10_2018_train_norm_1D,
        df_PM10_2020_train_norm_1D, df_PM10_2021_train_norm_1D,
        df_PM10_2022_train_norm_1D
    ], 0, 1, 'Normalisation of PM10 data')
    assert_range([
        df_PM25_2017_train_norm_1D, df_PM25_2018_train_norm_1D,
        df_PM25_2020_train_norm_1D, df_PM25_2021_train_norm_1D,
        df_PM25_2022_train_norm_1D
    ], 0, 1, 'Normalisation of PM25 data')
    assert_range([
        df_temp_2017_train_norm, df_temp_2018_train_norm,
        df_temp_2020_train_norm, df_temp_2021_train_norm,
        df_temp_2022_train_norm
    ], 0, 1, 'Normalisation of temperature data')
    assert_range([
        df_dewP_2017_train_norm, df_dewP_2018_train_norm,
        df_dewP_2020_train_norm, df_dewP_2021_train_norm,
        df_dewP_2022_train_norm
    ], 0, 1, 'Normalisation of dew point data')
    assert_range([
        df_WD_2017_train_norm, df_WD_2018_train_norm,
        df_WD_2020_train_norm, df_WD_2021_train_norm,
        df_WD_2022_train_norm
    ], 0, 1, 'Normalisation of wind direction data')
    assert_range([
        df_Wvh_2017_train_norm, df_Wvh_2018_train_norm,
        df_Wvh_2020_train_norm, df_Wvh_2021_train_norm,
        df_Wvh_2022_train_norm
    ], 0, 1, 'Normalisation of wind velocity data')
    assert_range([
        df_P_2017_train_norm, df_P_2018_train_norm,
        df_P_2020_train_norm, df_P_2021_train_norm,
        df_P_2022_train_norm
    ], 0, 1, 'Normalisation of pressure data')
    assert_range([
        df_SQ_2017_train_norm, df_SQ_2018_train_norm,
        df_SQ_2020_train_norm, df_SQ_2021_train_norm,
        df_SQ_2022_train_norm
    ], 0, 1, 'Normalisation of solar radiation data')
    print('(6/8): Normalisation successful')

(6/8): Normalisation successful


### **Create big combined normalised dataframe**

In [23]:
# Now, create a big combined normalised dataframe for each year

keys = ['PM25', 'PM10', 'O3', 'NO2',
        'temp', 'dewP', 'WD', 'Wvh', 'p', 'SQ']

# Create input dataframes (u):
# As we use the pollutant data twice, in Utrecht and Breukelen,
# we add an index to sample only the Tuindorp (= Utrecht) data
# for u, and later, we will add the Breukelen data for y
frames_train_2017_1D_u = [df_PM25_2017_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2017_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2017_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2017_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2017_train_norm,
                              df_dewP_2017_train_norm,
                              df_WD_2017_train_norm,
                              df_Wvh_2017_train_norm,
                              df_P_2017_train_norm,
                              df_SQ_2017_train_norm]
frames_train_2018_1D_u = [df_PM25_2018_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2018_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2018_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2018_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2018_train_norm,
                              df_dewP_2018_train_norm,
                              df_WD_2018_train_norm,
                              df_Wvh_2018_train_norm,
                              df_P_2018_train_norm,
                              df_SQ_2018_train_norm]
frames_train_2020_1D_u = [df_PM25_2020_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2020_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2020_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2020_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2020_train_norm,
                              df_dewP_2020_train_norm,
                              df_WD_2020_train_norm,
                              df_Wvh_2020_train_norm,
                              df_P_2020_train_norm,
                              df_SQ_2020_train_norm]
frames_train_2021_1D_u = [df_PM25_2021_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2021_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2021_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2021_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2021_train_norm,
                              df_dewP_2021_train_norm,
                              df_WD_2021_train_norm,
                              df_Wvh_2021_train_norm,
                              df_P_2021_train_norm,
                              df_SQ_2021_train_norm]
frames_val_2021_1D_u = [df_PM25_2021_val_norm_1D.loc[:, [TUINDORP]],
                            df_PM10_2021_val_norm_1D.loc[:, [TUINDORP]],
                            df_O3_2021_val_norm_1D.loc[:, [TUINDORP]],
                            df_NO2_2021_val_norm_1D.loc[:, [TUINDORP]],
                            df_temp_2021_val_norm,
                            df_dewP_2021_val_norm,
                            df_WD_2021_val_norm,
                            df_Wvh_2021_val_norm,
                            df_P_2021_val_norm,
                            df_SQ_2021_val_norm]
frames_test_2021_1D_u = [df_PM25_2021_test_norm_1D.loc[:, [TUINDORP]],
                             df_PM10_2021_test_norm_1D.loc[:, [TUINDORP]],
                             df_O3_2021_test_norm_1D.loc[:, [TUINDORP]],
                             df_NO2_2021_test_norm_1D.loc[:, [TUINDORP]],
                             df_temp_2021_test_norm,
                             df_dewP_2021_test_norm,
                             df_WD_2021_test_norm,
                             df_Wvh_2021_test_norm,
                             df_P_2021_test_norm,
                             df_SQ_2021_test_norm]
frames_train_2022_1D_u = [df_PM25_2022_train_norm_1D.loc[:, [TUINDORP]],
                              df_PM10_2022_train_norm_1D.loc[:, [TUINDORP]],
                              df_O3_2022_train_norm_1D.loc[:, [TUINDORP]],
                              df_NO2_2022_train_norm_1D.loc[:, [TUINDORP]],
                              df_temp_2022_train_norm,
                              df_dewP_2022_train_norm,
                              df_WD_2022_train_norm,
                              df_Wvh_2022_train_norm,
                              df_P_2022_train_norm,
                              df_SQ_2022_train_norm]
frames_val_2022_1D_u = [df_PM25_val_2022_norm_1D.loc[:, [TUINDORP]],
                            df_PM10_val_2022_norm_1D.loc[:, [TUINDORP]],
                            df_O3_val_2022_norm_1D.loc[:, [TUINDORP]],
                            df_NO2_val_2022_norm_1D.loc[:, [TUINDORP]],
                            df_temp_val_2022_norm,
                            df_dewP_val_2022_norm,
                            df_WD_val_2022_norm,
                            df_Wvh_val_2022_norm,
                            df_P_val_2022_norm,
                            df_SQ_val_2022_norm]
frames_val_2023_1D_u = [df_PM25_val_2023_norm_1D.loc[:, [TUINDORP]],
                            df_PM10_val_2023_norm_1D.loc[:, [TUINDORP]],
                            df_O3_val_2023_norm_1D.loc[:, [TUINDORP]],
                            df_NO2_val_2023_norm_1D.loc[:, [TUINDORP]],
                            df_temp_val_2023_norm,
                            df_dewP_val_2023_norm,
                            df_WD_val_2023_norm,
                            df_Wvh_val_2023_norm,
                            df_P_val_2023_norm,
                            df_SQ_val_2023_norm]
frames_test_2022_1D_u = [df_PM25_test_2022_norm_1D.loc[:, [TUINDORP]],
                             df_PM10_test_2022_norm_1D.loc[:, [TUINDORP]],
                             df_O3_test_2022_norm_1D.loc[:, [TUINDORP]],
                             df_NO2_test_2022_norm_1D.loc[:, [TUINDORP]],
                             df_temp_test_2022_norm,
                             df_dewP_test_2022_norm,
                             df_WD_test_2022_norm,
                             df_Wvh_test_2022_norm,
                             df_P_test_2022_norm,
                             df_SQ_test_2022_norm]
frames_test_2023_1D_u = [df_PM25_test_2023_norm_1D.loc[:, [TUINDORP]],
                             df_PM10_test_2023_norm_1D.loc[:, [TUINDORP]],
                             df_O3_test_2023_norm_1D.loc[:, [TUINDORP]],
                             df_NO2_test_2023_norm_1D.loc[:, [TUINDORP]],
                             df_temp_test_2023_norm,
                             df_dewP_test_2023_norm,
                             df_WD_test_2023_norm,
                             df_Wvh_test_2023_norm,
                             df_P_test_2023_norm,
                             df_SQ_test_2023_norm]

In [24]:
# For y, we only use pollutant data from Breukelen
frames_train_2017_1D_y = [df_PM25_2017_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2017_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2017_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2017_train_norm_1D.loc[:, [BREUKELEN]]]
frames_train_2018_1D_y = [df_PM25_2018_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2018_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2018_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2018_train_norm_1D.loc[:, [BREUKELEN]]]
frames_train_2020_1D_y = [df_PM25_2020_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2020_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2020_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2020_train_norm_1D.loc[:, [BREUKELEN]]]
frames_train_2021_1D_y = [df_PM25_2021_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2021_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2021_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2021_train_norm_1D.loc[:, [BREUKELEN]]]
frames_val_2021_1D_y = [df_PM25_2021_val_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_2021_val_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_2021_val_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_2021_val_norm_1D.loc[:, [BREUKELEN]]]
frames_test_2021_1D_y = [df_PM25_2021_test_norm_1D.loc[:, [BREUKELEN]],
                             df_PM10_2021_test_norm_1D.loc[:, [BREUKELEN]],
                             df_O3_2021_test_norm_1D.loc[:, [BREUKELEN]],
                             df_NO2_2021_test_norm_1D.loc[:, [BREUKELEN]]]
frames_train_2022_1D_y = [df_PM25_2022_train_norm_1D.loc[:, [BREUKELEN]],
                              df_PM10_2022_train_norm_1D.loc[:, [BREUKELEN]],
                              df_O3_2022_train_norm_1D.loc[:, [BREUKELEN]],
                              df_NO2_2022_train_norm_1D.loc[:, [BREUKELEN]]]
frames_val_2022_1D_y = [df_PM25_val_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_val_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_val_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_val_2022_norm_1D.loc[:, [BREUKELEN]]]
frames_val_2023_1D_y = [df_PM25_val_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_val_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_val_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_val_2023_norm_1D.loc[:, [BREUKELEN]]]
frames_test_2022_1D_y = [df_PM25_test_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_test_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_test_2022_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_test_2022_norm_1D.loc[:, [BREUKELEN]]]
frames_test_2023_1D_y = [df_PM25_test_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_PM10_test_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_O3_test_2023_norm_1D.loc[:, [BREUKELEN]],
                            df_NO2_test_2023_norm_1D.loc[:, [BREUKELEN]]]

In [25]:
input_keys = ['PM25', 'PM10', 'O3', 'NO2',
              'temp', 'dewP', 'WD', 'Wvh', 'p', 'SQ']
target_keys = ['PM25', 'PM10', 'O3', 'NO2']

In [26]:
df_train_2017_horizontal_u = concat_frames_horizontally(frames_train_2017_1D_u, input_keys)
df_train_2018_horizontal_u = concat_frames_horizontally(frames_train_2018_1D_u, input_keys)
df_train_2020_horizontal_u = concat_frames_horizontally(frames_train_2020_1D_u, input_keys)
df_train_2021_horizontal_u = concat_frames_horizontally(frames_train_2021_1D_u, input_keys)
df_val_2021_horizontal_u = concat_frames_horizontally(frames_val_2021_1D_u, input_keys)
df_test_2021_horizontal_u = concat_frames_horizontally(frames_test_2021_1D_u, input_keys)
df_train_2022_horizontal_u = concat_frames_horizontally(frames_train_2022_1D_u, input_keys)
df_val_2022_horizontal_u = concat_frames_horizontally(frames_val_2022_1D_u, input_keys)
df_val_2023_horizontal_u = concat_frames_horizontally(frames_val_2023_1D_u, input_keys)
df_test_2022_horizontal_u = concat_frames_horizontally(frames_test_2022_1D_u, input_keys)
df_test_2023_horizontal_u = concat_frames_horizontally(frames_test_2023_1D_u, input_keys)

df_train_2017_horizontal_y = concat_frames_horizontally(frames_train_2017_1D_y, target_keys)
df_train_2018_horizontal_y = concat_frames_horizontally(frames_train_2018_1D_y, target_keys)
df_train_2020_horizontal_y = concat_frames_horizontally(frames_train_2020_1D_y, target_keys)
df_train_2021_horizontal_y = concat_frames_horizontally(frames_train_2021_1D_y, target_keys)
df_val_2021_horizontal_y = concat_frames_horizontally(frames_val_2021_1D_y, target_keys)
df_test_2021_horizontal_y = concat_frames_horizontally(frames_test_2021_1D_y, target_keys)
df_train_2022_horizontal_y = concat_frames_horizontally(frames_train_2022_1D_y, target_keys)
df_val_2022_horizontal_y = concat_frames_horizontally(frames_val_2022_1D_y, target_keys)
df_val_2023_horizontal_y = concat_frames_horizontally(frames_val_2023_1D_y, target_keys)
df_test_2022_horizontal_y = concat_frames_horizontally(frames_test_2022_1D_y, target_keys)
df_test_2023_horizontal_y = concat_frames_horizontally(frames_test_2023_1D_y, target_keys)

In [27]:
# At last, a final check before exporting

if LOG:
    # First, check if u-dataframes of unsplitted years have same shape
    assert_equal_shape([
        df_train_2017_horizontal_u, df_train_2018_horizontal_u,
        df_train_2020_horizontal_u,
    ], True, True, 'Shape of u-dataframes of 2017, 2018 and 2020')
    # Second, check if y-dataframes of unsplitted years have same shape
    assert_equal_shape([
        df_train_2017_horizontal_y, df_train_2018_horizontal_y,
        df_train_2020_horizontal_y,
    ], True, True, 'Shape of y-dataframes of 2017, 2018 and 2020')
    # Third, check if validation/test u-dataframes of splitted years
    # have the same shape
    assert_equal_shape([
        df_val_2021_horizontal_u, df_test_2021_horizontal_u,
        df_val_2022_horizontal_u, df_test_2022_horizontal_u,
    ], True, True, 'Shape of u-dataframes of 2021 and 2022')
    # Fourth, check if validation/test y-dataframes of splitted years
    # have the same shape
    assert_equal_shape([
        df_val_2021_horizontal_y, df_test_2021_horizontal_y,
        df_val_2022_horizontal_y, df_test_2022_horizontal_y,
    ], True, True, 'Shape of y-dataframes of 2021 and 2022')
    # Fifth, check if 2023 dataframes have the same shape
    assert_equal_shape([
        df_val_2023_horizontal_u, df_test_2023_horizontal_u,
        df_val_2023_horizontal_y, df_test_2023_horizontal_y,
    ], True, False, 'Shape of 2023 dataframes')
    
    print('(7/8): All data concatenations successful')

(7/8): All data concatenations successful


In [28]:
# Save the dataframes to data_combined/ folder. The windowing will be performed
# by a PyTorch Dataset class in the model scripts.

df_train_2017_horizontal_u.to_csv("../data/data_combined/train_2017_combined_u.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2018_horizontal_u.to_csv("../data/data_combined/train_2018_combined_u.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2020_horizontal_u.to_csv("../data/data_combined/train_2020_combined_u.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2021_horizontal_u.to_csv("../data/data_combined/train_2021_combined_u.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2021_horizontal_u.to_csv("../data/data_combined/val_2021_combined_u.csv",
                                index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2021_horizontal_u.to_csv("../data/data_combined/test_2021_combined_u.csv",
                                 index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2022_horizontal_u.to_csv("../data/data_combined/train_2022_combined_u.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2022_horizontal_u.to_csv("../data/data_combined/val_2022_combined_u.csv",
                                index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2023_horizontal_u.to_csv("../data/data_combined/val_2023_combined_u.csv",
                                index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2022_horizontal_u.to_csv("../data/data_combined/test_2022_combined_u.csv",
                                 index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2023_horizontal_u.to_csv("../data/data_combined/test_2023_combined_u.csv",
                                 index = True, sep = ';', decimal = '.', encoding = 'utf-8')

df_train_2017_horizontal_y.to_csv("../data/data_combined/train_2017_combined_y.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2018_horizontal_y.to_csv("../data/data_combined/train_2018_combined_y.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2020_horizontal_y.to_csv("../data/data_combined/train_2020_combined_y.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2021_horizontal_y.to_csv("../data/data_combined/train_2021_combined_y.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2021_horizontal_y.to_csv("../data/data_combined/val_2021_combined_y.csv",
                                index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2021_horizontal_y.to_csv("../data/data_combined/test_2021_combined_y.csv",
                                 index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_train_2022_horizontal_y.to_csv("../data/data_combined/train_2022_combined_y.csv",
                                  index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2022_horizontal_y.to_csv("../data/data_combined/val_2022_combined_y.csv",
                                index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_val_2023_horizontal_y.to_csv("../data/data_combined/val_2023_combined_y.csv",
                                index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2022_horizontal_y.to_csv("../data/data_combined/test_2022_combined_y.csv",
                                 index = True, sep = ';', decimal = '.', encoding = 'utf-8')
df_test_2023_horizontal_y.to_csv("../data/data_combined/test_2023_combined_y.csv",
                                 index = True, sep = ';', decimal = '.', encoding = 'utf-8')

In [29]:
if LOG:
    print('(8/8): Data exported successfully')
    print('Data preparation finished')

(8/8): Data exported successfully
Data preparation finished
