# Dataset preparation
Dataset source:
[link](https://www.eea.europa.eu/data-and-maps/data/aqereporting-9)

Dataset will consist of combination of PM10, PM2.5, CO and NO2 measurements local to Bratislava (Slovakia) in years [x]-[x].

Searches:
- PM10, Ba, 2020-2020, 4 csv files
- PM2.5, BA, 2020-2020, 4 csv files
- CO, BA, 2020-2020, 1 csv files
- NO2, BA, 2020-2020, 3 csv files

In [71]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [72]:
df_pm10 = pd.read_csv('raw_data/SK_5_27294_2020_timeseries_PM10_1.csv')
df_pm10.head()

Unnamed: 0,Countrycode,Namespace,AirQualityNetwork,AirQualityStation,AirQualityStationEoICode,SamplingPoint,SamplingProcess,Sample,AirPollutant,AirPollutantCode,AveragingTime,Concentration,UnitOfMeasurement,DatetimeBegin,DatetimeEnd,Validity,Verification
0,SK,SK.SHMU.AQ,NET-SK001A,STA-SK0001A,SK0001A,SPO-SK0001A_00005_100,SPP-SK_A_BETA_BAM1020_SK0001A_00005_100_101,SPO_F-SK0001A_00005_100_101,PM10,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,16.07,µg/m3,2020-01-01 00:00:00 +01:00,2020-01-01 01:00:00 +01:00,1,1
1,SK,SK.SHMU.AQ,NET-SK001A,STA-SK0001A,SK0001A,SPO-SK0001A_00005_100,SPP-SK_A_BETA_BAM1020_SK0001A_00005_100_101,SPO_F-SK0001A_00005_100_101,PM10,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,120.15,µg/m3,2020-01-01 01:00:00 +01:00,2020-01-01 02:00:00 +01:00,1,1
2,SK,SK.SHMU.AQ,NET-SK001A,STA-SK0001A,SK0001A,SPO-SK0001A_00005_100,SPP-SK_A_BETA_BAM1020_SK0001A_00005_100_101,SPO_F-SK0001A_00005_100_101,PM10,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,52.513,µg/m3,2020-01-01 02:00:00 +01:00,2020-01-01 03:00:00 +01:00,1,1
3,SK,SK.SHMU.AQ,NET-SK001A,STA-SK0001A,SK0001A,SPO-SK0001A_00005_100,SPP-SK_A_BETA_BAM1020_SK0001A_00005_100_101,SPO_F-SK0001A_00005_100_101,PM10,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,40.058,µg/m3,2020-01-01 03:00:00 +01:00,2020-01-01 04:00:00 +01:00,1,1
4,SK,SK.SHMU.AQ,NET-SK001A,STA-SK0001A,SK0001A,SPO-SK0001A_00005_100,SPP-SK_A_BETA_BAM1020_SK0001A_00005_100_101,SPO_F-SK0001A_00005_100_101,PM10,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,26.224,µg/m3,2020-01-01 04:00:00 +01:00,2020-01-01 05:00:00 +01:00,1,1


In [73]:
df_pm10.rename(columns={'Concentration': 'PM10 Concentration', 'UnitOfMeasurement': 'PM10 Unit'}, inplace=True)
df_pm2_5 = pd.read_csv('raw_data/SK_6001_35900_2020_timeseries_PM25_1.csv')
df_pm2_5.rename(columns={'Concentration': 'PM2.5 Concentration', 'UnitOfMeasurement': 'PM2.5 Unit'}, inplace=True)
df_pm2_5 = df_pm2_5[['DatetimeBegin', 'DatetimeEnd', 'PM2.5 Concentration', 'PM2.5 Unit']]

In [74]:
df = pd.merge(df_pm10, df_pm2_5, how='inner')
df.drop(columns=['Countrycode', 'Namespace', 'AirQualityNetwork', 'AirQualityStation', 'SamplingPoint', 'SamplingProcess', 'Sample', 'AveragingTime', 'AirPollutantCode', 'AirPollutant', 'AirQualityStationEoICode', 'Validity', 'Verification'], inplace=True)
df

Unnamed: 0,PM10 Concentration,PM10 Unit,DatetimeBegin,DatetimeEnd,PM2.5 Concentration,PM2.5 Unit
0,16.070,µg/m3,2020-01-01 00:00:00 +01:00,2020-01-01 01:00:00 +01:00,14.328,µg/m3
1,120.150,µg/m3,2020-01-01 01:00:00 +01:00,2020-01-01 02:00:00 +01:00,118.520,µg/m3
2,52.513,µg/m3,2020-01-01 02:00:00 +01:00,2020-01-01 03:00:00 +01:00,45.789,µg/m3
3,40.058,µg/m3,2020-01-01 03:00:00 +01:00,2020-01-01 04:00:00 +01:00,38.463,µg/m3
4,26.224,µg/m3,2020-01-01 04:00:00 +01:00,2020-01-01 05:00:00 +01:00,18.477,µg/m3
...,...,...,...,...,...,...
8447,30.728,µg/m3,2020-12-31 19:00:00 +01:00,2020-12-31 20:00:00 +01:00,29.426,µg/m3
8448,33.860,µg/m3,2020-12-31 20:00:00 +01:00,2020-12-31 21:00:00 +01:00,29.544,µg/m3
8449,34.019,µg/m3,2020-12-31 21:00:00 +01:00,2020-12-31 22:00:00 +01:00,31.430,µg/m3
8450,42.901,µg/m3,2020-12-31 22:00:00 +01:00,2020-12-31 23:00:00 +01:00,36.028,µg/m3


In [75]:
df_co = pd.read_csv('raw_data/SK_10_27210_2020_timeseries_CO.csv')
df_co.rename(columns={'Concentration': 'CO Concentration', 'UnitOfMeasurement': 'CO Unit'}, inplace=True)
df_co = df_co[['DatetimeBegin', 'CO Concentration', 'CO Unit']]
df = pd.merge(df, df_co, how='left')

In [76]:
df_no2 = pd.read_csv('raw_data/SK_8_27303_2020_timeseries_NO2_1.csv')
df_no2.rename(columns={'Concentration': 'NO2 Concentration', 'UnitOfMeasurement': 'NO2 Unit'}, inplace=True)
df_no2 = df_no2[['DatetimeBegin', 'NO2 Concentration', 'NO2 Unit']]
df = pd.merge(df, df_no2, how='left')


In [77]:
df[df.isna().any(axis=1)]

Unnamed: 0,PM10 Concentration,PM10 Unit,DatetimeBegin,DatetimeEnd,PM2.5 Concentration,PM2.5 Unit,CO Concentration,CO Unit,NO2 Concentration,NO2 Unit
6,18.4540,µg/m3,2020-01-01 06:00:00 +01:00,2020-01-01 07:00:00 +01:00,12.4750,µg/m3,0.243600,mg/m3,,
18,25.3680,µg/m3,2020-01-01 18:00:00 +01:00,2020-01-01 19:00:00 +01:00,19.6860,µg/m3,,,30.6551,µg/m3
31,26.9920,µg/m3,2020-01-02 07:00:00 +01:00,2020-01-02 08:00:00 +01:00,26.9240,µg/m3,0.531280,mg/m3,,
44,44.5940,µg/m3,2020-01-02 20:00:00 +01:00,2020-01-02 21:00:00 +01:00,40.1950,µg/m3,,,31.2918,µg/m3
56,37.3590,µg/m3,2020-01-03 08:00:00 +01:00,2020-01-03 09:00:00 +01:00,37.1200,µg/m3,0.562252,mg/m3,,
...,...,...,...,...,...,...,...,...,...,...
8409,7.1994,µg/m3,2020-12-30 05:00:00 +01:00,2020-12-30 06:00:00 +01:00,5.7216,µg/m3,0.346840,mg/m3,,
8410,8.0941,µg/m3,2020-12-30 06:00:00 +01:00,2020-12-30 07:00:00 +01:00,5.9164,µg/m3,0.387556,mg/m3,,
8411,13.5610,µg/m3,2020-12-30 07:00:00 +01:00,2020-12-30 08:00:00 +01:00,8.3316,µg/m3,0.464232,mg/m3,,
8417,22.6390,µg/m3,2020-12-30 13:00:00 +01:00,2020-12-30 14:00:00 +01:00,18.0230,µg/m3,0.376304,mg/m3,,


In [78]:
df.to_csv('dataset.csv', sep=',', encoding='utf-8')