# Script to extract call out pollution aggregate figures for monitoring stations based on the discomap page
by Andrew Rowley

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
# read in timeseries links
ts_links = pd.read_csv(r'C:/DissertationCode/NO2_Prediction/timeseries_data/timeseries_NO2.txt',header=None)
# link output folder
output_path = "C:/DissertationCode/NO2_Prediction/timeseries_data/pollution_NO2.csv"

In [3]:
ts_links.columns = ["Links"]
print(ts_links.head())

                                               Links
0  https://ereporting.blob.core.windows.net/downl...
1  https://ereporting.blob.core.windows.net/downl...
2  https://ereporting.blob.core.windows.net/downl...
3  https://ereporting.blob.core.windows.net/downl...
4  https://ereporting.blob.core.windows.net/downl...


In [4]:
# example = pd.read_csv(ts_links["Links"][617])
# example.head()
# ts_links["Links"][617]

In [5]:
pollution_dicts = []
for link in tqdm(ts_links["Links"]):
    try:
        #read in and extract airquality station
        timeseries = pd.read_csv(link, encoding = "ISO-8859-1")
        aq_station = timeseries["AirQualityStation"][0]
        # remove invalid entries
        timeseries = timeseries[(timeseries["Validity"]==1)&(timeseries["Verification"]==1)]
        # aggregate by pollutant form dict
        average_pollutant = np.mean(timeseries["Concentration"])
        pollution_dicts.append({"station" : aq_station,"poll" : average_pollutant})
    except Exception:
        pass

100%|██████████| 11018/11018 [1:18:05<00:00,  2.35it/s]


In [6]:
df = pd.DataFrame.from_dict(pollution_dicts) 
print(df.head())
print(df.tail())

       station       poll
0  STA-AD0942A  26.551033
1  STA-AL0203A  21.542964
2  STA-AL0205A  13.488444
3  STA-AL0206A  18.822223
4  STA-AL0207A  23.004833
           station       poll
11011  STA-XK0008A  18.437177
11012  STA-XK0009A  19.102971
11013  STA-XK0010A   8.478350
11014  STA-XK0011A  18.064364
11015  STA-XK0012A  10.460738


In [7]:
df.to_csv(output_path, index=False)