In [1]:
import requests
import xml.etree.ElementTree as ET

from io import StringIO

from LoadingData import LoadFromAPI
import os
import pandas as pd
import numpy as np

## Air Quality API: 空气质量接口：

help网页： [https://api.erg.ic.ac.uk/AirQuality/help](https://api.erg.ic.ac.uk/AirQuality/help)

### Air Quality - MonitoringLocalAuthority

In [48]:
LoadFromAPI.load_data_from_api(url='https://api.erg.ic.ac.uk/AirQuality/Information/MonitoringLocalAuthority/GroupName=London', data_format="xml", filename="Data/AirQuality/MonitoringLocalAuthority.xml")

Data saved to Data/AirQuality/MonitoringLocalAuthority.xml.


### Air Quality - MonitoringSites Metadata

In [51]:
LoadFromAPI.load_data_from_api(url='http://api.erg.ic.ac.uk/AirQuality/Information/MonitoringSites/GroupName=London', data_format="xml", filename="Data/AirQuality/MonitoringSites.xml")

Data saved to Data/AirQuality/MonitoringSites.xml.


### Air Quality - Species Meta Data

In [54]:
LoadFromAPI.load_data_from_api(url='https://api.erg.ic.ac.uk/AirQuality/Information/MonitoringSiteSpecies/GroupName=London', data_format="xml", filename="Data/AirQuality/MonitoringSiteSpecies.xml")

Data saved to Data/AirQuality/MonitoringSiteSpecies.xml.


### Air Quality - Download Data for everySite/Species

get the specific species information of specific site during some period fromdate to enddate from the API

This returns raw data based on 'SiteCode', 'Species', 'StartDate', 'EndDate'. Default time period is 'hourly'. Data returned in CSV format

"https://api.erg.ic.ac.uk/AirQuality/Data/SiteSpecies/SiteCode={SITECODE}/SpeciesCode={SpeciesCode}/StartDate={STARTDATE}/EndDate={ENDDATE}/csv"

In [4]:
# Let's check the site/species data for determin which site remained open in the specific time period(from 2017-12-16 to 2018-03-17 matching the congestion data)

# Load the data

SiteSpecies = pd.read_csv("Data/AirQuality/MonitoringSiteSpecies.csv")

#### Download the NO2 data of the site in the time period from 2017-12-16 to 2018-03-17

In [43]:
Sites_NO2 = SiteSpecies[SiteSpecies['SpeciesCode'] == 'NO2']

Sites_NO2['DateMeasurementFinished'] = pd.to_datetime(Sites_NO2['DateMeasurementFinished'], errors='coerce')
Sites_NO2['DateMeasurementStarted'] = pd.to_datetime(Sites_NO2['DateMeasurementStarted'], errors='coerce')

# set some conditions for filtering
# condition1: 'DateClosed' is NAN or later than'2017-12-16 00:00'
# condition2: 'DateOpened' is earlier than '2018-03-17 00:00'
condition = (Sites_NO2['DateMeasurementFinished'].isna() | (Sites_NO2['DateMeasurementFinished'] > pd.Timestamp('2018-03-17 00:00'))) & (Sites_NO2['DateMeasurementStarted'] < pd.Timestamp('2017-12-16 00:00'))

# apply the filter condition to the dataframe
filtered_Sites_NO2 = Sites_NO2[condition]

# get the site codes for further downloading the data
Sites_Codes_NO2 = filtered_Sites_NO2['SiteCode'].values
print(Sites_Codes_NO2)

['BG1' 'BG2' 'BX2' 'BQ7' 'BX1' 'BT8' 'BT4' 'BT6' 'BT5' 'BY7' 'BL0' 'CD9'
 'IM1' 'CD1' 'CT4' 'CT3' 'CT6' 'CR5' 'CR9' 'CR7' 'EA6' 'EA8' 'EI1' 'EN5'
 'EN1' 'EN4' 'EN7' 'GN0' 'GR7' 'GR4' 'GB6' 'GN4' 'GN3' 'GN5' 'GR9' 'GR8'
 'HK6' 'HG4' 'HG1' 'HG1' 'HR2' 'HR1' 'HV1' 'HV3' 'LH0' 'HI0' 'IS6' 'IS2'
 'KC1' 'KT4' 'LB5' 'LB4' 'LB6' 'LW1' 'LW4' 'LW2' 'ME9' 'NM2' 'NM3' 'RB4'
 'RB7' 'RI2' 'RI1' 'RHG' 'SK5' 'SK6' 'ST8' 'ST5' 'ST4' 'ST6' 'TH4' 'TH2'
 'WAA' 'WAC' 'WA9' 'WA7' 'WA8' 'WAB' 'WA2' 'WM5' 'WM0' 'MY1' 'WM6' 'NB1']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sites_NO2['DateMeasurementFinished'] = pd.to_datetime(Sites_NO2['DateMeasurementFinished'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sites_NO2['DateMeasurementStarted'] = pd.to_datetime(Sites_NO2['DateMeasurementStarted'], errors='coerce')


In [44]:

#dont print anything for this code chunk


for site_code in Sites_Codes_NO2:


    url = f"https://api.erg.ic.ac.uk/AirQuality/Data/SiteSpecies/SiteCode={site_code}/SpeciesCode=NO2/StartDate=17 Dec 2017/EndDate=17 Mar 2018/csv"
    filename = f"Data/AirQuality/NO2/{site_code}.csv"

    LoadFromAPI.load_data_from_api(url=url,data_format="csv", filename=filename)

Data saved to Data/AirQuality/NO2/BG1.csv.
Data saved to Data/AirQuality/NO2/BG2.csv.
Data saved to Data/AirQuality/NO2/BX2.csv.
Data saved to Data/AirQuality/NO2/BQ7.csv.
Data saved to Data/AirQuality/NO2/BX1.csv.
Data saved to Data/AirQuality/NO2/BT8.csv.
Data saved to Data/AirQuality/NO2/BT4.csv.
Data saved to Data/AirQuality/NO2/BT6.csv.
Data saved to Data/AirQuality/NO2/BT5.csv.
Data saved to Data/AirQuality/NO2/BY7.csv.
Data saved to Data/AirQuality/NO2/BL0.csv.
Data saved to Data/AirQuality/NO2/CD9.csv.
Data saved to Data/AirQuality/NO2/IM1.csv.
Data saved to Data/AirQuality/NO2/CD1.csv.
Data saved to Data/AirQuality/NO2/CT4.csv.
Data saved to Data/AirQuality/NO2/CT3.csv.
Data saved to Data/AirQuality/NO2/CT6.csv.
Data saved to Data/AirQuality/NO2/CR5.csv.
Data saved to Data/AirQuality/NO2/CR9.csv.
Data saved to Data/AirQuality/NO2/CR7.csv.
Data saved to Data/AirQuality/NO2/EA6.csv.
Data saved to Data/AirQuality/NO2/EA8.csv.
Data saved to Data/AirQuality/NO2/EI1.csv.
Data saved 

#### Download the PM2.5 data of the site in the time period from 2017-12-16 to 2018-03-17

In [45]:
Sites_PM25 = SiteSpecies[SiteSpecies['SpeciesCode'] == 'PM25']

Sites_PM25['DateMeasurementFinished'] = pd.to_datetime(Sites_PM25['DateMeasurementFinished'], errors='coerce')
Sites_PM25['DateMeasurementStarted'] = pd.to_datetime(Sites_PM25['DateMeasurementStarted'], errors='coerce')

# set some conditions for filtering
# condition1: 'DateClosed' is NAN or later than'2017-12-16 00:00'
# condition2: 'DateOpened' is earlier than '2018-03-17 00:00'
condition = (Sites_PM25['DateMeasurementFinished'].isna() | (Sites_PM25['DateMeasurementFinished'] > pd.Timestamp('2018-03-17 00:00'))) & (Sites_PM25['DateMeasurementStarted'] < pd.Timestamp('2017-12-16 00:00'))

# apply the filter condition to the dataframe
filtered_Sites_PM25 = Sites_PM25[condition]

# get the site codes for further downloading the data
Sites_Codes_PM25 = filtered_Sites_PM25['SiteCode'].values
print(Sites_Codes_PM25)

['BX9' 'GB0' 'CD9' 'BL0' 'CT2' 'CT3' 'CD1' 'CR8' 'GN0' 'GR4' 'GN3' 'GR9'
 'HV1' 'LH0' 'KF1' 'HP1' 'LW2' 'RB7' 'TD5' 'ST5' 'TH4' 'WM0' 'MY7']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sites_PM25['DateMeasurementFinished'] = pd.to_datetime(Sites_PM25['DateMeasurementFinished'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sites_PM25['DateMeasurementStarted'] = pd.to_datetime(Sites_PM25['DateMeasurementStarted'], errors='coerce')


In [46]:

#dont print anything for this code chunk


for site_code in Sites_Codes_PM25:


    url = f"https://api.erg.ic.ac.uk/AirQuality/Data/SiteSpecies/SiteCode={site_code}/SpeciesCode=PM25/StartDate=17 Dec 2017/EndDate=17 Mar 2018/csv"
    filename = f"Data/AirQuality/PM25/{site_code}.csv"

    LoadFromAPI.load_data_from_api(url=url,data_format="csv", filename=filename)

Data saved to Data/AirQuality/PM25/BX9.csv.
Data saved to Data/AirQuality/PM25/GB0.csv.
Data saved to Data/AirQuality/PM25/CD9.csv.
Data saved to Data/AirQuality/PM25/BL0.csv.
Data saved to Data/AirQuality/PM25/CT2.csv.
Data saved to Data/AirQuality/PM25/CT3.csv.
Data saved to Data/AirQuality/PM25/CD1.csv.
Data saved to Data/AirQuality/PM25/CR8.csv.
Data saved to Data/AirQuality/PM25/GN0.csv.
Data saved to Data/AirQuality/PM25/GR4.csv.
Data saved to Data/AirQuality/PM25/GN3.csv.
Data saved to Data/AirQuality/PM25/GR9.csv.
Data saved to Data/AirQuality/PM25/HV1.csv.
Data saved to Data/AirQuality/PM25/LH0.csv.
Data saved to Data/AirQuality/PM25/KF1.csv.
Data saved to Data/AirQuality/PM25/HP1.csv.
Data saved to Data/AirQuality/PM25/LW2.csv.
Data saved to Data/AirQuality/PM25/RB7.csv.
Data saved to Data/AirQuality/PM25/TD5.csv.
Data saved to Data/AirQuality/PM25/ST5.csv.
Data saved to Data/AirQuality/PM25/TH4.csv.
Data saved to Data/AirQuality/PM25/WM0.csv.
Data saved to Data/AirQuality/PM

#### Download the PM10 data of the site in the time period from 2017-12-16 to 2018-03-17

In [47]:
Sites_PM10 = SiteSpecies[SiteSpecies['SpeciesCode'] == 'PM10']

Sites_PM10['DateMeasurementFinished'] = pd.to_datetime(Sites_PM10['DateMeasurementFinished'], errors='coerce')
Sites_PM10['DateMeasurementStarted'] = pd.to_datetime(Sites_PM10['DateMeasurementStarted'], errors='coerce')

# set some conditions for filtering
# condition1: 'DateClosed' is NAN or later than'2017-12-16 00:00'
# condition2: 'DateOpened' is earlier than '2018-03-17 00:00'
condition = (Sites_PM10['DateMeasurementFinished'].isna() | (Sites_PM10['DateMeasurementFinished'] > pd.Timestamp('2018-03-17 00:00'))) & (Sites_PM10['DateMeasurementStarted'] < pd.Timestamp('2017-12-16 00:00'))

# apply the filter condition to the dataframe
filtered_Sites_PM10 = Sites_PM10[condition]

# get the site codes for further downloading the data
Sites_Codes_PM10 = filtered_Sites_PM10['SiteCode'].values
print(Sites_Codes_PM10)

['BG2' 'BX2' 'BX0' 'BQ8' 'BX1' 'BL0' 'KX4' 'CD9' 'CD1' 'CT3' 'CR9' 'EI3'
 'EA6' 'EA8' 'EI8' 'EI1' 'EN5' 'GN0' 'GR7' 'GR4' 'GB6' 'GN4' 'GN3' 'GN5'
 'GR9' 'GR8' 'GR8' 'HK6' 'HR2' 'HR1' 'HV1' 'HV3' 'LH0' 'IS6' 'IS2' 'KF1'
 'KT4' 'LB5' 'LB4' 'LB6' 'HP1' 'LW4' 'LW2' 'TD5' 'ME2' 'NM2' 'NM3' 'RB4'
 'RB7' 'RI2' 'RI1' 'RHG' 'SK5' 'ST8' 'ST8' 'ST5' 'ST4' 'ST6' 'TH4' 'WAA'
 'WAC' 'WA9' 'WA7' 'WAB' 'WM0' 'MY1' 'MY7' 'WM6']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sites_PM10['DateMeasurementFinished'] = pd.to_datetime(Sites_PM10['DateMeasurementFinished'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sites_PM10['DateMeasurementStarted'] = pd.to_datetime(Sites_PM10['DateMeasurementStarted'], errors='coerce')


In [48]:

#dont print anything for this code chunk


for site_code in Sites_Codes_PM10:


    url = f"https://api.erg.ic.ac.uk/AirQuality/Data/SiteSpecies/SiteCode={site_code}/SpeciesCode=PM10/StartDate=17 Dec 2017/EndDate=17 Mar 2018/csv"
    filename = f"Data/AirQuality/PM10/{site_code}.csv"

    LoadFromAPI.load_data_from_api(url=url,data_format="csv", filename=filename)

Data saved to Data/AirQuality/PM10/BG2.csv.
Data saved to Data/AirQuality/PM10/BX2.csv.
Data saved to Data/AirQuality/PM10/BX0.csv.
Data saved to Data/AirQuality/PM10/BQ8.csv.
Data saved to Data/AirQuality/PM10/BX1.csv.
Data saved to Data/AirQuality/PM10/BL0.csv.
Data saved to Data/AirQuality/PM10/KX4.csv.
Data saved to Data/AirQuality/PM10/CD9.csv.
Data saved to Data/AirQuality/PM10/CD1.csv.
Data saved to Data/AirQuality/PM10/CT3.csv.
Data saved to Data/AirQuality/PM10/CR9.csv.
Data saved to Data/AirQuality/PM10/EI3.csv.
Data saved to Data/AirQuality/PM10/EA6.csv.
Data saved to Data/AirQuality/PM10/EA8.csv.
Data saved to Data/AirQuality/PM10/EI8.csv.
Data saved to Data/AirQuality/PM10/EI1.csv.
Data saved to Data/AirQuality/PM10/EN5.csv.
Data saved to Data/AirQuality/PM10/GN0.csv.
Data saved to Data/AirQuality/PM10/GR7.csv.
Data saved to Data/AirQuality/PM10/GR4.csv.
Data saved to Data/AirQuality/PM10/GB6.csv.
Data saved to Data/AirQuality/PM10/GN4.csv.
Data saved to Data/AirQuality/PM

#### Download the SO2 data of the site in the time period from 2017-12-16 to 2018-03-17

In [49]:
Sites_SO2 = SiteSpecies[SiteSpecies['SpeciesCode'] == 'SO2']

Sites_SO2['DateMeasurementFinished'] = pd.to_datetime(Sites_SO2['DateMeasurementFinished'], errors='coerce')
Sites_SO2['DateMeasurementStarted'] = pd.to_datetime(Sites_SO2['DateMeasurementStarted'], errors='coerce')

# set some conditions for filtering
# condition1: 'DateClosed' is NAN or later than'2017-12-16 00:00'
# condition2: 'DateOpened' is earlier than '2018-03-17 00:00'
condition = (Sites_SO2['DateMeasurementFinished'].isna() | (Sites_SO2['DateMeasurementFinished'] > pd.Timestamp('2018-03-17 00:00'))) & (Sites_SO2['DateMeasurementStarted'] < pd.Timestamp('2017-12-16 00:00'))

# apply the filter condition to the dataframe
filtered_Sites_SO2 = Sites_SO2[condition]

# get the site codes for further downloading the data
Sites_Codes_SO2 = filtered_Sites_SO2['SiteCode'].values
print(Sites_Codes_SO2)

['BG1' 'BX1' 'BL0' 'EN4' 'GR4' 'KC1' 'LB5' 'MY1']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sites_SO2['DateMeasurementFinished'] = pd.to_datetime(Sites_SO2['DateMeasurementFinished'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sites_SO2['DateMeasurementStarted'] = pd.to_datetime(Sites_SO2['DateMeasurementStarted'], errors='coerce')


In [50]:

#dont print anything for this code chunk


for site_code in Sites_Codes_SO2:


    url = f"https://api.erg.ic.ac.uk/AirQuality/Data/SiteSpecies/SiteCode={site_code}/SpeciesCode=SO2/StartDate=17 Dec 2017/EndDate=17 Mar 2018/csv"
    filename = f"Data/AirQuality/SO2/{site_code}.csv"

    LoadFromAPI.load_data_from_api(url=url,data_format="csv", filename=filename)

Data saved to Data/AirQuality/SO2/BG1.csv.
Data saved to Data/AirQuality/SO2/BX1.csv.
Data saved to Data/AirQuality/SO2/BL0.csv.
Data saved to Data/AirQuality/SO2/EN4.csv.
Data saved to Data/AirQuality/SO2/GR4.csv.
Data saved to Data/AirQuality/SO2/KC1.csv.
Data saved to Data/AirQuality/SO2/LB5.csv.
Data saved to Data/AirQuality/SO2/MY1.csv.


**pivot the long format data to wide format data**

In [None]:
# read data from csv file
df = pd.read_csv('Data/AirQuality/BQ7_20180101.csv')
print(df.columns)

MeasurementDateGMT = df.columns[0]
print(MeasurementDateGMT)
field = df.columns[1]
print(field)

Index(['MeasurementDateGMT', 'Bexley - Belvedere West: PM2.5 Particulate (ug/m3)'], dtype='object')
MeasurementDateGMT
Bexley - Belvedere West: PM2.5 Particulate (ug/m3)


In [None]:

# Convert the 'MeasurementDateGMT' to datetime to extract date and hour
df['MeasurementDateGMT'] = pd.to_datetime(df['MeasurementDateGMT'])

# Extract hour as a string with format 'HH:MM' to use as column names
df['hour'] = df['MeasurementDateGMT'].dt.strftime('%H:%M')

# Create a date column from the 'MeasurementDateGMT' datetime
df['date'] = df['MeasurementDateGMT'].dt.date

# Pivot the DataFrame to get the wide format with date as index
df_wide = df.pivot(index='date', columns='hour', values='Bexley - Belvedere West: PM2.5 Particulate (ug/m3)')

# 查看随机几行数据
print(df_wide.sample(15))


hour        00:00  01:00  02:00  03:00  04:00  05:00  06:00  07:00  08:00  \
date                                                                        
2022-05-04   13.3   13.9   15.4   20.2   21.3   20.5   22.7   28.3   31.9   
2022-03-14    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
2022-10-29    6.8    5.3    5.3    6.0    6.3    6.0    6.3    6.7    7.9   
2022-03-23    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
2022-04-22    8.3    9.7   11.6   12.7   13.4   14.9   16.8   15.4   14.8   
2022-11-18    3.0    2.8    2.9    2.7    2.5    2.4    2.5    2.7    2.9   
2023-02-09   23.8   23.8   23.0   22.4   21.5   20.4   18.1   18.4   18.9   
2022-12-17   36.3   34.0   30.4   27.4   26.6   25.2   17.1    4.4    4.6   
2023-01-10    8.9    8.3    7.8    7.6    7.1    7.4    7.9    7.8    5.4   
2023-01-13    6.7    5.3    6.1    6.2    6.8    6.4    6.1    6.5    6.5   
2022-11-23    6.4    5.9    5.5    4.7    3.6    4.0    4.7    4.4    2.0   

In [None]:
LoadFromAPI.load_data_from_api(url='https://api.erg.ic.ac.uk/AirQuality/Data/SiteSpecies/SiteCode=BT8/SpeciesCode=NO2/StartDate=01 Mar 2022/EndDate=01 Apr 2023/csv', 
                               data_format="csv", 
                               filename="Data/AirQuality/BQ7_20180101.csv")

Data saved to Data/AirQuality/BQ7_20180101.csv.


In [None]:
# This returns raw data based on 'SiteCode', 'StartDate', 'EndDate'. Data returned in JSON format from traffic counters.
LoadFromAPI.load_data_from_api(url='https://api.erg.ic.ac.uk/AirQuality/Data/Traffic/Site/SiteCode=BT8/StartDate=01 Mar 2022/EndDate=01 Apr 2023/Json', 
                               data_format="json", 
                               filename="Data/AirQuality/BT8_2022_2023.json")

### Potential thing for air quality data

https://www.data.gov.uk/dataset/ef87da6c-0b01-4717-aab4-a076e8b8ff7e/london-atmospheric-emissions-inventory-laei-2013

## Interactive GUI for London Roads Data