Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
188 lines (140 sloc) 5.29 KB
Scrapes data from the ELEXON API
Requires using an ELEXON API key -
make_dates_list() creates a list of dates from a start & num_days
scrape_all_dates() takes a list of dates and scrapes data for each
ReportGrabber scrapes data from Elexon
import argparse
from collections import defaultdict
from datetime import datetime as dt
from datetime import timedelta
import pytz
import requests
import xml.etree.ElementTree as ET
import pandas as pd
def make_dates_list(start_date, num_days):
Creates a list of dates
start_date (str)
days (int) number of days after start date
dates (list) list of strings
start_date = dt.strptime(start_date, '%Y-%m-%d')
return [start_date + timedelta(days=d) for d in range(num_days)]
def scrape_all_dates(name, cols, key, dates):
Takes a list of dates and scrapes Elexon for each
name (str) name of the Elexon report
cols (list) list of columns to get for the report
key (str) API key
dates (list)
report = ReportGrabber(name, cols, key)
# dataframes is a list of reports for each date
dataframes = []
for date in dates:
output_dict = report.scrape_report(date)
return pd.concat(dataframes, axis=0)
class ReportGrabber(object):
Grabs data from Elexon
name (str) name of the Elexon report
data_cols (list) list of columns to get for the report
key (str) API key
local_tz = pytz.timezone('Europe/London')
index_tz = pytz.timezone('GMT')
fmt = '%Y-%m-%d %H:%M:%S'
def __init__(self, name, data_cols, key, price_category='Excess balance'): = name
# relevant only to the imbalance price
self.price_category = price_category
assert isinstance(data_cols, list)
self.data_cols = data_cols
self.columns = ['settlementDate', 'settlementPeriod']
self.key = key
def scrape_report(self, settlement_date):
Gets data for one settlement date
settlement_date (str)
output (dict) {column name : data}
url = self.get_url(settlement_date)
print('scraping {} {}'.format(, settlement_date))
# use the requests library to get the response from this url
req = requests.get(url)
if 'An invalid API key has been passed' in req.text:
raise ValueError('Invalid API key')
self.root = ET.fromstring(req.content)
# iterate over the XML
# save each of the columns into a dict
output = defaultdict(list)
# we can narrow down where we need to look in this XML
for parent in self.root.findall("./responseBody/responseList/item"):
for child in parent:
# condition that only gets the data we want
# if we wanted all raw data we wouldn't do this
if child.tag in self.columns:
return output
def create_dataframe(self, output_dict):
Creates a dataframe from the output dictionary
Will create a dataframe for one settlement_date, as the output_dict
will be data for one settlement_date
output_dict (dict) {column name : data}
output (DataFrame)
# create a dataframe
output = pd.DataFrame().from_dict(output_dict)
# create the time stamp by iterating over each row
# there must be a better way!
for row_idx in range(output.shape[0]):
date = dt.strptime(output.loc[row_idx, 'settlementDate'], '%Y-%m-%d')
stamp = date + timedelta(minutes=30*int(output.loc[row_idx, 'settlementPeriod']))
aware = self.local_tz.localize(stamp)
index = aware.astimezone(self.index_tz)
output.loc[row_idx, 'time_stamp'] = index.strftime(self.fmt)
return output
def get_url(self, settlement_date):
Forms the URL to query the Elexon API
settlement_date (str)
url (str)
url = '{}/'.format(
url += 'v1?APIKey={}&'.format(self.key)
url += 'ServiceType=xml&'
url += 'Period=*&SettlementDate={}'.format(settlement_date)
return url
if __name__ == '__main__':
# send in the ELEXON API key from the command line
parser = argparse.ArgumentParser()
args = parser.parse_args()
key = args.key
# the reports we want data for
# format of {name: columns}
reports = {'B1770': ['imbalancePriceAmountGBP', 'priceCategory'],
'B1780': ['imbalanceQuantityMAW']}
# the dates we want data for
settlementdates = make_dates_list('2010-01-01', 8*365)
# loop over the reports and save each as a csv
for name, cols in reports.items():
data = scrape_all_dates(name, cols, key, settlementdates)