# Find sets of building ids with continuously available data

In [1]:
import os
import re

In [2]:
data_dir_path = os.path.join('processed_data')
building_dirs = [f for f in os.listdir(data_dir_path) if os.path.isdir(os.path.join(data_dir_path, f))]
regex = re.compile(r'\d+')
building_ids = sorted([int(*regex.findall(f)) for f in building_dirs])

In [3]:
# find years of available data for each building
building_years = {
    'elec': {},
    'gas': {}
}

for bid in building_ids:

    building_dir = os.path.join(data_dir_path, f'UCam_Building_b{bid}')
    elec_dir = os.path.join(building_dir, 'electricity')
    gas_dir = os.path.join(building_dir, 'gas')

    if os.path.exists(elec_dir):
        year_files = [f for f in os.listdir(elec_dir) if (os.path.isfile(os.path.join(elec_dir, f))) and (f.endswith('.csv'))]
        building_years['elec'][bid] = [int(*regex.findall(f)) for f in year_files]
    else:
        building_years['elec'][bid] = []

    if os.path.exists(gas_dir):
        year_files = [f for f in os.listdir(gas_dir) if (os.path.isfile(os.path.join(gas_dir, f))) and (f.endswith('.csv'))]
        building_years['gas'][bid] = [int(*regex.findall(f)) for f in year_files]
    else:
        building_years['gas'][bid] = []

In [4]:
first_year = min([min(building_years['elec'][bid] + building_years['gas'][bid]) for bid in building_ids])
last_year = max([max(building_years['elec'][bid] + building_years['gas'][bid]) for bid in building_ids])
years = list(range(first_year, last_year+1))

durations = [5,6,8,10,12,15]
windows = dict([d,[]] for d in durations)
for d in durations: 
    for y in years:
        if y+d-1 <= last_year:
            windows[d].append((y,y+d-1))

In [5]:
print(windows)

{5: [(2000, 2004), (2001, 2005), (2002, 2006), (2003, 2007), (2004, 2008), (2005, 2009), (2006, 2010), (2007, 2011), (2008, 2012), (2009, 2013), (2010, 2014), (2011, 2015), (2012, 2016), (2013, 2017), (2014, 2018), (2015, 2019), (2016, 2020), (2017, 2021), (2018, 2022), (2019, 2023)], 6: [(2000, 2005), (2001, 2006), (2002, 2007), (2003, 2008), (2004, 2009), (2005, 2010), (2006, 2011), (2007, 2012), (2008, 2013), (2009, 2014), (2010, 2015), (2011, 2016), (2012, 2017), (2013, 2018), (2014, 2019), (2015, 2020), (2016, 2021), (2017, 2022), (2018, 2023)], 8: [(2000, 2007), (2001, 2008), (2002, 2009), (2003, 2010), (2004, 2011), (2005, 2012), (2006, 2013), (2007, 2014), (2008, 2015), (2009, 2016), (2010, 2017), (2011, 2018), (2012, 2019), (2013, 2020), (2014, 2021), (2015, 2022), (2016, 2023)], 10: [(2000, 2009), (2001, 2010), (2002, 2011), (2003, 2012), (2004, 2013), (2005, 2014), (2006, 2015), (2007, 2016), (2008, 2017), (2009, 2018), (2010, 2019), (2011, 2020), (2012, 2021), (2013, 2022),

In [6]:
# find building ids with data available for each time window
periods_availability_elec = dict([d,[]] for d in durations)
periods_availability_both = dict([d,[]] for d in durations)
for d in durations:
    periods_availability_elec[d] = dict([('{0}:{1}'.format(*window),[0,[]]) for window in windows[d]])
    periods_availability_both[d] = dict([('{0}:{1}'.format(*window),[0,[]]) for window in windows[d]])
    for window in list(periods_availability_both[d].keys()):
        start, end = window.split(':')
        window_years = list(range(int(start),int(end)+1))
        for bid in building_ids:
            if all([y in building_years['elec'][bid] for y in window_years]):
                periods_availability_elec[d][window][0] += 1
                periods_availability_elec[d][window][1].append(bid)
            if all([y in building_years['elec'][bid] and y in building_years['gas'][bid] for y in window_years]):
                periods_availability_both[d][window][0] += 1
                periods_availability_both[d][window][1].append(bid)

In [7]:
# print windows with best electricity data availability
for d in durations:
    print("Duration: %s years"%d)
    print([(item[0],item[1][0]) for item in sorted(periods_availability_elec[d].items(), key=lambda item: item[1][0], reverse=True) if item[1][0] > 0])

Duration: 5 years
[('2019:2023', 66), ('2006:2010', 59), ('2008:2012', 55), ('2012:2016', 54), ('2015:2019', 54), ('2010:2014', 53), ('2011:2015', 52), ('2013:2017', 52), ('2009:2013', 50), ('2014:2018', 50), ('2005:2009', 45), ('2017:2021', 45), ('2016:2020', 43), ('2018:2022', 42), ('2007:2011', 38), ('2001:2005', 22), ('2003:2007', 22), ('2004:2008', 22), ('2000:2004', 21), ('2002:2006', 21)]
Duration: 6 years
[('2011:2016', 51), ('2012:2017', 51), ('2013:2018', 50), ('2010:2015', 48), ('2008:2013', 47), ('2014:2019', 46), ('2005:2010', 44), ('2009:2014', 43), ('2015:2020', 42), ('2017:2022', 40), ('2006:2011', 38), ('2007:2012', 38), ('2016:2021', 38), ('2018:2023', 38), ('2003:2008', 22), ('2001:2006', 21), ('2002:2007', 21), ('2004:2009', 21), ('2000:2005', 20)]
Duration: 8 years
[('2011:2018', 47), ('2010:2017', 45), ('2012:2019', 45), ('2013:2020', 41), ('2006:2013', 38), ('2009:2016', 37), ('2014:2021', 37), ('2008:2015', 35), ('2016:2023', 33), ('2005:2012', 32), ('2007:2014'

In [8]:
# print windows with best electricity and gas data availability
for d in durations:
    print("Duration: %s years"%d)
    print([(item[0],item[1][0]) for item in sorted(periods_availability_both[d].items(), key=lambda item: item[1][0], reverse=True) if item[1][0] > 0])

Duration: 5 years
[('2018:2022', 24), ('2011:2015', 17), ('2012:2016', 17), ('2013:2017', 16), ('2014:2018', 14), ('2017:2021', 13), ('2015:2019', 12), ('2016:2020', 12), ('2019:2023', 6)]
Duration: 6 years
[('2011:2016', 15), ('2013:2018', 14), ('2012:2017', 12), ('2014:2019', 12), ('2016:2021', 12), ('2015:2020', 11), ('2017:2022', 11)]
Duration: 8 years
[('2013:2020', 11), ('2014:2021', 11), ('2011:2018', 10), ('2012:2019', 10), ('2015:2022', 9)]
Duration: 10 years
[('2012:2021', 10), ('2011:2020', 9), ('2013:2022', 9)]
Duration: 12 years
[('2011:2022', 7)]
Duration: 15 years
[]


In [9]:
print(periods_availability_both[8]['2012:2019'])

[10, [0, 4, 8, 19, 24, 25, 40, 58, 101, 118]]


In [10]:
print(periods_availability_both[6]['2012:2017']) # VoI for LPs

[12, [0, 4, 8, 19, 24, 25, 40, 58, 101, 102, 104, 118]]
