In [52]:
import pandas as pd
import numpy as np
import os
import datetime as dt

In [53]:
computer_path = os.getcwd()
path_input_data = os.getcwd()+os.path.sep+"Data" # path to input data such as lookup tables for OA, MSOA, LA, etc.
path_EPCs = r"D:\OneDrive - Cardiff University\04 - Projects\03 - PhD\03 - Analysis\03 - LSOAs\00 - Data\EPC and energy efficiency\all-domestic-certificates" # path to the folder with the EPCs certificates of all local authorities in England and Wales
saving_path = computer_path

## Input parameters

In [54]:
# specify the name of the local authorit(y/ies) you would like to calculate heat demand data for
# if no name specified, this is done for all local authorities in England and Wales.

local_authority = []
potential = True #potential variable set to True to calculate heat demand after considering energy efficiency measures

if len(local_authority)>0:
    output_file_name = "LSOAs_in_selected_LAs"
else:
    output_file_name ="LSOAs_in_England_Wales"
    
if potential:
    output_file_name = output_file_name + "_after_EE_heat_demand"
else:
    output_file_name = output_file_name + "_before_EE_heat_demand"


### Import lookup table linking posctode, LSOA, MSOA and Region

In [55]:
def log_time_shape(func):
    def wrapper(dataf, *args, **kwargs):
        time_start = dt.datetime.now()
        result = func(dataf, *args, **kwargs)
        time_end = dt.datetime.now()
        print(f"{func.__name__} took {time_end-time_start} shape={result.shape}")
        return result
    return wrapper

In [56]:
# 
def get_location_lookup_df(local_authorities=[]):
    
    OA_lookup_file = r"PCD11_OA11_LSOA11_MSOA11_LAD11_EW_LU_feb_2018.csv"
    OA_lookup_df = pd.read_csv(path_input_data+os.path.sep+OA_lookup_file, low_memory=False)
    OA_lookup_df.drop(["pcd8", "pcds", "dointr", "doterm", "usertype", "lsoa11nm", "msoa11nm","ladnmw", "FID"], axis=1, inplace=True) 
    OA_lookup_df.drop_duplicates(inplace=True)

    region_lookup_file = r"laregionlookup376las.xls"
    region_lookup_df = pd.read_excel(path_input_data+os.path.sep+region_lookup_file, sheet_name=1, header=6, usecols=[0, 1, 3])

    OA_lookup_df = pd.merge(OA_lookup_df, region_lookup_df, left_on='ladcd', right_on='la_code', how='left')

    OA_lookup_df = pd.merge(OA_lookup_df, region_lookup_df, left_on='ladnm', right_on='la_name', how='left')
    OA_lookup_df["la_code_x"].fillna(OA_lookup_df["la_code_y"], inplace=True)
    OA_lookup_df["la_name_x"].fillna(OA_lookup_df["la_name_y"], inplace=True)
    OA_lookup_df["region_name_x"].fillna(OA_lookup_df["region_name_y"], inplace=True)
    OA_lookup_df.drop(["ladcd", "ladnm", "la_code_y","la_name_y", "region_name_y"], axis=1, inplace=True)
    OA_lookup_df.dropna(subset=["la_name_x"], inplace=True)
    OA_lookup_df["pcd7"]=OA_lookup_df["pcd7"].str.replace(' ','')
    OA_lookup_df.columns = ['PCD7', 'OA11CD', 'LSOA11CD', 'MSOA11CD', 'LAD11CD', 'Local Authority', 'Region']

    if len(local_authority)>0:
        OA_lookup_df=OA_lookup_df.loc[OA_lookup_df['Local Authority'].isin(local_authority),:]

    final_df = OA_lookup_df.loc[:, ['PCD7', 'OA11CD', "LSOA11CD", "MSOA11CD", "LAD11CD", "Local Authority"]].drop_duplicates()
    
    # uniformisation of the string values
    final_df["Local Authority nospace"] = final_df["Local Authority"].str.replace('-','', regex=False)
    final_df["Local Authority nospace"] = final_df["Local Authority nospace"].str.replace('.','', regex=False)
    final_df["Local Authority nospace"] = final_df["Local Authority nospace"].str.replace("'",'', regex=False)
    final_df["Local Authority nospace"] = final_df["Local Authority nospace"].str.replace(' ','', regex=False)
    return final_df


def get_OA_LSOA_dict(dataf): 
    temp_df = dataf.loc[:, ["OA11CD", "LSOA11CD"]].drop_duplicates()
    dict_map = dict(zip(dataf['OA11CD'].values, dataf['LSOA11CD'].values))
    return dict_map

@log_time_shape
def groupby_LSOA(dataf, location_df):
    dict_map = get_OA_LSOA_dict(location_df)
    dataf['LSOA11CD'] = dataf['OA_Code'].map(dict_map)
    dataf = dataf.groupby('LSOA11CD').sum()
    rurality_columns= [x for x in dataf.columns if 'Rurality_' in x]
    dataf["Rurality"] = dataf[rurality_columns].idxmax(axis=1)
    dataf.drop(rurality_columns, axis=1, inplace=True)
    
    return dataf

In [57]:
location_lookup_df = get_location_lookup_df()

### Add rurality information

In [58]:
def get_rurality_df():
    # Create dataframe with information about Number of households and rurality type of OAs
    rurality_file = "census_OA_data_England_and_Wales.xlsx"
    dataf = pd.read_excel(path_input_data+os.path.sep+rurality_file)
    dataf = dataf[["N_Households", "Rurality", 'OA_Code']]
    dataf["Rurality"].replace({'Village': 'Village, Town and Fringe',
                                  'Town and Fringe': 'Village, Town and Fringe'},
                                    inplace=True)
    dataf = pd.get_dummies(dataf, columns=["Rurality"])
    return dataf

@log_time_shape
def add_MSOA11CD_LAD11CD_LA(dataf, location_lookup_df):
    lookup_df = location_lookup_df.loc[:,'LSOA11CD': ].drop_duplicates().copy()
    dataf = dataf.merge(lookup_df, left_index=True, right_on='LSOA11CD', how='left')
    
    dataf.set_index("LSOA11CD", inplace=True, drop=True)
    return dataf

### Add census data dwelling type by heating type

In [59]:
@log_time_shape
def add_dwelling_categories(dataf):
    dwelling_heating_file = "CT0213.xls"

    frame = []

    dwelling_type = ["Detached", "Semi-detached", "Terraced", "Flat"]
    heating_type = ["No central heating", "Gas boiler", 'Resistance heating', "Oil boiler", "Solid fuel boiler"]
    count = 2

    for dwelling in dwelling_type:
        df = pd.read_excel(path_input_data+os.path.sep+dwelling_heating_file, sheet_name=count, header=11, index_col=0)
        df.drop(df.columns[0], axis=1, inplace=True)
        df.drop(df.columns[5], axis=1, inplace=True)
        df.columns = [dwelling + " " +heating+ " 2011" for heating in heating_type] 
        df["Code"] = df.index.str.strip().str.split(' ').str.get(0)
        df.set_index("Code", inplace=True, drop=True)
        df.dropna(inplace=True)
#         print(df.shape)
        count = count + 1
        frame.append(df)
    dwelling_heating_df = pd.concat(frame, axis=1, sort=False)
    
    dataf = pd.merge(dataf, dwelling_heating_df,left_index=True, right_index=True, how='left')
    
    return dataf

### Add area to LSOA

In [60]:
@log_time_shape
def add_area_LSOA(dataf):
    file = 'UK_2011_LSOA_with_area.csv'
    area_df = pd.read_csv(path_input_data+os.path.sep+file, index_col=1, dtype={"Area":float})
    area_df["Area"] = area_df["Area"]/1000000
    area_df.rename(columns = {'Area':'Area (km2)'}, inplace=True)
    area_df.head()
    
    dataf = pd.merge(dataf, area_df['Area (km2)'],left_index=True, right_index=True, how='left')
    dataf['Area (km2)'].fillna(-1, inplace=True)

    return dataf

### Add road length to LSOA

In [61]:
@log_time_shape
def add_road_length(dataf, location_df):
    file = 'UK_2011_LSOA_with_road.csv'
    road_df = pd.read_csv(path_input_data+os.path.sep+file, dtype={"Sum_Shape_":float})
    road_df.rename(columns={'Sum_Shape_':'Road length (m)'}, inplace=True)
    road_df = pd.merge(location_df[["OA11CD","LSOA11CD"]].drop_duplicates(), road_df[["geo_code", "Road length (m)"]], left_on="OA11CD", right_on="geo_code", how="left")
    road_df = road_df.groupby(["LSOA11CD"]).sum().reset_index()
    road_df['Road length (m)'].fillna(-1, inplace=True)

    dataf = pd.merge(dataf, road_df,left_index=True, right_on="LSOA11CD", how='left')
    return dataf

### Add number of households in 2018

In [62]:
@log_time_shape
def add_number_households_2018(dataf):
    ## Add data for 2018
    frames = []

    year_list=['2018']

    for yr in year_list:
        temp_df = pd.read_csv(path_input_data+os.path.sep+"LSOA_domestic_"+str(yr)+".csv",na_values='-')
        cols_to_keep = ["Lower Layer Super Output Area (LSOA) Code" , "Total number of domestic electricity meters"]
        temp_df = temp_df.loc[:, cols_to_keep].copy()
        temp_df.columns = ["LSOA code "+yr, 
                            "Total number of domestic electricity meters "+yr, 
                           ]
        temp_df["LSOA code "+yr]=temp_df["LSOA code "+yr].str.strip()
        temp_df.set_index("LSOA code "+yr, inplace=True)
        dataf = pd.merge(dataf, temp_df,left_on="LSOA11CD", right_on="LSOA code "+yr, how='left')
        for col in temp_df.iloc[:,:].columns:
            dataf[col].fillna(0, inplace=True)
            
            
    dataf.rename(columns={'N_Households':'N_Households 2011'}, inplace=True) #
    print("Number of households in 2011: ", '{:,}'.format(dataf['N_Households 2011'].sum()))

    # the number of electricity meters is a good proxy to estimate the number of households per LSOA in 2018
    dataf['N_Households 2018'] = dataf['Total number of domestic electricity meters 2018'] 
    print("Number of households in 2018: ", '{:,}'.format(dataf['N_Households 2018'].sum()))
    
    heating_cols = [
                'Detached Gas boiler 2011' ,
                'Detached Resistance heating 2011' ,
                'Detached Oil boiler 2011' ,
                'Detached Solid fuel boiler 2011' ,
                'Semi-detached Gas boiler 2011' ,
                'Semi-detached Resistance heating 2011' ,
                'Semi-detached Oil boiler 2011' ,
                'Semi-detached Solid fuel boiler 2011' ,
                'Terraced Gas boiler 2011' ,
                'Terraced Resistance heating 2011' ,
                'Terraced Oil boiler 2011' ,
                'Terraced Solid fuel boiler 2011' ,
                'Flat Gas boiler 2011' ,
                'Flat Resistance heating 2011' ,
                'Flat Oil boiler 2011' ,
                'Flat Solid fuel boiler 2011' ]
    
    for nb_install in heating_cols:
        print(nb_install)
        dataf[nb_install[:-5]+" 2018"] = round(dataf[nb_install]/dataf["N_Households 2011"]*dataf['N_Households 2018'], 0)
    
    dataf.set_index("LSOA11CD", inplace=True)
    return dataf

In [63]:
oa_df = get_rurality_df()
oa_df.head()

Unnamed: 0,N_Households,OA_Code,Rurality_Hamlet & Isolated Dwellings,Rurality_Urban >10K,"Rurality_Village, Town and Fringe"
0,119,E00078837,0,0,1
1,122,E00078838,0,0,1
2,97,E00078839,0,0,1
3,78,E00078840,1,0,0
4,117,E00078841,0,0,1


In [64]:
lsoa_df = (oa_df
    .pipe(groupby_LSOA, location_lookup_df)
    .pipe(add_MSOA11CD_LAD11CD_LA, location_lookup_df)
    .pipe(add_dwelling_categories)
    .pipe(add_area_LSOA)
    .pipe(add_road_length, location_lookup_df)
    .pipe(add_number_households_2018))

groupby_LSOA took 0:00:01.031256 shape=(34753, 2)
add_MSOA11CD_LAD11CD_LA took 0:00:00.807841 shape=(34753, 6)
add_dwelling_categories took 0:00:19.362308 shape=(34753, 26)
add_area_LSOA took 0:00:00.180489 shape=(34753, 27)
add_road_length took 0:00:01.207758 shape=(34753, 29)
Number of households in 2011:  23,361,222
Number of households in 2018:  25,553,711
Detached Gas boiler 2011
Detached Resistance heating 2011
Detached Oil boiler 2011
Detached Solid fuel boiler 2011
Semi-detached Gas boiler 2011
Semi-detached Resistance heating 2011
Semi-detached Oil boiler 2011
Semi-detached Solid fuel boiler 2011
Terraced Gas boiler 2011
Terraced Resistance heating 2011
Terraced Oil boiler 2011
Terraced Solid fuel boiler 2011
Flat Gas boiler 2011
Flat Resistance heating 2011
Flat Oil boiler 2011
Flat Solid fuel boiler 2011
add_number_households_2018 took 0:00:00.187238 shape=(34753, 46)


In [65]:
lsoa_df

Unnamed: 0_level_0,N_Households 2011,Rurality,MSOA11CD,LAD11CD,Local Authority,Local Authority nospace,Detached No central heating 2011,Detached Gas boiler 2011,Detached Resistance heating 2011,Detached Oil boiler 2011,...,Semi-detached Oil boiler 2018,Semi-detached Solid fuel boiler 2018,Terraced Gas boiler 2018,Terraced Resistance heating 2018,Terraced Oil boiler 2018,Terraced Solid fuel boiler 2018,Flat Gas boiler 2018,Flat Resistance heating 2018,Flat Oil boiler 2018,Flat Solid fuel boiler 2018
LSOA11CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E01000001,876,Rurality_Urban >10K,E02000001,E09000001,City of London,CityofLondon,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,10.0,0.0,0.0,95.0,680.0,12.0,0.0
E01000002,830,Rurality_Urban >10K,E02000001,E09000001,City of London,CityofLondon,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,32.0,0.0,0.0,50.0,717.0,8.0,0.0
E01000003,817,Rurality_Urban >10K,E02000001,E09000001,City of London,CityofLondon,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,718.0,302.0,1.0,0.0
E01000005,467,Rurality_Urban >10K,E02000001,E09000001,City of London,CityofLondon,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,270.0,118.0,30.0,0.0
E01000006,543,Rurality_Urban >10K,E02000017,E09000002,Barking and Dagenham,BarkingandDagenham,2.0,26.0,1.0,0.0,...,0.0,0.0,256.0,7.0,0.0,1.0,25.0,86.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W01001954,637,Rurality_Urban >10K,W02000372,W06000015,Cardiff,Cardiff,0.0,274.0,0.0,1.0,...,0.0,0.0,37.0,1.0,0.0,0.0,77.0,70.0,0.0,0.0
W01001955,1153,Rurality_Urban >10K,W02000192,W06000011,Swansea,Swansea,0.0,10.0,6.0,0.0,...,1.0,0.0,38.0,4.0,0.0,0.0,604.0,602.0,2.0,0.0
W01001956,803,Rurality_Urban >10K,W02000190,W06000011,Swansea,Swansea,1.0,259.0,3.0,3.0,...,1.0,1.0,92.0,19.0,0.0,0.0,86.0,11.0,0.0,1.0
W01001957,411,Rurality_Urban >10K,W02000192,W06000011,Swansea,Swansea,0.0,3.0,1.0,0.0,...,0.0,0.0,222.0,7.0,0.0,1.0,139.0,85.0,0.0,0.0


In [66]:
copy_lsoa = lsoa_df.copy()

## EPC data
information regarding each field is described in : https://epc.opendatacommunities.org/docs/guidance#technical_notes

In [67]:
def mergeColumn(df, target, suffixes=["_x", "_y"]):
    list_col = df.columns
    org_col = target+suffixes[0]
    repl_col = target+suffixes[1]
    boolean=False
    if org_col in list_col:
        df[target] = df[org_col].fillna(df[repl_col])
        df.drop([org_col, repl_col], axis=1, inplace=True)
        boolean=True
    return boolean  

def getEPC(path, location_df):
    file = 'certificates.csv'
    df = pd.read_csv(path+os.path.sep+file)
    df["POSTCODE"] = df["POSTCODE"].str.replace(' ','')
    df=pd.merge(df, location_df.loc[:,["PCD7", "LSOA11CD", "MSOA11CD", "LAD11CD"]],left_on="POSTCODE", right_on="PCD7", how='left')

    df.drop(["PCD7"], axis=1, inplace=True)
    df.replace('Bungalow', 'House', regex=True, inplace=True)
    df.replace('Maisonette', 'Flat', regex=True, inplace=True)
    df["Heat consumption (kWh)"]=0
    df["Space constrained ratio"]=0
    df["Heat_rating_changes [%]"]=0
    df["Heat_efficiency_changes [%]"]=0
    print(df.shape)
    return df

    
def setFlag(df):
    #Remove rows without a valid LSOA, MSOA or LA
    df["FLAG LSOA"] = df["LSOA11CD"].astype(str).str.len()==9
    df.drop(df[df["FLAG LSOA"]==False].index, inplace=True)

    df["FLAG MSOA"] = df["MSOA11CD"].astype(str).str.len()==9
    df.drop(df[df["FLAG MSOA"]==False].index, inplace=True)

    df["FLAG LA"] = df["LAD11CD"].astype(str).str.len()==9
    df.drop(df[df["FLAG LA"]==False].index, inplace=True)
    
    return df

def costTokWh(df):
    print("list of fuels: ", df["MAIN_FUEL"].unique()) 
    if potential:
        cost_column = "HEATING_COST_POTENTIAL"
    else:
        cost_column = "HEATING_COST_CURRENT" 
    #USING SAP price from january 2016: https://www.bre.co.uk/filelibrary/SAP/2012/RdSAP-fuel-prices-from-January-2018.xlsx
    price_of_elec = 15.32/100 #df_fuels.loc["Electricity", "Cost [GBP/kWh]"]
    price_of_gas = 4.32/100 #df_fuels.loc["Ngas", "Cost [GBP/kWh]"] 
    price_of_oil = 5.06/100 #df_fuels.loc["Oil", "Cost [GBP/kWh]"] 
    price_of_solid_fuel = 5.67/100 #df_fuels.loc["Biomass", "Cost [GBP/kWh]"] 

    #Efficiency from "combined"  individual_tech database
    eff_elec = 1
    eff_gas = 0.84
    eff_oil = 0.84
    eff_solid_fuel = 0.82
    eff_hp = 3.4
    
    df["Heat system"] = None
    df["MAINHEAT_DESCRIPTION"].fillna("No value", inplace=True)
    df["Heat consumption (kWh)"].dropna(inplace=True)
    
    gas_list = ["mains gas (not community)",
                #'mains gas - this is for backwards compatibility only and should not be used'
                ]
    
    df.loc[df["MAIN_FUEL"].isin(gas_list), "Heat consumption (kWh)"] = df.loc[df["MAIN_FUEL"].isin(gas_list), cost_column]/price_of_gas*eff_gas
    df.loc[df["MAIN_FUEL"].isin(gas_list), "Heat system"] = "Gas boiler"

    electricity_list = ["electricity (not community)", 'electricity - this is for backwards compatibility only and should not be used',
                       'Electricity: electricity, unspecified tariff']
    
    df.loc[df["MAIN_FUEL"].isin(electricity_list), "Heat system"]='Resistance heating'
    df.loc[df["MAINHEAT_DESCRIPTION"].str.contains("heat pump"),"Heat system"]="Heat pump"
    
    df.loc[df["Heat system"]=='Resistance heating', "Heat consumption (kWh)"] = df.loc[df["Heat system"]=='Resistance heating', cost_column]/price_of_elec*eff_elec
    df.loc[df["Heat system"]=="Heat pump", "Heat consumption (kWh)"] = df.loc[df["Heat system"]=="Heat pump", cost_column]/price_of_elec*eff_hp

    oil_list = ['appliances able to use mineral oil or liquid biofuel','oil (not community)', 'oil - this is for backwards compatibility only and should not be used']
    
    df.loc[df["MAIN_FUEL"].isin(oil_list), "Heat system"] = "Oil boiler"
    df.loc[df["Heat system"]=="Oil boiler", "Heat consumption (kWh)"] = df.loc[df["Heat system"]=="Oil boiler", cost_column]/price_of_oil*eff_oil

    solid_fuels = ['anthracite', 'house coal (not community)','wood logs', 'smokeless coal', 'house coal - this is for backwards compatibility only and should not be used',
                  'bulk wood pellets', 'wood chips']

    df.loc[df["MAIN_FUEL"].isin(solid_fuels), "Heat system"]="Solid fuel boiler"
    df.loc[df["Heat system"]=="Solid fuel boiler", "Heat consumption (kWh)"] = df.loc[df["Heat system"]=="Solid fuel boiler", cost_column]/price_of_solid_fuel*eff_solid_fuel
    return df

def threshold(df):
    # Remove inaccurate value from the dataset. e.g. everything under 25 kWh/m2
    min_kWh_m2 = 15 # minimum intensity Passive House (Passivhaus) standard is at 15 kWh/m2
    max_kWh_m2 = 400 
    max_m2 = 1000
    min_number_room = 1
    
    df["Heat per floor area (kWh/m2)"]=df["Heat consumption (kWh)"]/df["TOTAL_FLOOR_AREA"]
    print(df.loc[(df["Heat per floor area (kWh/m2)"]<min_kWh_m2),:].shape)
    df.fillna(0, inplace=True)
    to_drop = df.loc[(df["Heat per floor area (kWh/m2)"]<min_kWh_m2),:].index
    df.drop(to_drop, inplace=True)
    print(df.loc[(df["Heat per floor area (kWh/m2)"]>max_kWh_m2),:].shape)
    to_drop = df.loc[(df["Heat per floor area (kWh/m2)"]>max_kWh_m2),:].index
    df.drop(to_drop, inplace=True)
       
    
    # Used to assess the sapce availability to change heating system 
    # Source: Element Energy, “Analysis on abating direct emissions from ‘hard-to-decarbonise’ homes, with a view to informing the UK’s long term targets,” 2019.
    df["Space constrained ratio"] = df["TOTAL_FLOOR_AREA"]/df["NUMBER_HEATED_ROOMS"]
    df.loc[df["Space constrained ratio"]<=16, "Space constrained ratio"]=1
    df.loc[df["Space constrained ratio"]>16, "Space constrained ratio"]=0
    
    # Change in EPC and impact on heat
    to_drop = df.loc[df["POTENTIAL_ENERGY_EFFICIENCY"]<df["CURRENT_ENERGY_EFFICIENCY"], :].index
    df.drop(to_drop, inplace=True)
    
    to_drop = df.loc[df["HEATING_COST_CURRENT"]<df["HEATING_COST_POTENTIAL"]*(1+0.1), :].index
    df.drop(to_drop, inplace=True)
    
    df["Heat_rating_changes [%]"] = (df["HEATING_COST_POTENTIAL"] - df["HEATING_COST_CURRENT"])/df["HEATING_COST_CURRENT"]
    df["Heat_efficiency_changes [%]"] = (df["HEATING_COST_POTENTIAL"] - df["HEATING_COST_CURRENT"])/df["HEATING_COST_CURRENT"]/(df["POTENTIAL_ENERGY_EFFICIENCY"] - df["CURRENT_ENERGY_EFFICIENCY"])

    df["Heat_rating_changes [%]"].replace([np.inf, -np.inf], 0, inplace=True)
    df["Heat_efficiency_changes [%]"].replace([np.inf, -np.inf], 0, inplace=True)
    #Remove houses with less than 1 heated room
    to_drop = df.loc[(df["NUMBER_HEATED_ROOMS"]<min_number_room),:].index
    df.drop(to_drop, inplace=True)
    
    return df


def removeDuplicates(df):
    # Keep only the most uptodate EPC
    
    df = df.sort_values(["BUILDING_REFERENCE_NUMBER", "INSPECTION_DATE"])
    df.drop_duplicates(["BUILDING_REFERENCE_NUMBER"], keep="last", inplace=True)
    
    return df

def dwellingType(df):
    
    df["Dwelling type"]=None
    df.loc[df["PROPERTY_TYPE"]=="Flat", "Dwelling type"]="Flat"
    df.loc[df["PROPERTY_TYPE"]=="House", "Dwelling type"]=df.loc[df["PROPERTY_TYPE"]=="House", "BUILT_FORM"]
    list_dwelling_type = list(df["Dwelling type"].unique())
    list_dwelling_type = list(filter(None.__ne__, list_dwelling_type)) # Remove None from the list
    # temp_df.replace({dwel:"Terrace" for dwel in list_dwelling_type if "Terrace" in dwel}, regex=True, inplace=True)
   
    to_replace = {dwel:"Terraced" for dwel in list_dwelling_type if "Terrace" in dwel}
    df.replace(to_replace, inplace=True)
    df.replace("Semi-Detached", "Semi-detached", inplace=True)
    list_dwelling_type = list(df["Dwelling type"].unique())
    print(list_dwelling_type)
    
    to_drop = df.loc[(df["Dwelling type"]=='NO DATA!'),:].index
    df.drop(to_drop, inplace=True)
    
    df["Key"] = df["Dwelling type"].str.lower()+" "+df["Heat system"].str.lower()
    df["Key_EPC"] = df["CURRENT_ENERGY_RATING"]+" to "+df["POTENTIAL_ENERGY_RATING"]
    df.dropna(subset=['Key'], inplace=True)
    print("Shape:", df.shape)
    
    
    return df

def skeleton(LSOA_lookup):
    LSOA_lookup = LSOA_lookup.loc[:,['LSOA11CD', 'MSOA11CD', 'LAD11CD', 'Local Authority']].drop_duplicates()
    index = list(map(list, zip(*LSOA_lookup.values)))
    index = pd.MultiIndex.from_arrays(index, names=LSOA_lookup.columns)
    dwelling_type = ["Detached", "Semi-detached", "Terraced", "Flat"]
    heating_type = ["Gas boiler", 'Resistance heating', "Oil boiler", "Solid fuel boiler", "Heat pump"]
    list_col = ["Mean "+x.lower()+" "+y.lower() for x in dwelling_type for y in heating_type]
    list_col.extend(["Average floor area of "+x.lower()+" "+y.lower() for x in dwelling_type for y in heating_type])
    list_col.extend(["Average number of rooms of "+x.lower()+" "+y.lower() for x in dwelling_type for y in heating_type])
    list_col.extend(["Average space constrained "+x.lower()+" "+y.lower() for x in dwelling_type for y in heating_type])
    EPC_ratings = ["A", "B", "C", "D", "E", "F", "G"]
    EPC_rating_cols = []
    EPC_eff_cols = []
    for ii, val in enumerate(EPC_ratings):
        for val2 in EPC_ratings[ii:]:
            EPC_rating_cols.append("EPC rating "+val2+" to "+val)
            EPC_eff_cols.append("EPC efficiency "+val2+" to "+val)
    list_col.extend(EPC_rating_cols)
    list_col.extend(EPC_eff_cols)
    LSOA_LA_df = pd.DataFrame(index=index, columns=list_col)
    LSOA_LA_df.reset_index(3,drop=True, inplace=True)
    LSOA_LA_df = LSOA_LA_df.reorder_levels(["LAD11CD",  'MSOA11CD','LSOA11CD'])
    return LSOA_LA_df


def populatingSkeleton(df, skeleton, target):
    list_col = list(skeleton.columns)
    key = "Key"
    if target=="Heat consumption (kWh)":
        list_col = [x for x in list_col if "Mean" in x]
        prefix = "Mean"
    elif target=="NUMBER_HEATED_ROOMS":
        list_col = [x for x in list_col if "rooms" in x]
        prefix = "Average number of rooms of"
    elif target=="TOTAL_FLOOR_AREA":
        list_col = [x for x in list_col if "floor" in x]
        prefix = "Average floor area of"
    elif target=="Space constrained ratio":
        list_col = [x for x in list_col if "space constrained" in x]
        prefix = "Average space constrained"
    elif target=="Heat_rating_changes [%]":
        
        list_col = [x for x in list_col if "EPC rating" in x]
        prefix = "EPC rating"
        key= "Key_EPC"
    elif target=="Heat_efficiency_changes [%]":
        list_col = [x for x in list_col if "EPC efficiency" in x]
        prefix = "EPC efficiency"
        key= "Key_EPC"
        
    print(list_col, prefix)
    df[key] = [prefix+" "+x for x in df[key].values]
    print(df[key].unique())
    
    df = df.loc[df[key].isin(list_col), :].copy()
    # Add the value available at LSOA level
    LSOA_df_count = df.groupby(["LAD11CD", "MSOA11CD", "LSOA11CD", key]).count()[target]
    LSOA_df_count[LSOA_df_count<4]=np.nan #Remove the archetypes with too few properties
    LSOA_df = df.groupby(["LAD11CD", "MSOA11CD", "LSOA11CD", key]).sum()[target]
    LSOA_df = LSOA_df/LSOA_df_count
    LSOA_df = LSOA_df.to_frame()
    LSOA_df = LSOA_df.unstack(3)
    LSOA_df.columns = LSOA_df.columns.droplevel(0)
    
    populated_skeleton = pd.merge(skeleton, LSOA_df, left_index=True, right_index=True, how='left')
    for col in list_col:
        mergeColumn(populated_skeleton, col)
    populated_skeleton.reset_index(level=2, inplace=True)
    
    # Add the value available at MSOA level
    MSOA_df_count = df.groupby(["LAD11CD", "MSOA11CD", key]).count()[target]
    MSOA_df_count[MSOA_df_count<4]=np.nan #Remove the archetypes with too few properties
    MSOA_df = df.groupby(["LAD11CD", "MSOA11CD", key]).sum()[target]
    MSOA_df = MSOA_df/MSOA_df_count
    MSOA_df = MSOA_df.to_frame()
    MSOA_df = MSOA_df.unstack(2)
    MSOA_df.columns = MSOA_df.columns.droplevel(0)

    populated_skeleton = pd.merge(populated_skeleton, MSOA_df, left_index=True, right_index=True, how='left')
    for col in list_col:
        mergeColumn(populated_skeleton, col)
        
    populated_skeleton.reset_index(level=1, inplace=True)
    
    # Add the value available at LA level
    LA_df_count = df.groupby(["LAD11CD", key]).count()[target]
    LA_df_count[LA_df_count<4]=np.nan #Remove the archetypes with too few properties
    LA_df = df.groupby(["LAD11CD", key]).sum()[target]
    LA_df = LA_df/LA_df_count
    LA_df = LA_df.to_frame()
    LA_df = LA_df.unstack(1)
    LA_df.columns = LA_df.columns.droplevel(0)
    LA_df.fillna(LA_df.mean() ,inplace=True)
    
    populated_skeleton = pd.merge(populated_skeleton, LA_df, left_index=True, right_index=True, how='left')
    for col in list_col:
        mergeColumn(populated_skeleton, col)
        
    populated_skeleton.reset_index(inplace=True)
    populated_skeleton.set_index(["LAD11CD", "MSOA11CD", "LSOA11CD"], inplace=True, drop=True)
    
    #populated_skeleton["Mean flat oil boiler"].fillna(populated_skeleton["Mean flat solid fuel boiler"], inplace=True)
    
    return populated_skeleton

def epcToLSOA(path, LSOA_lookup):
    epc_df = getEPC(path, LSOA_lookup)
    org_EPC_number = len(epc_df)
    epc_df = setFlag(epc_df)
    epc_df = costTokWh(epc_df)
    epc_df = threshold(epc_df)
    epc_df = dwellingType(epc_df)
    epc_df = removeDuplicates(epc_df)
    final_EPC_number = len(epc_df)
    skeleton_df = skeleton(LSOA_lookup)
    df = populatingSkeleton(epc_df.copy(), skeleton_df, "Heat consumption (kWh)")
    df = populatingSkeleton(epc_df.copy(), df, "NUMBER_HEATED_ROOMS")
    df = populatingSkeleton(epc_df.copy(), df, "TOTAL_FLOOR_AREA")
    df = populatingSkeleton(epc_df.copy(), df, "Space constrained ratio")
    df = populatingSkeleton(epc_df.copy(), df, "Heat_rating_changes [%]")
    df = populatingSkeleton(epc_df.copy(), df, "Heat_efficiency_changes [%]")
    df.reset_index(inplace=True)
    df.set_index("LSOA11CD", inplace=True)
    df.drop(["MSOA11CD", "LAD11CD"], axis=1, inplace=True)

    return df, org_EPC_number, final_EPC_number

In [68]:
def add_annual_heat_demand(dataf, location_df, number_epc_df):
#     location_df = location_df.loc[:, 'LSOA11CD':].drop_duplicates().copy()
    
    list_LA = location_df["Local Authority nospace"].unique()
    # output_df.set_index("LSOA11CD", inplace=True)
    for fn in os.listdir(path_EPCs):
        LA_name = fn.split('-')[2:]
        LA_name = ''.join(LA_name)
        print(LA_name)
        if LA_name in list_LA:
            LSOA_lookup = location_df.loc[location_df["Local Authority nospace"]==LA_name, :]
            if LSOA_lookup.shape[0]>0:
                temp_df, org_number, final_number = epcToLSOA(path_EPCs+os.path.sep+fn+os.path.sep, LSOA_lookup)
                number_epc_df.loc[LA_name] = [org_number, final_number]
                dataf = pd.merge(dataf, temp_df, left_index=True, right_index=True, how='left')
                for col in temp_df.columns:
                    mergeColumn(dataf, col)
            else: print("ERROR WITH NAME !!!!!!!!!!!!!!!!!!!", LA_name)
        else:
            print("Out", LA_name, fn)
            
            
    return dataf

In [69]:
def fill_missing_data(dataf):
    #fill missing data with average values
    if potential: #average value in England and Wales after considering EE measures (obtained from a previous run)
        avg_heat_demand ={'Mean flat heat pump': 5839.768497366066,
                          'Mean detached gas boiler': 14589.644077982724,
                          'Mean detached resistance heating': 5550.939151840999,
                          'Mean detached oil boiler': 20442.887090034532,
                          'Mean detached solid fuel boiler': 13221.200322729735,
                          'Mean detached heat pump': 27478.618425106422,
                          'Mean semi-detached gas boiler': 11176.994056933589,
                          'Mean semi-detached resistance heating': 3812.4039727079357,
                          'Mean semi-detached oil boiler': 13178.966362117755, 
                          'Mean semi-detached solid fuel boiler': 8384.13646738717,
                          'Mean semi-detached heat pump': 13397.401257373627,
                          'Mean terraced gas boiler': 9808.588429358784,
                          'Mean terraced resistance heating': 3251.5967760104672,
                          'Mean terraced oil boiler': 11664.662458327663,
                          'Mean terraced solid fuel boiler': 7475.750706061542, 
                          'Mean terraced heat pump': 10588.61787728848, 
                          'Mean flat gas boiler': 6683.579422146057, 
                          'Mean flat resistance heating': 1988.1744399227346,
                          'Mean flat oil boiler': 8115.289346063978, 
                          'Mean flat solid fuel boiler': 5002.95209113117}

    else: #average value in England and Wales before considering EE measures (obtained from a previous run)
        avg_heat_demand = {'Mean flat heat pump': 8317.654679323365,
                           'Mean detached gas boiler': 22255.992657350296, 
                           'Mean detached resistance heating': 10228.077961518446, 
                           'Mean detached oil boiler': 33141.41672297774, 
                           'Mean detached solid fuel boiler': 20875.19512086178, 
                           'Mean detached heat pump': 37840.82307995661, 
                           'Mean semi-detached gas boiler': 16912.665791856532, 
                           'Mean semi-detached resistance heating': 7003.306852855303,
                           'Mean semi-detached oil boiler': 21490.58787143911, 
                           'Mean semi-detached solid fuel boiler': 12979.552672499918,
                           'Mean semi-detached heat pump': 18264.7813841389, 
                           'Mean terraced gas boiler': 14512.460427801987, 
                           'Mean terraced resistance heating': 5780.188713745561, 
                           'Mean terraced oil boiler': 18130.40810398572, 
                           'Mean terraced solid fuel boiler': 11399.774271931581, 
                           'Mean terraced heat pump': 14145.681348359638, 
                           'Mean flat gas boiler': 9647.65715063135, 
                           'Mean flat resistance heating': 3273.160923480452, 
                           'Mean flat oil boiler': 12937.408383942113, 
                           'Mean flat solid fuel boiler': 7186.23811026301}


    for key, value in avg_heat_demand.items():
        dataf[key].fillna(dataf[key].mean(), inplace=True)
    return dataf

In [70]:
def print_average_values(dataf):
    cols = ['Flat Heat pump', 'Detached Gas boiler', 'Detached Resistance heating', 'Detached Oil boiler', 'Detached Solid fuel boiler',
     'Detached Heat pump', 'Semi-detached Gas boiler', 'Semi-detached Resistance heating', 'Semi-detached Oil boiler',
     'Semi-detached Solid fuel boiler', 'Semi-detached Heat pump', 'Terraced Gas boiler', 'Terraced Resistance heating',
     'Terraced Oil boiler', 'Terraced Solid fuel boiler', 'Terraced Heat pump', 'Flat Gas boiler', 'Flat Resistance heating', 'Flat Oil boiler',
     'Flat Solid fuel boiler']

    for p in ["Mean", "Average number of rooms of", "Average floor area of", "Average space constrained"]:
        dict_avg_demand = {}
        temp_cols = [p+" "+x.lower() for x in cols]
        for col in temp_cols:
            dict_avg_demand[col]=dataf[col].mean()
        print(dict_avg_demand)
        
    return True

In [71]:
def remove_heat_pump_cols(dataf):
    # Remove dwellings with heat pumps
    heatpump_cols = [c for c in dataf.columns if "Heat pump" in c]
    dataf.drop(heatpump_cols, axis=1, inplace=True)
    return dataf

In [72]:
location_lookup_df

Unnamed: 0,PCD7,OA11CD,LSOA11CD,MSOA11CD,LAD11CD,Local Authority,Local Authority nospace
34000,AL19FY,E00120196,E01023667,E02004936,E07000100,St Albans,StAlbans
34001,AL19FZ,E00120196,E01023667,E02004936,E07000100,St Albans,StAlbans
34002,AL19GA,E00120196,E01023667,E02004936,E07000100,St Albans,StAlbans
34003,AL19GB,E00120196,E01023667,E02004936,E07000100,St Albans,StAlbans
34004,AL19GD,E00120196,E01023667,E02004936,E07000100,St Albans,StAlbans
...,...,...,...,...,...,...,...
2599363,RG134QQ,E00082524,E01016337,E02003383,E06000037,West Berkshire,WestBerkshire
2599364,RG134QR,E00082525,E01016337,E02003383,E06000037,West Berkshire,WestBerkshire
2599365,RG134QS,E00172671,E01016337,E02003383,E06000037,West Berkshire,WestBerkshire
2599366,RG134QT,E00172669,E01016337,E02003383,E06000037,West Berkshire,WestBerkshire


In [73]:
number_epcs = pd.DataFrame(columns = ["Original number", "Final number"])
lsoa_df = add_annual_heat_demand(lsoa_df,location_lookup_df, number_epcs)
lsoa_df = fill_missing_data(lsoa_df)
lsoa_df = remove_heat_pump_cols(lsoa_df)

Hartlepool
(30681, 92)
list of fuels:  ['mains gas - this is for backwards compatibility only and should not be used'
 'mains gas (not community)' 'NO DATA!'
 'electricity - this is for backwards compatibility only and should not be used'
 'electricity (not community)'
 'To be used only when there is no heating/hot-water system'
 'Electricity: electricity, unspecified tariff' 'mains gas (community)'
 'INVALID!' 'oil (not community)' 'dual fuel - mineral + wood'
 'LPG - this is for backwards compatibility only and should not be used'
 'oil - this is for backwards compatibility only and should not be used'
 'smokeless coal' 'LPG (not community)' 'B30D (community)'
 'house coal (not community)' 'bulk wood pellets'
 'house coal - this is for backwards compatibility only and should not be used'
 'biomass (community)' 'wood logs' 'bottled LPG']
(10620, 97)
(5, 97)
['Terraced', 'Semi-detached', 'Detached', 'Flat', None]
Shape: (14859, 100)
['Mean detached gas boiler', 'Mean detached resistanc

  


(37322, 92)
list of fuels:  ['mains gas - this is for backwards compatibility only and should not be used'
 'oil (not community)' 'mains gas (not community)'
 'electricity - this is for backwards compatibility only and should not be used'
 'oil - this is for backwards compatibility only and should not be used'
 'mains gas (community)' 'INVALID!' 'dual fuel - mineral + wood'
 'Electricity: electricity, unspecified tariff'
 'electricity (not community)' 'NO DATA!' 'oil (community)'
 'LPG - this is for backwards compatibility only and should not be used'
 'smokeless coal' 'biomass (community)'
 'To be used only when there is no heating/hot-water system'
 'house coal (not community)' 'anthracite' 'LPG (not community)'
 'bottled LPG' 'bulk wood pellets' 'LPG (community)' 'wood logs'
 'wood chips' 'B30K (not community)' 'LPG special condition'
 'house coal - this is for backwards compatibility only and should not be used'
 'B30D (community)' 'electricity (community)']
(11871, 97)
(91, 97)
['

  


(33754, 92)
list of fuels:  ['mains gas - this is for backwards compatibility only and should not be used'
 'mains gas (not community)' 'electricity (not community)'
 'electricity - this is for backwards compatibility only and should not be used'
 'To be used only when there is no heating/hot-water system' 'INVALID!'
 'Electricity: electricity, unspecified tariff' 'smokeless coal'
 'mains gas (community)' 'NO DATA!'
 'LPG - this is for backwards compatibility only and should not be used'
 'dual fuel - mineral + wood' 'electricity (community)'
 'LPG (not community)' 'bottled LPG' 'oil (not community)'
 'oil - this is for backwards compatibility only and should not be used'
 'wood logs' 'house coal (not community)'
 'house coal - this is for backwards compatibility only and should not be used'
 'anthracite' 'LPG special condition']
(10892, 97)
(79, 97)
['Terraced', 'Semi-detached', 'Flat', 'Detached']
Shape: (16362, 100)
['Mean detached gas boiler', 'Mean detached resistance heating', 'M

  


(32869, 92)
list of fuels:  ['electricity - this is for backwards compatibility only and should not be used'
 'mains gas - this is for backwards compatibility only and should not be used'
 'mains gas (not community)' 'electricity (not community)' 'INVALID!'
 'Electricity: electricity, unspecified tariff' 'smokeless coal'
 'LPG (not community)'
 'LPG - this is for backwards compatibility only and should not be used'
 'NO DATA!' 'mains gas (community)' 'bottled LPG'
 'To be used only when there is no heating/hot-water system'
 'house coal (not community)' 'oil (not community)'
 'dual fuel - mineral + wood'
 'oil - this is for backwards compatibility only and should not be used'
 'house coal - this is for backwards compatibility only and should not be used'
 'oil (community)' 'wood logs' 'B30D (community)']
(11691, 97)
(32, 97)
['Terraced', 'Flat', 'Semi-detached', 'Detached', None]
Shape: (15757, 100)
['Mean detached gas boiler', 'Mean detached resistance heating', 'Mean detached oil boi

  


(25326, 92)
list of fuels:  ['mains gas - this is for backwards compatibility only and should not be used'
 'INVALID!'
 'electricity - this is for backwards compatibility only and should not be used'
 'mains gas (not community)' 'electricity (not community)'
 'smokeless coal'
 'To be used only when there is no heating/hot-water system'
 'LPG (not community)' 'mains gas (community)'
 'dual fuel - mineral + wood' 'NO DATA!'
 'house coal - this is for backwards compatibility only and should not be used'
 'bottled LPG' 'oil (not community)' 'wood logs'
 'oil - this is for backwards compatibility only and should not be used'
 'house coal (not community)' 'anthracite'
 'Electricity: electricity, unspecified tariff' 'bulk wood pellets'
 'LPG (community)'
 'LPG - this is for backwards compatibility only and should not be used'
 'wood chips' 'electricity (community)']
(6982, 97)
(8, 97)
['Terraced', 'Semi-detached', 'Flat', 'Detached']
Shape: (14932, 100)
['Mean detached gas boiler', 'Mean deta

  


(143600, 92)
list of fuels:  ['mains gas - this is for backwards compatibility only and should not be used'
 'mains gas (not community)'
 'electricity - this is for backwards compatibility only and should not be used'
 'NO DATA!' 'electricity (not community)'
 'Electricity: electricity, unspecified tariff' 'smokeless coal'
 'oil - this is for backwards compatibility only and should not be used'
 'mains gas (community)'
 'To be used only when there is no heating/hot-water system' 'INVALID!'
 'biomass - this is for backwards compatibility only and should not be used'
 'waste combustion - this is for backwards compatibility only and should not be used'
 'wood logs' 'LPG (not community)' 'anthracite' 'oil (not community)'
 'waste combustion (community)'
 'LPG - this is for backwards compatibility only and should not be used'
 'dual fuel - mineral + wood' 'bulk wood pellets'
 'house coal (not community)' 'LPG (community)' 'wood chips' 'bottled LPG'
 'biomass (community)'
 'house coal - this

  


(50933, 92)
list of fuels:  ['mains gas - this is for backwards compatibility only and should not be used'
 'INVALID!' 'mains gas (not community)'
 'electricity - this is for backwards compatibility only and should not be used'
 'electricity (not community)'
 'Electricity: electricity, unspecified tariff' 'NO DATA!'
 'oil - this is for backwards compatibility only and should not be used'
 'To be used only when there is no heating/hot-water system'
 'LPG (not community)' 'oil (not community)' 'smokeless coal'
 'biomass (community)'
 'LPG - this is for backwards compatibility only and should not be used'
 'B30D (community)' 'mains gas (community)' 'house coal (not community)'
 'anthracite' 'dual fuel - mineral + wood' 'wood logs' 'bulk wood pellets'
 'bottled LPG' 'B30K (not community)'
 'house coal - this is for backwards compatibility only and should not be used'
 'oil (community)' 'electricity (community)']
(18086, 97)
(16, 97)
['Semi-detached', 'Terraced', 'Detached', 'Flat']
Shape: 

  


(81990, 92)
list of fuels:  ['mains gas - this is for backwards compatibility only and should not be used'
 'mains gas (community)' 'mains gas (not community)'
 'electricity - this is for backwards compatibility only and should not be used'
 'INVALID!' 'Electricity: electricity, unspecified tariff'
 'smokeless coal' 'electricity (not community)' 'NO DATA!'
 'LPG - this is for backwards compatibility only and should not be used'
 'oil - this is for backwards compatibility only and should not be used'
 'LPG (not community)'
 'To be used only when there is no heating/hot-water system'
 'oil (not community)' 'electricity (community)'
 'biomass - this is for backwards compatibility only and should not be used'
 'oil (community)' 'bottled LPG' 'wood logs' 'dual fuel - mineral + wood'
 'B30D (community)'
 'house coal - this is for backwards compatibility only and should not be used'
 'LPG (community)' 'bulk wood pellets'
 'waste combustion - this is for backwards compatibility only and should

In [74]:
lsoa_df

Unnamed: 0_level_0,N_Households 2011,Rurality,MSOA11CD,LAD11CD,Local Authority,Local Authority nospace,Detached No central heating 2011,Detached Gas boiler 2011,Detached Resistance heating 2011,Detached Oil boiler 2011,...,EPC efficiency D to D,EPC efficiency E to D,EPC efficiency F to D,EPC efficiency G to D,EPC efficiency E to E,EPC efficiency F to E,EPC efficiency G to E,EPC efficiency F to F,EPC efficiency G to F,EPC efficiency G to G
LSOA11CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E01000001,876,Rurality_Urban >10K,E02000001,E09000001,City of London,CityofLondon,0.0,0.0,0.0,0.0,...,-0.021664,-0.020512,-0.015422,-0.012960,-0.020716,-0.016850,-0.014638,-0.021551,-0.017657,
E01000002,830,Rurality_Urban >10K,E02000001,E09000001,City of London,CityofLondon,0.0,0.0,0.0,0.0,...,-0.021664,-0.021067,-0.015422,-0.012960,-0.020716,-0.016850,-0.014638,-0.021551,-0.017657,
E01000003,817,Rurality_Urban >10K,E02000001,E09000001,City of London,CityofLondon,0.0,0.0,0.0,0.0,...,-0.020728,-0.020100,-0.015422,-0.012960,-0.020716,-0.016850,-0.014638,-0.021551,-0.017657,
E01000005,467,Rurality_Urban >10K,E02000001,E09000001,City of London,CityofLondon,0.0,1.0,0.0,0.0,...,-0.022384,-0.020857,-0.015422,-0.012960,-0.020716,-0.016850,-0.014638,-0.021551,-0.019133,
E01000006,543,Rurality_Urban >10K,E02000017,E09000002,Barking and Dagenham,BarkingandDagenham,2.0,26.0,1.0,0.0,...,-0.024497,-0.015810,-0.015147,-0.012231,-0.015369,-0.016717,-0.014027,-0.018572,-0.015192,-0.021505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W01001954,637,Rurality_Urban >10K,W02000372,W06000015,Cardiff,Cardiff,0.0,274.0,0.0,1.0,...,-0.022677,-0.017897,-0.013106,-0.011725,-0.018574,-0.016846,-0.013921,-0.020006,-0.016908,-0.028952
W01001955,1153,Rurality_Urban >10K,W02000192,W06000011,Swansea,Swansea,0.0,10.0,6.0,0.0,...,-0.023869,-0.019866,-0.014466,-0.012128,-0.018523,-0.017958,-0.013484,-0.017603,-0.018611,-0.026056
W01001956,803,Rurality_Urban >10K,W02000190,W06000011,Swansea,Swansea,1.0,259.0,3.0,3.0,...,-0.020943,-0.015862,-0.013142,-0.012128,-0.017631,-0.016354,-0.013484,-0.017603,-0.016356,-0.026056
W01001957,411,Rurality_Urban >10K,W02000192,W06000011,Swansea,Swansea,0.0,3.0,1.0,0.0,...,-0.025595,-0.015968,-0.014439,-0.012128,-0.018523,-0.017958,-0.013484,-0.017603,-0.018611,-0.026056


### Formatting of the name of the columns

In [75]:
def format_column_names(dataf, potential):
    current_categories = ['Detached Gas boiler' ,
                        'Detached Resistance heating' ,
                        'Detached Oil boiler' ,
                        'Detached Biomass boiler' ,
                        'Semi-detached Gas boiler' ,
                        'Semi-detached Resistance heating' ,
                        'Semi-detached Oil boiler' ,
                        'Semi-detached Biomass boiler' ,
                        'Terraced Gas boiler' ,
                        'Terraced Resistance heating' ,
                        'Terraced Oil boiler' ,
                        'Terraced Biomass boiler' ,
                        'Flat Gas boiler' ,
                        'Flat Resistance heating' ,
                        'Flat Oil boiler' ,
                        'Flat Biomass boiler' ]
    
    dataf.columns = [c.replace("Solid fuel", "biomass") for c in dataf.columns]
    dataf.columns = [c.replace("solid fuel", "biomass") for c in dataf.columns]
    rename_dict = {}
    for c in current_categories:
        if potential:
            rename_dict["Mean "+c.lower()] = "Average heat demand after energy efficiency measures for "+c.lower()+" (kWh)"
        else:
            rename_dict["Mean "+c.lower()] = "Average heat demand before energy efficiency measures for "+c.lower()+" (kWh)"

    number_2011 = [c for c in dataf.columns if "2011" in c]
    for c in number_2011:
        for existing_c in current_categories:
    #         print(existing_c)
            if existing_c.lower() in c.lower():
                rename_dict[c] = "Number of " + existing_c.lower() + " in 2011"

    number_2018 = [c for c in dataf.columns if "2018" in c]
    for c in number_2018:
        for existing_c in current_categories:
    #         print(existing_c)
            if existing_c.lower() in c.lower():
                rename_dict[c] = "Number of " + existing_c.lower()+ " in 2018"

    dataf.rename(columns=rename_dict, inplace=True)
    return dataf

In [76]:
final_df = format_column_names(lsoa_df, potential)

In [79]:
saving_path

'D:\\OneDrive - Cardiff University\\05 - Python\\12 - UKERC'

In [77]:
final_df.to_csv(saving_path+os.path.sep+output_file_name+".csv")
number_epcs.to_csv(saving_path+os.path.sep+"Number_EPCs_"+output_file_name+".csv")