In [1]:
import pandas as pd
certificates_df = pd.read_csv('Bolton.csv')

  certificates_df = pd.read_csv('Bolton.csv')


# Anuual records of buildings, aggregate in region level

In [3]:
certificates_df['INSPECTION_DATE'] = pd.to_datetime(certificates_df['INSPECTION_DATE'], errors='coerce')

# remove empty date
certificates_df = certificates_df.dropna(subset=['INSPECTION_DATE'])

# filter period
certificates_df['year'] = certificates_df['INSPECTION_DATE'].dt.year
certificates_df = certificates_df[
    (certificates_df['INSPECTION_DATE'].dt.year >= 2013)
     & (certificates_df['INSPECTION_DATE'].dt.year <= 2023)]

# drop rows with missing important values
certificates_df = certificates_df.dropna(subset=[
    'ENERGY_CONSUMPTION_CURRENT',
    'TOTAL_FLOOR_AREA',
    'MAIN_FUEL',
    'PROPERTY_TYPE',
    'BUILT_FORM',
    'BUILDING_REFERENCE_NUMBER'
])

# newest record of each building
certificates_df = certificates_df.sort_values(
    by=['BUILDING_REFERENCE_NUMBER', 'INSPECTION_DATE'])

certificates_df = certificates_df.groupby(['BUILDING_REFERENCE_NUMBER', 'year']).tail(1).reset_index(drop=True)

In [4]:
# group aggregation
summary = certificates_df.groupby('year').agg(
    building_count=('BUILDING_REFERENCE_NUMBER', 'nunique'),
    total_floor_area=('TOTAL_FLOOR_AREA', 'sum'),
    avg_unit_energy=('ENERGY_CONSUMPTION_CURRENT', 'mean')
).reset_index()

summary

Unnamed: 0,year,building_count,total_floor_area,avg_unit_energy
0,2013,10480,897752.0,254.191412
1,2014,11490,1009393.0,257.292602
2,2015,8200,722008.0,306.131829
3,2016,7428,672398.0,296.517905
4,2017,4372,369280.0,275.368481
5,2018,4860,399597.0,269.733951
6,2019,5638,441303.0,278.682157
7,2020,5563,445605.0,269.210318
8,2021,7379,572823.0,261.780323
9,2022,6990,555975.0,244.53133


# sparse field strategy

In [6]:
import numpy as np
# setting thresholds
zero_threshold = 0.9
missing_threshold = 0.9
low_unique_threshold = 1

# 1.missing ratio
missing_ratio = certificates_df.isnull().mean()

# 2.proportion of zero
zero_ratio = certificates_df.select_dtypes(include = [np.number]).apply(lambda x: (x == 0).mean())

# 3.unique amount
unique_counts = certificates_df.nunique(dropna = False)

# 4.summarize
field_analysis = pd.DataFrame({
    'missing_ratio': missing_ratio,
    'zero_ratio': zero_ratio.reindex(certificates_df.columns),
    'unique_count': unique_counts
})

# 5.tags
def classify_field(row):
    if row['missing_ratio'] > missing_threshold or row['zero_ratio'] > zero_threshold or row['unique_count'] <= low_unique_threshold:
        return 'drop_lstm'
    else:
        return 'keep_core'

field_analysis['keep_tag'] = field_analysis.apply(classify_field, axis = 1)


# results
def display_dataframe_to_user(name, dataframe):
    print(f'\n{name}\n{'=' * len(name)}')
    display(dataframe)
    
display_dataframe_to_user(name='Field sparsity and retention suggestions', dataframe=field_analysis)


Field sparsity and retention suggestions


Unnamed: 0,missing_ratio,zero_ratio,unique_count,keep_tag
LMK_KEY,0.000000,,78251,keep_core
ADDRESS1,0.000000,,69088,keep_core
ADDRESS2,0.502434,,1532,keep_core
ADDRESS3,0.934301,,202,drop_lstm
POSTCODE,0.000000,,6135,keep_core
...,...,...,...,...
LOW_ENERGY_FIXED_LIGHT_COUNT,0.654126,0.056268,63,keep_core
UPRN,0.007591,0.000000,67720,keep_core
UPRN_SOURCE,0.007591,,3,keep_core
REPORT_TYPE,0.000000,0.000000,2,keep_core


In [8]:
# delete drop fields
drop_lstm_fields = field_analysis[field_analysis['keep_tag'] == 'drop_lstm'].index.tolist()
certificates_lstm_df = certificates_df.drop(columns=drop_lstm_fields)
# remain local authority
certificates_lstm_df['LOCAL_AUTHORITY'] = certificates_df['LOCAL_AUTHORITY']

In [11]:
certificates_lstm_df.to_csv('cleaned_epc_certificates.csv', index=False)

In [13]:
# export keep_tags
field_tags = field_analysis[['keep_tag']].copy()
field_tags.index.name = 'field'

field_tags.reset_index(inplace = True)

field_tags.to_csv('epc_field_tags.csv', index = False)