In [None]:
from DataRetriever import DataRetriever
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

In [12]:
#load all the files
open_data = DataRetriever()

year1hour = open_data.get_data("All-Subsystems-hour-Year1.pkl")
year2hour = open_data.get_data("All-Subsystems-hour-Year2.pkl")

year1min = open_data.get_data("All-Subsystems-minute-Year1.pkl")
year2min = open_data.get_data("All-Subsystems-minute-Year2.pkl")

In [13]:
print("year 1 hour:", year1hour.shape)
print("year 1 min:", year1min.shape)
print("year 2 hour:", year2hour.shape)
print("year 2 min:", year2min.shape)

year 1 hour: (8761, 356)
year 1 min: (518792, 358)
year 2 hour: (8760, 380)
year 2 min: (519604, 382)


# Attribute Investigation

In [14]:
#The two extra columns in the minute versions
for col in year2min.columns:
    if col not in year2hour.columns:
        print(col)

#The 24 extra columns in the year2 versions
print('\n The new columns are:')
for column in year2hour.columns:
    if column not in year1hour.columns:
        print(column)

TimeStamp_Count
DayOfWeek

 The new columns are:
PV_PVSystem1ACEnergyOSEACPV1OS
PV_PVSystem2ACEnergyOSEACPV2OS
PV_PVSystem1ACPowerOSPACPV1OS
PV_PVSystem2ACPowerOSPACPV2OS
Vent_HRVDewpointTemperatureSupply
Vent_HRVDewpointTemperatureReturn
Vent_HRVDewpointTemperatureExhaust
Vent_HRVDewpointTemperatureOutdoor
Vent_HRVreturnflowrate
Vent_HRVexhaustflowrate
Vent_HRVoutdoorflowrate
DHW_WaterTempMBASinkRHot
DHW_WaterTempMBASinkRCold
DHW_WaterTempBASinkHot
DHW_WaterTempBASinkCold
DHW_WaterTempBAShwHot
DHW_WaterTempBAShwCold
DHW_WaterTempBASinkMixed
DHW_WaterTempBAShowerMixed
DHW_WaterTempMBASinkRMixed
SHW_GlycolFlowHXCoriolisSHW
SHW_WaterFlowHXCoriolisSHW
SHW_GlycolFlowRateHXCoriolisSHW
SHW_WaterFlowRateHXCoriolisSHW


In [15]:
#Basic descriptive statistics for all attributes
y1hsummary = year1hour.describe()
y2hsummary = year2hour.describe()

In [20]:
#Investigation the missing values on a subsystem & attribute level in Year2 minute data

countnanvalues_dict = {}
for column in year2min.columns:
    columnnanvalue = year2min[column].isnull().sum()
    if columnnanvalue > 0:
        countnanvalues_dict[column] = columnnanvalue
        print(column, "has", columnnanvalue, "number of NaN values" )

countnanvalues_df = pd.DataFrame.from_dict(countnanvalues_dict, orient='index', columns=['NaN']).sort_values(by=['NaN'], ascending=False)
#countnanvalues_df.drop(['SHW_GlycolFlowHXCoriolisSHW', 'SHW_WaterFlowHXCoriolisSHW', 'SHW_GlycolFlowRateHXCoriolisSHW', 'SHW_WaterFlowRateHXCoriolisSHW'], axis=0, inplace=True)

subsystemnanvalues_df = countnanvalues_df.groupby([s.split('_')[0] for s in countnanvalues_df.index.values]).sum()
subsystemnanvalues_df = subsystemnanvalues_df.sort_values(by=['NaN'], ascending=False)

fig2 = make_subplots(rows=1, cols=2)
fig2.add_trace(go.Bar(name='Missing values per attribute', x=countnanvalues_df.index, y=countnanvalues_df['NaN']), row=1, col=1)
fig2.add_trace(go.Bar(name='Missing values per subsystem', x=subsystemnanvalues_df.index, y=subsystemnanvalues_df['NaN']), row=1, col=2)
fig2.update_yaxes(title='Number of Missing Values')
fig2.show()

SHW_GlycolFlowHXCoriolisSHW has 214208 number of NaN values
SHW_WaterFlowHXCoriolisSHW has 214208 number of NaN values
SHW_GlycolFlowRateHXCoriolisSHW has 214208 number of NaN values
SHW_WaterFlowRateHXCoriolisSHW has 214208 number of NaN values
HVAC_HeatPumpIndoorUnitPower has 777 number of NaN values
HVAC_HeatPumpOutdoorUnitPower has 777 number of NaN values
HVAC_DehumidifierPower has 777 number of NaN values
HVAC_DehumidifierInletAirTemp has 777 number of NaN values
HVAC_DehumidifierExitAirTemp has 777 number of NaN values
HVAC_DehumidifierAirflow has 777 number of NaN values


# Understanding the residents schedule

In [16]:
fig = go.Figure(data=[
    go.Bar(name='ChildA Upstairs', x=year2hour['Timestamp'], y=year2hour['Elec_PowerChildAUpstairs']),
    go.Bar(name='ChildB Upstairs', x=year2hour['Timestamp'], y=year2hour['Elec_PowerChildBUpstairs'])
])
fig.update_layout(barmode='group', height=800)
fig.update_yaxes(title='Watts')
fig.show()