# Missing Values

In [1]:
import DataRetriever as dr

retriever = dr.DataRetriever()

year_2 = retriever.get_data("All-Subsystems-minute-Year2.pkl")
year_2

### Overall missing values

In [2]:
count_missing_cells = year_2.isnull().sum().sum()
prop_missing_cells = count_missing_cells / year_2.size
print(f"Proportion of missing values in cells: \n{prop_missing_cells}")

count_missing_attr = year_2.isnull().sum()
prop_missing_attr = count_missing_attr[count_missing_attr>0] / year_2.size
print(f"Amount of missing cells per attribute: \n{prop_missing_attr}")

Proportion of missing values in cells: 
0.0043402666170544455
Amount of missing cells per attribute: 
SHW_GlycolFlowHXCoriolisSHW        0.001079
SHW_WaterFlowHXCoriolisSHW         0.001079
SHW_GlycolFlowRateHXCoriolisSHW    0.001079
SHW_WaterFlowRateHXCoriolisSHW     0.001079
HVAC_HeatPumpIndoorUnitPower       0.000004
HVAC_HeatPumpOutdoorUnitPower      0.000004
HVAC_DehumidifierPower             0.000004
HVAC_DehumidifierInletAirTemp      0.000004
HVAC_DehumidifierExitAirTemp       0.000004
HVAC_DehumidifierAirflow           0.000004
dtype: float64


### Missing values in subsystem

In [30]:
# Get the three unique characters identifying each subsystem and save them in a set
subsystems = set()
for column in year_2.columns:
    subsystem_id = column.split("_")[0]
    subsystems.add(subsystem_id)

# Get attributes pertaining to each subsystem and save as a dictionary
subsystem_attributes = dict()
for subsystem in subsystems:
    subsystem_columns = [column for column in year_2 if column.startswith(subsystem)]
    subsystem_attributes[subsystem] = subsystem_columns

# We saw from the overall analysis that only subsystems SHW and HVAC contained missing values. Now, we check these in detail
shw_data = year_2[subsystem_attributes["SHW"]]
hvac_data = year_2[subsystem_attributes["HVAC"]]

prop_missing_shw = shw_data.isnull().sum().sum() / shw_data.size
prop_missing_hvac = hvac_data.isnull().sum().sum() / hvac_data.size
print("Proportion per subsystem:")
print(f"SHW: {prop_missing_shw}")
print(f"HVAC: {prop_missing_hvac}\n")

prop_attr_shw = shw_data.isnull().sum() / shw_data.size
prop_attr_hvac = hvac_data.isnull().sum() / hvac_data.size
print("Proportion per attribute:")
print(prop_attr_shw, prop_attr_hvac)

Proportion per subsystem:
SHW: 0.23557280434440733
HVAC: 0.0008972217303946852

Proportion per attribute:
SHW_SHWHX80galWaterFlow            0.000000
SHW_GlycolTempSHWPanel80galOut     0.000000
SHW_PSPSHW                         0.000000
SHW_GlycolFlowHXCoriolisSHW        0.058893
SHW_WaterFlowHXCoriolisSHW         0.058893
SHW_GlycolFlowRateHXCoriolisSHW    0.058893
SHW_WaterFlowRateHXCoriolisSHW     0.058893
dtype: float64 HVAC_HVACTempReturnAir           0.00000
HVAC_HVACDewpointReturnAir       0.00000
HVAC_HVACTempSupplyAir           0.00000
HVAC_HVACDewpointSupplyAir       0.00000
HVAC_HeatPumpIndoorUnitPower     0.00015
HVAC_HeatPumpOutdoorUnitPower    0.00015
HVAC_DehumidifierPower           0.00015
HVAC_DehumidifierInletAirTemp    0.00015
HVAC_DehumidifierExitAirTemp     0.00015
HVAC_DehumidifierAirflow         0.00015
dtype: float64
