In [32]:
import pandas as pd
import numpy as np

import DataRetriever as dr

retriever = dr.DataRetriever()

df = retriever.get_data("All-Subsystems-minute-Year2.pkl")

pd.options.mode.chained_assignment = None

In [33]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

In [34]:
# Creating a pd.Series with the timestamp shifted one downwards. Adding the first value twice, and excluding the last.
timestamp_plus_one = pd.concat([pd.Series(df["Timestamp"][0]), df["Timestamp"][:-1]], ignore_index=True)

# Creating a new column with the time delta in seconds.
df["Timestamp_Delta"] = (df["Timestamp"] - timestamp_plus_one).astype('timedelta64[s]')

In [35]:
df_index_split = df[df["Timestamp_Delta"] > 300].index

In [36]:
for row in df.index:
    dt = df["Timestamp"][row]
    df.at[row, "Timestamp"] = pd.Timestamp(year=dt.year, month=dt.month, day=dt.day,
                                       hour=dt.hour, minute=dt.minute, second=dt.second)

df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# Interpolate measurements

In [37]:
df[df["HVAC_HeatPumpIndoorUnitPower"].isnull() == True]

Unnamed: 0,Timestamp,TimeStamp_Count,Load_LatentHeatWaterVolume,Load_RefrigeratorTemp,Load_StatusBA1Lights,Load_StatusKitchenLightsA,Load_StatusKitchenLightsB,Load_StatusKitchenLightsC,Load_StatusDRLights,Load_StatusLRLights3,...,SHW_WaterFlowHXCoriolisSHW,SHW_GlycolFlowRateHXCoriolisSHW,SHW_WaterFlowRateHXCoriolisSHW,HVAC_HeatPumpIndoorUnitPower,HVAC_HeatPumpOutdoorUnitPower,HVAC_DehumidifierPower,HVAC_DehumidifierInletAirTemp,HVAC_DehumidifierExitAirTemp,HVAC_DehumidifierAirflow,Timestamp_Delta
27384,2015-02-20 01:03:37,27385,0.055482,4.712000,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,60.0
27385,2015-02-20 01:04:37,27386,0.058124,4.776172,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,60.0
27386,2015-02-20 01:05:37,27387,0.058124,4.860834,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,60.0
27387,2015-02-20 01:06:37,27388,0.058124,4.862597,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,60.0
27388,2015-02-20 01:07:37,27389,0.058124,4.887749,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516793,2016-01-30 01:06:29,516794,0.070674,5.211040,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.018551,0.001090,,,,,,,60.0
516794,2016-01-30 01:07:29,516795,0.071334,5.313943,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.017023,-0.000391,,,,,,,60.0
516795,2016-01-30 01:08:29,516796,0.074636,5.391418,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.017164,-0.000776,,,,,,,60.0
516796,2016-01-30 01:09:29,516797,0.074636,5.436720,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.015977,-0.001013,,,,,,,60.0


In [38]:
df.isnull().sum()

Timestamp                          0
TimeStamp_Count                    0
Load_LatentHeatWaterVolume         0
Load_RefrigeratorTemp              0
Load_StatusBA1Lights               0
                                ... 
HVAC_DehumidifierPower           777
HVAC_DehumidifierInletAirTemp    777
HVAC_DehumidifierExitAirTemp     777
HVAC_DehumidifierAirflow         777
Timestamp_Delta                    0
Length: 383, dtype: int64

# Function to create interpolated records based on a DataFrame

In [61]:
def interpolate_df(dataframe):
    dataframe.index = dataframe["Timestamp"]
    dataframe["DF"] = 1
    dataframe.drop(["Timestamp", "TimeStamp_Count", "DayOfWeek", "Timestamp_Delta"], axis=1, inplace=True)

    interpolation_records = dataframe.resample('1T') # Rækker svarende til 1 min i forskel
    interpolation_records = interpolation_records.interpolate()
    interpolation_records["DF"] = 0

    df_concat = pd.concat([dataframe, interpolation_records], axis=0)
    df_concat.sort_index(inplace=True)

    df_concat.interpolate(method="time", inplace=True)

    df_concat.dropna(inplace=True)

    df_concat = df_concat[df_concat["DF"] == 0]

    del df_concat["DF"]

    return df_concat

# Handle boolean attributes

In [62]:
metadata = retriever.get_data("metadata-year2.pkl")
metadata.rename(columns={"Unnamed: 0": "Attribute"}, inplace=True)

In [63]:
boolean_attributes = metadata["Attribute"][metadata["Units"] == "Binary Status"].tolist()

In [64]:
list_boolean_attributes = list(set(boolean_attributes) & set(df.columns.tolist())) # Some columns does not exist in df

In [65]:
def resolve_boolean(dataframe, columns=list_boolean_attributes):
    dataframe[columns] = dataframe[columns].round()
    return dataframe

# Sub-DataFrames interpolated

In [66]:
df_1_interpolated = resolve_boolean(dataframe=interpolate_df(dataframe=df[:df_index_split[0]]))
df_2_interpolated = resolve_boolean(dataframe=interpolate_df(dataframe=df[df_index_split[0]:df_index_split[1]]))
df_3_interpolated = resolve_boolean(dataframe=interpolate_df(dataframe=df[df_index_split[1]:df_index_split[2]]))
df_4_interpolated = resolve_boolean(dataframe=interpolate_df(dataframe=df[df_index_split[2]:]))

In [69]:
df_1_interpolated

Unnamed: 0,Timestamp,Load_LatentHeatWaterVolume,Load_RefrigeratorTemp,Load_StatusBA1Lights,Load_StatusKitchenLightsA,Load_StatusKitchenLightsB,Load_StatusKitchenLightsC,Load_StatusDRLights,Load_StatusLRLights3,Load_StatusEntryHallLights,...,SHW_WaterFlowHXCoriolisSHW,SHW_GlycolFlowRateHXCoriolisSHW,SHW_WaterFlowRateHXCoriolisSHW,HVAC_HeatPumpIndoorUnitPower,HVAC_HeatPumpOutdoorUnitPower,HVAC_DehumidifierPower,HVAC_DehumidifierInletAirTemp,HVAC_DehumidifierExitAirTemp,HVAC_DehumidifierAirflow,Timestamp_Delta
0,2015-06-30 00:02:00,0.126377,5.115046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35508.000000,6.073449e-07,-0.000005,98.904745,670.825000,4.621383,71.666469,87.538439,9.361570,0.0
1,2015-06-30 00:03:00,0.126363,5.115063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35508.000000,5.867736e-06,0.000007,98.907478,670.845606,4.621388,71.666595,87.539179,9.361875,60.0
2,2015-06-30 00:04:00,0.126348,5.115081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35508.000000,-3.938519e-06,0.000004,98.910212,670.866211,4.621393,71.666720,87.539918,9.362180,60.0
3,2015-06-30 00:05:00,0.126334,5.115098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35508.000000,-5.850107e-06,-0.000005,98.912945,670.886816,4.621398,71.666846,87.540658,9.362486,60.0
4,2015-06-30 00:06:00,0.126319,5.115116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35508.000000,-8.385785e-06,-0.000007,98.915679,670.907422,4.621403,71.666971,87.541397,9.362791,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161272,2015-10-19 23:54:00,0.799205,5.112026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,677.418936,1.896195e-03,0.001210,9.940000,22.300000,4.539000,68.618000,69.998000,0.000000,60.0
161273,2015-10-19 23:55:00,0.799205,5.112026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,677.418936,1.896195e-03,0.001210,9.940000,22.300000,4.539000,68.618000,69.998000,0.000000,60.0
161274,2015-10-19 23:56:00,0.799205,5.112026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,677.418936,1.896195e-03,0.001210,9.940000,22.300000,4.539000,68.618000,69.998000,0.000000,60.0
161275,2015-10-19 23:57:00,0.799205,5.112026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,677.418936,1.896195e-03,0.001210,9.940000,22.300000,4.539000,68.618000,69.998000,0.000000,60.0


# Interpolating larger gaps

In [None]:
from statsmodels.tsa.arima_model import ARIMA
#TODO: https://stackoverflow.com/questions/31690134/python-statsmodels-help-using-arima-model-for-time-series

In [None]:
import numpy as np
from scipy.optimize import Bounds, minimize

def func(x):
    return -(2*x[0]+5*x[1])

bounds = Bounds([0, 0], [np.inf, np.inf])

from scipy.optimize import LinearConstraint
linear_constraint = LinearConstraint([[1, 2], [5, 3]], [-np.inf, -np.inf], [16, 45])

res = minimize(func, np.array([0, 0]), constraints=linear_constraint, bounds=bounds)

In [None]:
res.x.round(5)