In [1]:
#!rm -r /home/jovyan/.local/share/jupyter/kernels/nilmtk
!pip uninstall -y -q nilmtk nilm_metadata
!python3 -m pip install pandas numpy networkx scipy tables scikit-learn hmmlearn pyyaml matplotlib xgboost pyts
# Trick to install NILM regardless of its dependencies
!python3 -m pip install --no-deps git+https://github.com/nilmtk/nilmtk@master
!python3 -m pip install --no-deps git+https://github.com/nilmtk/nilm_metadata@master

Collecting git+https://github.com/nilmtk/nilmtk@master
  Cloning https://github.com/nilmtk/nilmtk (to revision master) to /tmp/pip-req-build-n33khpe3
  Running command git clone --filter=blob:none -q https://github.com/nilmtk/nilmtk /tmp/pip-req-build-n33khpe3
  Resolved https://github.com/nilmtk/nilmtk to commit dde510b7b2299958597a6b3885230844c1b7e9da
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: nilmtk
  Building wheel for nilmtk (setup.py) ... [?25ldone
[?25h  Created wheel for nilmtk: filename=nilmtk-0.4.0.dev1+git.dde510b-py3-none-any.whl size=279071 sha256=563beb09bc9828fc29456f6f913b50165f54a959d586726199abf4165057283d
  Stored in directory: /tmp/pip-ephem-wheel-cache-sk3qzvei/wheels/05/f2/54/7dc6179b2887926915ea9f94c7d02d7c097f9e22a6fdaad502
Successfully built nilmtk
Installing collected packages: nilmtk
Successfully installed nilmtk-0.4.0.dev1+git.dde510b
Collecting git+https://github.com/nilmtk/nilm_metadata@master
  Cloning h

In [2]:
import h5py
import numpy as np
import nilmtk
import os
import pickle
import pandas as pd

In [3]:
#select initial parameters

#name of nilmtk dataset file
originname="SynD.h5"

#path of folder with the dataset
originpath=""

#where to output processed data
destinationpath=""

#desired sample length in minutes, assumes sample rate of 1/6s
n_mins=15

#samples are created with a sliding window. this is the step as a proportion of resolution
step_ratio=0.5

#how to deal with nan values (delete/replace with 0/lerp)
method="zero fill" #"stitch", "zero fill" or "interpolate"

#removes samples with an average power in watts lower than this
min_watts=1

In [4]:
#for measuring execution time
from time import process_time
t = process_time()

#step calculation based on resolution and step ratio, this is the 
step_mins=n_mins*step_ratio
step=int(step_mins*10)

#name of input file, will be used to name output files
name=os.path.splitext(originname)[0]

#load dataset
dataset=nilmtk.DataSet(originpath+originname)

#iterates through all meters and gets unique devices in the dataset
devices=[]
for building in dataset.buildings:
  for meter in dataset.buildings[building].elec.submeters().meters:
    label=meter.appliances[0].metadata.get("type") #string with name of device for example "fridge"
    devices.append(str(label))
devices=np.sort(np.unique(np.array(devices))) #removes duplicates and sorts alphabetically

#dictionary of unique devices in the dataset
devicedict={}
for i, device in enumerate(devices):
  devicedict[device]=i

#save dictionary to a pickle file
with open(""+name+"_devicedict.pkl", "wb") as f:
  pickle.dump(devices, f)

#arrays where processed data will be temporarily stored. 
#data and labels store samples and labels 
#the other three store information about total data points, not nan data points and retained data points after sample creation respectively
data=[]
labels=[]
stattotal=[]
statnotnan=[]
statretained=[]

#60s/sample time in seconds=10, this is the number of data points in a sample
sample_length=n_mins*10

#iterate through all meters
for building in dataset.buildings:
  for meter in dataset.buildings[building].elec.submeters().meters:
    label=meter.appliances[0].metadata.get("type")
    #use electrical power data and store in a pandas dataframe
    df = next(meter.load(physical_quantity='power'))
    #resamples to 6s, as thats the most commonly shared sample rate
    df=df.resample("6s").ffill(limit=10)
    #gets time series of power values, uses active power if possible, else apparent power
    try:
        ts = np.array(df.power.active.values)
    except:
        try:
            ts = np.array(df.power.apparent.values)
        except:
            raise ValueError

    #collects info about the length of raw ts      
    print(f"starting: {label}")
    length=len(ts)
    print(f"raw series length: {length}")
    stattotal.append(length)
    
    #transforms the raw time series based on the selected type
    if method=="stitch":
        #deletes nan data points
      ts=ts[~np.isnan(ts)]
    elif method=="zero fill":
        #replaces nan data points with 0
      ts[np.isnan(ts)]=0
    elif method=="interpolate":
        #interpolates nan data points (linear by default)
      ts=np.array(pd.Series(ts).interpolate())
    
    #collects info about the number of not nan data points
    length=len(ts)
    print(f"cleaned series length: {length}")
    statnotnan.append(length)
    
    #sample creation from time series, sliding window with calculated step
    n_samples=length//(sample_length-step)
    
    #split time series into segments based on the selected length and step
    count=0
    for i in range(n_samples):
      sample=ts[i*step:i*step+sample_length]
      #only use samples where average power is above the threshold else discard them
      if np.mean(sample)>=min_watts:
        data.append(sample)
        labels.append(label)
        count+=1
    
    #info about retained data
    statretained.append(count*sample_length)
    print(f"finished: {label}, created {count} samples\n")

#output collected info (how many data points)
print(f"retained {np.sum(statnotnan)} out of {np.sum(stattotal)} data points")
print(f"retained {np.sum(statretained)} data points after sample creation ({np.sum(statretained)//sample_length} samples)\n")

#save data in shape (number of samples, minutes*10) and labels in shape (number of samples)
data=np.array(data)
labels=np.array(labels)
np.save(destinationpath+name+"_data_"+str(n_mins)+".npy", data, allow_pickle=False)
np.save(destinationpath+name+"_labels_"+str(n_mins)+".npy", labels, allow_pickle=False)

#print execution time
elapsed_time=process_time()-t
print(f"done in {elapsed_time} seconds\n")


OSError: No such file as SynD.h5