In [1]:
import lightgbm as lgb
import sys
from os import listdir
import os.path
from os.path import isdir, isfile, join
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
def gatherCSVFilePathsfromDirectory(dir):
    print(dir)
    filepaths = []
    for f in listdir(dir):
        path = join(dir, f)
        if isfile(path) and f.endswith(".csv"):
            filepaths.append(path)

    if len(filepaths) == 0:
        exit("Directory does not contain any csv files: %s" % dir)

    filepaths = np.sort(filepaths)
    print("Found %s files." % len(filepaths))

    return filepaths


In [3]:
#############################################
#MARK: - Read Files

SOURCE_DIR = "./data/"


SOURCE_DIR = os.path.abspath(SOURCE_DIR)
if not isdir(SOURCE_DIR):
    exit("That's not a directory!")
else:
    print(SOURCE_DIR)

elems = listdir(SOURCE_DIR)
print(elems)
directories = {}
for e in elems:
    path = join(SOURCE_DIR, e)
    
    if isdir(path):
        filepaths = gatherCSVFilePathsfromDirectory(path)

        directories[e] = []
        for filepath in filepaths:
            dataframe = pd.read_csv(filepath, header=1)
            dataframe.columns = dataframe.columns.str.strip().str.replace('(', '').str.replace(')', '') 
            
            # convert ms timestamp to ids
            dataframe = dataframe.assign(id= lambda x: dataframe.index )
            dataframe = dataframe.drop(columns=['ms'])
            
            if 'mA' in dataframe:
                dataframe = dataframe.assign(A= lambda x: x.mA / 1000.0).drop(columns=['mA'])
            directories[e].append(dataframe)
            

/Users/Felix/Documents/FH/NILM/DataScienceProjekt/data
['ohp_voll-single.psdata', 'laptop-single', '.DS_Store', 'ohp_voll-laptop', 'lampe-ohp_voll.psdata', 'ohp_halb-single', 'ohp_halb-single.psdata', 'ohp_voll-laptop-bosch-lampe.psdata', 'lampe-ohp_voll', 'ohp_voll-single', 'ohp_halb-laptop-lampe.psdata', 'ohp_halb-laptop.psdata', 'bosch-single', 'lampe-single', 'ohp_voll-laptop-bosch-lampe', 'ohp_halb-laptop', 'ohp_halb-laptop-lampe']
/Users/Felix/Documents/FH/NILM/DataScienceProjekt/data/laptop-single
Found 256 files.
/Users/Felix/Documents/FH/NILM/DataScienceProjekt/data/ohp_voll-laptop
Found 256 files.
/Users/Felix/Documents/FH/NILM/DataScienceProjekt/data/ohp_halb-single
Found 256 files.
/Users/Felix/Documents/FH/NILM/DataScienceProjekt/data/lampe-ohp_voll
Found 256 files.
/Users/Felix/Documents/FH/NILM/DataScienceProjekt/data/ohp_voll-single
Found 256 files.
/Users/Felix/Documents/FH/NILM/DataScienceProjekt/data/bosch-single
Found 256 files.
/Users/Felix/Documents/FH/NILM/DataSc

In [4]:
for (id, dfs) in directories.items():
    print(id, len(dfs))
    print(dfs[0].head(4))
    print(dfs[0].describe())
    print('\n\n')

laptop-single 256
          V         A  id
0 -8.224738  0.018067   0
1 -8.224738  0.036012   1
2 -3.753777  0.018067   2
3  0.732444  0.018067   3
                 V            A           id
count  2445.000000  2445.000000  2445.000000
mean     38.888648     0.021008  1222.000000
std     222.962687     0.338291   705.955027
min    -321.848200    -1.293191     0.000000
25%    -178.472200    -0.017823   611.000000
50%      81.362350     0.000122  1222.000000
75%     251.609800     0.018067  1833.000000
max     314.340600     1.419172  2444.000000



ohp_voll-laptop 256
           V         A  id
0 -12.710960 -1.078524   0
1  -8.224738 -1.078524   1
2  -3.753777 -1.078524   2
3  -8.224738 -1.078524   3
                 V            A           id
count  2445.000000  2445.000000  2445.000000
mean     38.879487     0.443960  1222.000000
std     221.681810     2.560982   705.955027
min    -317.362000    -5.035249     0.000000
25%    -174.001300    -1.887875   611.000000
50%      81.362350 

#### TODO

- Zeit diskretisieren 0ms = 0 und 20ms = 2445 (neue spalte -> id des datensatzes) done
- mA zu A umrechnen (falls vorhanden) done
- geht multilabel mit gradient boosting? jaein, vielleicht
- in X und Y aufteilen 
- attacke!


In [5]:
features = []

for label, data in directories.items():
    # pro gerät alle daten in einen dataframe
    res = pd.concat(data, sort=False)
    #zu der tabelle eine spalte mit label hinzufügen
    res = res.assign(labels= lambda x: label)
    
    features.append(res)

# dataframes zu einem kombinieren
features = pd.concat(features, sort=False)

X = features.drop(columns=['labels'])
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)
Y = features['labels']

Y = pd.get_dummies(Y)
Y = Y.assign(labels= lambda x: 0 + x['lampe-ohp_voll'] + x['lampe-single'] * 2 + x['laptop-single'] * 3 + x['ohp_halb-laptop']* 4 + x['ohp_halb-laptop-lampe'] * 5 + x['ohp_halb-single'] * 6 + x['ohp_voll-laptop'] * 7 + x['ohp_voll-laptop-bosch-lampe'] * 8 + x['ohp_voll-single'] *9)
Y = Y['labels']

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, shuffle=True)

In [7]:
#train_data = lgb.Dataset(X_train, label=Y_train)
#test_data = lgb.Dataset(X_test, label=Y_test)

#param = {'num_leaves':200, 'num_trees':100, 'objective':'multiclass', 'num_class': 10}
#num_round = 100
#param['metric'] = ['multi_logloss', 'multi_error']

#bst = lgb.cv(param, train_data, num_round, nfold=5)
#bst = lgb.train(param, train_data, num_round, valid_sets=[train_data, test_data], valid_names = ['train','eval'])

In [8]:
from sklearn.ensemble import RandomForestClassifier
from time import time

clf = RandomForestClassifier(n_estimators=50, random_state=42, min_samples_split=2, verbose=2, n_jobs=-1)
startTime = time()

clf.fit(X_train, Y_train)

endTime = time()
diff = (endTime - startTime)
print("Time taken:", diff)

building tree 1 of 50building tree 2 of 50building tree 3 of 50
building tree 4 of 50


building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.1min


building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50
Time taken: 517.7442049980164


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.6min finished


In [9]:
print("Train:", round(10000*clf.score(X_train, Y_train))/100.0)
print("Test:", round(10000*clf.score(X_test, Y_test))/100.0)

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   36.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   55.1s finished


Train: 90.66


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    8.8s


Test: 90.38


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   13.0s finished


# Get a Sample to predict the class

In [10]:
num_test_samples = len(X_test)
sample = np.random.randint(len(X_test))

print("evaluating sample", sample, "of", num_test_samples)
#print(Y_test.head(), type(Y_test))
sample_X = X_test[sample]
sample_Y = Y_test[sample:sample+1]

print(sample_X)
print(sample_Y)

clf.predict(sample_X.reshape(1, -1))

evaluating sample 925448 of 1251840
[ 0.97181717  0.71547829  0.07610475]
186    9
Name: labels, dtype: uint8


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished


array([9], dtype=uint8)

# Evaluate model with new data

- Record data
- export as csv
- Read csv files
- for each wave (represented by one file)
    - predict device
    - result is mode of the prediction (most often used classification prediction)
    - print to screen which device is active