In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [34]:
import glob
import pandas as pd
import numpy as np
import random
import pickle
from tsfeatures import tsfeatures

random.seed(123)

In [35]:
build_list = glob.glob('./data/buildings/*.csv')
test_list = random.sample(build_list, int(len(build_list) * 0.1))
train_list = [x for x in build_list if x not in test_list]

In [36]:
test_list

['./data/buildings/226.csv',
 './data/buildings/170.csv',
 './data/buildings/119.csv',
 './data/buildings/1104.csv',
 './data/buildings/1453.csv',
 './data/buildings/1488.csv',
 './data/buildings/760.csv',
 './data/buildings/866.csv',
 './data/buildings/1713.csv',
 './data/buildings/1250.csv',
 './data/buildings/1299.csv',
 './data/buildings/1061.csv',
 './data/buildings/914.csv',
 './data/buildings/529.csv',
 './data/buildings/297.csv',
 './data/buildings/232.csv',
 './data/buildings/920.csv',
 './data/buildings/221.csv',
 './data/buildings/1311.csv',
 './data/buildings/250.csv',
 './data/buildings/1276.csv',
 './data/buildings/1353.csv',
 './data/buildings/1715.csv',
 './data/buildings/1116.csv',
 './data/buildings/917.csv',
 './data/buildings/762.csv',
 './data/buildings/163.csv',
 './data/buildings/1631.csv',
 './data/buildings/956.csv',
 './data/buildings/43.csv',
 './data/buildings/59.csv',
 './data/buildings/1513.csv',
 './data/buildings/331.csv',
 './data/buildings/1186.csv',
 

In [40]:
# Read the features from the csv file
features = pd.read_csv('data/social_features_test.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
metrics = pd.read_csv('data/ts_metrics.csv', usecols = ['ID', 'mean', 'std'], index_col='ID')
# add the metrics to the features
features = features.join(metrics)
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features

Unnamed: 0,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,...,w2,w3,w4,w5,w6,yearly,function,mean,std,ID
119,0.020468,0.020456,0.020752,0.022444,0.026098,0.033939,0.043871,0.056856,0.070734,0.072694,...,0.165600,0.183370,0.176833,0.056413,0.056406,142869.45,Lagere school,3.655874,3.498496,119
1104,0.043612,0.043444,0.043540,0.043546,0.043543,0.041459,0.040577,0.040982,0.041782,0.041748,...,0.137432,0.138564,0.148916,0.137136,0.137556,136887.70,Werkplaats,3.890467,1.560140,1104
1488,0.026629,0.026035,0.026054,0.026167,0.025995,0.026190,0.030007,0.035284,0.037805,0.047133,...,0.147712,0.160097,0.145658,0.132150,0.112098,549785.32,Cultureel centrum,19.905225,14.669210,1488
1713,0.035669,0.035579,0.035756,0.036011,0.036187,0.036516,0.037440,0.039859,0.046001,0.051040,...,0.147649,0.152212,0.153118,0.121504,0.122105,360576.35,Brandweerkazerne,10.305312,5.439933,1713
1299,0.018835,0.018331,0.018182,0.018152,0.018310,0.021053,0.030045,0.041678,0.052797,0.067279,...,0.141955,0.141830,0.145330,0.166741,0.134618,1954162.00,Museum,72.726192,55.687306,1299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,0.021506,0.021451,0.021539,0.021910,0.023544,0.026610,0.036858,0.058945,0.074942,0.077487,...,0.161060,0.174752,0.165496,0.070982,0.071713,470418.71,Werkplaats,13.808177,12.372584,326
1710,0.027653,0.027225,0.027107,0.026679,0.027035,0.030729,0.038195,0.048256,0.056940,0.060483,...,0.150715,0.158483,0.155441,0.132841,0.113720,258533.81,Cultureel centrum,8.940384,7.234940,1710
1126,0.026913,0.027014,0.027244,0.027394,0.027722,0.029261,0.039204,0.050774,0.057621,0.059831,...,0.161678,0.162963,0.162928,0.106443,0.090295,129858.00,Containerpark,3.922833,2.667552,1126
570,0.028417,0.028364,0.028324,0.028316,0.028304,0.029397,0.039103,0.050298,0.055833,0.058872,...,0.139110,0.140870,0.129529,0.132618,0.148795,261676.20,Ontmoetingscentrum,7.293096,2.980066,570


In [26]:
clust_num = 4
kmeans = pickle.load(open("kmeans{}.pkl".format(clust_num),  "rb"))
subset = features.drop(['function', 'ID', 'mean', 'std', 'yearly'], axis=1).copy()
clusters = kmeans.predict(subset)
features['cluster'] = clusters

In [27]:
profiles = pd.DataFrame()
# Create stanard profiles for each cluster
for k, clust in enumerate(range(clust_num)):
    for i, ID in enumerate(features[features.cluster == clust].index):
        #print(ID)
        if i == 0:
            agg = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg = agg / agg['2019'].sum()
        else:
            agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg[ID] = agg[ID] / agg['2019'][ID].sum()
    # drop the rows with any infinite values
    agg = agg[~np.isinf(agg).any(1)]
    # Create an average profile over columns
    agg = agg.mean(axis=1)
    profiles[clust] = agg

In [28]:
profiles

Unnamed: 0_level_0,0,1,2,3
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01 00:15:00,0.000015,0.000034,0.000002,0.000017
2019-01-01 00:30:00,0.000014,0.000036,0.000002,0.000017
2019-01-01 00:45:00,0.000010,0.000033,0.000002,0.000017
2019-01-01 01:00:00,0.000014,0.000033,0.000002,0.000016
2019-01-01 01:15:00,0.000014,0.000032,0.000002,0.000015
...,...,...,...,...
2021-12-31 23:00:00,0.000011,0.000021,0.000002,0.000017
2021-12-31 23:15:00,0.000009,0.000021,0.000017,0.000016
2021-12-31 23:30:00,0.000009,0.000020,0.000019,0.000016
2021-12-31 23:45:00,0.000009,0.000020,0.000010,0.000015


In [43]:
test_list

['./data/buildings/226.csv',
 './data/buildings/170.csv',
 './data/buildings/119.csv',
 './data/buildings/1104.csv',
 './data/buildings/1453.csv',
 './data/buildings/1488.csv',
 './data/buildings/760.csv',
 './data/buildings/866.csv',
 './data/buildings/1713.csv',
 './data/buildings/1250.csv',
 './data/buildings/1299.csv',
 './data/buildings/1061.csv',
 './data/buildings/914.csv',
 './data/buildings/529.csv',
 './data/buildings/297.csv',
 './data/buildings/232.csv',
 './data/buildings/920.csv',
 './data/buildings/221.csv',
 './data/buildings/1311.csv',
 './data/buildings/250.csv',
 './data/buildings/1276.csv',
 './data/buildings/1353.csv',
 './data/buildings/1715.csv',
 './data/buildings/1116.csv',
 './data/buildings/917.csv',
 './data/buildings/762.csv',
 './data/buildings/163.csv',
 './data/buildings/1631.csv',
 './data/buildings/956.csv',
 './data/buildings/43.csv',
 './data/buildings/59.csv',
 './data/buildings/1513.csv',
 './data/buildings/331.csv',
 './data/buildings/1186.csv',
 

In [44]:
ID = features.index[0]
#print(ID)
ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
#print(ts)
clust_ts = kmeans.predict(features.loc[ID])
ts_syn = profiles.loc[clust_ts].copy()
# measure the error


119
                     Power
ds                        
2019-01-01 00:15:00   1.40
2019-01-01 00:30:00   1.60
2019-01-01 00:45:00   1.40
2019-01-01 01:00:00   1.55
2019-01-01 01:15:00   1.85
...                    ...
2021-12-31 23:00:00   1.90
2021-12-31 23:15:00   1.90
2021-12-31 23:30:00   1.90
2021-12-31 23:45:00   1.90
2022-01-01 00:00:00   1.90

[105216 rows x 1 columns]
h0               0.020468
h1               0.020456
h2               0.020752
h3               0.022444
h4               0.026098
h5               0.033939
h6               0.043871
h7               0.056856
h8               0.070734
h9               0.072694
h10              0.070448
h11              0.066232
h12              0.061108
h13              0.059334
h14               0.06026
h15              0.059266
h16              0.053615
h17              0.045269
h18                0.0349
h19              0.028217
h20              0.020149
h21              0.017273
h22              0.016422
h23              0.0

In [30]:
for filename in test_list:
    ts = pd.read_csv(filename, usecols=['Power', 'ds'], index_col='ds')
    

NameError: name 'buildings' is not defined