In [2]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [3]:
import glob
import pandas as pd
import numpy as np
import random
import pickle
from tsfeatures import tsfeatures
from src.utils.functions import validation

random.seed(123)
model_dir = 'models/'

In [4]:
build_list = glob.glob('./data/buildings/*.csv')
test_list = random.sample(build_list, int(len(build_list) * 0.1))
train_list = [x for x in build_list if x not in test_list]

In [5]:
test_list

['./data/buildings/226.csv',
 './data/buildings/170.csv',
 './data/buildings/119.csv',
 './data/buildings/1104.csv',
 './data/buildings/1453.csv',
 './data/buildings/1488.csv',
 './data/buildings/760.csv',
 './data/buildings/866.csv',
 './data/buildings/1713.csv',
 './data/buildings/1250.csv',
 './data/buildings/1299.csv',
 './data/buildings/1061.csv',
 './data/buildings/914.csv',
 './data/buildings/529.csv',
 './data/buildings/297.csv',
 './data/buildings/232.csv',
 './data/buildings/920.csv',
 './data/buildings/221.csv',
 './data/buildings/1311.csv',
 './data/buildings/250.csv',
 './data/buildings/1276.csv',
 './data/buildings/1353.csv',
 './data/buildings/1715.csv',
 './data/buildings/1116.csv',
 './data/buildings/917.csv',
 './data/buildings/762.csv',
 './data/buildings/163.csv',
 './data/buildings/1631.csv',
 './data/buildings/956.csv',
 './data/buildings/43.csv',
 './data/buildings/59.csv',
 './data/buildings/1513.csv',
 './data/buildings/331.csv',
 './data/buildings/1186.csv',
 

In [10]:
# Read the features from the csv file
features = pd.read_csv('data/social_features_test.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
metrics = pd.read_csv('data/ts_metrics.csv', usecols = ['ID', 'mean', 'std'], index_col='ID')
# add the metrics to the features
features = features.join(metrics)
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features

Unnamed: 0,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,...,w2,w3,w4,w5,w6,yearly,function,mean,std,ID
666,0.039596,0.037584,0.036305,0.035503,0.034760,0.034023,0.033480,0.033304,0.033433,0.034265,...,0.136372,0.148047,0.160725,0.150751,0.145114,308362.90,Andere gebouwen,5.563986,17.810517,666
399,0.023548,0.023489,0.023494,0.023669,0.024771,0.029233,0.033275,0.043361,0.073395,0.087735,...,0.134992,0.188926,0.173594,0.065251,0.065634,100270.95,School,2.888369,3.724561,399
1544,0.020197,0.019855,0.019903,0.020176,0.020237,0.026546,0.043857,0.055716,0.064573,0.069158,...,0.166665,0.171826,0.141554,0.095879,0.094351,496847.76,Stadhuis/Gemeentehuis,13.306562,8.142373,1544
1655,0.035502,0.035733,0.035597,0.035926,0.035941,0.036661,0.034927,0.042369,0.086040,0.094630,...,0.143411,0.196228,0.162992,0.073898,0.072501,43067.00,Lagere school,1.054809,1.796610,1655
844,0.034837,0.031965,0.029652,0.028188,0.027135,0.024033,0.022991,0.022195,0.033942,0.046249,...,0.164637,0.176120,0.119342,0.144373,0.068970,411595.23,School,10.690519,9.587791,844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,0.022122,0.021285,0.020963,0.020881,0.022279,0.026788,0.046466,0.065346,0.068977,0.069152,...,0.155299,0.156233,0.153407,0.111514,0.110111,2473361.00,RVT/WZC/revalidatiecentrum,70.525623,35.880629,320
944,0.013873,0.013889,0.013957,0.014145,0.016034,0.023628,0.035757,0.052698,0.072488,0.082926,...,0.169847,0.170205,0.161782,0.085471,0.085612,905537.58,Bibliotheek,22.370093,18.574790,944
444,0.028038,0.028319,0.028868,0.029173,0.029472,0.030041,0.039209,0.055161,0.069871,0.064558,...,0.183859,0.120475,0.158235,0.177234,0.092165,25718.20,Werkplaats,0.896637,1.061082,444
778,0.027741,0.026469,0.026011,0.025903,0.025678,0.026909,0.028480,0.029036,0.031723,0.035783,...,0.150864,0.158515,0.142460,0.128643,0.128154,995995.20,Stadion,22.055094,21.576259,778


In [62]:
clust_num = 4
kmeans = pickle.load(open(model_dir+"kmeans{}.pkl".format(clust_num),  "rb"))
subset = features.drop(['function', 'ID', 'mean', 'std', 'yearly'], axis=1).copy()
clusters = kmeans.predict(subset)
features['cluster'] = clusters
profiles = pd.DataFrame()
# Create stanard profiles for each cluster
for k, clust in enumerate(range(clust_num)):
    for i, ID in enumerate(features[features.cluster == clust].index):
        #print(ID)
        if i == 0:
            agg = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg = agg / agg.resample('A').sum().values[0]
        else:
            agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg[ID] = agg[ID] / agg[ID].resample('A').sum().values[0]
    # drop the rows with any infinite values
    agg = agg[~np.isinf(agg).any(1)]
    # Create an average profile over columns
    agg = agg.mean(axis=1)
    profiles[clust] = agg

In [65]:
subset = features.drop(['function', 'ID', 'mean', 'std', 'yearly', 'cluster'], axis=1).copy()
mae = {}
rmse = {}
smape = {}
for ID in features.index:
    ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
    #print(ts)
    clust_ts = kmeans.predict(np.array(subset.loc[ID]).reshape(1, -1))
    ts_syn = profiles[clust_ts[0]].copy() * features.loc[ID, 'yearly']
    # measure the error
    mae[ID] = validation(ts.values, ts_syn, 'MAE')
    rmse[ID] = validation(ts.values, ts_syn, 'RMSE')
    smape[ID] = validation(ts.values, ts_syn, 'SMAPE')

In [66]:
# mean of smape dict values
mean_smape = np.mean(list(smape.values()))
# mean of mae dict values
mean_mae = np.mean(list(mae.values()))
# mean of rmse dict values
mean_rmse = np.mean(list(rmse.values()))
print("Mean of SMAPE: {}".format(mean_smape))
print("Mean of MAE: {}".format(mean_mae))
print("Mean of RMSE: {}".format(mean_rmse))

Mean of SMAPE: 26.349291883785757
Mean of MAE: 6.559657505727089
Mean of RMSE: 9.036385269764164


In [21]:
for clust_num in [10,11,12]:
    kmeans = pickle.load(open(model_dir+"kmeans{}.pkl".format(clust_num),  "rb"))
    subset = features.drop(['function', 'ID', 'mean', 'std', 'yearly'], axis=1).copy()
    # drop a cluster column if exists
    if 'cluster' in subset.columns:
        subset.drop('cluster', axis=1, inplace=True)
    clusters = kmeans.predict(subset)
    features['cluster'] = clusters
    profiles = pd.DataFrame()
    # Create stanard profiles for each cluster
    for k, clust in enumerate(range(clust_num)):
        for i, ID in enumerate(features[features.cluster == clust].index):
            if i == 0:
                agg = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
                agg = agg / agg.resample('A').sum().values[0]
            else:
                agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
                agg[ID] = agg[ID] / agg[ID].resample('A').sum().values[0]
        # turn to a dataframe if a series
        if isinstance(agg, pd.Series):
            agg = agg.to_frame()
        # drop the rows with any infinite values
        agg = agg[~np.isinf(agg).any(1)]
        # Create an average profile over columns
        agg = agg.mean(axis=1)
        profiles[clust] = agg
    mae = {}
    rmse = {}
    smape = {}
    for ID in features.index:
        ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
        #print(ts)
        clust_ts = kmeans.predict(np.array(subset.loc[ID]).reshape(1, -1))
        ts_syn = profiles[clust_ts[0]].copy() * features.loc[ID, 'yearly']
        # measure the error
        mae[ID] = validation(ts.values, ts_syn, 'MAE')
        rmse[ID] = validation(ts.values, ts_syn, 'RMSE')
        smape[ID] = validation(ts.values, ts_syn, 'SMAPE')
    # make a dataframe with 3 dictionaries as columns
    temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
    temp_df.to_csv('./results/kmeans{}.csv'.format(clust_num))

In [12]:
subset

Unnamed: 0,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,...,h22,h23,w0,w1,w2,w3,w4,w5,w6,cluster
666,0.039596,0.037584,0.036305,0.035503,0.034760,0.034023,0.033480,0.033304,0.033433,0.034265,...,0.045225,0.042126,0.124429,0.134562,0.136372,0.148047,0.160725,0.150751,0.145114,2
399,0.023548,0.023489,0.023494,0.023669,0.024771,0.029233,0.033275,0.043361,0.073395,0.087735,...,0.031405,0.026247,0.176705,0.194899,0.134992,0.188926,0.173594,0.065251,0.065634,1
1544,0.020197,0.019855,0.019903,0.020176,0.020237,0.026546,0.043857,0.055716,0.064573,0.069158,...,0.020998,0.020609,0.162877,0.166848,0.166665,0.171826,0.141554,0.095879,0.094351,1
1655,0.035502,0.035733,0.035597,0.035926,0.035941,0.036661,0.034927,0.042369,0.086040,0.094630,...,0.036068,0.035720,0.173718,0.177252,0.143411,0.196228,0.162992,0.073898,0.072501,1
844,0.034837,0.031965,0.029652,0.028188,0.027135,0.024033,0.022991,0.022195,0.033942,0.046249,...,0.040984,0.038080,0.165019,0.161539,0.164637,0.176120,0.119342,0.144373,0.068970,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,0.022122,0.021285,0.020963,0.020881,0.022279,0.026788,0.046466,0.065346,0.068977,0.069152,...,0.026553,0.023821,0.155959,0.157476,0.155299,0.156233,0.153407,0.111514,0.110111,1
944,0.013873,0.013889,0.013957,0.014145,0.016034,0.023628,0.035757,0.052698,0.072488,0.082926,...,0.014086,0.013823,0.152333,0.174751,0.169847,0.170205,0.161782,0.085471,0.085612,1
444,0.028038,0.028319,0.028868,0.029173,0.029472,0.030041,0.039209,0.055161,0.069871,0.064558,...,0.027429,0.027425,0.109299,0.158733,0.183859,0.120475,0.158235,0.177234,0.092165,2
778,0.027741,0.026469,0.026011,0.025903,0.025678,0.026909,0.028480,0.029036,0.031723,0.035783,...,0.045904,0.032447,0.134813,0.156552,0.150864,0.158515,0.142460,0.128643,0.128154,2


In [42]:
# loop through csv files in the results folder and calculate the mean of each column
mean_df = pd.DataFrame()
for file in os.listdir('./results/'):
    if file.endswith(".csv"):
        temp_df = pd.read_csv('./results/' + file)
        mean_df[file] = temp_df.mean(axis=0)
mean_df.drop('Unnamed: 0', axis=0, inplace=True)
mean_df.drop('mean_results.csv', axis=1, inplace=True)
mean_df = mean_df.T
mean_df['order'] = mean_df.index.map(split_index)
mean_df.sort_values('order', inplace=True)
mean_df.drop('order', axis=1, inplace=True)
mean_df.to_csv('./results/mean_results.csv')

In [43]:
mean_df

Unnamed: 0,MAE,RMSE,SMAPE
kmeans3.csv,6.613694,9.174545,27.498288
kmeans4.csv,6.613694,9.174545,27.498288
kmeans5.csv,6.613694,9.174545,27.498288
kmeans6.csv,6.594035,9.114526,27.544606
kmeans7.csv,6.543726,9.110357,26.231239
kmeans8.csv,6.447444,8.880074,24.879646
kmeans9.csv,6.290287,8.826643,24.019333
kmeans10.csv,6.225108,8.673581,23.778875
kmeans11.csv,6.111726,8.709419,23.494507
kmeans12.csv,5.981924,8.398807,23.098314


In [35]:
import re
pattern = re.compile('([a-z]+)(\d*)', re.I)
def split_index(idx):
    m = pattern.match(idx)
    if m:
        letters = m.group(1)
        numbers = m.group(2)
        if numbers:
            return (letters, int(numbers))
        else:
            return (letters, 0)