In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [2]:
import glob
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from src.utils.functions import validation

random.seed(123)
model_dir = 'models/'

In [3]:
build_list = glob.glob('./data/buildings/*.csv')
test_list = random.sample(build_list, int(len(build_list) * 0.1))
train_list = [x for x in build_list if x not in test_list]

In [4]:
# Read the features from the csv file
features = pd.read_csv('data/social_features_test.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[:4,16:24]].sum(axis=1)
features['weekday'] = features.iloc[:,24:29].sum(axis=1)
features['weekend'] = features.iloc[:,29:31].sum(axis=1)
# scale yearly column to 0-1 with minmax scaler
subset = features[['yearly', 'weekend',  'evening']].copy()
subset['yearly'] = MinMaxScaler().fit_transform(subset['yearly'].values.reshape(-1,1))
subset

Unnamed: 0,yearly,weekend,evening
666,0.071996,0.295865,0.541792
399,0.022563,0.130884,0.326180
1544,0.116772,0.190230,0.348661
1655,0.008973,0.146399,0.396827
844,0.096520,0.213343,0.530799
...,...,...,...
320,0.586307,0.221625,0.362090
944,0.213859,0.171083,0.270924
444,0.004852,0.269399,0.386193
778,0.235348,0.256797,0.615815


In [30]:
clust_num = 10
kmeans = pickle.load(open(model_dir+"kmeans{}.pkl".format(clust_num),  "rb"))
clusters = kmeans.predict(subset)
features['cluster'] = clusters

In [35]:
profiles = pd.read_csv('./data/st_p_kmeans{}.csv'.format(clust_num), index_col=0)
profiles

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-01 00:15:00,0.000019,0.000017,0.000024,0.000021,0.000012,0.000055,0.000020,0.000019,0.000028,0.000007
2019-01-01 00:30:00,0.000019,0.000018,0.000024,0.000023,0.000012,0.000053,0.000020,0.000020,0.000029,0.000007
2019-01-01 00:45:00,0.000019,0.000017,0.000024,0.000022,0.000012,0.000047,0.000020,0.000019,0.000029,0.000008
2019-01-01 01:00:00,0.000019,0.000018,0.000024,0.000021,0.000012,0.000046,0.000020,0.000019,0.000029,0.000007
2019-01-01 01:15:00,0.000019,0.000018,0.000023,0.000021,0.000012,0.000046,0.000020,0.000021,0.000029,0.000007
...,...,...,...,...,...,...,...,...,...,...
2021-12-31 23:00:00,0.000019,0.000017,0.000019,0.000022,0.000013,0.000028,0.000019,0.000015,inf,0.000011
2021-12-31 23:15:00,0.000019,0.000016,0.000018,0.000022,0.000013,0.000028,0.000019,0.000013,0.000020,0.000009
2021-12-31 23:30:00,0.000018,0.000017,0.000018,0.000022,0.000013,0.000027,0.000019,0.000013,inf,0.000008
2021-12-31 23:45:00,0.000018,0.000016,0.000018,0.000022,0.000013,0.000027,0.000019,0.000013,inf,0.000010


In [40]:
subset = features[['yearly', 'weekend',  'evening']].copy()
mae = {}
rmse = {}
smape = {}
for ID in features.index:
    ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
    #print(ts)
    clust_ts = kmeans.predict(np.array(subset.loc[ID]).reshape(1, -1))
    ts_syn = profiles[str(clust_ts[0])].copy() * features.loc[ID, 'yearly']
    # measure the error
    mae[ID] = validation(ts.values, ts_syn, 'MAE')
    rmse[ID] = validation(ts.values, ts_syn, 'RMSE')
    smape[ID] = validation(ts.values, ts_syn, 'SMAPE')

In [41]:
# mean of smape dict values
mean_smape = np.mean(list(smape.values()))
# mean of mae dict values
mean_mae = np.mean(list(mae.values()))
# mean of rmse dict values
mean_rmse = np.mean(list(rmse.values()))
print("Mean of SMAPE: {}".format(mean_smape))
print("Mean of MAE: {}".format(mean_mae))
print("Mean of RMSE: {}".format(mean_rmse))

Mean of SMAPE: 29.68118999770497
Mean of MAE: 7.094519138069932
Mean of RMSE: 9.6445259395783


In [None]:
features

Unnamed: 0_level_0,0,1,2
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 00:15:00,,,0.000031
2019-01-01 00:30:00,,,0.000031
2019-01-01 00:45:00,,,0.000030
2019-01-01 01:00:00,,,0.000030
2019-01-01 01:15:00,,,0.000030
...,...,...,...
2021-12-31 23:00:00,,,0.000023
2021-12-31 23:15:00,,,0.000024
2021-12-31 23:30:00,,,0.000023
2021-12-31 23:45:00,,,0.000023


In [13]:
for clust_num in range(3, 21):
    kmeans = pickle.load(open(model_dir+"kmeans{}.pkl".format(clust_num),  "rb"))
    subset = features[['yearly', 'weekend',  'evening']].copy()
    clusters = kmeans.predict(subset)
    features['cluster'] = clusters
    profiles = pd.DataFrame()
    # Create stanard profiles for each cluster
    for k, clust in enumerate(range(clust_num)):
        agg = pd.DataFrame()
        for i, ID in enumerate(features[features['cluster'] == clust].ID):
            #print(ID)
            agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg[ID] = agg[ID] / agg.loc['2019'][ID].sum()
        # Create an average profile over columns
        agg = agg.mean(axis=1)
        profiles[str(clust)] = agg
    mae = {}
    rmse = {}
    smape = {}
    for ID in features.index:
        ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
        #print(ts)
        clust_ts = kmeans.predict(np.array(subset.loc[ID]).reshape(1, -1))
        ts_syn = profiles[str(clust_ts[0])].copy() * features.loc[ID, 'yearly']
        ts_syn = ts_syn.to_frame()
        ts.index = pd.to_datetime(ts.index)
        ts_syn.index = pd.to_datetime(ts_syn.index)
        # drop inf values from ts_syn
        ts_syn = ts_syn[ts_syn < np.inf]
        # drop the nan values
        ts_syn = ts_syn.dropna()
        # keep the same index in ts as ts_syn
        ts = ts.loc[ts_syn.index]
        # measure the error
        mae[ID] = validation(ts.values, ts_syn.values, 'MAE')
        rmse[ID] = validation(ts.values, ts_syn.values, 'RMSE')
        smape[ID] = validation(ts.values, ts_syn.values, 'SMAPE')
    # make a dataframe with 3 dictionaries as columns
    temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
    temp_df.to_csv('./results/kmeans{}.csv'.format(clust_num))



In [None]:
for clust_num in range(3, 21):
    kproto = pickle.load(open(model_dir+"kproto{}.pkl".format(clust_num),  "rb"))
    subset = features[['yearly', 'weekend',  'evening', 'function']].copy()
    clusters = kproto.predict(subset, categorical=[3])
    features['cluster'] = clusters
    profiles = pd.DataFrame()
    # Create stanard profiles for each cluster
    for k, clust in enumerate(range(clust_num)):
        agg = pd.DataFrame()
        for i, ID in enumerate(features[features['cluster'] == clust].ID):
            #print(ID)
            agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg[ID] = agg[ID] / agg.loc['2019'][ID].sum()
        # Create an average profile over columns
        agg = agg.mean(axis=1)
        profiles[str(clust)] = agg
    mae = {}
    rmse = {}
    smape = {}
    for ID in features.index:
        ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
        #print(ts)
        clust_ts = kmeans.predict(np.array(subset.loc[ID]).reshape(1, -1))
        ts_syn = profiles[str(clust_ts[0])].copy() * features.loc[ID, 'yearly']
        ts_syn = ts_syn.to_frame()
        ts.index = pd.to_datetime(ts.index)
        ts_syn.index = pd.to_datetime(ts_syn.index)
        # drop inf values from ts_syn
        ts_syn = ts_syn[ts_syn < np.inf]
        # drop the nan values
        ts_syn = ts_syn.dropna()
        # keep the same index in ts as ts_syn
        ts = ts.loc[ts_syn.index]
        # measure the error
        mae[ID] = validation(ts.values, ts_syn.values, 'MAE')
        rmse[ID] = validation(ts.values, ts_syn.values, 'RMSE')
        smape[ID] = validation(ts.values, ts_syn.values, 'SMAPE')
    # make a dataframe with 3 dictionaries as columns
    temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
    temp_df.to_csv('./results/kproto{}.csv'.format(clust_num))

In [42]:
# loop through csv files in the results folder and calculate the mean of each column
mean_df = pd.DataFrame()
for file in os.listdir('./results/'):
    if file.endswith(".csv"):
        temp_df = pd.read_csv('./results/' + file)
        mean_df[file] = temp_df.mean(axis=0)
mean_df.drop('Unnamed: 0', axis=0, inplace=True)
mean_df.drop('mean_results.csv', axis=1, inplace=True)
mean_df = mean_df.T
mean_df['order'] = mean_df.index.map(split_index)
mean_df.sort_values('order', inplace=True)
mean_df.drop('order', axis=1, inplace=True)
mean_df.to_csv('./results/mean_results.csv')

In [35]:
import re
pattern = re.compile('([a-z]+)(\d*)', re.I)
def split_index(idx):
    m = pattern.match(idx)
    if m:
        letters = m.group(1)
        numbers = m.group(2)
        if numbers:
            return (letters, int(numbers))
        else:
            return (letters, 0)