In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [39]:
import glob
import pandas as pd
import numpy as np
import random
import pickle
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from src.utils.functions import validation

random.seed(123)
model_dir = 'models/'
input_dir = 'data/'
scaler_dir = 'scalers/'

In [3]:
build_list = glob.glob('./data/buildings/*.csv')
test_list = random.sample(build_list, int(len(build_list) * 0.1))
train_list = [x for x in build_list if x not in test_list]

In [45]:
# Read the features from the csv file
features = pd.read_csv('data/social_features_test.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[:4,16:24]].sum(axis=1)
features['weekday'] = features.iloc[:,24:29].sum(axis=1)
features['weekend'] = features.iloc[:,29:31].sum(axis=1)
# scale yearly column to 0-1 with minmax scaler
scaler = joblib.load(scaler_dir+'scaler.gz')
features 

Unnamed: 0,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,...,w4,w5,w6,yearly,function,ID,morning,evening,weekday,weekend
666,0.039596,0.037584,0.036305,0.035503,0.034760,0.034023,0.033480,0.033304,0.033433,0.034265,...,0.160725,0.150751,0.145114,308362.90,Andere gebouwen,666,0.458208,0.541792,0.704135,0.295865
399,0.023548,0.023489,0.023494,0.023669,0.024771,0.029233,0.033275,0.043361,0.073395,0.087735,...,0.173594,0.065251,0.065634,100270.95,School,399,0.673820,0.326180,0.869116,0.130884
1544,0.020197,0.019855,0.019903,0.020176,0.020237,0.026546,0.043857,0.055716,0.064573,0.069158,...,0.141554,0.095879,0.094351,496847.76,Stadhuis/Gemeentehuis,1544,0.651339,0.348661,0.809770,0.190230
1655,0.035502,0.035733,0.035597,0.035926,0.035941,0.036661,0.034927,0.042369,0.086040,0.094630,...,0.162992,0.073898,0.072501,43067.00,Lagere school,1655,0.603173,0.396827,0.853601,0.146399
844,0.034837,0.031965,0.029652,0.028188,0.027135,0.024033,0.022991,0.022195,0.033942,0.046249,...,0.119342,0.144373,0.068970,411595.23,School,844,0.469201,0.530799,0.786657,0.213343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,0.022122,0.021285,0.020963,0.020881,0.022279,0.026788,0.046466,0.065346,0.068977,0.069152,...,0.153407,0.111514,0.110111,2473361.00,RVT/WZC/revalidatiecentrum,320,0.637910,0.362090,0.778375,0.221625
944,0.013873,0.013889,0.013957,0.014145,0.016034,0.023628,0.035757,0.052698,0.072488,0.082926,...,0.161782,0.085471,0.085612,905537.58,Bibliotheek,944,0.729076,0.270924,0.828917,0.171083
444,0.028038,0.028319,0.028868,0.029173,0.029472,0.030041,0.039209,0.055161,0.069871,0.064558,...,0.158235,0.177234,0.092165,25718.20,Werkplaats,444,0.613807,0.386193,0.730601,0.269399
778,0.027741,0.026469,0.026011,0.025903,0.025678,0.026909,0.028480,0.029036,0.031723,0.035783,...,0.142460,0.128643,0.128154,995995.20,Stadion,778,0.384185,0.615815,0.743203,0.256797


In [41]:
for clust_num in [3, 5, 10, 15, 20]:
    kmeans = pickle.load(open(model_dir+"kmeans{}.pkl".format(clust_num),  "rb"))
    subset = features[['yearly', 'weekend',  'evening']].copy()
    subset['yearly'] = scaler.transform(subset['yearly'].values.reshape(-1,1))
    profiles = pd.read_csv('./data/st_p_kmeans{}.csv'.format(clust_num), index_col=0)
    mae = {}
    rmse = {}
    smape = {}
    for ID in features.index:
        ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
        #print(ts)
        clust_ts = kmeans.predict(np.array(subset.loc[ID]).reshape(1, -1))
        ts_syn = profiles[str(clust_ts[0])].copy() * features.loc[ID, 'yearly']
        ts_syn = ts_syn.to_frame()
        ts.index = pd.to_datetime(ts.index)
        ts_syn.index = pd.to_datetime(ts_syn.index)
        # drop inf values from ts_syn
        ts_syn = ts_syn[ts_syn < np.inf]
        # drop the nan values
        ts_syn = ts_syn.dropna()
        # keep the same index in ts as ts_syn
        ts = ts.loc[ts_syn.index]
        # measure the error
        mae[ID] = validation(ts.values, ts_syn.values, 'MAE')
        rmse[ID] = validation(ts.values, ts_syn.values, 'RMSE')
        smape[ID] = validation(ts.values, ts_syn.values, 'SMAPE')
    # make a dataframe with 3 dictionaries as columns
    temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
    temp_df.to_csv('./results/kmeans{}.csv'.format(clust_num))



In [47]:
subset['yearly'].values.reshape(-1,1)

array([[0.03938117],
       [0.01280565],
       [0.06345265],
       [0.00550011],
       [0.05256501],
       [0.23834761],
       [0.06046128],
       [0.0130511 ],
       [0.01928938],
       [0.00306308],
       [0.03545234],
       [0.03633033],
       [0.04571935],
       [0.50017327],
       [0.05397752],
       [0.04953466],
       [0.1231577 ],
       [0.24636416],
       [0.01239872],
       [0.08217924],
       [0.01806075],
       [0.05869903],
       [0.27981856],
       [0.03176009],
       [0.07550734],
       [0.0167502 ],
       [0.06150467],
       [0.02103377],
       [0.035703  ],
       [0.04120995],
       [0.02619853],
       [0.21177087],
       [0.53827466],
       [0.17458057],
       [0.00545716],
       [0.13027535],
       [0.02785196],
       [0.01929947],
       [0.15009899],
       [0.05469877],
       [0.00877139],
       [0.03338769],
       [0.02269051],
       [0.02065794],
       [0.01766105],
       [0.04325118],
       [0.01547399],
       [0.144

In [48]:
for clust_num in [3, 5, 10, 15, 20]:
    kproto = pickle.load(open(model_dir+"kproto{}.pkl".format(clust_num),  "rb"))
    subset = features[['yearly', 'weekend',  'evening', 'function']].copy()
    subset['yearly'] = scaler.transform(subset['yearly'].values.reshape(-1,1))
    profiles = pd.read_csv('./data/st_p_kproto{}.csv'.format(clust_num), index_col=0)
    mae = {}
    rmse = {}
    smape = {}
    for ID in features.index:
        ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
        #print(ts)
        clust_ts = kproto.predict(np.array(subset.loc[ID]).reshape(1, -1), categorical=[3])
        ts_syn = profiles[str(clust_ts[0])].copy() * features.loc[ID, 'yearly']
        ts_syn = ts_syn.to_frame()
        ts.index = pd.to_datetime(ts.index)
        ts_syn.index = pd.to_datetime(ts_syn.index)
        # drop inf values from ts_syn
        ts_syn = ts_syn[ts_syn < np.inf]
        # drop the nan values
        ts_syn = ts_syn.dropna()
        # keep the same index in ts as ts_syn
        ts = ts.loc[ts_syn.index]
        # measure the error
        mae[ID] = validation(ts.values, ts_syn.values, 'MAE')
        rmse[ID] = validation(ts.values, ts_syn.values, 'RMSE')
        smape[ID] = validation(ts.values, ts_syn.values, 'SMAPE')
    # make a dataframe with 3 dictionaries as columns
    temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
    temp_df.to_csv('./results/kproto{}.csv'.format(clust_num))

In [51]:
# loop through csv files in the results folder and calculate the mean of each column
mean_df = pd.DataFrame()
for file in os.listdir('./results/'):
    if file.endswith(".csv"):
        temp_df = pd.read_csv('./results/' + file)
        mean_df[file] = temp_df.mean(axis=0)
mean_df.drop('Unnamed: 0', axis=0, inplace=True)
mean_df.drop('mean_results.csv', axis=1, inplace=True)
mean_df = mean_df.T
mean_df['order'] = mean_df.index.map(split_index)
mean_df.sort_values('order', inplace=True)
mean_df.drop('order', axis=1, inplace=True)
mean_df.to_csv('./results/mean_results.csv')

  mean_df[file] = temp_df.mean(axis=0)


In [52]:
mean_df

Unnamed: 0,MAE,RMSE,SMAPE
Kris.csv,6.655499,20.860885,26.659543
kmeans3.csv,7.285993,25.092969,27.633334
kmeans5.csv,7.003531,22.280303,27.158048
kmeans10.csv,6.532484,13.342659,26.804741
kmeans15.csv,6.90622,20.439738,26.917123
kmeans20.csv,6.189132,9.397156,26.847038
kproto3.csv,6.97647,18.896398,27.40228
kproto5.csv,7.912623,23.042007,28.289587
kproto10.csv,6.627435,17.386498,26.856423
kproto15.csv,6.813406,18.123698,26.944723


In [16]:
import re
pattern = re.compile('([a-z]+)(\d*)', re.I)
def split_index(idx):
    m = pattern.match(idx)
    if m:
        letters = m.group(1)
        numbers = m.group(2)
        if numbers:
            return (letters, int(numbers))
        else:
            return (letters, 0)