In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [17]:
import glob
import pandas as pd
import numpy as np
import random
import joblib
import matplotlib.pyplot as plt
import plotly.express as px
import pickle

# K-mean clustering libraries
from kmodes.kprototypes import KPrototypes
# import minmax scaler
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from src.utils.functions import validation

random.seed(123)
model_dir = 'models/'
scaler_dir = 'scalers/'

In [3]:
# Read the features from the csv file
features = pd.read_csv('data/social_features.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[:4,16:24]].sum(axis=1)
features['weekday'] = features.iloc[:,24:29].sum(axis=1)
features['weekend'] = features.iloc[:,29:31].sum(axis=1)
# scale yearly column to 0-1 with minmax scaler
ds = features[['yearly', 'weekend',  'evening', 'function']].copy()
scaler = MinMaxScaler()
ds['yearly'] = scaler.fit_transform(ds['yearly'].values.reshape(-1,1))
# save scaler
joblib.dump(scaler, scaler_dir+'scaler.gz')
ds

Unnamed: 0,yearly,weekend,evening,function
1208,0.043184,0.237764,0.426774,Stadhuis/Gemeentehuis
588,0.023923,0.263603,0.548167,Academie
1116,0.004459,0.446731,0.566519,Cultureel centrum
144,0.021442,0.105142,0.272579,Lagere school
510,0.004152,0.238654,0.457893,Andere gebouwen
...,...,...,...,...
56,0.017418,0.276985,0.438050,Ontmoetingscentrum
13,0.047746,0.203067,0.417403,Administratief centrum
1660,0.039817,0.270733,0.470136,Cultureel centrum
235,0.024982,0.161183,0.355166,Werkplaats


In [31]:
clust_num = 10
kproto = KPrototypes(n_clusters=clust_num, init='Cao', verbose=2)
clusters = kproto.fit_predict(ds, categorical=[3])
features['cluster'] = clusters
pickle.dump(kproto, open(model_dir+"kmeans{}.pkl".format(clust_num), "wb"))

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 218, ncost: 34.35625474718132
Run: 1, iteration: 2/100, moves: 113, ncost: 33.70998115852395
Run: 1, iteration: 3/100, moves: 50, ncost: 33.539463388134045
Run: 1, iteration: 4/100, moves: 10, ncost: 33.52655091797853
Run: 1, iteration: 5/100, moves: 11, ncost: 33.52239871256826
Run: 1, iteration: 6/100, moves: 0, ncost: 33.52239871256826
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 201, ncost: 37.4562310241183
Run: 2, iteration: 2/100, moves: 83, ncost: 37.17187491501762
Run: 2, iteration: 3/100, moves: 20, ncost: 37.15072979079196
Run: 2, iteration: 4/100, moves: 3, ncost: 37.14962852545937
Run: 2, iteration: 5/100, moves: 2, ncost: 37.148997213135054
Run: 2, iteration: 6/100, moves: 0, ncost: 37.148997213135054
Init: initializing 

In [8]:
# Show  value counts in the function column for each cluster
counts = features.groupby('cluster').function.value_counts().sort_values(ascending=False).unstack().T
counts.loc[counts.sum(axis=1).sort_values(ascending=False).index]

cluster,0,1,2,3,4,5,6,7,8
function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Sporthal,104.0,,,,,,1.0,,
Andere gebouwen,,90.0,,,,3.0,,,
Administratief centrum,,,80.0,,,,2.0,,
Cultureel centrum,,,,77.0,,1.0,1.0,,
Ontmoetingscentrum,,,,,63.0,,2.0,,
Lagere school,,,,,,55.0,,,
RVT/WZC/revalidatiecentrum,,1.0,,,1.0,,37.0,,
Stadhuis/Gemeentehuis,,,,,,,,38.0,
Werkplaats,,,,,,,,,30.0
Sportcomplex,4.0,11.0,3.0,6.0,1.0,,4.0,,


In [26]:
# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(ds.index, clusters):
    print(f"Symbol: {s}, cluster:{c}")

[['0.056447210822485105' '0.2331879902114118' '0.7668120097885884'
  '0.5266959926758608' '0.47330400732413797' 'Sporthal']
 ['0.027244963875149956' '0.3062941501812108' '0.6937058498187894'
  '0.5009442884676937' '0.4990557115323057' 'Andere gebouwen']
 ['0.09048956953103615' '0.2089870962279445' '0.7910129037720554'
  '0.6013664464628723' '0.3986335535371266' 'Administratief centrum']
 ['0.07620579116065099' '0.2620072859981515' '0.7379927140018481'
  '0.5451064853812762' '0.4548935146187229' 'Cultureel centrum']
 ['0.031042364320994203' '0.28611683107625224' '0.7138831689237478'
  '0.5420318914581161' '0.4579681085418827' 'Ontmoetingscentrum']
 ['0.037731920971382366' '0.14954805026729512' '0.8504519497327044'
  '0.6885357522279575' '0.3114642477720416' 'Lagere school']
 ['0.33629437590389166' '0.26148527928770116' '0.7385147207122984'
  '0.5962789817054501' '0.4037210182945488' 'RVT/WZC/revalidatiecentrum']
 ['0.06791566153917591' '0.22796610584675903' '0.77203389415324'
  '0.62302

In [46]:
st_p = pd.DataFrame()
# Create stanard profiles for each cluster
for k, clust in enumerate(np.unique(clusters)):
    agg = pd.DataFrame()
    for i, ID in enumerate(features[features['cluster'] == clust].ID):
        #print(ID)
        agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
        agg[ID] = agg[ID] / agg['2019'][ID].sum()
    # Create an average profile over columns
    agg = agg.mean(axis=1)
    st_p[str(clust)] = agg

In [47]:
# Read the features from the csv file
features = pd.read_csv('data/social_features_test.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[:4,16:24]].sum(axis=1)
features['weekday'] = features.iloc[:,24:29].sum(axis=1)
features['weekend'] = features.iloc[:,29:31].sum(axis=1)
subset = features[['yearly','weekend', 'evening', 'function']].copy()
subset['yearly'] = scaler.transform(subset['yearly'].values.reshape(-1,1))
subset['cluster'] = kproto.predict(subset, categorical=[3])
subset

Unnamed: 0,yearly,weekend,evening,function,cluster
666,0.039381,0.295865,0.541792,Andere gebouwen,1
399,0.012806,0.130884,0.326180,School,5
1544,0.063453,0.190230,0.348661,Stadhuis/Gemeentehuis,7
1655,0.005500,0.146399,0.396827,Lagere school,5
844,0.052565,0.213343,0.530799,School,0
...,...,...,...,...,...
320,0.315874,0.221625,0.362090,RVT/WZC/revalidatiecentrum,6
944,0.115647,0.171083,0.270924,Bibliotheek,5
444,0.003284,0.269399,0.386193,Werkplaats,8
778,0.127199,0.256797,0.615815,Stadion,9


In [48]:
mae = {}
rmse = {}
smape = {}
for ID in subset.index:
    clust = subset.loc[ID, 'cluster']
    ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
    ts_syn =  st_p[str(clust)].copy() * features.loc[ID, 'yearly']
    ts_syn = ts_syn.to_frame()
    ts.index = pd.to_datetime(ts.index)
    ts_syn.index = pd.to_datetime(ts_syn.index)
    # drop inf values from ts_syn
    ts_syn = ts_syn[ts_syn < np.inf]
    # drop the nan values
    ts_syn = ts_syn.dropna()
    # keep the same index in ts as ts_syn
    ts = ts.loc[ts_syn.index]
    mae[ID] = validation(ts.values, ts_syn.values, 'MAE')
    rmse[ID] = validation(ts.values, ts_syn.values, 'RMSE')
    smape[ID] = validation(ts.values, ts_syn.values, 'SMAPE')
# make a dataframe with 3 dictionaries as columns
temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
temp_df.to_csv('./results/kproto10.csv')

In [49]:
temp_df.mean()

MAE       6.653858
RMSE     17.418321
SMAPE    26.884031
dtype: float64

In [50]:
temp_df.median()

MAE       4.112054
RMSE      6.061998
SMAPE    20.185257
dtype: float64