In [2]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [44]:
import glob
import pandas as pd
import numpy as np
import random
import pickle
from tsfeatures import tsfeatures
import matplotlib.pyplot as plt
import plotly.express as px

# K-mean clustering libraries
from kmodes.kprototypes import KPrototypes
# import minmax scaler
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from src.utils.functions import validation

random.seed(123)
model_dir = 'models/'

In [45]:
# Read the features from the csv file
features = pd.read_csv('data/social_features.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[1:4,16:24]].sum(axis=1)
# scale yearly column to 0-1 with minmax scaler
ds = features[['yearly', 'morning', 'evening', 'function']].copy()
ds['yearly'] = MinMaxScaler().fit_transform(ds['yearly'].values.reshape(-1,1))
ds

Unnamed: 0,yearly,morning,evening,function
1208,0.043184,0.573226,0.397711,Stadhuis/Gemeentehuis
588,0.023923,0.451833,0.510319,Academie
1116,0.004459,0.433481,0.537542,Cultureel centrum
144,0.021442,0.727421,0.255215,Lagere school
510,0.004152,0.542107,0.443149,Andere gebouwen
...,...,...,...,...
56,0.017418,0.561950,0.413922,Ontmoetingscentrum
13,0.047746,0.582597,0.392399,Administratief centrum
1660,0.039817,0.529864,0.438617,Cultureel centrum
235,0.024982,0.644834,0.326298,Werkplaats


In [46]:
kproto = KPrototypes(n_clusters=10, init='Cao', verbose=2)
clusters = kproto.fit_predict(ds, categorical=[3])
features['cluster'] = clusters

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 143, ncost: 43.36456166661007
Run: 1, iteration: 2/100, moves: 41, ncost: 43.33064466917181
Run: 1, iteration: 3/100, moves: 23, ncost: 42.73232928574015
Run: 1, iteration: 4/100, moves: 16, ncost: 42.50866778944642
Run: 1, iteration: 5/100, moves: 3, ncost: 42.50057724316256
Run: 1, iteration: 6/100, moves: 5, ncost: 42.47342512937381
Run: 1, iteration: 7/100, moves: 7, ncost: 42.450141884424276
Run: 1, iteration: 8/100, moves: 26, ncost: 41.63792817280326
Run: 1, iteration: 9/100, moves: 23, ncost: 41.56793063503648
Run: 1, iteration: 10/100, moves: 7, ncost: 41.56415881570909
Run: 1, iteration: 11/100, moves: 1, ncost: 41.56311627996012
Run: 1, iteration: 12/100, moves: 0, ncost: 41.56311627996012
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/

In [47]:
# Show  value counts in the function column for each cluster
counts = features.groupby('cluster').function.value_counts().sort_values(ascending=False).unstack().T
counts.loc[counts.sum(axis=1).sort_values(ascending=False).index]

cluster,0,1,2,3,4,5,6,7,8,9
function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sporthal,102.0,,,,,,,,,3.0
Andere gebouwen,,87.0,,,,3.0,,,,3.0
Administratief centrum,,,80.0,,,,2.0,,,
Cultureel centrum,,,,77.0,,1.0,1.0,,,
Ontmoetingscentrum,,,,,64.0,,1.0,,,
Lagere school,,,,,,55.0,,,,
RVT/WZC/revalidatiecentrum,,,,,,,38.0,,,1.0
Stadhuis/Gemeentehuis,,,,,,,,38.0,,
Werkplaats,,,,,,,,,30.0,
Sportcomplex,,,,,,,2.0,,,27.0


In [48]:
# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(ds.index, clusters):
    print(f"Symbol: {s}, cluster:{c}")

[['0.07493941240048463' '0.526831060978793' '0.4496329984093153'
  'Sporthal']
 ['0.027172243692286806' '0.49976627305280674' '0.4730684528486072'
  'Andere gebouwen']
 ['0.08005821431535479' '0.6030587814096116' '0.36871641675273664'
  'Administratief centrum']
 ['0.056188278989156695' '0.5504505710304635' '0.421335783459811'
  'Cultureel centrum']
 ['0.03056305338375914' '0.5379600464853261' '0.4317826514824361'
  'Ontmoetingscentrum']
 ['0.0369950896482135' '0.6924862249680213' '0.2871561573222484'
  'Lagere school']
 ['0.33821416216863254' '0.5959219296723982' '0.3761947220212047'
  'RVT/WZC/revalidatiecentrum']
 ['0.09390578631636555' '0.6056932103154798' '0.36610168937140775'
  'Stadhuis/Gemeentehuis']
 ['0.032217009602785084' '0.6393144703132625' '0.33550051953734616'
  'Werkplaats']
 ['0.0669649776968031' '0.36161846439699086' '0.6114251409444504'
  'Sportcomplex']]
41.56011885064679
4
Symbol: 1208, cluster:7
Symbol: 588, cluster:1
Symbol: 1116, cluster:3
Symbol: 144, cluster:5

In [49]:
st_p = pd.DataFrame()
# Create stanard profiles for each cluster
for k, clust in enumerate(np.unique(clusters)):
    for i, ID in enumerate(features[features['cluster'] == clust].ID):
        #print(ID)
        if i == 0:
            agg = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg = agg / agg['2019'].sum()
        else:
            agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg[ID] = agg[ID] / agg['2019'][ID].sum()
    # Create an average profile over columns
    agg = agg.mean(axis=1)
    st_p[str(clust)] = agg

In [40]:
# Read the features from the csv file
features = pd.read_csv('data/social_features_test.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[1:4,16:24]].sum(axis=1)
subset = features[['yearly','morning', 'evening', 'function']].copy()
subset['yearly'] = MinMaxScaler().transform(subset['yearly'].values.reshape(-1,1))
subset['cluster'] = kproto.predict(subset.iloc[:,1:], categorical=[2])
subset

Unnamed: 0,yearly,morning,evening,function,cluster
666,308362.90,0.458208,0.502196,Andere gebouwen,1
399,100270.95,0.673820,0.302632,School,8
1544,496847.76,0.651339,0.328465,Stadhuis/Gemeentehuis,7
1655,43067.00,0.603173,0.361325,Lagere school,5
844,411595.23,0.469201,0.495961,School,1
...,...,...,...,...,...
320,2473361.00,0.637910,0.339968,RVT/WZC/revalidatiecentrum,6
944,905537.58,0.729076,0.257051,Bibliotheek,5
444,25718.20,0.613807,0.358155,Werkplaats,8
778,995995.20,0.384185,0.588073,Stadion,9


In [41]:
mae = {}
rmse = {}
smape = {}
for ID in subset.index:
    clust = subset.loc[ID, 'cluster']
    ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
    ts_syn =  st_p[str(clust)].copy() * subset.loc[ID, 'yearly']
    ts_syn = ts_syn.to_frame()
    ts.index = pd.to_datetime(ts.index)
    ts_syn.index = pd.to_datetime(ts_syn.index)
    # drop inf values from ts_syn
    ts_syn = ts_syn[ts_syn < np.inf]
    # drop the nan values
    ts_syn = ts_syn.dropna()
    # keep the same index in ts as ts_syn
    ts = ts.loc[ts_syn.index]
    mae[ID] = validation(ts.values, ts_syn.values, 'MAE')
    rmse[ID] = validation(ts.values, ts_syn.values, 'RMSE')
    smape[ID] = validation(ts.values, ts_syn.values, 'SMAPE')
# make a dataframe with 3 dictionaries as columns
temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
temp_df.to_csv('./results/kproto10.csv')

In [42]:
temp_df.mean()

MAE       6.578974
RMSE     18.061315
SMAPE    26.900832
dtype: float64

In [43]:
temp_df.median()

MAE       4.151040
RMSE      5.917619
SMAPE    20.488536
dtype: float64