In [1]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [2]:
import glob
import pandas as pd
import numpy as np
import random
import joblib
import matplotlib.pyplot as plt
import plotly.express as px
import pickle

# K-mean clustering libraries
from kmodes.kprototypes import KPrototypes
# import minmax scaler
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from src.utils.functions import validation

random.seed(123)
model_dir = 'models/'
scaler_dir = 'scalers/'

In [3]:
# Read the features from the csv file
features = pd.read_csv('data/social_features.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[:4,16:24]].sum(axis=1)
features['weekday'] = features.iloc[:,24:29].sum(axis=1)
features['weekend'] = features.iloc[:,29:31].sum(axis=1)
# scale yearly column to 0-1 with minmax scaler
ds = features[['yearly', 'weekend',  'evening', 'function']].copy()
scaler = MinMaxScaler()
ds['yearly'] = scaler.fit_transform(ds['yearly'].values.reshape(-1,1))
# save scaler
joblib.dump(scaler, scaler_dir+'scaler.gz')
ds

Unnamed: 0,yearly,weekend,evening,function
1208,0.043184,0.237764,0.426774,Stadhuis/Gemeentehuis
588,0.023923,0.263603,0.548167,Academie
1116,0.004459,0.446731,0.566519,Cultureel centrum
144,0.021442,0.105142,0.272579,Lagere school
510,0.004152,0.238654,0.457893,Andere gebouwen
...,...,...,...,...
56,0.017418,0.276985,0.438050,Ontmoetingscentrum
13,0.047746,0.203067,0.417403,Administratief centrum
1660,0.039817,0.270733,0.470136,Cultureel centrum
235,0.024982,0.161183,0.355166,Werkplaats


In [4]:
clust_num = 10
kproto = KPrototypes(n_clusters=clust_num, init='Cao', verbose=2)
clusters = kproto.fit_predict(ds, categorical=[3])
features['cluster'] = clusters
pickle.dump(kproto, open(model_dir+"kmeans{}.pkl".format(clust_num), "wb"))

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 179, ncost: 33.689971340672095
Run: 1, iteration: 2/100, moves: 67, ncost: 33.60021856696128
Run: 1, iteration: 3/100, moves: 41, ncost: 33.53212498901514
Run: 1, iteration: 4/100, moves: 13, ncost: 33.52410282217328
Run: 1, iteration: 5/100, moves: 6, ncost: 33.5222430353301
Run: 1, iteration: 6/100, moves: 0, ncost: 33.5222430353301
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 224, ncost: 34.197861364592285
Run: 2, iteration: 2/100, moves: 58, ncost: 34.05816304686836
Run: 2, iteration: 3/100, moves: 21, ncost: 34.02610221572731
Run: 2, iteration: 4/100, moves: 17, ncost: 33.990907193908676
Run: 2, iteration: 5/100, moves: 8, ncost: 33.979656191949815
Run: 2, iteration: 6/100, moves: 2, ncost: 33.97868070167899
Run: 2, iteration: 7

In [5]:
# Show  value counts in the function column for each cluster
counts = features.groupby('cluster').function.value_counts().sort_values(ascending=False).unstack().T
counts.loc[counts.sum(axis=1).sort_values(ascending=False).index]

cluster,0,1,2,3,4,5,6,7,8,9
function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sporthal,101.0,,,,,,1.0,,,3.0
Andere gebouwen,,92.0,,,,1.0,,,,
Administratief centrum,,,80.0,,,,2.0,,,
Cultureel centrum,,,,77.0,,1.0,1.0,,,
Ontmoetingscentrum,,,,,63.0,,2.0,,,
Lagere school,,,,,,55.0,,,,
RVT/WZC/revalidatiecentrum,1.0,,,,1.0,,36.0,,,1.0
Stadhuis/Gemeentehuis,,,,,,,,38.0,,
Werkplaats,,,,,,,,,30.0,
Sportcomplex,,,,,,,2.0,,,27.0


In [6]:
# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(ds.index, clusters):
    print(f"Symbol: {s}, cluster:{c}")

[['0.0540365083469643' '0.23262493232490122' '0.4729621352203733'
  'Sporthal']
 ['0.02621368097414759' '0.29972983306971823' '0.5029655189814201'
  'Andere gebouwen']
 ['0.0852993950152578' '0.20976575386259635' '0.3998234996681631'
  'Administratief centrum']
 ['0.07537592787443634' '0.262824758605392' '0.4553944836929692'
  'Cultureel centrum']
 ['0.026163372574721858' '0.2852623825098367' '0.454196749380608'
  'Ontmoetingscentrum']
 ['0.03681292746926863' '0.14959899470813157' '0.3123139754721098'
  'Lagere school']
 ['0.3464804032001502' '0.2601033086337322' '0.40602789255684774'
  'RVT/WZC/revalidatiecentrum']
 ['0.08429910781541396' '0.21480287620423305' '0.3840053349690675'
  'Stadhuis/Gemeentehuis']
 ['0.0313466607614463' '0.19099976227002924' '0.35812070236869975'
  'Werkplaats']
 ['0.07196702752556441' '0.2326058410242503' '0.6312116344953934'
  'Sportcomplex']]
33.51800261912163
8
Symbol: 1208, cluster:7
Symbol: 588, cluster:1
Symbol: 1116, cluster:3
Symbol: 144, cluster:5


In [7]:
st_p = pd.DataFrame()
# Create stanard profiles for each cluster
for k, clust in enumerate(np.unique(clusters)):
    agg = pd.DataFrame()
    for i, ID in enumerate(features[features['cluster'] == clust].ID):
        #print(ID)
        agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
        agg[ID] = agg[ID] / agg['2019'][ID].sum()
    # Create an average profile over columns
    agg = agg.mean(axis=1, skipna=True)
    st_p[str(clust)] = agg
st_p.to_csv('./data/st_proto'+str(clust_num)+'.csv')

  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'][ID].sum()
  agg[ID] = agg[ID] / agg['2019'

In [47]:
# Read the features from the csv file
features = pd.read_csv('data/social_features_test.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[:4,16:24]].sum(axis=1)
features['weekday'] = features.iloc[:,24:29].sum(axis=1)
features['weekend'] = features.iloc[:,29:31].sum(axis=1)
subset = features[['yearly','weekend', 'evening', 'function']].copy()
subset['yearly'] = scaler.transform(subset['yearly'].values.reshape(-1,1))
subset['cluster'] = kproto.predict(subset, categorical=[3])
subset

Unnamed: 0,yearly,weekend,evening,function,cluster
666,0.039381,0.295865,0.541792,Andere gebouwen,1
399,0.012806,0.130884,0.326180,School,5
1544,0.063453,0.190230,0.348661,Stadhuis/Gemeentehuis,7
1655,0.005500,0.146399,0.396827,Lagere school,5
844,0.052565,0.213343,0.530799,School,0
...,...,...,...,...,...
320,0.315874,0.221625,0.362090,RVT/WZC/revalidatiecentrum,6
944,0.115647,0.171083,0.270924,Bibliotheek,5
444,0.003284,0.269399,0.386193,Werkplaats,8
778,0.127199,0.256797,0.615815,Stadion,9


In [48]:
mae = {}
rmse = {}
smape = {}
for ID in subset.index:
    clust = subset.loc[ID, 'cluster']
    ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
    ts_syn =  st_p[str(clust)].copy() * features.loc[ID, 'yearly']
    ts_syn = ts_syn.to_frame()
    ts.index = pd.to_datetime(ts.index)
    ts_syn.index = pd.to_datetime(ts_syn.index)
    # drop inf values from ts_syn
    ts_syn = ts_syn[ts_syn < np.inf]
    # drop the nan values
    ts_syn = ts_syn.dropna()
    # keep the same index in ts as ts_syn
    ts = ts.loc[ts_syn.index]
    mae[ID] = validation(ts.values, ts_syn.values, 'MAE')
    rmse[ID] = validation(ts.values, ts_syn.values, 'RMSE')
    smape[ID] = validation(ts.values, ts_syn.values, 'SMAPE')
# make a dataframe with 3 dictionaries as columns
temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
temp_df.to_csv('./results/kproto10.csv')

In [49]:
temp_df.mean()

MAE       6.653858
RMSE     17.418321
SMAPE    26.884031
dtype: float64

In [50]:
temp_df.median()

MAE       4.112054
RMSE      6.061998
SMAPE    20.185257
dtype: float64