In [2]:
import os
# Change native directory to root
os.chdir(os.path.dirname(os.getcwd()))

In [3]:
import glob
import pandas as pd
import numpy as np
import random
import pickle
from tsfeatures import tsfeatures
import matplotlib.pyplot as plt
import plotly.express as px

# K-mean clustering libraries
from kmodes.kprototypes import KPrototypes
from tqdm import tqdm

random.seed(123)
model_dir = 'models/'

In [5]:
# Read the features from the csv file
features = pd.read_csv('data/social_features.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[1:4,16:24]].sum(axis=1)
ds = features[['morning', 'evening', 'function']].copy()
ds

Unnamed: 0,morning,evening,function
1208,0.573226,0.397711,Stadhuis/Gemeentehuis
588,0.451833,0.510319,Academie
1116,0.433481,0.537542,Cultureel centrum
144,0.727421,0.255215,Lagere school
510,0.542107,0.443149,Andere gebouwen
...,...,...,...
56,0.561950,0.413922,Ontmoetingscentrum
13,0.582597,0.392399,Administratief centrum
1660,0.529864,0.438617,Cultureel centrum
235,0.644834,0.326298,Werkplaats


In [11]:
kproto = KPrototypes(n_clusters=6, init='Cao', verbose=2)
clusters = kproto.fit_predict(ds, categorical=[2])
features['cluster'] = clusters

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 323, ncost: 34.49219909248915
Run: 1, iteration: 2/100, moves: 164, ncost: 34.29658444747944
Run: 1, iteration: 3/100, moves: 24, ncost: 34.28927027496563
Run: 1, iteration: 4/100, moves: 5, ncost: 34.28820909981603
Run: 1, iteration: 5/100, moves: 0, ncost: 34.28820909981603
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 354, ncost: 35.448704589677305
Run: 2, iteration: 2/100, moves: 91, ncost: 35.35132610345045
Run: 2, iteration: 3/100, moves: 35, ncost: 35.31336982338694
Run: 2, iteration: 4/100, moves: 98, ncost: 35.13835288348172
Run: 2, iteration: 5/100, moves: 34, ncost: 35.13150384209437
Run: 2, iteration: 6/100, moves: 6, ncost: 35.13097521859355
Run: 2, iteration: 7/100, moves: 1, ncost: 35.13096958364051
Run: 2, iteration: 8

In [10]:
# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(ds.index, clusters):
    print(f"Symbol: {s}, cluster:{c}")

[['0.51627399148021' '0.4581646218897727' 'Sporthal']
 ['0.406635727532301' '0.5655222325442574' 'Andere gebouwen']
 ['0.611905644194714' '0.3613648305579128' 'Administratief centrum']
 ['0.5543253781242361' '0.4172006013200627' 'Cultureel centrum']
 ['0.536307204916398' '0.43242760002448655' 'Ontmoetingscentrum']
 ['0.6914791466739091' '0.2881228292467457' 'Lagere school']]
32.35306032387537
5
Symbol: 1208, cluster:3
Symbol: 588, cluster:1
Symbol: 1116, cluster:3
Symbol: 144, cluster:5
Symbol: 510, cluster:1
Symbol: 220, cluster:4
Symbol: 621, cluster:5
Symbol: 786, cluster:3
Symbol: 428, cluster:2
Symbol: 59, cluster:0
Symbol: 999, cluster:3
Symbol: 1185, cluster:0
Symbol: 828, cluster:5
Symbol: 1477, cluster:2
Symbol: 1000, cluster:2
Symbol: 1248, cluster:4
Symbol: 1011, cluster:3
Symbol: 336, cluster:5
Symbol: 697, cluster:2
Symbol: 556, cluster:1
Symbol: 526, cluster:0
Symbol: 989, cluster:0
Symbol: 1419, cluster:2
Symbol: 1408, cluster:4
Symbol: 520, cluster:1
Symbol: 178, cluste

In [13]:
st_p = pd.DataFrame()
# Create stanard profiles for each cluster
for k, clust in enumerate(np.unique(clusters)):
    for i, ID in enumerate(features[features['cluster'] == clust].ID):
        #print(ID)
        if i == 0:
            agg = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg = agg / agg['2019'].sum()
        else:
            agg[ID] = pd.read_csv('./data/buildings/' + str(ID) + '.csv', index_col=0, usecols=['ds','Power'], parse_dates=['ds'])
            agg[ID] = agg[ID] / agg['2019'][ID].sum()
    # Create an average profile over columns
    agg = agg.mean(axis=1)
    st_p[str(clust)] = agg

In [15]:
# Read the features from the csv file
features = pd.read_csv('data/social_features_test.csv', index_col=0)
# read metadata csv
metadata = pd.read_csv('data/EANLIJST_METADATA.csv', index_col=0, sep   = ';')
# ADD the functietype column to the features
features['function'] = metadata['Patrimonium Functietype']
# read more metrics from csv
features.isnull().sum()
features.dropna(inplace=True)
features['ID'] = features.index
# drop rows with kast as function
features = features[features['function'] != 'Kast']
features['morning'] = features.iloc[:,4:16].sum(axis=1)
features['evening'] = features.iloc[:,np.r_[1:4,16:24]].sum(axis=1)
subset = features[['yearly','morning', 'evening', 'function']].copy()
subset

Unnamed: 0,yearly,morning,evening,function
666,308362.90,0.458208,0.502196,Andere gebouwen
399,100270.95,0.673820,0.302632,School
1544,496847.76,0.651339,0.328465,Stadhuis/Gemeentehuis
1655,43067.00,0.603173,0.361325,Lagere school
844,411595.23,0.469201,0.495961,School
...,...,...,...,...
320,2473361.00,0.637910,0.339968,RVT/WZC/revalidatiecentrum
944,905537.58,0.729076,0.257051,Bibliotheek
444,25718.20,0.613807,0.358155,Werkplaats
778,995995.20,0.384185,0.588073,Stadion


In [None]:
mae = {}
rmse = {}
smape = {}
for ID in subset.index:
    func = subset.loc[ID, 'function']
    ts = pd.read_csv('./data/buildings/{}.csv'.format(ID), usecols=['Power', 'ds'], index_col='ds')
    ts_syn =  st_p[names[get_index(func, clusters)]].copy() * subset.loc[ID, 'yearly']
    ts_syn = ts_syn.to_frame()
    ts.index = pd.to_datetime(ts.index)
    ts_syn.index = pd.to_datetime(ts_syn.index)
    # drop inf values from ts_syn
    ts_syn = ts_syn[ts_syn < np.inf]
    # drop the nan values
    ts_syn = ts_syn.dropna()
    # keep the same index in ts as ts_syn
    ts = ts.loc[ts_syn.index]
    mae[ID] = validation(ts.values, ts_syn.values, 'MAE')
    rmse[ID] = validation(ts.values, ts_syn.values, 'RMSE')
    smape[ID] = validation(ts.values, ts_syn.values, 'SMAPE')
# make a dataframe with 3 dictionaries as columns
temp_df = pd.DataFrame({'MAE': mae, 'RMSE': rmse, 'SMAPE': smape})
temp_df.to_csv('./results/Kris.csv')