In [None]:
import numpy as np
import pandas as pd
%matplotlib inline

## Leitura e Processamento de Dados

In [None]:
DATA_DIR = 'assets/household_demand/'
DEVICE_INFO_DIR = 'assets/devices_info.csv'

In [None]:
import os
devices = os.listdir(DATA_DIR)
devices

In [None]:
# guardar a série de cada equipamento num dicionário
import re

devices_data = pd.read_csv(DEVICE_INFO_DIR)

series_d = {}
for device in devices:
    print(device)
    
    df = pd.read_csv(f'{DATA_DIR}{device}', parse_dates=['timestamp'])

    df.set_index('timestamp', inplace=True)
    
    device_name = re.sub('.csv', '', device)
    
    df_power = df['power']
    
    # agregar por granularidade horária
    df_p_hour = df_power.resample('H').sum()
    
    series_d[device_name] = df_p_hour

In [None]:
print([*series_d])

In [None]:
series_d['boiler_226']

In [None]:
series_d['washing_machine_343']

In [None]:
pd.Series([len(series_d[x]) for x in series_d]).value_counts()

In [None]:
# diferentes períodos e tamanhos
### começar na 2a feira 00:00
series_trunc = {}
for k, device in series_d.items():
    print(k)
    first_monday0000 = np.where(device.index.weekday + device.index.hour + device.index.minute == 0)[0][0]

    # primeiros 21 dias
    series_from_mon = device[first_monday0000:].head(24 * 21)

    if len(series_from_mon) == 24 * 21:
        series_trunc[k] = series_from_mon

In [None]:
series_trunc['washing_machine_343']

In [None]:
# tamanho de cada série
pd.Series([len(series_trunc[x]) for x in series_trunc]).value_counts()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# normalização (x-min(x)) / (max(x) - min(x))
for k in series_trunc:
    series_norm = MinMaxScaler().fit_transform(series_trunc[k].values.reshape(-1, 1)).flatten()
    #series_d[k] = pd.Series(series_norm, index=series_d[k].index)
    series_trunc[k] = pd.Series(series_norm)

series_trunc['boiler_226']

In [None]:
series_df = pd.DataFrame(series_trunc)
series_df.head()

### Clustering

#### Com base em variáveis explicativas (feature-based)

In [None]:
series_df_list = []
#for i, k in enumerate(series_trunc):
for k, x in series_trunc.items():  
    x = x.reset_index()
    x['id'] = k
    series_df_list.append(x)

series_df_l = pd.concat(series_df_list, axis=0)
series_df_l.columns = ['time', 'value', 'id']
series_df_l

In [None]:
from tsfresh.feature_extraction import MinimalFCParameters, EfficientFCParameters
from tsfresh.feature_extraction import extract_features
from tsfresh.utilities.dataframe_functions import impute

feats = extract_features(series_df_l,
                         default_fc_parameters=MinimalFCParameters(),
                         column_id='id',
                         column_value='value',
                         column_sort='time')

feats = impute(feats)
feats

In [None]:
feats_scl = MinMaxScaler().fit_transform(feats)
feats_scl = pd.DataFrame(feats_scl, columns=feats.columns, index=feats.index)
print(feats_scl.shape)
feats_scl

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans_kwargs = {
    'init': 'k-means++',
    'n_init': 10,
    'max_iter': 300,
}

N_CLUSTERS = range(1, 16)

# sum of squared errors, silhoette
sse, silhouette_coefficients = [], []
for k in N_CLUSTERS:
    print(k)
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(feats_scl)
    sse.append(kmeans.inertia_)
    if k > 1:
        silhouette_coefficients.append(silhouette_score(feats_scl, kmeans.labels_))

pd.Series(sse, index=N_CLUSTERS).plot(figsize=(12,6), 
                                      title='Soma dos Erros Quadrados para cada k',
                                      xlabel='Numero de grupos', ylabel='Soma dos Erros Q.',
                                      xticks=N_CLUSTERS)

In [None]:
sil_scores = pd.Series(silhouette_coefficients, index=N_CLUSTERS[1:])
sil_scores.plot(figsize=(12,6), 
                title='Valor da métrica da silhouette',
                xlabel='Numero de grupos', ylabel='Silhouette',
                xticks=N_CLUSTERS[1:])

In [None]:
devices_data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

device_names = [f'{x}.csv' for x in feats_scl.index] 

devices_data = devices_data.loc[devices_data['files_names'].isin(device_names)]
category = devices_data['appliance_category']

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(category)

label_encoder.classes_

In [None]:
pd.Series(category).value_counts()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

model = KMeans(
    n_clusters=4,
    init="k-means++",
    n_init=50,
    max_iter=500,
)

feats_pca = pca.fit_transform(feats_scl)
model.fit(feats_pca)

pca_df = pd.DataFrame(
    feats_pca,
    columns=['PC1', 'PC2'],
)

pca_df['Predicted'] = model.labels_
pca_df['Actual'] = category

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use("fivethirtyeight")
plt.figure(figsize=(8, 8))
scat = sns.scatterplot(
    "PC1",
    "PC2",
    s=150,
    data=pca_df,
    hue="Predicted",
    style="Actual",
    palette="Set2"
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
import scipy.cluster.hierarchy as hc
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 12))
plt.title("Hierarchical Clustering Dendrogram")

clusters = hc.linkage(feats_scl, 
            method='ward', 
            metric="euclidean")
hc.dendrogram(Z=clusters, labels=feats_scl.index, orientation='right')
plt.show()

#### Com dados brutos

In [None]:
from tslearn.clustering import TimeSeriesKMeans
?TimeSeriesKMeans

In [None]:
# sum of squared errors, silhoette
sse, silhouette_coefficients = [], []
for k in N_CLUSTERS:
    print(k)
    kmeans = TimeSeriesKMeans(n_clusters=k, metric='dtw',max_iter=30)
    kmeans.fit(series_df)
    sse.append(kmeans.inertia_)
    if k > 1:
        silhouette_coefficients.append(silhouette_score(series_df, kmeans.labels_))

In [None]:
pd.Series(sse, index=N_CLUSTERS).plot(figsize=(12,6), 
                                      title='Soma dos Erros Quadrados para cada k',
                                      xlabel='Numero de grupos', ylabel='Soma dos Erros Q.',
                                      xticks=N_CLUSTERS)

In [None]:
sil_scores = pd.Series(silhouette_coefficients, index=N_CLUSTERS[1:])
sil_scores.plot(figsize=(12,6), 
                title='Valor da métrica da silhouette',
                xlabel='Numero de grupos', ylabel='Silhouette',
                xticks=N_CLUSTERS[1:])

In [None]:
kmeans = TimeSeriesKMeans(n_clusters=2, metric='dtw',max_iter=100)

feats_pca = pca.fit_transform(feats_scl)
kmeans.fit(feats_pca)

pca_df = pd.DataFrame(
    feats_pca,
    columns=['PC1', 'PC2'],
)

pca_df['Predicted'] = kmeans.labels_
pca_df['Actual'] = category

plt.style.use("fivethirtyeight")
plt.figure(figsize=(8, 8))
scat = sns.scatterplot(
    "PC1",
    "PC2",
    s=150,
    data=pca_df,
    hue="Predicted",
    style="Actual",
    palette="Set2"
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
idx = np.argwhere(feats_pca[:,0] > 1.3).flatten()

In [None]:
feats_scl.iloc[idx,:]

## 