In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sb

from starterkits.visualization import vis_tools

# from starterkits import DATA_PATH
DATA_PATH = Path('/Users/hcab/Documents/projects/data/')
import support as sp
import visualizations as vis

%load_ext autoreload
%autoreload 2

In [None]:
data_dir = DATA_PATH / 'smart_meters_london'

acorns = pd.read_csv(data_dir / 'acorn_details.csv', encoding="ISO-8859-1")
acorns_age = acorns[acorns.CATEGORIES == 'Age'].set_index('REFERENCE').iloc[:, 2:]
n_acorns_age = acorns_age.apply(lambda x: x/x.sum(), axis=1)
std_n_acorns_age = n_acorns_age.std(axis=0).sort_values(ascending=False)

In [None]:
from sklearn.cluster import AgglomerativeClustering

X = n_acorns_age.T
clusters = AgglomerativeClustering(n_clusters=4).fit_predict(X)

keep_acorns = []
for c in np.unique(clusters):
    id_cluster = clusters == c
    if id_cluster.sum() == 1:
        keep_acorns.append(n_acorns_age.columns.values[np.argwhere(id_cluster).squeeze()])
    else:
        corr_cluster = n_acorns_age.loc[:, id_cluster].corr()
        keep_acorns.append(corr_cluster.mean().idxmax())
        
selected_acorns = pd.melt(n_acorns_age.loc[:, keep_acorns].reset_index(), id_vars='REFERENCE')

fig, axes = plt.subplots(1, len(keep_acorns), figsize=(15, 5))
for k, a in enumerate(keep_acorns):
    n_acorns_age_sub = n_acorns_age[a]
    axes[k].bar(n_acorns_age_sub.index, n_acorns_age_sub)
    vis_tools.format_axes(is_date=False, axes=axes[k])
vis_tools.add_facet_axes_labels(axes, 1, len(keep_acorns), 'Age group', 'Prevalence', keep_acorns)
fig.suptitle('Distribution across age groups of selected ACORNS', fontsize=20)
fig.tight_layout();


In [None]:
acorns = pd.read_csv(data_dir / 'informations_households.csv')
np.random.seed(77)
lcl_ids = []
for a in keep_acorns:
    lcl_ids.append(info_households[info_households.Acorn == a].LCLid.sample(2, replace=False).values)
    
lcl_ids = np.concatenate(lcl_ids)

In [None]:
info_households = pd.read_csv(data_dir / 'informations_households.csv')

In [None]:
def read_smartmeter(f):
    ds = pd.read_csv(f)
    return ds#[ds.LCLid.isin(lcl_ids)]
ds = pd.concat([read_smartmeter(f) for f in (data_dir / 'halfhourly_dataset').glob('*csv')])

ds['tstp'] = pd.to_datetime(ds.tstp)
ds.rename(columns={'tstp': 'datetime', 'energy(kWh/hh)': 'energy'}, inplace=True)
ds.set_index(['LCLid', 'datetime'], inplace=True)
ds[ds.energy=='Null'] = None
ds['energy'] = ds['energy'].astype(float)

In [None]:
cols = ['energy']
no_nulls = (ds
            .groupby(['LCLid', 'date'])
            [cols]
            .apply(lambda x: pd.isnull(x).sum())
            > 0)
no_nulls = no_nulls.reset_index().rename(columns={'energy': 'has_nulls'})
ds = (ds
      .reset_index()
      .merge(no_nulls[no_nulls.has_nulls == False].drop(columns='has_nulls'),
             on=['LCLid', 'date'])
      .set_index(['LCLid', 'datetime']))

In [None]:
ds = sp.get_holidays(ds, data_dir)
ds['wday'] = ds.index.get_level_values(1).weekday
ds['date'] = ds.index.get_level_values(1).date
ds['quarter'] = ds.index.get_level_values(1).quarter

In [None]:
results_radius_tuning = sp.finetune_dtw_radius(ds, 'energy')

In [None]:
vis.plot_dtw_convergence_radius_tuning(results_radius_tuning)

In [None]:
radius = vis.get_radius_at_convergion(results_radius_tuning, thresh_0=0.5)

In [None]:
ds_sub = []
np.random.seed(77)
for l in np.unique(ds.index.get_level_values(0)):
    rnd_dates = ds.loc[pd.IndexSlice[l,:], 'date'].sample(20, replace=False).values
    ds_sub.append(ds[(ds.index.get_level_values(0) == l) & (ds['date'].isin(rnd_dates))])
ds_sub = pd.concat(ds_sub)

In [None]:
dist_matrix, ds_dist_matrix, ds_units = sp.get_distance_matrix(ds_sub, int(radius), ['energy'])

In [None]:
vis.get_best_number_clusters(dist_matrix);

In [None]:
cluster_day, cluster_lclid, cluster_colors = sp.make_kmedoid_clustering(
    dist_matrix, ds_units, 5)

ds_dist_matrix, cluster_ranks, cluster_centers = sp.add_cluster_to_results(
    ds_dist_matrix, cluster_day)



In [None]:
plt.figure(figsize=(15, 5))
sb.countplot(data=cluster_day, x='cluster', palette=cluster_colors, order=cluster_colors.keys())
vis_tools.axes_font_size('Cluster ID', 'Count', 'Trial distribution per cluster');

In [None]:
cols = ['energy']
def _reshape_for_dtw(ds, r):
    return ds.loc[pd.IndexSlice[r.LCLid, str(r.date)], cols]

def _inverse_reshape(x, LCLid, cluster):
    x.index = (x.index - x.index.min()) / np.timedelta64(1, 'h')
    x.reset_index(inplace=True)
    x['LCLid'] = LCLid
    x['cluster'] = cluster
    
    return x.set_index(['LCLid', 'datetime'])
    


def align_to_ref(ref, target):
    align = dtw(target.values, ref.values)
    return target.iloc[align.get_warping_path(),:]

ds_cluster_centers = {}
for k, r in cluster_centers.iterrows():
    ds_cluster_centers[k] = _reshape_for_dtw(ds_sub, ds_units.loc[r['index']])

cluster_day_aligned = []
for k, r in tqdm(cluster_day.iterrows(), total=len(cluster_day)):
    ref = cluster_centers.loc[r.cluster]
    if (ref['index'] == k):
        x_aligned = ds_cluster_centers[r.cluster]
    else:
        target = _reshape_for_dtw(ds_sub, ds_units.loc[r.name])
        x_aligned = align_to_ref(ds_cluster_centers[r.cluster], target)
    cluster_day_aligned.append(_inverse_reshape(x_aligned.copy(), r.LCLid, r.cluster))
cluster_day_aligned = pd.concat(cluster_day_aligned)
        
        

In [None]:
n_clusters = len(cluster_centers)
fig, axes = plt.subplots(n_clusters, 1, figsize=(15, 5 * n_clusters))
for k in cluster_centers.index:
    sb.lineplot(data=cluster_day_aligned[cluster_day_aligned.cluster==k].reset_index(), 
                x='datetime', 
                y='energy',
                ax=axes[k])
vis_tools.add_facet_axes_labels(axes, n_clusters, 1, 'Hour', 'energy', [f'Cluster{k}' for k in cluster_centers.index]);
    