# Representation Methods

Comparison of different cluster representation methods: medoid, maxoid, mean, minmax, and duration.

Author: Maximilian Hoffmann

Import pandas and the relevant time series aggregation class

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import plotly.express as px
import plotly.io as pio

import tsam
from tsam import ClusterConfig

pio.renderers.default = "notebook"

### Input data 

Read in time series from testdata.csv with pandas

In [None]:
raw = pd.read_csv("testdata.csv", index_col=0)

Create a plot function for the temperature for a visual comparison of the time series

In [None]:
# Use tsam.unstack_to_periods() with plotly for heatmap visualization
# px.imshow(unstacked["column"].values.T) creates interactive heatmaps

### Hierarchical aggregation with medoid representation at hourly resolution

Initialize an aggregation class object with hierarchical as method for eight typical days

In [None]:
result_medoid = tsam.aggregate(
    raw,
    n_clusters=8,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="medoid"),
)

Create the typical periods

In [None]:
cluster_representatives_medoid = result_medoid.cluster_representatives

Predict original data

In [None]:
reconstructed_medoid = result_medoid.reconstructed

Get accuracy indicators

In [None]:
result_medoid.accuracy

### Hierarchical aggregation with maxoid representation at hourly resolution

In [None]:
result_maxoid = tsam.aggregate(
    raw,
    n_clusters=8,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="maxoid"),
    preserve_column_means=False,
)

Create the typical periods

In [None]:
cluster_representatives_maxoid = result_maxoid.cluster_representatives

Predict original data

In [None]:
reconstructed_maxoid = result_maxoid.reconstructed

Get accuracy indicators

In [None]:
result_maxoid.accuracy

### Hierarchical aggregation with mean representation and 10 typical days at hourly resolution

In [None]:
result_mean = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="mean"),
)

Create the typical periods

In [None]:
cluster_representatives_mean = result_mean.cluster_representatives

Predict original data

In [None]:
reconstructed_mean = result_mean.reconstructed

Get accuracy indicators

In [None]:
result_mean.accuracy

### Hierarchical aggregation with minmax representation and 10 typical days at hourly resolution

In [None]:
result_minmax = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="minmax_mean"),
    preserve_column_means=False,
)

Create the typical periods

In [None]:
cluster_representatives_minmax = result_minmax.cluster_representatives

Predict original data

In [None]:
reconstructed_minmax = result_minmax.reconstructed

Get accuracy indicators

In [None]:
result_minmax.accuracy

### Hierarchical aggregation with distribution representation and 10 typical days at hourly resolution

In [None]:
result_duration = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="distribution"),
    preserve_column_means=False,
)

Create the typical periods

In [None]:
cluster_representatives_duration = result_duration.cluster_representatives

Predict original data

In [None]:
reconstructed_duration = result_duration.reconstructed

Get accuracy indicators

In [None]:
result_duration.accuracy

### Comparison of the aggregations 
It was shown for the temperature, but both times all four time series have been aggregated. Therefore, we compare here also the duration curves  of the electrical load for the original time series, the aggregation with k-mean, and the hierarchical aggregation including peak periods.

In [None]:
# Compare duration curves using plotly express
comparison_data = {
    "Original": raw,
    "Medoid": reconstructed_medoid,
    "Maxoid": reconstructed_maxoid,
    "Mean": reconstructed_mean,
    "Minmax": reconstructed_minmax,
}

frames = []
for name, df in comparison_data.items():
    sorted_vals = df["Load"].sort_values(ascending=False).reset_index(drop=True)
    frames.append(
        pd.DataFrame(
            {"Hour": range(len(sorted_vals)), "Load": sorted_vals, "Method": name}
        )
    )
long_df = pd.concat(frames, ignore_index=True)

px.line(
    long_df,
    x="Hour",
    y="Load",
    color="Method",
    title="Duration Curve Comparison - Load",
)

In [None]:
param = "GHI"

In [None]:
# Original data heatmap
unstacked_orig = tsam.unstack_to_periods(raw, period_duration=24)
px.imshow(
    unstacked_orig[param].values.T,
    labels={"x": "Day", "y": "Hour", "color": param},
    title=f"Original {param}",
    aspect="auto",
)

In [None]:
# Medoid representation heatmap
unstacked_medoid = tsam.unstack_to_periods(reconstructed_medoid, period_duration=24)
px.imshow(
    unstacked_medoid[param].values.T,
    labels={"x": "Day", "y": "Hour", "color": param},
    title=f"Medoid {param}",
    aspect="auto",
)

In [None]:
# Maxoid representation heatmap
unstacked_maxoid = tsam.unstack_to_periods(reconstructed_maxoid, period_duration=24)
px.imshow(
    unstacked_maxoid[param].values.T,
    labels={"x": "Day", "y": "Hour", "color": param},
    title=f"Maxoid {param}",
    aspect="auto",
)

In [None]:
# Mean representation heatmap
unstacked_mean = tsam.unstack_to_periods(reconstructed_mean, period_duration=24)
px.imshow(
    unstacked_mean[param].values.T,
    labels={"x": "Day", "y": "Hour", "color": param},
    title=f"Mean {param}",
    aspect="auto",
)

In [None]:
# Minmax representation heatmap
unstacked_minmax = tsam.unstack_to_periods(reconstructed_minmax, period_duration=24)
px.imshow(
    unstacked_minmax[param].values.T,
    labels={"x": "Day", "y": "Hour", "color": param},
    title=f"Minmax {param}",
    aspect="auto",
)

In [None]:
# Distribution representation heatmap
unstacked_dist = tsam.unstack_to_periods(reconstructed_duration, period_duration=24)
px.imshow(
    unstacked_dist[param].values.T,
    labels={"x": "Day", "y": "Hour", "color": param},
    title=f"Distribution {param}",
    aspect="auto",
)

In [None]:
# Time slice comparison using plotly express
frames = []
for name, df in comparison_data.items():
    sliced = df.loc["20100210":"20100218", ["Load"]].copy()
    sliced["Method"] = name
    frames.append(sliced)
long_df = pd.concat(frames).reset_index(names="Time")

px.line(
    long_df,
    x="Time",
    y="Load",
    color="Method",
    title="Time Slice Comparison - Load (Feb 10-18)",
)

### Validation

Check that the means of the original time series and the predicted ones are the same.

In [None]:
raw.mean()

In [None]:
reconstructed_medoid.mean()

In [None]:
reconstructed_maxoid.mean()

In [None]:
reconstructed_mean.mean()

In [None]:
reconstructed_minmax.mean()

In [None]:
reconstructed_duration.mean()