# Representation Methods

Comparison of different cluster representation methods: medoid, maxoid, mean, minmax, and duration.

Author: Maximilian Hoffmann

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import plotly.express as px
import plotly.io as pio

import tsam
from tsam import ClusterConfig

pio.renderers.default = "notebook"

### Input data

In [None]:
raw = pd.read_csv("testdata.csv", index_col=0)
raw

### Medoid representation

Picks the actual observed period closest to each cluster centroid.

In [None]:
result_medoid = tsam.aggregate(
    raw,
    n_clusters=8,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="medoid"),
)
result_medoid.accuracy

In [None]:
result_medoid.plot.cluster_members()

### Maxoid representation

Picks the period that maximizes the sum of all column values in each cluster.

In [None]:
result_maxoid = tsam.aggregate(
    raw,
    n_clusters=8,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="maxoid"),
    preserve_column_means=False,
)
result_maxoid.accuracy

In [None]:
result_maxoid.plot.cluster_members()

### Mean representation

Averages all member periods of each cluster.

In [None]:
result_mean = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="mean"),
)
result_mean.accuracy

In [None]:
result_mean.plot.cluster_members()

### MinMax Mean representation

Like mean, but preserves the minimum and maximum values per column.

In [None]:
result_minmax = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="minmax_mean"),
    preserve_column_means=False,
)
result_minmax.accuracy

In [None]:
result_minmax.plot.cluster_members()

### Distribution representation

Preserves the full value distribution (duration curve) within each cluster.

In [None]:
result_duration = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="distribution"),
    preserve_column_means=False,
)
result_duration.accuracy

In [None]:
result_duration.plot.cluster_members()

### Comparison

Compare all representation methods via duration curves, heatmaps, and time slices.

In [None]:
results = {
    "Medoid (8)": result_medoid,
    "Maxoid (8)": result_maxoid,
    "Mean (20)": result_mean,
    "Minmax (20)": result_minmax,
    "Distribution (20)": result_duration,
}

# Duration curves
frames = []
for name, r in {"Original": None, **results}.items():
    vals = (raw if r is None else r.reconstructed)["Load"]
    sorted_vals = vals.sort_values(ascending=False).reset_index(drop=True)
    frames.append(
        pd.DataFrame(
            {"Hour": range(len(sorted_vals)), "Load": sorted_vals, "Method": name}
        )
    )

px.line(
    pd.concat(frames, ignore_index=True),
    x="Hour",
    y="Load",
    color="Method",
    title="Duration Curve Comparison - Load",
)

In [None]:
# Heatmap comparison
param = "GHI"
unstacked_orig = tsam.unstack_to_periods(raw, period_duration=24)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ["Original", *list(results.keys())]
data = [unstacked_orig] + [
    tsam.unstack_to_periods(r.reconstructed, period_duration=24)
    for r in results.values()
]

fig = make_subplots(
    rows=len(data), cols=1, subplot_titles=labels, vertical_spacing=0.03
)
for i, d in enumerate(data, 1):
    fig.add_trace(go.Heatmap(z=d[param].values.T, coloraxis="coloraxis"), row=i, col=1)
fig.update_layout(
    height=250 * len(data),
    coloraxis={"colorscale": "Viridis"},
    title_text=f"Heatmap Comparison - {param}",
)
fig.show()

In [None]:
# Time slice comparison
frames = []
for name, r in {"Original": None, **results}.items():
    df = raw if r is None else r.reconstructed
    sliced = df.loc["20100210":"20100218", ["Load"]].copy()
    sliced["Method"] = name
    frames.append(sliced)

px.line(
    pd.concat(frames).reset_index(names="Time"),
    x="Time",
    y="Load",
    color="Method",
    title="Time Slice Comparison - Load (Feb 10-18)",
)

### Validation

Column means should be preserved (except maxoid, which uses `preserve_column_means=False`).

In [None]:
means = pd.DataFrame(
    {"Original": raw.mean()}
    | {name: r.reconstructed.mean() for name, r in results.items()}
)
means