# Segmentation

How to use time step segmentation to reduce the number of timesteps per period.

Author: Maximilian Hoffmann

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import plotly.express as px
import plotly.io as pio

import tsam
from tsam import ClusterConfig, SegmentConfig

pio.renderers.default = "notebook"

### Input data

In [None]:
raw = pd.read_csv("testdata.csv", index_col=0)

### 10 typical days at hourly resolution (no segmentation)

Baseline: hierarchical clustering with medoid representation and 24 hourly timesteps.

In [None]:
result = tsam.aggregate(
    raw,
    n_clusters=10,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical"),
)
result.accuracy

In [None]:
result.plot.cluster_members()

In [None]:
result.cluster_representatives

### 20 typical days with 12 irregular segments

Segmentation reduces the number of timesteps per period while preserving key transitions.

In [None]:
result_segmented = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical"),
    segments=SegmentConfig(n_segments=12),
)
result_segmented.accuracy

In [None]:
result_segmented.plot.cluster_members()

In [None]:
result_segmented.plot.segment_durations()

In [None]:
result_segmented.cluster_representatives

### Comparison

In [None]:
results = {
    "10 x 24h": result,
    "20 x 12seg": result_segmented,
}

# Duration curves
frames = []
for name, r in {"Original": None, **results}.items():
    vals = (raw if r is None else r.reconstructed)["Load"]
    sorted_vals = vals.sort_values(ascending=False).reset_index(drop=True)
    frames.append(
        pd.DataFrame(
            {"Hour": range(len(sorted_vals)), "Load": sorted_vals, "Method": name}
        )
    )

px.line(
    pd.concat(frames, ignore_index=True),
    x="Hour",
    y="Load",
    color="Method",
    title="Duration Curve Comparison - Load",
)

In [None]:
# Heatmap comparison
param = "GHI"

import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ["Original", "10 x 24h", "20 x 12seg"]
data = [
    tsam.unstack_to_periods(raw, period_duration=24),
    tsam.unstack_to_periods(result.reconstructed, period_duration=24),
    tsam.unstack_to_periods(result_segmented.reconstructed, period_duration=24),
]

fig = make_subplots(rows=3, cols=1, subplot_titles=labels, vertical_spacing=0.05)
for i, d in enumerate(data, 1):
    fig.add_trace(go.Heatmap(z=d[param].values.T, coloraxis="coloraxis"), row=i, col=1)
fig.update_layout(
    height=750,
    coloraxis={"colorscale": "Viridis"},
    title_text=f"Heatmap Comparison - {param}",
)
fig.show()

In [None]:
# Time slice comparison
frames = []
for name, r in {"Original": None, **results}.items():
    df = raw if r is None else r.reconstructed
    sliced = df.loc["20100210":"20100218", ["Load"]].copy()
    sliced["Method"] = name
    frames.append(sliced)

px.line(
    pd.concat(frames).reset_index(names="Time"),
    x="Time",
    y="Load",
    color="Method",
    title="Time Slice Comparison - Load (Feb 10-18)",
)

### Validation

Column means should be preserved for both approaches.

In [None]:
means = pd.DataFrame(
    {
        "Original": raw.mean(),
        "10 x 24h": result.reconstructed.mean(),
        "20 x 12seg": result_segmented.reconstructed.mean(),
    }
)
means