# Segmentation

How to use time step segmentation to reduce the number of timesteps per period.

Author: Maximilian Hoffmann

Import pandas and the relevant time series aggregation class

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd

# Configure Plotly for sphinx/nbsphinx output
import plotly.io as pio

import tsam
from tsam import ClusterConfig, SegmentConfig

pio.renderers.default = "notebook"

### Input data 

Read in time series from testdata.csv with pandas

In [None]:
raw = pd.read_csv("testdata.csv", index_col=0)

Create a plot function for the temperature for a visual comparison of the time series

In [None]:
# Use tsam's built-in plotting with plotly
# tsam.plot.heatmap(data, column, period_duration) creates interactive heatmaps

### Hierarchical aggregation with medoid representation and 10 typical days with 24 hourly segments

Initialize an aggregation class object with hierarchical as method for eight typical days

In [None]:
result = tsam.aggregate(
    raw,
    n_clusters=10,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical"),
)

Create the typical periods

In [None]:
cluster_representatives = result.cluster_representatives
cluster_representatives

Predict original data

In [None]:
reconstructed = result.reconstruct()
reconstructed

Get accuracy indicators

In [None]:
result.accuracy

### Hierarchical aggregation with medoid representation and 20 typical days with 12 irregular segments

In [None]:
result_segmented = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical"),
    segments=SegmentConfig(n_segments=12),
)

Create the typical periods

In [None]:
cluster_representatives_segmented = result_segmented.cluster_representatives
cluster_representatives_segmented

Predict original data

In [None]:
reconstructed_segmented = result_segmented.reconstruct()
reconstructed_segmented

Get accuracy indicators

In [None]:
result_segmented.accuracy

### Comparison of the aggregations 
It was shown for the temperature, but both times all four time series have been aggregated. Therefore, we compare here also the duration curves  of the electrical load for the original time series, the aggregation with k-mean, and the hierarchical aggregation including peak periods.

In [None]:
# Compare duration curves using built-in function
tsam.plot.compare_results(
    {
        "Original": raw,
        "10 with 24 hours": reconstructed,
        "20 with 12 Seg": reconstructed_segmented,
    },
    column="Load",
    plot_type="duration_curve",
)

In [None]:
param = "GHI"

In [None]:
# Original data heatmap
tsam.plot.heatmap(raw, column=param, period_duration=24, title=f"Original {param}")

In [None]:
# 10 periods with 24 hours heatmap
tsam.plot.heatmap(
    reconstructed, column=param, period_duration=24, title=f"10 with 24 hours - {param}"
)

In [None]:
# 20 periods with 12 segments heatmap
tsam.plot.heatmap(
    reconstructed_segmented,
    column=param,
    period_duration=24,
    title=f"20 with 12 Seg - {param}",
)

In [None]:
# Time slice comparison using built-in function
tsam.plot.compare(
    {
        "Original": raw,
        "10 with 24 hours": reconstructed,
        "20 with 12 seg": reconstructed_segmented,
    },
    column="Load",
    plot_type="time_slice",
    start="20100210",
    end="20100218",
)

### Validation

Check that the means of the original time series and the predicted ones are the same.

In [None]:
raw.mean()

In [None]:
reconstructed.mean()

In [None]:
reconstructed_segmented.mean()

Check that a segmented period has the same column-wise means as a non-segmented period for if the periods are the same.

In [None]:
# Mean of first period with non-segmented aggregation
result.cluster_representatives.loc[0, :].mean()

In [None]:
# Segmented aggregation with same number of periods for comparison
result_segmented_test = tsam.aggregate(
    raw,
    n_clusters=10,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical"),
    segments=SegmentConfig(n_segments=12),
)

In [None]:
# Get segment durations
segment_durations = result_segmented_test.segment_durations
print("Segment durations:", segment_durations)

In [None]:
# Weighted mean of first period (should match non-segmented period mean)
period_0 = result_segmented_test.cluster_representatives.loc[0, :].reset_index(
    0, drop=True
)
# Convert segment_durations dict to DataFrame and get values for period 0
segment_durations_df = pd.DataFrame.from_dict(segment_durations)
segment_durations_values = segment_durations_df.loc[0, :].values.flatten().tolist()
weighted_mean = period_0.mul(segment_durations_values, axis=0).sum() / sum(
    segment_durations_values
)
weighted_mean

Print out the (segmented) typical periods.

In [None]:
# Display segmented typical periods
result_segmented.cluster_representatives

In [None]:
# Display non-segmented typical periods
result.cluster_representatives