# Dataset Explorer
This notebook inspects dataset YAML configurations, lists available trials, and visualizes time series via custom loaders.

## 1. Load libraries and utilities
Import required libraries, resolve project paths, and configure plotting defaults.

In [1]:
from pathlib import Path
import os
import json
import random
from typing import Any
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from tsseg_exp.datasets.loaders import load_dataset, _get_all_params

# Resolve project root from the notebook location.
PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / "configs").exists():
    if (PROJECT_ROOT.parent / "configs").exists():
        PROJECT_ROOT = PROJECT_ROOT.parent.resolve()
    else:
        raise RuntimeError("Unable to locate the 'configs' directory.")

CONFIG_ROOT = PROJECT_ROOT / "configs" / "dataset"
DATA_ROOT = PROJECT_ROOT / "data"
NOTEBOOK_CACHE = PROJECT_ROOT / "notebooks" / ".cache"

plt.rcParams.update({
    "figure.figsize": (14, 4),
    "axes.grid": True
})

RNG = random.Random()
EXPLORATION_LOG: list[dict[str, Any]] = []
print(f"Project root: {PROJECT_ROOT}")
print(f"Config directory: {CONFIG_ROOT}")
print(f"Data directory: {DATA_ROOT}")

E0000 00:00:1767701153.709358   12751 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767701153.724331   12751 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Project root: /home/fchavell/tsseg-project/tsseg-exp
Config directory: /home/fchavell/tsseg-project/tsseg-exp/configs/dataset
Data directory: /home/fchavell/tsseg-project/tsseg-exp/data


## 2. Read YAML configurations
Scan configuration files, load metadata, and build a quick overview table.

In [2]:
config_records: list[dict[str, Any]] = []
for cfg_path in sorted(CONFIG_ROOT.glob("*.yaml")):
    with open(cfg_path, "r", encoding="utf-8") as handle:
        raw_cfg = yaml.safe_load(handle) or {}
    loader_cfg = raw_cfg.get("loader", {})
    record = {
        "config_file": cfg_path.name,
        "dataset_name": loader_cfg.get("dataset_name", raw_cfg.get("name")),
        "modality": raw_cfg.get("modality"),
        "loader_target": loader_cfg.get("_target_"),
        "loader_defaults": {k: v for k, v in loader_cfg.items() if k not in {"_target_", "dataset_name"}},
    }
    config_records.append(record)

dataset_configs_df = pd.DataFrame(config_records).sort_values("dataset_name").reset_index(drop=True)
display(dataset_configs_df)

Unnamed: 0,config_file,dataset_name,modality,loader_target,loader_defaults
0,actrectut.yaml,actrectut,multivariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}
1,has.yaml,has,multivariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}
2,knot-tying.yaml,knot-tying,multivariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}
3,mocap.yaml,mocap,multivariate,tsseg_exp.datasets.loaders.load_dataset,{}
4,needle-passing.yaml,needle-passing,multivariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}
5,pamap2.yaml,pamap2,multivariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}
6,skab.yaml,skab,multivariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}
7,suturing.yaml,suturing,multivariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}
8,tssb.yaml,tssb,univariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}
9,usc-had.yaml,usc-had,multivariate,tsseg_exp.datasets.loaders.load_dataset,{'return_X_y': True}


## 3. Build the trial index
Leverage helper utilities to enumerate valid parameter combinations and enrich the index with YAML metadata.

In [7]:
index_rows: list[dict[str, Any]] = []
for cfg in config_records:
    dataset_name = cfg["dataset_name"]
    modality = cfg["modality"]
    config_file = cfg["config_file"]
    print(f"Processing {dataset_name}...")
    try:
        param_grid = _get_all_params(dataset_name, DATA_ROOT)
    except Exception as exc:
        print(f"Warning: _get_all_params failed for {dataset_name}: {exc}")
        param_grid = []

    if param_grid:
        for params in param_grid:
            index_rows.append({
                "dataset": dataset_name,
                "params": params,
                "param_summary": json.dumps(params, sort_keys=True),
                "modality": modality,
                "config_file": config_file,
                "indexed": True
            })
    else:
        index_rows.append({
            "dataset": dataset_name,
            "params": {},
            "param_summary": "{}",
            "modality": modality,
            "config_file": config_file,
            "indexed": False
        })

trial_index_df = pd.DataFrame(index_rows)
print(f"Total trials indexed: {len(trial_index_df)}")
display(trial_index_df.head())
display(trial_index_df.groupby("dataset").size().to_frame("trial_count"))

Processing actrectut...
Processing has...
Processing knot-tying...
Processing mocap...
Processing needle-passing...
Processing pamap2...
Processing skab...
Processing suturing...
Processing tssb...
Processing usc-had...
Processing utsa...
Total trials indexed: 438


Unnamed: 0,dataset,params,param_summary,modality,config_file,indexed
0,actrectut,{'subject_number': 1},"{""subject_number"": 1}",multivariate,actrectut.yaml,True
1,actrectut,{'subject_number': 2},"{""subject_number"": 2}",multivariate,actrectut.yaml,True
2,has,{'ts_id': 0},"{""ts_id"": 0}",multivariate,has.yaml,True
3,has,{'ts_id': 1},"{""ts_id"": 1}",multivariate,has.yaml,True
4,has,{'ts_id': 2},"{""ts_id"": 2}",multivariate,has.yaml,True


Unnamed: 0_level_0,trial_count
dataset,Unnamed: 1_level_1
actrectut,2
has,250
knot-tying,36
mocap,1
needle-passing,28
pamap2,9
skab,1
suturing,39
tssb,1
usc-had,70


## 4. Define exploration helpers
Provide helpers to load a specific trial or sample a random series from the consolidated index.

In [8]:
def plot_multichannel(X: np.ndarray, y: np.ndarray | None = None, *, max_channels: int = 8, title: str | None = None) -> None:
    """Plot multiple channels and optionally the ground-truth labels."""
    array = np.asarray(X)
    if array.ndim == 1:
        array = array.reshape(-1, 1)
    time_axis = np.arange(array.shape[0])
    display_channels = min(array.shape[1], max_channels)
    fig, ax = plt.subplots()
    for chan in range(display_channels):
        ax.plot(time_axis, array[:, chan], label=f"channel {chan}")
    if array.shape[1] > display_channels:
        ax.set_title("Only a subset of channels is shown")
    if title:
        ax.set_title(title)
    ax.set_xlabel("time (index)")
    ax.set_ylabel("amplitude")
    if y is not None and len(y):
        ax2 = ax.twinx()
        ax2.step(time_axis, y, where="post", color="black", alpha=0.35, label="ground truth")
        ax2.set_ylabel("class")
        ax2.grid(False)
    ax.legend(loc="upper right")
    plt.show()

def explore_trial(dataset_name: str, **params: Any) -> tuple[np.ndarray, np.ndarray, dict[str, Any]]:
    """Load a specific series and return data, labels, and metadata."""
    X, y = load_dataset(dataset_name=dataset_name, data_root=str(DATA_ROOT), return_X_y=True, **params)
    if isinstance(X, list) or isinstance(y, list):
        raise ValueError("explore_trial expects a single series. Provide more specific parameters.")
    array = np.asarray(X)
    labels = np.asarray(y) if y is not None else None
    context = {
        "dataset": dataset_name,
        "params": params,
        "length": int(array.shape[0]),
        "channels": int(array.shape[1]) if array.ndim > 1 else 1,
        "labels_present": bool(labels is not None and labels.size > 0),
        "unique_labels": int(np.unique(labels).size) if labels is not None and labels.size else 0
    }
    return array, labels, context

def explore_random(dataset_name: str) -> tuple[np.ndarray, np.ndarray, dict[str, Any]]:
    """Sample a random series from the index or fallback loader for the dataset."""
    candidates = trial_index_df[(trial_index_df["dataset"] == dataset_name) & (trial_index_df["indexed"])]
    if not candidates.empty:
        sample_idx = RNG.randrange(len(candidates))
        sample_row = candidates.iloc[sample_idx]
        params = sample_row["params"] or {}
        X, y, context = explore_trial(dataset_name, **params)
        context["selection_mode"] = "index"
        context["config_file"] = sample_row["config_file"]
        return X, y, context

    X_all, y_all = load_dataset(dataset_name=dataset_name, data_root=str(DATA_ROOT), return_X_y=True)
    if isinstance(X_all, list) and X_all:
        sample_idx = RNG.randrange(len(X_all))
        X = np.asarray(X_all[sample_idx])
        y = np.asarray(y_all[sample_idx]) if y_all else np.array([])
        context = {
            "dataset": dataset_name,
            "params": {"index": sample_idx},
            "length": int(X.shape[0]),
            "channels": int(X.shape[1]) if X.ndim > 1 else 1,
            "labels_present": bool(y.size > 0),
            "unique_labels": int(np.unique(y).size) if y.size else 0,
            "selection_mode": "bulk",
            "config_file": trial_index_df.loc[trial_index_df["dataset"] == dataset_name, "config_file"].head(1).item() if not trial_index_df.empty and (trial_index_df["dataset"] == dataset_name).any() else None
        }
        return X, y, context

    if isinstance(X_all, np.ndarray):
        X = X_all
        y = np.asarray(y_all) if y_all is not None else np.array([])
        context = {
            "dataset": dataset_name,
            "params": {},
            "length": int(X.shape[0]),
            "channels": int(X.shape[1]) if X.ndim > 1 else 1,
            "labels_present": bool(y.size > 0),
            "unique_labels": int(np.unique(y).size) if y.size else 0,
            "selection_mode": "single",
            "config_file": trial_index_df.loc[trial_index_df["dataset"] == dataset_name, "config_file"].head(1).item() if not trial_index_df.empty and (trial_index_df["dataset"] == dataset_name).any() else None
        }
        return X, y, context

    raise ValueError(f"Unable to explore a trial for {dataset_name} with the current loader.")

## 5. Explore a trial by identifiers
Pick a dataset and parameters from the index to inspect the corresponding series.

In [5]:
if dataset_index_df.empty:
    raise

example_row = dataset_index_df[dataset_index_df["indexed"]].head(1)
if example_row.empty:
    print("No available detailed parameters: use explore_random for this dataset.")
else:
    dataset_name = example_row.iloc[0]["dataset"]
    example_params = example_row.iloc[0]["params"]
    print(f"Dataset selected: {dataset_name}")
    print(f"Parameters: {example_params}")
    X_example, y_example, ctx_example = explore_trial(dataset_name, **example_params)
    ctx_example["section"] = "identifiers"
    EXPLORATION_LOG.append(ctx_example)
    display(pd.DataFrame([ctx_example]))
    stats_df = pd.DataFrame(X_example).describe().T
    display(stats_df)
    plot_multichannel(X_example, y_example, title=f"{dataset_name} - preview control")

NameError: name 'dataset_index_df' is not defined

## 6. Explore a random series
Sample a random parameter combination for a chosen dataset, visualize it, and log the exploration.

In [6]:
available_datasets = dataset_index_df["dataset"].unique().tolist()
if not available_datasets:
    raise RuntimeError("No datasets registered for random exploration.")

random_dataset = RNG.choice(available_datasets)
print(f"Dataset selected for random exploration: {random_dataset}")
X_random, y_random, ctx_random = explore_random(random_dataset)
ctx_random["section"] = "random"
EXPLORATION_LOG.append(ctx_random)
display(pd.DataFrame([ctx_random]))
plot_multichannel(X_random, y_random, title=f"{random_dataset} - random draw")

NameError: name 'dataset_index_df' is not defined

## 7. Summarize explored metadata
Aggregate exploration episodes and persist them for future reuse.

## 8. Change-point statistics
Compute aggregate change-point and state-count statistics for every dataset referenced in the configuration index.

In [8]:
def _collect_trial_labels(dataset_name: str):
    """Yield label arrays for every trial associated with a dataset."""
    subset = dataset_index_df[(dataset_index_df["dataset"] == dataset_name) & (dataset_index_df["indexed"])]
    if not subset.empty:
        for _, row in subset.iterrows():
            params = row["params"] or {}
            try:
                X, y = load_dataset(dataset_name=dataset_name, data_root=str(DATA_ROOT), return_X_y=True, **params)
            except Exception as exc:
                print(f"Warning: failed to load {dataset_name} with params {params}: {exc}")
                continue
            if isinstance(y, list):
                for labels in y:
                    if labels is not None and len(labels):
                        yield np.asarray(labels)
            elif y is not None and len(np.asarray(y)):
                yield np.asarray(y)
            else:
                continue
        return
    # Fallback: load everything at once when the dataset is not indexed
    try:
        X_all, y_all = load_dataset(dataset_name=dataset_name, data_root=str(DATA_ROOT), return_X_y=True)
    except Exception as exc:
        print(f"Warning: failed bulk load for {dataset_name}: {exc}")
        return
    if isinstance(y_all, list):
        for labels in y_all:
            if labels is not None and len(labels):
                yield np.asarray(labels)
    elif y_all is not None and len(np.asarray(y_all)):
        yield np.asarray(y_all)

def _count_changes(labels: np.ndarray) -> tuple[int, int]:
    labels = np.asarray(labels)
    if labels.ndim != 1:
        labels = labels.ravel()
    if labels.size == 0:
        return 0, 0
    change_points = int(np.sum(np.diff(labels) != 0))
    states = int(np.unique(labels).size)
    return change_points, states

stats_rows: list[dict[str, Any]] = []
for dataset_name in sorted(dataset_index_df["dataset"].unique()):
    cp_counts: list[int] = []
    state_counts: list[int] = []
    for labels in _collect_trial_labels(dataset_name):
        cp, states = _count_changes(labels)
        cp_counts.append(cp)
        state_counts.append(states)
    if not cp_counts and not state_counts:
        stats_rows.append({
            "dataset": dataset_name,
            "min_change_points": np.nan,
            "avg_change_points": np.nan,
            "max_change_points": np.nan,
            "min_states": np.nan,
            "avg_states": np.nan,
            "max_states": np.nan,
            "n_trials_with_labels": 0
        })
        continue
    stats_rows.append({
        "dataset": dataset_name,
        "min_change_points": int(np.nanmin(cp_counts)) if cp_counts else np.nan,
        "avg_change_points": float(np.nanmean(cp_counts)) if cp_counts else np.nan,
        "max_change_points": int(np.nanmax(cp_counts)) if cp_counts else np.nan,
        "min_states": int(np.nanmin(state_counts)) if state_counts else np.nan,
        "avg_states": float(np.nanmean(state_counts)) if state_counts else np.nan,
        "max_states": int(np.nanmax(state_counts)) if state_counts else np.nan,
        "n_trials_with_labels": len(cp_counts)
    })
change_point_stats_df = pd.DataFrame(stats_rows)
display(change_point_stats_df.sort_values("dataset").reset_index(drop=True))

Unnamed: 0,dataset,min_change_points,avg_change_points,max_change_points,min_states,avg_states,max_states,n_trials_with_labels
0,actrectut,42,42.0,42,6,6.0,6,2
1,has,0,3.14,14,1,3.288,12,250
2,knot-tying,7,9.305556,15,4,5.527778,6,36
3,mocap,5,7.222222,10,3,5.555556,9,9
4,needle-passing,8,18.214286,27,6,7.5,9,28
5,pamap2,2,21.111111,27,2,11.111111,13,9
6,skab,1,1.941176,2,2,2.0,2,34
7,suturing,16,19.333333,36,6,7.820513,9,39
8,tssb,0,2.533333,8,1,3.186667,7,75
9,usc-had,11,11.0,11,12,12.0,12,70


In [18]:
weights = change_point_stats_df["n_trials_with_labels"]

cp_mask = weights.gt(0) & change_point_stats_df["avg_change_points"].notna()
state_mask = weights.gt(0) & change_point_stats_df["avg_states"].notna()
min_cp_mask = weights.gt(0) & change_point_stats_df["min_change_points"].notna()
max_cp_mask = weights.gt(0) & change_point_stats_df["max_change_points"].notna()
min_state_mask = weights.gt(0) & change_point_stats_df["min_states"].notna()
max_state_mask = weights.gt(0) & change_point_stats_df["max_states"].notna()

weighted_avg_cp = float(np.average(change_point_stats_df.loc[cp_mask, "avg_change_points"],
                                   weights=weights[cp_mask])) if cp_mask.any() else np.nan
weighted_min_cp = float(np.average(change_point_stats_df.loc[min_cp_mask, "min_change_points"],
                                   weights=weights[min_cp_mask])) if min_cp_mask.any() else np.nan
weighted_max_cp = float(np.average(change_point_stats_df.loc[max_cp_mask, "max_change_points"],
                                   weights=weights[max_cp_mask])) if max_cp_mask.any() else np.nan

weighted_avg_states = float(np.average(change_point_stats_df.loc[state_mask, "avg_states"],
                                       weights=weights[state_mask])) if state_mask.any() else np.nan
weighted_min_states = float(np.average(change_point_stats_df.loc[min_state_mask, "min_states"],
                                       weights=weights[min_state_mask])) if min_state_mask.any() else np.nan
weighted_max_states = float(np.average(change_point_stats_df.loc[max_state_mask, "max_states"],
                                       weights=weights[max_state_mask])) if max_state_mask.any() else np.nan

total_trials = int(weights.sum())

weighted_summary = pd.DataFrame(
    {
        "weighted_min_change_points": [weighted_min_cp],
        "weighted_avg_change_points": [weighted_avg_cp],
        "weighted_max_change_points": [weighted_max_cp],
        "weighted_min_states": [weighted_min_states],
        "weighted_avg_states": [weighted_avg_states],
        "weighted_max_states": [weighted_max_states],
        "total_trials": [total_trials],
    }
)

display(weighted_summary)

Unnamed: 0,weighted_min_change_points,weighted_avg_change_points,weighted_max_change_points,weighted_min_states,weighted_avg_states,weighted_max_states,total_trials
0,3.566781,6.486301,13.902397,3.253425,4.991438,9.517123,584


In [None]:
change_point_stats_df["avg_change_points"].describe()

count    11.000000
mean     12.456343
std      12.205477
min       1.218750
25%       2.836667
50%       9.305556
75%      18.773810
max      42.000000
Name: avg_change_points, dtype: float64

In [15]:
change_point_stats_df["avg_states"].describe()

count    11.000000
mean      6.016102
std       3.375061
min       2.000000
25%       3.237333
50%       5.555556
75%       7.660256
max      12.000000
Name: avg_states, dtype: float64

In [None]:
summary_df = pd.DataFrame(EXPLORATION_LOG)
if summary_df.empty:
    print("No explorations recorded yet. Run the previous sections.")
else:
    summary_df = summary_df.copy()
    summary_df["timestamp"] = pd.Timestamp.utcnow().isoformat()
    ordered_cols = [
        "section","dataset","selection_mode","params","length","channels","unique_labels","labels_present","config_file","timestamp"
    ]
    ordered_cols = [col for col in ordered_cols if col in summary_df.columns]
    summary_df = summary_df[ordered_cols]
    display(summary_df)
    NOTEBOOK_CACHE.mkdir(parents=True, exist_ok=True)
    summary_path = NOTEBOOK_CACHE / "dataset_explorer_summary.csv"
    summary_df.to_csv(summary_path, index=False)
    print(f"Summary saved to: {summary_path}")

Unnamed: 0,section,dataset,selection_mode,params,length,channels,unique_labels,labels_present,config_file,timestamp
0,identifiants,actrectut,,{'subject_number': 1},31392,10,6,True,,2025-11-10T12:42:11.601275+00:00
1,aleatoire,utsa,bulk,{'index': 24},8001,1,2,True,utsa.yaml,2025-11-10T12:42:11.601275+00:00


Resume sauvegarde: /home/fchavell/tsseg-project/tsseg-exp/notebooks/.cache/dataset_explorer_summary.csv
