# Interactive Benchmark Segmentation Analysis

use this notebook to interactively explore segmentation results from the benchmark. 
It connects to the local MLflow database to query run metadata and loads artifacts from the specified remote mount point.

In [1]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, clear_output
import mlflow
import yaml

# Add current directory and src to path
sys.path.append(os.getcwd())
sys.path.append(os.path.abspath("../src"))

from benchmark_db import BenchmarkDataManager
# Try importing the loader function from source
try:
    from tsseg_exp.datasets.loaders import load_dataset
except ImportError:
    print("Warning: Could not import load_dataset from tsseg_exp.datasets.loaders. Ensure ../src is in path.")
    # Define a dummy loader to prevent crash if not found
    def load_dataset(*args, **kwargs):
        return None, None

%matplotlib inline

E0000 00:00:1769448303.522392  138750 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769448303.526149  138750 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# --- Configuration ---
PROJECT_ROOT = Path("..").resolve()
MLFLOW_DB_PATH = PROJECT_ROOT / "results/mlflow_snapshot.db"
TRACKING_URI = f"sqlite:///{MLFLOW_DB_PATH}"
CONFIG_PATH = PROJECT_ROOT / "configs/benchmark_config.yaml"

# Auto-detect Artifact Path
potential_roots = []
uid = os.getuid()

# 1. GVFS Mounts (Priority)
gvfs_root = Path(f"/run/user/{uid}/gvfs")
if gvfs_root.exists():
    candidate_mounts = list(gvfs_root.glob("sftp*cleps*"))
    for mount in candidate_mounts:
        # Variant A: Mount is Root (path includes home)
        potential_roots.append(mount / "home/fchavell/scratch/tsseg-exp/mlartifacts")
        # Variant B: Mount is Home (path starts at scratch)
        potential_roots.append(mount / "scratch/tsseg-exp/mlartifacts")

# 2. Local Fallback
potential_roots.append(PROJECT_ROOT / "mlartifacts")

ARTIFACT_ROOT = None
print("Searching for Artifact Root...")
for p in potential_roots:
    try:
        if p.exists():
            print(f"  Checking: {p} -> Found")
            if ARTIFACT_ROOT is None:
                ARTIFACT_ROOT = p
        else:
            print(f"  Checking: {p} -> Missing")
    except PermissionError:
        print(f"  Checking: {p} -> Permission Denied")

if ARTIFACT_ROOT is None:
    print("Warning: No accessible artifact root found. Defaulting to local path.")
    ARTIFACT_ROOT = PROJECT_ROOT / "mlartifacts"

print(f"\nTracking URI: {TRACKING_URI}")
print(f"Selected Artifact Root: {ARTIFACT_ROOT}")

# --- Initialize Manager & Analyzer ---
# We now use the robust classes from the paper reproduction
from mlflow_manager import MLflowBenchmarkManager
from benchmark_analysis import BenchmarkAnalyzer

# Initialize Manager with explicit Tracking URI & Config
manager = MLflowBenchmarkManager(CONFIG_PATH, tracking_uri=TRACKING_URI)
analyzer = BenchmarkAnalyzer(manager)

Searching for Artifact Root...
  Checking: /home/fchavell/tsseg-project/tsseg-exp/mlartifacts -> Found

Tracking URI: sqlite:////home/fchavell/tsseg-project/tsseg-exp/results/mlflow_snapshot.db
Selected Artifact Root: /home/fchavell/tsseg-project/tsseg-exp/mlartifacts


2026/01/26 18:25:06 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/26 18:25:06 INFO mlflow.store.db.utils: Updating database tables
2026/01/26 18:25:06 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/26 18:25:06 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/26 18:25:06 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/26 18:25:06 INFO alembic.runtime.migration: Will assume non-transactional DDL.


In [3]:
from scipy.optimize import linear_sum_assignment
import matplotlib.lines as mlines

class SegmentationDashboard:
    def __init__(self, analyzer, artifact_root):
        self.analyzer = analyzer
        self.manager = analyzer.manager
        self.artifact_root = Path(artifact_root)
        
        # State Flags
        self._updating = False
        
        # Data Cache
        self.df_parents = pd.DataFrame()
        self.df_children = pd.DataFrame()
        
        # --- UI Components ---
        self.modes = ['default', 'guided', 'grid_default', 'grid_guided']
        self.dropdown_mode = widgets.Dropdown(
            options=self.modes, 
            description="Mode:",
            value=self.modes[0]
        )
        
        self.dropdown_algo = widgets.Dropdown(description="Algorithm:")
        self.dropdown_dataset = widgets.Dropdown(description="Dataset:")
        self.dropdown_series = widgets.Dropdown(description="Series:", layout={'width': 'max-content'})
        
        self.out = widgets.Output()
        
        # Bindings
        self.dropdown_mode.observe(self.on_mode_change, names='value')
        self.dropdown_algo.observe(self.update_series_list, names='value')
        self.dropdown_dataset.observe(self.update_series_list, names='value')
        self.dropdown_series.observe(self.on_series_change, names='value')
        
        display(
            widgets.VBox([
                self.dropdown_mode,
                widgets.HBox([self.dropdown_algo, self.dropdown_dataset]),
                self.dropdown_series,
                self.out
            ])
        )
        
        # Trigger Initial Load
        self.on_mode_change({'new': self.dropdown_mode.value})

    def on_mode_change(self, change):
        if self._updating: return
        
        mode = change['new']
        if not mode: return
        
        self._updating = True
        self.dropdown_algo.unobserve(self.update_series_list, names='value')
        self.dropdown_dataset.unobserve(self.update_series_list, names='value')
        
        try:
            with self.out:
                clear_output(wait=True)
                print(f"Fetching Parent Runs for Mode: {mode}...")
                
                key_map = {
                    'default': ['unsupervised'],
                    'guided': ['semi_supervised'],
                    'grid_default': ['grid_unsupervised'],
                    'grid_guided': ['grid_supervised']
                }
                keys = key_map.get(mode, [])
                
                raw_parents = self.analyzer.fetch_parent_runs_stats(keys)
                self.df_parents = self.analyzer.validate_completeness(raw_parents, strategy='merge')
                
                if self.df_parents.empty:
                    print("No valid runs found for this mode.")
                    self.dropdown_algo.options = []
                    self.dropdown_dataset.options = []
                    self.dropdown_series.options = []
                    self.dropdown_algo.value = None
                    self.dropdown_dataset.value = None
                else:
                    print(f"Found {len(self.df_parents)} valid parent runs.")
                    
                    algos = sorted(self.df_parents['algorithm'].dropna().unique())
                    datasets = sorted(self.df_parents['dataset'].dropna().unique())
                    
                    self.dropdown_algo.options = algos
                    self.dropdown_dataset.options = datasets
                    
                    if algos: self.dropdown_algo.value = algos[0]
                    else: self.dropdown_algo.value = None
                        
                    if datasets: self.dropdown_dataset.value = datasets[0]
                    else: self.dropdown_dataset.value = None
                    
        finally:
            self.dropdown_algo.observe(self.update_series_list, names='value')
            self.dropdown_dataset.observe(self.update_series_list, names='value')
            self._updating = False
        
        self.update_series_list(None)

    def update_series_list(self, change):
        if self._updating: return
        
        self._updating = True
        self.dropdown_series.unobserve(self.on_series_change, names='value')
        
        try:
            algo = self.dropdown_algo.value
            dataset = self.dropdown_dataset.value
            
            if self.df_parents.empty or not algo or not dataset:
                self.dropdown_series.options = []
                self.dropdown_series.value = None
                return
                
            mask = (self.df_parents['algorithm'] == algo) & (self.df_parents['dataset'] == dataset)
            relevant_parents = self.df_parents[mask]
            
            if relevant_parents.empty:
                self.dropdown_series.options = []
                self.dropdown_series.value = None
                return
                
            with self.out:
                print(f"Fetching children for {len(relevant_parents)} parent(s)...", end='\r')
                
                children_metrics = self.analyzer.fetch_metrics_for_parents(
                    relevant_parents, 
                    deduplicate_series=True
                )
                
                if children_metrics.empty:
                    print(f"No children found for {algo} on {dataset}.")
                    self.dropdown_series.options = []
                    self.dropdown_series.value = None
                    return

                self.df_children = children_metrics
                
                if 'trial_index' in self.df_children.columns:
                    self.df_children['trial_index'] = pd.to_numeric(self.df_children['trial_index'], errors='coerce').fillna(-1).astype(int)
                    self.df_children = self.df_children.sort_values('trial_index')
                
                options = []
                for _, row in self.df_children.iterrows():
                    t_idx = row.get('trial_index', '?')
                    run_id = row['run_id']
                    
                    label = f"Series {t_idx}"
                    if 'metrics.f1_score' in row:
                        label += f" | F1: {row['metrics.f1_score']:.2f}"
                    
                    label += f" ({run_id[:6]})"
                    options.append((label, run_id))
                
                self.dropdown_series.options = options
                print(f"Found {len(options)} series.        ")
                
                if options:
                    self.dropdown_series.value = options[0][1]
                else:
                    self.dropdown_series.value = None
                    
        finally:
            self.dropdown_series.observe(self.on_series_change, names='value')
            self._updating = False
        
        if self.dropdown_series.value:
            self.on_series_change({'new': self.dropdown_series.value})

    def _get_run_artifact_path(self, run_info):
        uri = run_info.artifact_uri
        run_dir = None
        
        if uri:
            if uri.startswith("file://"):
                p = Path(uri.replace("file://", ""))
                if p.exists():
                    run_dir = p
                elif not p.is_absolute():
                     p_rel = PROJECT_ROOT / p
                     if p_rel.exists():
                         run_dir = p_rel
        
        if run_dir is None:
            run_dir = self.artifact_root / run_info.experiment_id / run_info.run_id / "artifacts"
            
        return run_dir

    def _load_ground_truth_via_loader(self, dataset_name, run_params):
        try:
            loader_params = {}
            valid_keys = {
                'subject_number', 'target_number', 'ts_id', 'trial', 'dimension',
                'ts_name', 'version', 'filename', 'variables', 'subject', 'target',
                'trial_index'
            }
            int_keys = ['subject_number', 'target_number', 'ts_id', 'trial', 'dimension', 'trial_index']
            
            for k, v in run_params.items():
                clean_k = k.split('.')[-1]
                if clean_k in valid_keys:
                    if clean_k in int_keys:
                        try: loader_params[clean_k] = int(v)
                        except: loader_params[clean_k] = v
                    else:
                        loader_params[clean_k] = v
            
            if 'subject' in loader_params and 'subject_number' not in loader_params:
                try: loader_params['subject_number'] = int(loader_params['subject'])
                except: pass

            data_root = PROJECT_ROOT / "data"
            X, y = load_dataset(dataset_name=dataset_name, data_root=data_root, return_X_y=True, **loader_params)
            
            if isinstance(X, (list, tuple)):
                if len(X) > 0:
                    idx = 0
                    if 'trial_index' in loader_params:
                        idx = int(loader_params['trial_index'])
                    elif 'trial' in loader_params and isinstance(loader_params['trial'], int):
                        # Some loaders use 'trial' as index
                        idx = loader_params['trial']
                    
                    if 0 <= idx < len(X):
                        print(f"Selecting series index {idx} from {len(X)} returned.")
                        X = X[idx]
                        if isinstance(y, (list, tuple)) and len(y) > idx:
                            y = y[idx]
                        elif isinstance(y, (list, tuple)) and len(y) > 0:
                            y = y[0]
                    else:
                        print(f"Warning: Index {idx} out of bounds (0-{len(X)-1}). Using 0.")
                        X = X[0]
                        if isinstance(y, (list, tuple)) and len(y) > 0:
                            y = y[0]
                else:
                    return None, None
            
            if X is not None: X = np.array(X)
            if y is not None: y = np.array(y)

            return X, y
        except Exception as e:
            print(f"Loader failed for {dataset_name}: {e}")
            return None, None

    def on_series_change(self, change):
        if self._updating: return
        
        run_id = change['new']
        if not run_id: return
        
        with self.out:
            clear_output(wait=True)
            try:
                run = mlflow.get_run(run_id)
            except Exception as e:
                print(f"Error fetching run {run_id}: {e}")
                return

            params = run.data.params
            dataset_name = params.get('dataset') or params.get('dataset_name')
            if not dataset_name and self.dropdown_dataset.value:
                dataset_name = self.dropdown_dataset.value
            
            algorithm_name = params.get('algorithm') or params.get('algorithm_name') or self.dropdown_algo.value
            
            print(f"Run: {run_id} | Algo: {algorithm_name} | Data: {dataset_name}")
            
            run_dir = self._get_run_artifact_path(run.info)
            if run_dir.exists():
                print(f"Artifacts: {run_dir} -> Found")
            else:
                 print(f"Artifacts: {run_dir} -> Missing")
            
            data = {}
            if dataset_name:
                X, y = self._load_ground_truth_via_loader(dataset_name, params)
                if X is not None: data['X'] = X
                if y is not None: data['y'] = y
            
            if run_dir.exists():
                files = [
                    "segmentation.npy", "predicted_labels.npy", 
                    "predicted_change_points.npy", "ground_truth_change_points.npy"
                ]
                for f in files:
                    matches = list(run_dir.glob(f"**/{f}"))
                    if matches:
                        try:
                            data[f] = np.load(matches[0], allow_pickle=True)
                        except: pass
            
            if data:
                meta = {'algorithm': algorithm_name, 'dataset': dataset_name}
                self.plot_segmentation(data, meta)
            else:
                print("No data to display. (Loader failed and no artifacts found)")

    def _get_optimal_mapping(self, y_true, y_pred):
        """Aligns y_pred colors to y_true using Hungarian algorithm."""
        # Normalize inputs
        y_true = np.array(y_true).flatten()
        y_pred = np.array(y_pred).flatten()
        
        # Filter out NaNs if any, though usually int
        # Limit to min length
        l = min(len(y_true), len(y_pred))
        y_true = y_true[:l]
        y_pred = y_pred[:l]
        
        labels_true = np.unique(y_true)
        labels_pred = np.unique(y_pred)
        
        # Build Confusion Matrix
        t_map = {l: i for i, l in enumerate(labels_true)}
        p_map = {l: i for i, l in enumerate(labels_pred)}
        
        C = np.zeros((len(labels_true), len(labels_pred)), dtype=int)
        for t, p in zip(y_true, y_pred):
            C[t_map[t], p_map[p]] += 1
            
        # Hungarian Algorithm
        row_ind, col_ind = linear_sum_assignment(C, maximize=True)
        
        # Build Color Mappings
        # GT Map: Just index 0..N
        color_map_true = {l: i for i, l in enumerate(labels_true)}
        
        # Pred Map: Align matched, append new
        color_map_pred = {}
        matched_preds = set()
        
        for r, c in zip(row_ind, col_ind):
            gt_label = labels_true[r]
            pred_label = labels_pred[c]
            color_map_pred[pred_label] = color_map_true[gt_label]
            matched_preds.add(pred_label)
            
        next_color = len(labels_true)
        for l in labels_pred:
            if l not in matched_preds:
                color_map_pred[l] = next_color
                next_color += 1
                
        return color_map_true, color_map_pred

    def plot_segmentation(self, data, meta):
        has_X = 'X' in data
        
        if has_X:
            fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 8), sharex=True, gridspec_kw={'height_ratios': [2, 1]})
        else:
            fig, ax2 = plt.subplots(figsize=(15, 4))
            ax1 = None

        # --- Top Panel: Time Series ---
        if has_X:
            X = data['X']
            if not isinstance(X, np.ndarray):
                try: X = np.array(X)
                except: pass

            if isinstance(X, np.ndarray) and X.ndim > 1:
                # Plot max 10 dims
                for i in range(min(X.shape[1], 10)):
                    ax1.plot(X[:, i], alpha=0.5, linewidth=1)
            elif isinstance(X, np.ndarray):
                ax1.plot(X, color='black', alpha=0.6)
            
            ax1.set_title(f"Time Series: {meta['dataset']}")
            ax1.set_ylabel("Value")
            ax1.grid(True, alpha=0.3)
            
            # optional: GT Change Points overlays on TS
            gt_cps = None
            if "ground_truth_change_points.npy" in data:
                gt_cps = data["ground_truth_change_points.npy"]
            elif 'y' in data:
                 y = data['y']
                 try: gt_cps = np.where(y[:-1] != y[1:])[0]
                 except: pass 
            
            if gt_cps is not None:
                gt_cps = np.array(gt_cps).flatten()
                for cp in gt_cps:
                    ax1.axvline(x=cp, color='green', linestyle='--', alpha=0.5)

            # optional: Predicted Change Points overlays on TS
            pred_cps = None
            if "predicted_change_points.npy" in data:
                pred_cps = data["predicted_change_points.npy"]
            
            if pred_cps is not None:
                pred_cps = np.array(pred_cps).flatten()
                for cp in pred_cps:
                    ax1.axvline(x=cp, color='red', linestyle='--', alpha=0.5)

            # Legend
            if gt_cps is not None or pred_cps is not None:
                handles = []
                if gt_cps is not None:
                    handles.append(mlines.Line2D([], [], color='green', linestyle='--', label='GT CP'))
                if pred_cps is not None:
                    handles.append(mlines.Line2D([], [], color='red', linestyle='--', label='Pred CP'))
                ax1.legend(handles=handles, loc='upper right')

        # --- Bottom Panel: Colored Segmentation Bars ---
        # Prepare Data
        y_true = np.array([])
        y_pred = np.array([])
        
        if 'y' in data: 
            y_true = np.array(data['y']).flatten()
            
        if "predicted_labels.npy" in data:
            y_pred = np.array(data["predicted_labels.npy"]).flatten()
            
        # Determine Length for visualization
        T = 0
        if len(y_true) > 0: T = max(T, len(y_true))
        if len(y_pred) > 0: T = max(T, len(y_pred))
        
        if T == 0 and has_X: T = len(data['X'])
        
        if T > 0:
            # Color Matching
            if len(y_true) > 0 and len(y_pred) > 0:
                cmap_t, cmap_p = self._get_optimal_mapping(y_true, y_pred)
            else:
                # Fallback simple maps
                cmap_t = {l: i for i, l in enumerate(np.unique(y_true))} if len(y_true) > 0 else {}
                cmap_p = {l: i for i, l in enumerate(np.unique(y_pred))} if len(y_pred) > 0 else {}

            # Construct Image Grid (2 Rows: GT, Pred)
            # Use float/nan for empty spaces
            grid = np.full((2, T), np.nan)
            
            # Fill GT (Row 0)
            if len(y_true) > 0:
                indices = [cmap_t.get(val, 0) for val in y_true]
                grid[0, :len(indices)] = indices
                
            # Fill Pred (Row 1)
            if len(y_pred) > 0:
                indices = [cmap_p.get(val, 0) for val in y_pred]
                grid[1, :len(indices)] = indices
            
            # Plot
            # Use interpolation='nearest' and aspect='auto' for distinct blocks
            ax2.imshow(
                grid, 
                aspect='auto', 
                interpolation='nearest', 
                cmap='tab20',
                extent=[0, T, -0.5, 1.5] # y-range -0.5 to 1.5 allows ticks at 0 and 1
            )
            
            # Formatting
            ax2.set_yticks([1, 0])
            ax2.set_yticklabels(["GT states", "Predicted states"])
            
            # Separator line
            ax2.axhline(y=0.5, color='black', linewidth=1.5)
            
            ax2.set_title(f"Segmentation: {meta['algorithm']}")
            ax2.set_xlabel("Time Step")
            ax2.grid(False) # Grid interferes with bars
            
        plt.tight_layout()
        plt.show()

# Start Dashboard
dashboard = SegmentationDashboard(analyzer, ARTIFACT_ROOT)

VBox(children=(Dropdown(description='Mode:', options=('default', 'guided', 'grid_default', 'grid_guided'), val…

In [4]:
from tsseg.metrics.change_point_detection import Covering

# Target Run Parameters
run_id = "ed38d4935f4346f19449319228d2fc05"
print(f"--- Analyzing Run: {run_id} (GGS | Mocap | Series 1) ---")

# 1. Fetch Logged Metric from DB
run = mlflow.get_run(run_id)
# Note: Metric name might vary slightly depending on logging prefix (e.g. 'covering', 'metrics.covering')
logged_score = run.data.metrics.get('covering_score')
if logged_score is None:
    # Try alternate keys often found in MLflow
    logged_score = run.data.metrics.get('metrics.covering_score')

print(f"DB Logged Covering Score: {logged_score}")

# 2. Load Artifacts (True & Pred CPs)
# We use ARTIFACT_ROOT defined in the configuration cell above
run_artifact_dir = ARTIFACT_ROOT / run.info.experiment_id / run_id / "artifacts"
print(f"Loading artifacts from: {run_artifact_dir}")

try:
    # Load Change Points directly
    y_true = np.load(run_artifact_dir / "ground_truth_change_points.npy", allow_pickle=True).flatten()
    y_pred = np.load(run_artifact_dir / "predicted_change_points.npy", allow_pickle=True).flatten()

    print(f"\nLoaded Ground Truth CPs: {y_true}")
    print(f"Loaded Predicted CPs:    {y_pred}")

    # 3. Recompute Metric explicitly using current codebase
    # We set convert_labels_to_segments=False because we are providing CPs directly, not labels.
    covering_metric = Covering(convert_labels_to_segments=False)
    
    # Compute
    result = covering_metric.compute(y_true, y_pred)
    recomputed_score = result['score']
    
    print(f"\nRecomputed Covering Score: {recomputed_score}")
    
    if logged_score is not None:
        diff = abs(logged_score - recomputed_score)
        print(f"Difference: {diff:.6f}")
        if diff < 1e-6:
            print(">> MATCH: Scores are identical.")
        else:
            print(">> DIFF: Scores differ (check metric version or boundary handling).")

except FileNotFoundError as e:
    print(f"Error: Could not find artifact file. {e}")
except Exception as e:
    print(f"An error occurred during calculation: {e}")

--- Analyzing Run: ed38d4935f4346f19449319228d2fc05 (GGS | Mocap | Series 1) ---
DB Logged Covering Score: 0.6302778974579922
Loading artifacts from: /home/fchavell/tsseg-project/tsseg-exp/mlartifacts/10/ed38d4935f4346f19449319228d2fc05/artifacts

Loaded Ground Truth CPs: [   0  572 1012 1232 1408]
Loaded Predicted CPs:    [   0  564 1408]
Covering legacy 0.6302778974579922
y_true: [0, 572, 1012, 1232, 1408]
y_pred: [0, 564, 1408]

Recomputed Covering Score: 0.6302778974579922
Difference: 0.000000
>> MATCH: Scores are identical.


In [6]:
import random

print("--- Auditing Predicted Change Point Boundaries (Random Sample 1000 Runs) ---")

# 1. Fetch all runs
search_results = mlflow.search_runs(search_all_experiments=True, output_format="list")
print(f"Total runs found: {len(search_results)}")

# Filter finished runs
finished_runs = [r for r in search_results if r.info.status == "FINISHED"]
print(f"Finished runs: {len(finished_runs)}")

# 2. Sample 1000 runs
sample_size = 1000
if len(finished_runs) > sample_size:
    sampled_runs = random.sample(finished_runs, sample_size)
else:
    sampled_runs = finished_runs
print(f"Auditing {len(sampled_runs)} runs...")

# Counters
count_has_0 = 0
count_has_T = 0
count_no_pred = 0
count_missing_artifacts = 0
boundary_issues = []

for idx, run in enumerate(sampled_runs):
    run_id = run.info.run_id
    exp_id = run.info.experiment_id
    
    # Construct artifact path
    artifact_path = ARTIFACT_ROOT / exp_id / run_id / "artifacts"
    
    # We need both True and Pred to know what T is
    pred_path = artifact_path / "predicted_change_points.npy"
    true_path = artifact_path / "ground_truth_change_points.npy"
    
    if not pred_path.exists() or not true_path.exists():
        count_missing_artifacts += 1
        continue
        
    try:
        y_pred = np.load(pred_path, allow_pickle=True).flatten()
        y_true = np.load(true_path, allow_pickle=True).flatten()
        
        # Determine T from Ground Truth (last element of y_true usually)
        # Assuming y_true is a list of change points, the last one is the length T
        if len(y_true) == 0:
            continue
            
        T = y_true[-1] 
        
        if len(y_pred) == 0:
            count_no_pred += 1
            continue
            
        # Check 0
        has_0 = (0 in y_pred)
        if has_0: count_has_0 += 1
        
        # Check T
        has_T = (T in y_pred)
        if has_T: count_has_T += 1
        
        if not has_0 or not has_T:
            boundary_issues.append({
                'run_id': run_id,
                'algo': run.data.params.get('algorithm'),
                'dataset': run.data.params.get('dataset'),
                'has_0': has_0,
                'has_T': has_T,
                'pred': y_pred,
                'T': T
            })
            
    except Exception as e:
        print(f"Error processing run {run_id}: {e}")

print("\n--- Audit Results ---")
print(f"Runs checked: {len(sampled_runs) - count_missing_artifacts} (skipped {count_missing_artifacts} missing artifacts)")
print(f"Runs with '0' included: {count_has_0} ({count_has_0/(len(sampled_runs)-count_missing_artifacts):.1%})")
print(f"Runs with 'T' included: {count_has_T} ({count_has_T/(len(sampled_runs)-count_missing_artifacts):.1%})")
print(f"\nRuns missing boundaries: {len(boundary_issues)}")

if boundary_issues:
    print("\nSample of runs with missing boundaries:")
    for issue in boundary_issues[:10]:
        print(f"Run {issue['run_id']} ({issue['algo']}): Includes 0? {issue['has_0']}, Includes T? {issue['has_T']} (T={issue['T']}) -> Pred: {issue['pred']}")

--- Auditing Predicted Change Point Boundaries (Random Sample 1000 Runs) ---
Total runs found: 100000
Finished runs: 98750
Auditing 1000 runs...

--- Audit Results ---
Runs checked: 983 (skipped 17 missing artifacts)
Runs with '0' included: 983 (100.0%)
Runs with 'T' included: 983 (100.0%)

Runs missing boundaries: 0


In [11]:
print(f"--- Recomputing Metrics for All Runs on TSSB ---")

# 0. Debug: Check available values in finished_runs
seen_algos = set()
seen_datasets = set()
for r in finished_runs:
    p = r.data.params
    a = p.get('algorithm') or p.get('algorithm_name')
    d = p.get('dataset') or p.get('dataset_name')
    if a: seen_algos.add(a)
    if d: seen_datasets.add(d)

print(f"Available Algorithms: {sorted(list(seen_algos))}")
print(f"Available Datasets:   {sorted(list(seen_datasets))}")

# 1. Filter runs for ANY Algo on TSSB (Robust Filtering)
tssb_runs = []
target_dataset = 'tssb'

for r in finished_runs:
    p = r.data.params
    # Handle key variations
    dataset = p.get('dataset') or p.get('dataset_name')
    
    if dataset:
        if dataset.lower() == target_dataset.lower():
            tssb_runs.append(r)

print(f"Found {len(tssb_runs)} matching runs for dataset {target_dataset}.")

# Metric setup
# Ensure import is available if running this cell standalone (assuming tsseg installed)
try:
    from tsseg.metrics.change_point_detection import Covering
except ImportError:
    pass # Assume loaded from previous cells

covering_metric = Covering(convert_labels_to_segments=False)
comparison_results = []

print("Starting audit loop...")
for idx, run in enumerate(tssb_runs):
    run_id = run.info.run_id
    algo_name = run.data.params.get('algorithm') or run.data.params.get('algorithm_name') or 'unknown'

    # Get Logged Score
    logged_score = run.data.metrics.get('covering_score') or run.data.metrics.get('metrics.covering_score')
    
    # --- Smart Artifact Resolution ---
    run_artifact_dir = None
    uri = run.info.artifact_uri
    
    # 1. Try URI if local file
    if uri and uri.startswith("file://"):
        p = Path(uri.replace("file://", ""))
        if p.exists():
            run_artifact_dir = p
        elif not p.is_absolute():
             # Try relative to project root
             p_rel = PROJECT_ROOT / p
             if p_rel.exists():
                 run_artifact_dir = p_rel
    
    # 2. Fallback to constructed path
    if run_artifact_dir is None:
        run_artifact_dir = ARTIFACT_ROOT / run.info.experiment_id / run_id / "artifacts"
        
    pred_path = run_artifact_dir / "predicted_change_points.npy"
    true_path = run_artifact_dir / "ground_truth_change_points.npy"
    
    if not pred_path.exists() or not true_path.exists():
        if idx < 5: # Debug first 5 failures
            print(f"SKIP {run_id[:8]}: Files missing at {run_artifact_dir}")
            print(f"      (URI: {uri})")
        continue

    try:
        y_pred = np.load(pred_path, allow_pickle=True).flatten()
        y_true = np.load(true_path, allow_pickle=True).flatten()
        
        # Recompute
        result = covering_metric.compute(y_true, y_pred)
        recomputed_score = result['score']
        
        # Handle logged_score being None
        l_score_val = logged_score if logged_score is not None else -1.0
        
        diff = abs(l_score_val - recomputed_score)
        is_match = (diff < 1e-6) if logged_score is not None else False
        
        comparison_results.append({
            'run_id': run_id,
            'algo': algo_name,
            'logged': logged_score,
            'recomputed': recomputed_score,
            'diff': diff,
            'match': is_match
        })
        
    except Exception as e:
        print(f"Error on run {run_id}: {e}")

# Summary
if comparison_results:
    df_comp = pd.DataFrame(comparison_results)
    print(f"\nStats for {len(df_comp)} successful recalculations:")
    match_rate = df_comp['match'].mean()
    print(f"Match Rate: {match_rate:.1%}")
    metric_diffs = df_comp[~df_comp['match']]
    if not metric_diffs.empty:
        print(f"Average Diff (mismatches): {metric_diffs['diff'].mean():.6f}")
    
    print("\nTop differences:")
    print(df_comp.sort_values('diff', ascending=False).head(10)[['run_id', 'algo', 'logged', 'recomputed', 'diff']])
    
    # Algo breakdown
    print("\nMismatch Rate by Algorithm:")
    print(df_comp.groupby('algo')['match'].mean().sort_values())
else:
    print("No comparison results generated (all artifacts missing or errored).")

--- Recomputing Metrics for All Runs on TSSB ---
Available Algorithms: ['amoc', 'binseg', 'clap', 'clasp', 'eagglo', 'espresso', 'fluss', 'hdp-hsmm-legacy', 'hidalgo', 'icid', 'kcpd', 'pelt', 'prophet', 'random', 'tglad', 'ticc', 'time2state', 'tire', 'tscp2', 'vsax', 'window']
Available Datasets:   ['actrectut', 'has', 'mocap', 'pamap2', 'skab', 'tssb', 'usc-had', 'utsa']
Found 330 matching runs for dataset tssb.
Starting audit loop...
SKIP 1b202645: Files missing at /run/user/675409/gvfs/sftp:host=cleps,user=fchavell/home/fchavell/scratch/tsseg-exp/mlartifacts/15/1b20264576a748a8864f490b0836d55e/artifacts
      (URI: /scratch/fchavell/tsseg-exp/mlartifacts/15/1b20264576a748a8864f490b0836d55e/artifacts)
SKIP 3c1400cd: Files missing at /run/user/675409/gvfs/sftp:host=cleps,user=fchavell/home/fchavell/scratch/tsseg-exp/mlartifacts/15/3c1400cdea0547d38be4da2063deb8b5/artifacts
      (URI: /scratch/fchavell/tsseg-exp/mlartifacts/15/3c1400cdea0547d38be4da2063deb8b5/artifacts)
SKIP 145d2147

In [11]:
import sqlite3

target_algo_search = "clasp"    # e.g., "clasp", "pelt"
target_dataset_search = "tssb"  # e.g., "tssb", "has"

print(f"--- DIAGNOSTIC SQL Search: Algo ~= '{target_algo_search}', Dataset ~= '{target_dataset_search}' (Exp ID: 10) ---")

# Access variables defined in previous cells
db_path = str(MLFLOW_DB_PATH)

# SQL Query: Broad search to see what runs exist and what tags/params they have
query = f"""
SELECT 
    r.run_uuid as run_id, 
    r.end_time,
    r.status,
    p_algo.value as algorithm,
    p_data.value as dataset,
    -- Check if 'trial_index' param exists
    (SELECT value FROM params WHERE run_uuid = r.run_uuid AND key = 'trial_index') as trial_index,
    -- Check if 'mlflow.parentRunId' tag exists
    (SELECT value FROM tags WHERE run_uuid = r.run_uuid AND key = 'mlflow.parentRunId') as parent_run_id,
     -- Check covering score
    (SELECT value FROM metrics WHERE run_uuid = r.run_uuid AND key IN ('covering_score', 'metrics.covering_score') ORDER BY value DESC LIMIT 1) as score
FROM runs r
JOIN params p_algo ON r.run_uuid = p_algo.run_uuid 
JOIN params p_data ON r.run_uuid = p_data.run_uuid 
WHERE r.experiment_id = '10'
  AND p_algo.key IN ('algorithm', 'algorithm_name') 
  AND p_algo.value LIKE '%{target_algo_search}%'
  AND p_data.key IN ('dataset', 'dataset_name') 
  AND p_data.value LIKE '%{target_dataset_search}%'
ORDER BY r.end_time DESC
LIMIT 20
"""

try:
    with sqlite3.connect(db_path) as conn:
        df_found = pd.read_sql_query(query, conn)
    
    if not df_found.empty:
        # Convert ms timestamp to date
        if 'end_time' in df_found.columns:
            df_found['date'] = pd.to_datetime(df_found['end_time'], unit='ms')
            
        print(f"Found {len(df_found)} raw runs matching algo/dataset in Exp 10.")
        display(df_found[['run_id', 'status', 'trial_index', 'parent_run_id', 'score']].head(20))
        
        # Check distribution
        n_parents = df_found['parent_run_id'].isna().sum()
        n_children = df_found['parent_run_id'].notna().sum()
        print(f"\nStructure Analysis:")
        print(f"- Runs WITHOUT parent_run_id (potential parents): {n_parents}")
        print(f"- Runs WITH parent_run_id (children): {n_children}")
    else:
        print("No runs found AT ALL for this Algo/Dataset in Exp 10.")
        print("Check if Experiment ID is correct (Unsupervised usually Exp 1?) or if Algo/Dataset names match.")

except Exception as e:
    print(f"SQL Error: {e}")
    print(f"Query was:\n{query}")

--- DIAGNOSTIC SQL Search: Algo ~= 'clasp', Dataset ~= 'tssb' (Exp ID: 10) ---
Found 1 raw runs matching algo/dataset in Exp 10.


Unnamed: 0,run_id,status,trial_index,parent_run_id,score
0,def3aeb1cfc84943be94df6efb4490e2,FINISHED,,,0.629993



Structure Analysis:
- Runs WITHOUT parent_run_id (potential parents): 1
- Runs WITH parent_run_id (children): 0


In [12]:
import sqlite3
import pandas as pd

# Utilise le chemin défini plus haut dans votre notebook
print(f"Inspection de la base : {MLFLOW_DB_PATH}")

try:
    with sqlite3.connect(MLFLOW_DB_PATH) as conn:
        # 1. Lister toutes les tables
        tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
        print("\n--- Tables disponibles ---")
        display(tables)
        
        # 2. Voir les colonnes de la table 'runs' (principale)
        if 'runs' in tables['name'].values:
            schema = pd.read_sql_query("PRAGMA table_info(runs);", conn)
            print("\n--- Schéma de la table 'runs' ---")
            display(schema[['name', 'type', 'notnull']]) 

        # 3. Voir un échantillon des paramètres (params) pour comprendre les clés
        if 'params' in tables['name'].values:
            print("\n--- Exemple de paramètres (clés utilisées) ---")
            display(pd.read_sql_query("SELECT DISTINCT key FROM params LIMIT 10", conn))

except Exception as e:
    print(f"Erreur d'accès à la DB: {e}")

Inspection de la base : /home/fchavell/tsseg-project/tsseg-exp/results/mlflow_snapshot.db

--- Tables disponibles ---


Unnamed: 0,name
0,experiments
1,alembic_version
2,experiment_tags
3,registered_models
4,runs
5,registered_model_tags
6,model_versions
7,latest_metrics
8,metrics
9,registered_model_aliases



--- Schéma de la table 'runs' ---


Unnamed: 0,name,type,notnull
0,run_uuid,VARCHAR(32),1
1,name,VARCHAR(250),0
2,source_type,VARCHAR(20),0
3,source_name,VARCHAR(500),0
4,entry_point_name,VARCHAR(50),0
5,user_id,VARCHAR(256),0
6,status,VARCHAR(9),0
7,start_time,BIGINT,0
8,end_time,BIGINT,0
9,source_version,VARCHAR(50),0



--- Exemple de paramètres (clés utilisées) ---


Unnamed: 0,key
0,algo_K
1,algo_K_states
2,algo__target_
3,algo_a
4,algo_alpha
5,algo_alphabet_size
6,algo_axis
7,algo_b
8,algo_batch_size
9,algo_beta
