In [None]:
import os
from pathlib import Path
import random
from typing import List, Tuple, Optional

import numpy as np
import pandas as pd

from gift_eval.data import Dataset, Term

# -----------------------
# Configuration
# -----------------------
min_len = 512 + 256  # length of each sampled window
stride = 128         # ONLY used to compute how many samples to draw
max_per_dataset = 1024
seed = 42            # for reproducibility

# Synthetic time index per sampled window
base_start = pd.Timestamp("2000-01-01")
synthetic_freq = "H"  # hourly

random.seed(seed)
np.random.seed(seed)

# -----------------------
# Helpers
# -----------------------
def find_subdatasets(root: Path) -> List[str]:
    names = []
    for info in root.rglob("dataset_info.json"):
        state = info.parent / "state.json"
        if state.exists():
            names.append(str(info.parent.relative_to(root)))
    return sorted(set(names))

def iter_univariate_series(ds_obj: Dataset):
    """
    Yield tuples (row_idx, length) for every series in the dataset,
    treating multivariate items as univariate by taking ONLY the last feature.

    For univariate items, it's just the single series.
    For multivariate items (D, T), we use the last dimension: target[-1].
    """
    num_rows = ds_obj.hf_dataset.num_rows
    for i in range(num_rows):
        target = ds_obj.hf_dataset[i]["target"]  # numpy array due to with_format("numpy")
        # If multivariate: (D, T), use the last feature
        if getattr(target, "ndim", 1) > 1:
            y = np.asarray(target[-1])
        else:
            y = np.asarray(target)
        yield (i, y.shape[-1])

def get_series_array(ds_obj: Dataset, row_idx: int):
    """
    Return the 1D numpy array for the series at row_idx,
    using ONLY the last feature if the target is multivariate.
    """
    target = ds_obj.hf_dataset[row_idx]["target"]
    if getattr(target, "ndim", 1) > 1:
        return np.asarray(target[-1])
    else:
        return np.asarray(target)

def candidate_count_for_length(T: int, min_len: int, stride: int) -> int:
    """
    Number of windows to sample for this series.
    We DO NOT align starts to stride; stride is only used to compute how many samples.
    """
    if T < min_len:
        return 0
    return 1 + (T - min_len) // stride

def proportional_allocation(total_picks: int, counts: np.ndarray) -> np.ndarray:
    """
    Allocate 'total_picks' integers across series proportional to 'counts',
    ensuring sum(...) == total_picks and n_i <= counts[i].
    Uses flooring + distribute remainder to largest fractional parts.
    """
    if total_picks <= 0 or counts.sum() == 0:
        return np.zeros_like(counts, dtype=int)

    ideal = total_picks * (counts / counts.sum())
    floor_alloc = np.floor(ideal).astype(int)
    remainder = total_picks - floor_alloc.sum()

    if remainder > 0:
        fract = ideal - floor_alloc
        order = np.argsort(-fract)  # descending by fractional part
        for idx in order:
            if remainder == 0:
                break
            if floor_alloc[idx] < counts[idx]:
                floor_alloc[idx] += 1
                remainder -= 1

    floor_alloc = np.minimum(floor_alloc, counts)
    return floor_alloc

In [None]:
root = # ROOT PATH
names = find_subdatasets(root)
print(f"Found {len(names)} subdatasets")

all_windows = []      # list of 1D numpy arrays (length = min_len)
all_datasets = []     # dataset name for each window (same length as all_windows)

for n in names:
    try:
        ds_obj = Dataset(name=n, term=Term.SHORT, to_univariate=False)
    except Exception as e:
        print(f"Skipping {n}: failed to load -> {e}")
        continue

    # Collect metadata for series (multivariate collapsed to last feature)
    meta = []  # (row_idx, length, num_candidates)
    for row_idx, T in iter_univariate_series(ds_obj):
        num_candidates = candidate_count_for_length(T, min_len, stride)
        if num_candidates > 0:
            meta.append((row_idx, T, num_candidates))

    if not meta:
        print(f"{n}: no series with length >= {min_len}, skipping")
        continue

    # Total candidates across this dataset
    candidate_counts = np.array([m[2] for m in meta], dtype=int)
    tot_candidates = int(candidate_counts.sum())
    k = min(max_per_dataset, tot_candidates)
    if k <= 0:
        print(f"{n}: no candidate windows, skipping")
        continue

    # Allocate how many windows to sample from each series proportionally
    alloc = proportional_allocation(k, candidate_counts)
    if alloc.sum() == 0:
        print(f"{n}: allocation produced zero samples, skipping")
        continue

    # For each series, sample starts NON-deterministically across all eligible positions
    sampled_this_dataset = 0
    for (row_idx, T, n_cand), n_i in zip(meta, alloc):
        if n_i <= 0:
            continue

        # Eligible starting positions are EVERY integer position where a window fits:
        # [0, 1, 2, ..., T - min_len]
        eligible_size = T - min_len + 1
        if eligible_size <= 0:
            continue

        # stride ONLY determines how many samples to draw
        n_i = min(n_i, eligible_size)

        # Sample without replacement from the full eligible range
        # Use Python's random for reproducibility with the given seed
        chosen_starts = random.sample(range(eligible_size), k=n_i)

        # Retrieve the full series and slice windows
        y_full = get_series_array(ds_obj, row_idx)
        for s in chosen_starts:
            window = y_full[s : s + min_len]
            if len(window) == min_len:
                all_windows.append(window.astype(float, copy=False))
                all_datasets.append(n)
                sampled_this_dataset += 1

    print(f"{n}: sampled {sampled_this_dataset} windows (k={k}, tot_candidates={tot_candidates})")

print(f"Total sampled windows across all datasets: {len(all_windows)}")

# -----------------------
# Build long DataFrame
# -----------------------
if len(all_windows) == 0:
    print("No windows sampled. Check min_len/stride/max_per_dataset settings.")
else:
    num_windows = len(all_windows)
    Y = np.vstack(all_windows)  # shape: (num_windows, min_len)

    base_index = pd.date_range(base_start, periods=min_len, freq=synthetic_freq)

    unique_ids = np.arange(num_windows, dtype=int)
    unique_id_col = np.repeat(unique_ids, min_len)
    dataset_col = np.repeat(np.array(all_datasets, dtype=object), min_len)
    ds_col = np.tile(base_index.values, num_windows)
    y_col = Y.reshape(-1)

    long_df = pd.DataFrame(
        {
            "ds": ds_col,
            "y": y_col,
            "unique_id": unique_id_col,
            "dataset": dataset_col,   # keep source dataset name
        }
    )

    print(long_df.head())
    print(f"Long DF shape: {long_df.shape} (expected ~ {min_len} * total_windows)")

In [None]:
# Save to csv
out_dir = # PATH
long_df.to_csv(f"{out_dir}/y_gift-eval-subsample.csv", index=True)