## UT Watch Data Preprocessing notebook
### Step 0. Setup the paths and env variables

In [1]:
# ======================================================================
# STEP 0: Environment & paths
# ======================================================================
from pathlib import Path
import json, sys, os
import re
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

ROOT = Path("/home/aidan/IMU_LM_Data")
sys.path.insert(0, str(ROOT))  # so UTILS is importable

from UTILS.helpers import (
    resample_df,      # not strictly needed here but useful to have
    convert_unit,
    zscore_normalize,
    normalize_str,
    keyize,
    _keyize,
)

BASE    = ROOT / "data"
RAW = BASE / "raw_data" / "UT_Watch" / "ut_watch_dataset"
CLEANED = BASE / "cleaned_premerge"
MERGED  = BASE / "merged_dataset"

SCHEMA_PATH       = ROOT / "Unification" / "schemas" / "continuous_stream_schema.json"
ACTIVITY_MAP_PATH = ROOT / "Unification" / "schemas" / "activity_mapping.json"

SCHEMA       = json.loads(SCHEMA_PATH.read_text())
ACT_MAP_FULL = json.loads(ACTIVITY_MAP_PATH.read_text())

UNKNOWN_ID = int(ACT_MAP_FULL.get("unknown_activity_id", -1))
ID2NAME    = {int(x["id"]): x["name"] for x in ACT_MAP_FULL["label_set"]}
RAW2ID     = {_keyize(k): int(v) for k, v in ACT_MAP_FULL.get("mapping", {}).items()}

print("Paths & contracts ready.")
print(f"Schema keys: {list(SCHEMA.keys())}")
print("RAW UT_Watch path:", RAW)
print("CLEANED:", CLEANED)
print("MERGED:", MERGED)
print("Unknown activity ID:", UNKNOWN_ID)


Paths & contracts ready.
Schema keys: ['name', 'version', 'primary_index', 'description', 'columns', 'rate_hz', 'axis_frame', 'unit_contract', 'unknown_activity_id', 'expectations']
RAW UT_Watch path: /home/aidan/IMU_LM_Data/data/raw_data/UT_Watch/ut_watch_dataset
CLEANED: /home/aidan/IMU_LM_Data/data/cleaned_premerge
MERGED: /home/aidan/IMU_LM_Data/data/merged_dataset
Unknown activity ID: 9000


### Step 1. Ingest, preproccess and map the data 

In [2]:
# ======================================================================
# STEP 1: Ingest UT_Watch raw IMU + annotations
# ======================================================================

def parse_ut_watch_ids(session_code: str) -> tuple[str, str]:
    session_code = str(session_code).strip()

    # Numeric 3-digit code: xxy where xx is participant, y is session (semi-naturalistic)
    if re.fullmatch(r"\d{3}", session_code):
        subj_num = int(session_code[:2])
        sess_num = int(session_code[2])
        subject_id = f"{subj_num:02d}"
        session_id = f"S{sess_num:d}"
        return subject_id, session_id

    # Two-letter + digit (e.g., HH2, AB1) for in-the-wild data
    m = re.fullmatch(r"([A-Za-z]{2})(\d)", session_code)
    if m:
        subj_code, sess_num = m.groups()
        subject_id = subj_code.upper()
        # mark explicitly as wild:
        session_id = f"wild_S{sess_num}"
        return subject_id, session_id

    # Fallback
    subject_id = f"U{abs(hash(session_code)) % 1000:03d}"
    session_id = session_code
    return subject_id, session_id


def load_ut_watch_raw(raw_root: Path) -> pd.DataFrame:
    """
    Load UT_Watch IMU + annotation files and return a 'raw' dataframe:

        subject_id, session_id, timestamp_s,
        acc_x, acc_y, acc_z, gyro_x, gyro_y, gyro_z,
        activity_label_raw

    - Assumes IMU is sampled at 50 Hz.
    - Builds synthetic timestamps aligned to annotation Start (ms) when available.
    """
    dataset_name = "ut_watch"
    rows = []

    imu_longer_map = {}
    imu_shorter_map = {}
    trimmed_tail_map = {}

    session_dirs = [p for p in sorted(raw_root.iterdir()) if p.is_dir()]
    print(f"Found {len(session_dirs)} session folders under {raw_root}")

    for session_dir in tqdm(session_dirs, desc="Sessions"):
        session_code = session_dir.name

        # ------------------------------------------------------------------
        # 1) Locate IMU file (011.csv or 011.tab, or fallback *.csv at root)
        # ------------------------------------------------------------------
        imu_candidates = [
            session_dir / f"{session_code}.csv",
            session_dir / f"{session_code}.tab",
        ]
        motion_file = next((p for p in imu_candidates if p.exists()), None)

        if motion_file is None:
            # Fallback: any CSV at TOP LEVEL that is not an annotation file
            fallback = [
                p for p in sorted(session_dir.glob("*.csv"))
                if not (p.name.endswith("ant.csv") or p.name.endswith("_ann.csv"))
            ]
            if fallback:
                motion_file = fallback[0]
                print(f"[note] Using IMU file '{motion_file.name}' for session '{session_code}'")

        if motion_file is None:
            print(f"[skip] No IMU file found for session '{session_code}'")
            continue

        # ------------------------------------------------------------------
        # 2) Locate annotation file at ROOT ONLY
        #    - 011ant.csv / 011ant.tab (semi-naturalistic)
        #    - HH1_ann.csv             (in-the-wild)
        # ------------------------------------------------------------------
        ann_candidates = [
            session_dir / f"{session_code}ant.csv",
            session_dir / f"{session_code}ant.tab",
            session_dir / f"{session_code}_ann.csv",
        ]
        ann_file = next((p for p in ann_candidates if p.exists()), None)

        # ------------------------------------------------------------------
        # 3) Read IMU (handle one-line epoch header + 6-column body)
        # ------------------------------------------------------------------
        try:
            # check first line: if it's a lone epoch number, skip it
            with motion_file.open("r") as f:
                first_line = f.readline().strip()

            if first_line and ("," not in first_line) and ("\t" not in first_line):
                skiprows = 1
            else:
                skiprows = 0
            

            if motion_file.suffix == ".csv":
                imu_df = pd.read_csv(motion_file, header=None, skiprows=skiprows)
            else:
                imu_df = pd.read_csv(motion_file, sep="\t", header=None, skiprows=skiprows)

        except Exception as e:
            print(f"[error] Reading IMU '{motion_file}': {e}")
            continue

        # extra guard – fine to keep or drop
        if imu_df.shape[1] >= 1:
            fr = imu_df.iloc[0]
            if fr.iloc[0] == fr.iloc[0] and fr.iloc[1:].isna().all():
                imu_df = imu_df.iloc[1:].reset_index(drop=True)

        if imu_df.shape[1] < 6:
            print(f"[skip] {session_code}: IMU data has <6 columns ({imu_df.shape[1]})")
            continue

        imu_df = imu_df.iloc[:, :6].copy()
        imu_df.columns = ["acc_x", "acc_y", "acc_z", "gyro_x", "gyro_y", "gyro_z"]

        n = len(imu_df)
        if n == 0:
            print(f"[skip] {session_code}: empty IMU file")
            continue

        # Parse subject/session IDs
        subject_id, session_id = parse_ut_watch_ids(session_code)

        # Default raw label
        imu_df["activity_label_raw"] = "none"

        # ------------------------------------------------------------------
        # 4) Read annotations & build timestamps at 50 Hz
        # ------------------------------------------------------------------
        ann_df = None
        if ann_file is not None:
            try:
                if ann_file.suffix == ".csv":
                    # semi-naturalistic 011ant.csv is actually TAB-separated
                    if ann_file.name.endswith("ant.csv"):
                        ann_df = pd.read_csv(ann_file, sep="\t")
                    else:
                        # in-the-wild HH1_ann.csv etc. are real CSV
                        ann_df = pd.read_csv(ann_file)
                else:
                    ann_df = pd.read_csv(ann_file, sep="\t")

                ann_df.columns = [c.strip() for c in ann_df.columns]
            except Exception as e:
                print(f"[warn] Cannot read annotation '{ann_file}': {e}")
                ann_df = None

        # timestamps in milliseconds
        if ann_df is not None and "Start (ms)" in ann_df.columns and "End (ms)" in ann_df.columns:
            # semi-naturalistic branch: align to video-derived start time
            try:
                start0_ms = float(ann_df["Start (ms)"].min())
            except Exception as e:
                print(f"[warn] Cannot determine Start (ms) for {session_code}: {e}")
                start0_ms = 0.0

            dt_ms = 20.0  # 50 Hz
            ts_ms = start0_ms + np.arange(n, dtype="float64") * dt_ms
            imu_df["timestamp_ms"] = ts_ms

            # Only do this for numeric sessions 011, 012, ... (session_id "S1", "S2", ...)
            if session_id.startswith("S"):
                last_end_ms = float(ann_df["End (ms)"].max())
                keep_mask = imu_df["timestamp_ms"] <= (last_end_ms + 1e-6)
                dropped = int(len(imu_df) - keep_mask.sum())
                if dropped > 0:
                    print(f"[trim] {session_code}: dropping {dropped} tail samples beyond last label")
                imu_df = imu_df.loc[keep_mask].reset_index(drop=True)
                # refresh timestamp array and n
                ts_ms = imu_df["timestamp_ms"].to_numpy()
                n = len(imu_df)

            # Duration comparison IMU vs annotations (diagnostic only)
            imu_range = ts_ms[-1] - ts_ms[0]
            ann_range = float(ann_df["End (ms)"].max() - ann_df["Start (ms)"].min())
            if imu_range > ann_range + 5000:
                imu_longer_map[session_code] = (imu_range - ann_range) / 1000.0
            elif imu_range < ann_range - 5000:
                imu_shorter_map[session_code] = (ann_range - imu_range) / 1000.0

            # Apply labels in [Start, End]
            for _, r in ann_df.iterrows():
                try:
                    s = float(r["Start (ms)"])
                    e = float(r["End (ms)"])
                    lbl = str(r["Label"])
                except Exception:
                    continue
                mask = (ts_ms >= s) & (ts_ms <= e)
                imu_df.loc[mask, "activity_label_raw"] = lbl

        else:
            # No usable Start/End → synthetic timeline, unlabeled ("none")
            dt_ms = 20.0
            ts_ms = np.arange(n, dtype="float64") * dt_ms
            imu_df["timestamp_ms"] = ts_ms

        # Convert to seconds relative to session start
        imu_df["timestamp_s"] = (imu_df["timestamp_ms"] - imu_df["timestamp_ms"].iloc[0]) / 1000.0

        # ------------------------------------------------------------------
        # 5) Accumulate rows
        # ------------------------------------------------------------------
        rows.extend(
            {
                "subject_id": subject_id,
                "session_id": session_id,
                "timestamp_s": float(imu_df.at[k, "timestamp_s"]),
                "acc_x": float(imu_df.at[k, "acc_x"]),
                "acc_y": float(imu_df.at[k, "acc_y"]),
                "acc_z": float(imu_df.at[k, "acc_z"]),
                "gyro_x": float(imu_df.at[k, "gyro_x"]),
                "gyro_y": float(imu_df.at[k, "gyro_y"]),
                "gyro_z": float(imu_df.at[k, "gyro_z"]),
                "activity_label_raw": str(imu_df.at[k, "activity_label_raw"]),
            }
            for k in range(n)
        )

    # ----------------------------------------------------------------------
    # Combine and summarize
    # ----------------------------------------------------------------------
    df = pd.DataFrame(rows)

    print("\n=== UT_Watch RAW SUMMARY ===")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} cols")

    if len(df):
        def est_hz(ts: pd.Series) -> float:
            arr = ts.to_numpy()
            if arr.size < 3:
                return np.nan
            dt = np.diff(arr)
            dt = dt[(dt > 0) & np.isfinite(dt)]
            return float(np.median(1.0 / dt)) if dt.size else np.nan

        def sess_dur(ts: pd.Series) -> float:
            arr = ts.to_numpy()
            return float(arr[-1] - arr[0]) if arr.size > 1 else 0.0

        hz = df.groupby(["subject_id", "session_id"])["timestamp_s"].apply(est_hz)
        dur = df.groupby(["subject_id", "session_id"])["timestamp_s"].apply(sess_dur)

        n_subjects = df["subject_id"].nunique()
        n_sessions = df["session_id"].nunique()
        n_classes = df["activity_label_raw"].astype(str).str.lower().nunique()

        print(f"Subjects: {n_subjects} | Sessions: {n_sessions} | Raw classes: {n_classes}")
        print(f"Median Hz across sessions: {np.nanmedian(hz.values):.2f}")
        print(f"Median session duration (s): {np.nanmedian(dur.values):.2f}")

        top = df["activity_label_raw"].value_counts().head(15)
        print("\nTop-15 raw labels:")
        for lbl, cnt in top.items():
            print(f"  {lbl:45s} {cnt:,}")

    if imu_longer_map:
        print("\nIMU longer than annotations:")
        for sess, diff in imu_longer_map.items():
            print(f"  {sess}: {diff:.2f} s longer")
    if imu_shorter_map:
        print("\nIMU shorter than annotations:")
        for sess, diff in imu_shorter_map.items():
            print(f"  {sess}: {diff:.2f} s shorter")

    return df



# Run raw ingest
ut_raw_df = load_ut_watch_raw(RAW)


Found 40 session folders under /home/aidan/IMU_LM_Data/data/raw_data/UT_Watch/ut_watch_dataset


Sessions:   0%|          | 0/40 [00:00<?, ?it/s]

[trim] 011: dropping 52906 tail samples beyond last label


Sessions:   2%|▎         | 1/40 [00:01<00:42,  1.09s/it]

[trim] 012: dropping 6053 tail samples beyond last label


Sessions:   5%|▌         | 2/40 [00:02<00:48,  1.27s/it]

[trim] 021: dropping 3791 tail samples beyond last label


Sessions:   8%|▊         | 3/40 [00:03<00:45,  1.24s/it]

[trim] 022: dropping 2223 tail samples beyond last label


Sessions:  10%|█         | 4/40 [00:04<00:40,  1.13s/it]

[trim] 031: dropping 3715 tail samples beyond last label


Sessions:  12%|█▎        | 5/40 [00:06<00:44,  1.26s/it]

[trim] 032: dropping 3909 tail samples beyond last label


Sessions:  15%|█▌        | 6/40 [00:07<00:41,  1.21s/it]

[trim] 041: dropping 6074 tail samples beyond last label


Sessions:  18%|█▊        | 7/40 [00:08<00:45,  1.37s/it]

[trim] 042: dropping 3571 tail samples beyond last label


Sessions:  20%|██        | 8/40 [00:10<00:43,  1.35s/it]

[trim] 051: dropping 7290 tail samples beyond last label


Sessions:  22%|██▎       | 9/40 [00:11<00:44,  1.43s/it]

[trim] 052: dropping 2180 tail samples beyond last label


Sessions:  25%|██▌       | 10/40 [00:13<00:40,  1.35s/it]

[trim] 061: dropping 8580 tail samples beyond last label


Sessions:  28%|██▊       | 11/40 [00:14<00:39,  1.38s/it]

[trim] 062: dropping 2969 tail samples beyond last label


Sessions:  30%|███       | 12/40 [00:15<00:36,  1.29s/it]

[trim] 071: dropping 3368 tail samples beyond last label


Sessions:  32%|███▎      | 13/40 [00:17<00:39,  1.46s/it]

[trim] 072: dropping 4584 tail samples beyond last label


Sessions:  35%|███▌      | 14/40 [00:18<00:38,  1.48s/it]

[trim] 081: dropping 3895 tail samples beyond last label


Sessions:  38%|███▊      | 15/40 [00:20<00:36,  1.47s/it]

[trim] 082: dropping 3599 tail samples beyond last label


Sessions:  40%|████      | 16/40 [00:21<00:34,  1.43s/it]

[trim] 091: dropping 3891 tail samples beyond last label


Sessions:  42%|████▎     | 17/40 [00:23<00:31,  1.38s/it]

[trim] 092: dropping 2238 tail samples beyond last label


Sessions:  45%|████▌     | 18/40 [00:24<00:29,  1.33s/it]

[trim] 101: dropping 3544 tail samples beyond last label


Sessions:  48%|████▊     | 19/40 [00:25<00:29,  1.39s/it]

[trim] 102: dropping 3452 tail samples beyond last label


Sessions:  50%|█████     | 20/40 [00:27<00:27,  1.37s/it]

[trim] 111: dropping 4670 tail samples beyond last label


Sessions:  52%|█████▎    | 21/40 [00:28<00:28,  1.49s/it]

[trim] 112: dropping 3236 tail samples beyond last label


Sessions:  55%|█████▌    | 22/40 [00:29<00:24,  1.36s/it]

[trim] 121: dropping 4448 tail samples beyond last label


Sessions:  57%|█████▊    | 23/40 [00:31<00:23,  1.38s/it]

[trim] 122: dropping 2807 tail samples beyond last label


Sessions:  60%|██████    | 24/40 [00:32<00:21,  1.33s/it]

[trim] 131: dropping 6019 tail samples beyond last label


Sessions:  62%|██████▎   | 25/40 [00:34<00:20,  1.40s/it]

[trim] 132: dropping 3285 tail samples beyond last label


Sessions:  65%|██████▌   | 26/40 [00:35<00:19,  1.37s/it]

[trim] 141: dropping 6327 tail samples beyond last label


Sessions:  68%|██████▊   | 27/40 [00:36<00:18,  1.42s/it]

[trim] 142: dropping 2465 tail samples beyond last label


Sessions:  70%|███████   | 28/40 [00:38<00:16,  1.34s/it]

[trim] 151: dropping 9931 tail samples beyond last label


Sessions:  72%|███████▎  | 29/40 [00:40<00:17,  1.62s/it]

[trim] 152: dropping 2455 tail samples beyond last label


Sessions:  92%|█████████▎| 37/40 [02:03<00:32, 10.69s/it]

[skip] No IMU file found for session 'LV2'
[note] Using IMU file 'SK2.csv' for session 'SK1'


Sessions:  98%|█████████▊| 39/40 [02:15<00:08,  8.72s/it]

[note] Using IMU file 'SK3.csv' for session 'SK2'


Sessions: 100%|██████████| 40/40 [02:28<00:00,  3.71s/it]



=== UT_Watch RAW SUMMARY ===
Shape: 8,146,709 rows × 10 cols
Subjects: 20 | Sessions: 4 | Raw classes: 24
Median Hz across sessions: 50.00
Median session duration (s): 1666.30

Top-15 raw labels:
  none                                          7,004,993
  Microwave                                     56,282
  Sweeping                                      52,144
  Frying                                        50,984
  Drinking                                      50,873
  Writing                                       50,500
  Dishes                                        49,881
  Eating                                        49,865
  Drawing                                       49,828
  Keyboard                                      49,748
  Mobile                                        49,443
  Vacuuming                                     49,195
  Browsing                                      49,176
  Scratching                                    49,115
  Teeth                       

### Step 2. Map the data and audit the mapping

In [None]:
# ======================================================================
# STEP 2: Activity mapping audit (raw → global ontology) bigger for Uwatch
# ======================================================================

# 1) Quick sanity on the raw labels *before* normalization
n_null  = ut_raw_df["activity_label_raw"].isna().sum()
n_empty = (ut_raw_df["activity_label_raw"].astype(str).str.strip() == "").sum()
print(f"Label sanity: null={n_null:,} | empty={n_empty:,}")

# 2) Normalize labels for mapping
norm_raw = (
    ut_raw_df["activity_label_raw"]
      .astype(str)
      .fillna("none")
      .map(_keyize)   # lower, strip, etc.
)

raw_counts = (
    norm_raw
      .value_counts()
      .rename_axis("raw_label")
      .reset_index(name="count")
)

# 3) Map to global IDs/names
raw_counts["mapped_id"] = (
    raw_counts["raw_label"]
      .map(RAW2ID)
      .fillna(UNKNOWN_ID)
      .astype(int)
)

raw_counts["mapped_nm"] = raw_counts["mapped_id"].map(
    lambda x: ID2NAME.get(int(x), "other")
)

# 4) Inspect unmapped labels
unmapped = raw_counts.loc[raw_counts["mapped_id"] == UNKNOWN_ID]

print(f"Raw label unique: {len(raw_counts)} | Unmapped: {len(unmapped)}")
print("Unmapped (top-10 by frequency):")
print(
    unmapped.nlargest(10, "count")[["raw_label", "count"]]
      .to_string(index=False)
)

raw_counts.head(25)

Label sanity: null=0 | empty=0
Raw label unique: 24 | Unmapped: 1
Unmapped (top-10 by frequency):
raw_label  count
 clapping  49019


Unnamed: 0,raw_label,count,mapped_id,mapped_nm
0,none,7004993,0,rest_inactive
1,microwave,56282,15,adl_food
2,sweeping,52144,13,adl_household_general
3,frying,50984,15,adl_food
4,drinking,50873,15,adl_food
5,writing,50500,14,adl_desk_device
6,dishes,49881,13,adl_household_general
7,eating,49865,15,adl_food
8,drawing,49828,14,adl_desk_device
9,keyboard,49748,14,adl_desk_device


### Step 3. Build and clean dataset in stream json fromat

In [9]:
# ======================================================================
# STEP 3: Convert UT_Watch → continuous_stream schema
# ======================================================================

def to_continuous_stream_ut(df_raw: pd.DataFrame, dataset_name: str = "ut_watch") -> pd.DataFrame:
    """
    Convert UT_Watch raw frame into the unified continuous_stream schema.
    """
    if df_raw.empty:
        return pd.DataFrame(columns=[c["name"] for c in SCHEMA["columns"]])

    # ---------- GLOBAL (ontology) ----------
    raw_key = df_raw["activity_label_raw"].astype(str).map(_keyize)
    gid = raw_key.map(RAW2ID).fillna(UNKNOWN_ID).astype("int16")
    glabel = gid.map(lambda x: ID2NAME.get(int(x), "other")).astype("string")

    # ---------- NATIVE (dataset-specific) ----------
    # One stable ID per distinct raw label
    unique_labels = sorted(df_raw["activity_label_raw"].astype(str).unique())
    label2id = {lbl: idx for idx, lbl in enumerate(unique_labels)}
    native_id = df_raw["activity_label_raw"].astype(str).map(label2id).astype("Int16")
    native_lbl = df_raw["activity_label_raw"].astype("string")

    out = pd.DataFrame({
        "dataset":        dataset_name,
        "subject_id":     df_raw["subject_id"].astype("string"),
        "session_id":     df_raw["session_id"].astype("string"),
        "timestamp_ns":   (df_raw["timestamp_s"].astype(np.float64) * 1e9).round().astype("int64"),

        "acc_x": df_raw["acc_x"].astype("float32"),
        "acc_y": df_raw["acc_y"].astype("float32"),
        "acc_z": df_raw["acc_z"].astype("float32"),
        "gyro_x": df_raw["gyro_x"].astype("float32"),
        "gyro_y": df_raw["gyro_y"].astype("float32"),
        "gyro_z": df_raw["gyro_z"].astype("float32"),

        "global_activity_id":    gid,
        "global_activity_label": glabel,

        "dataset_activity_id":   native_id,
        "dataset_activity_label": native_lbl,
    })

    order = [c["name"] for c in SCHEMA["columns"]]
    return out[order]


ut_watch_df = to_continuous_stream_ut(ut_raw_df, dataset_name="ut_watch")
ut_watch_df.head()


Unnamed: 0,dataset,subject_id,session_id,timestamp_ns,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,global_activity_id,global_activity_label,dataset_activity_id,dataset_activity_label
0,ut_watch,1,S1,0,-1.558,-0.14,9.715,0.022,0.008,0.006,14,adl_desk_device,22,Writing
1,ut_watch,1,S1,20000000,-1.558,-0.14,9.715,0.013,0.012,0.009,14,adl_desk_device,22,Writing
2,ut_watch,1,S1,40000000,-1.642,0.042,9.705,-0.035,0.001,0.004,14,adl_desk_device,22,Writing
3,ut_watch,1,S1,60000000,-1.508,-0.137,9.775,-0.001,0.004,0.004,14,adl_desk_device,22,Writing
4,ut_watch,1,S1,80000000,-1.544,-0.089,9.736,0.033,0.002,0.002,14,adl_desk_device,22,Writing


### Step 4. Audit check the unified frame

In [10]:
# ======================================================================
# STEP 4: Audit unified UT_Watch frame
# ======================================================================

print("UNIFIED UT_Watch rows:", len(ut_watch_df))
print("Subjects:", ut_watch_df["subject_id"].nunique(),
      "| Sessions:", ut_watch_df["session_id"].nunique())

# Monotonic timestamps per (subject, session)
viol = 0
for (_sid, _sess), g in ut_watch_df.groupby(["subject_id", "session_id"], sort=False):
    ts = g["timestamp_ns"].to_numpy()
    if ts.size and not np.all(np.diff(ts) >= 0):
        viol += 1
print("Monotonic violations (groups):", viol)


def est_hz_ns(ts_ns: pd.Series) -> float:
    arr = ts_ns.to_numpy()
    if arr.size < 3:
        return np.nan
    dt = np.diff(arr) / 1e9  # ns → s
    dt = dt[(dt > 0) & np.isfinite(dt)]
    return float(np.median(1.0 / dt)) if dt.size else np.nan


hz = ut_watch_df.groupby(["subject_id", "session_id"])["timestamp_ns"].apply(est_hz_ns)
print(f"Median Hz: {np.nanmedian(hz.values):.2f} (target={SCHEMA['rate_hz']})")

# required-not-null coverage
req = SCHEMA["expectations"]["required_not_null"]
pct = ut_watch_df[req].notnull().all(axis=1).mean() * 100
print(f"Rows meeting required-not-null: {pct:.2f}%")

print("\nTop-10 canonical labels:")
print(ut_watch_df["global_activity_label"].value_counts().head(10))

cov = (ut_watch_df["global_activity_id"] != UNKNOWN_ID).mean() * 100
print(f"Global mapping coverage: {cov:.1f}% (unknown={UNKNOWN_ID})")

ut_watch_df["global_activity_label"].value_counts().head(15)


UNIFIED UT_Watch rows: 8146709
Subjects: 20 | Sessions: 4
Monotonic violations (groups): 0
Median Hz: 50.00 (target=50)
Rows meeting required-not-null: 100.00%

Top-10 canonical labels:
global_activity_label
rest_inactive            7004993
adl_food                  402387
adl_desk_device           248695
adl_household_general     248443
adl_personal_care         147091
other                      49019
walk                       46081
Name: count, dtype: Int64
Global mapping coverage: 99.4% (unknown=9000)


global_activity_label
rest_inactive            7004993
adl_food                  402387
adl_desk_device           248695
adl_household_general     248443
adl_personal_care         147091
other                      49019
walk                       46081
Name: count, dtype: Int64

In [None]:
print(ut_watch_df["session_id"].unique)

4


### Step 5. Save outputs

In [None]:
# ======================================================================
# STEP 5: Save unified UT_Watch frame
# ======================================================================

CLEANED.mkdir(parents=True, exist_ok=True)
out_path = CLEANED / "ut_watch_clean_data.parquet"
ut_watch_df.to_parquet(out_path, index=False)
print("Saved UT_Watch continuous_stream frame to:", out_path)
