## Opportunity++ Data Preprocessing notebook
### Step 0. Setup the paths and env variables

In [1]:
# =========================
# STEP 0 — Setup & contracts
# =========================
from pathlib import Path
import json, sys, re
import numpy as np
import pandas as pd
from tqdm import tqdm

ROOT = Path("/home/aidan/IMU_LM_Data")
sys.path.insert(0, str(ROOT))

from UTILS.helpers import (
    read_opp_column_names,
    canonicalize_opp_columns,
    parse_opp_subject_session,
    nearest_label_join_1d,
    upsample_df_rate,
    _canon,
    
)

BASE      = ROOT / "data"
RAW_OPP   = BASE / "raw_data" / "Opportunity++" / "dataset"   # *.dat + column_names.txt
CLEANED   = BASE / "cleaned_premerge"

SCHEMA_PATH       = ROOT / "Unification" / "schemas" / "continuous_stream_schema.json"
ACTIVITY_MAP_PATH = ROOT / "Unification" / "schemas" / "activity_mapping.json"

SCHEMA       = json.loads(SCHEMA_PATH.read_text())
ACT_MAP_FULL = json.loads(ACTIVITY_MAP_PATH.read_text())

UNKNOWN_ID = int(ACT_MAP_FULL.get("unknown_activity_id", 9000))
TARGET_HZ  = int(SCHEMA.get("rate_hz", 50))

# activity_mapping.json provides RAW2ID keyed by canonical label strings
# RAW2ID = { _canon(k): int(v) for k, v in ACT_MAP_FULL.get("mapping", {}).items() }
# ID2NAME = { int(x["id"]): x["name"] for x in ACT_MAP_FULL["label_set"] }

print("Paths & contracts ready.")
print("RAW dir :", RAW_OPP)
print("CLEANED :", CLEANED)

# ---- Locomotion (native, used verbatim) ----
# Keep the native IDs distinct; labels are simple canonical strings.
OPP_LOCO = {
    1: "stand",
    2: "walk",
    4: "sit",
    5: "lie",
}

# ---- ML_Both_Arms (native → collapsed label) ----
# We KEEP every native ML code in dataset_activity_id (e.g., 406516 vs 406517)
# but COLLAPSE their human-readable label to a canonical verb+object (e.g., "open_door").
OPP_ML = {
    406516: "open_door",        406517: "open_door",
    404516: "close_door",       404517: "close_door",
    406520: "open_fridge",      404520: "close_fridge",
    406505: "open_dishwasher",  404505: "close_dishwasher",
    406519: "open_drawer",      404519: "close_drawer",
    406511: "open_drawer",      404511: "close_drawer",
    406508: "open_drawer",      404508: "close_drawer",
    408512: "clean_table",
    407521: "drink_from_cup",
    405506: "toggle_switch",
}


Paths & contracts ready.
RAW dir : /home/aidan/IMU_LM_Data/data/raw_data/Opportunity++/dataset
CLEANED : /home/aidan/IMU_LM_Data/data/cleaned_premerge


### Step 1. Ingest, preporccess and map the data 

In [2]:
# ============================
# STEP 1 — Load & normalize (RWR accel + Loco & ML only)
# ============================
def _pick_label_ml_then_loco(lo, ml):
    """
    Return (dataset_activity_id:int, dataset_activity_label:str) with ML > Locomotion precedence.
    - If ML present and in OPP_ML -> (ml_code, collapsed_label)
    - Else if Loco present and in OPP_LOCO (and nonzero) -> (loco_code, simple_label)
    - Else -> (UNKNOWN_ID, 'unknown_activity')
    """
    if pd.notna(ml):
        m = int(ml)
        if m in OPP_ML:
            return m, OPP_ML[m]
    if pd.notna(lo):
        l = int(lo)
        if l != 0 and l in OPP_LOCO:
            return l, OPP_LOCO[l]
    return UNKNOWN_ID, "unknown_activity"

def load_opportunity_loco_ml(
    raw_dir: Path,
    drop_all_zero_unknown: bool = True,
    drop_allnan_sessions: bool = True,   # NEW: only drop sessions 100% NaN on acc
) -> pd.DataFrame:
    """
    Load Opportunity++ RWR accelerometer and select labels from (ml_both_arms, locomotion).
    Output (RAW STAGE):
      subject_id, session_id, timestamp_s,
      acc_x/acc_y/acc_z (m/s^2), gyro_* (NaN placeholders),
      locomotion (Int64), ml_both_arms (Int64),
      dataset_activity_id (Int16, native code), dataset_activity_label (string, collapsed).
    """
    col_names_path = raw_dir / "column_names.txt"
    if not col_names_path.exists():
        print("Missing column_names.txt at", col_names_path)
        return pd.DataFrame()

    names = read_opp_column_names(col_names_path)
    files = sorted([p for p in raw_dir.glob("*.dat") if p.is_file()])
    if not files:
        print("No .dat files in", raw_dir)
        return pd.DataFrame()

    rows = []
    g = 9.80665  # mg → m/s^2

    for f in tqdm(files, desc="[OPP] files"):
        df = pd.read_csv(f, sep=r"\s+", header=None, names=names, na_values=["NaN"], engine="c")
        df = canonicalize_opp_columns(df)
        sid, sess = parse_opp_subject_session(f.stem)

        # Right-wrist accel — support canonical & legacy names
        ax = "rwr_acc_x" if "rwr_acc_x" in df.columns else ("rwr_accx" if "rwr_accx" in df.columns else None)
        ay = "rwr_acc_y" if "rwr_acc_y" in df.columns else ("rwr_accy" if "rwr_accy" in df.columns else None)
        az = "rwr_acc_z" if "rwr_acc_z" in df.columns else ("rwr_accz" if "rwr_accz" in df.columns else None)

        needed = ["millisec", "locomotion", "ml_both_arms"]
        missing = [c for c in needed if c not in df.columns]
        if ax is None: missing.append("rwr_acc_x|rwr_accx")
        if ay is None: missing.append("rwr_acc_y|rwr_accy")
        if az is None: missing.append("rwr_acc_z|rwr_accz")
        if missing:
            raise KeyError(f"{f.name}: missing required columns: {missing}")

        out = pd.DataFrame({
            "dataset": "opportunity++",
            "subject_id": sid,
            "session_id": sess,
            "timestamp_s": df["millisec"].astype("float64") / 1000.0,

            "acc_x": df[ax].astype("float64") * (g/1000.0),
            "acc_y": df[ay].astype("float64") * (g/1000.0),
            "acc_z": df[az].astype("float64") * (g/1000.0),

            # No right-wrist gyro in OPP
            "gyro_x": np.float32(np.nan),
            "gyro_y": np.float32(np.nan),
            "gyro_z": np.float32(np.nan),

            # Native tracks we consider
            "locomotion":   df["locomotion"].astype("Int64"),
            "ml_both_arms": df["ml_both_arms"].astype("Int64"),
        })

        # Choose native label (ML > Locomotion), keep native code; collapse only the string label
        picked = out[["locomotion","ml_both_arms"]].apply(
            lambda r: _pick_label_ml_then_loco(r["locomotion"], r["ml_both_arms"]),
            axis=1, result_type="expand"
        )
        picked.columns = ["dataset_activity_id", "dataset_activity_label"]
        out = pd.concat([out, picked], axis=1)

        if drop_all_zero_unknown:
            both_zero = (out["locomotion"].fillna(0) == 0) & (out["ml_both_arms"].fillna(0) == 0)
            out = out.loc[~(both_zero & (out["dataset_activity_label"] == "unknown_activity"))]

        rows.append(out)

    raw = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()

    # === drop only sessions that are COMPLETELY NaN on acc_x/y/z ===
    if drop_allnan_sessions and not raw.empty:
        sess_all_nan = (
            raw.groupby(["subject_id","session_id"])[["acc_x","acc_y","acc_z"]]
               .apply(lambda g: g.isna().all().all())
               .reset_index(name="all_acc_nan")
        )
        bad = sess_all_nan.loc[sess_all_nan["all_acc_nan"], ["subject_id","session_id"]]
        if not bad.empty:
            bad_idx = pd.MultiIndex.from_frame(bad)
            n_before = len(raw)
            keep = ~raw.set_index(["subject_id","session_id"]).index.isin(bad_idx)
            raw = raw.loc[keep].reset_index(drop=True)
            print(f"Dropped accel-empty sessions: {list(bad_idx)} | removed_rows={n_before - len(raw):,}")
        else:
            print("No accel-empty sessions detected.")

    # NOTE: we intentionally keep partial-NaN rows; STEP 4 will handle interpolation.
    any_nan = raw[["acc_x","acc_y","acc_z"]].isna().any().to_dict()
    print(f"Accel NaNs present (kept for later interpolation): {any_nan}")

    # Summary
    print("\n=== RAW SUMMARY (Opportunity++ wrist: loco + ML only) ===")
    print(f"Rows: {len(raw):,}")
    def _est_hz(ts: pd.Series):
        arr = ts.to_numpy()
        if arr.size < 3: return np.nan
        dt = np.diff(arr); dt = dt[(dt > 0) & np.isfinite(dt)]
        return float(np.median(1.0/dt)) if dt.size else np.nan
    med_hz = raw.groupby(["subject_id","session_id"])["timestamp_s"].apply(_est_hz)
    print(f"Median native Hz: {np.nanmedian(med_hz.values):.2f}")

    print("\nTop native labels (collapsed names, native IDs retained):")
    print(raw["dataset_activity_label"].value_counts().head(20))

    return raw
   
# Run STEP 1
raw_opp = load_opportunity_loco_ml(RAW_OPP)
raw_opp.head(3)


[OPP] files: 100%|██████████| 24/24 [00:49<00:00,  2.05s/it]


Dropped accel-empty sessions: [('S1', 'DRILL')] | removed_rows=52,130
Accel NaNs present (kept for later interpolation): {'acc_x': True, 'acc_y': True, 'acc_z': True}

=== RAW SUMMARY (Opportunity++ wrist: loco + ML only) ===
Rows: 656,031
Median native Hz: 30.30

Top native labels (collapsed names, native IDs retained):
dataset_activity_label
stand               174367
walk                144018
sit                 105692
drink_from_cup       45901
lie                  25395
open_door            25026
close_door           23065
open_drawer          20906
close_drawer         18866
open_fridge          16288
close_fridge         14595
clean_table          13833
open_dishwasher       9698
toggle_switch         9465
close_dishwasher      8916
Name: count, dtype: int64


Unnamed: 0,dataset,subject_id,session_id,timestamp_s,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,locomotion,ml_both_arms,dataset_activity_id,dataset_activity_label
0,opportunity++,S1,ADL1,98.466,-4.06976,8.276813,1.470997,,,,1,0,1,stand
1,opportunity++,S1,ADL1,98.499,-4.628739,8.963278,1.451384,,,,1,0,1,stand
2,opportunity++,S1,ADL1,98.532,-4.148213,8.521979,1.735777,,,,1,0,1,stand


### Step 2. Map the data and audit the mapping

In [None]:
# ============================
# STEP 2 — Mapping audit (raw → global)
# ============================
# We keep native OPP codes verbatim in dataset_*.
# For global_* we map using ACTIVITY_MAP_PATH (RAW2ID via collapsed strings).
# Anything unmapped → UNKNOWN_ID / "other".

# Build (raw_label → mapped global ID/label) summary
if raw_opp.empty:
    raise SystemExit("No Opportunity++ rows after STEP 1. Check RAW_OPP path.")

raw_counts = (
    raw_opp["dataset_activity_label"]
          .astype("string")
          .map(_canon)
          .value_counts(dropna=False)
          .rename_axis("raw_label")
          .reset_index(name="count")
)

# activity_mapping.json provides RAW2ID keyed by canonical label strings
RAW2ID = { _canon(k): int(v) for k, v in ACT_MAP_FULL.get("mapping", {}).items() }
ID2NAME = { int(x["id"]): x["name"] for x in ACT_MAP_FULL["label_set"] }

raw_counts["mapped_id"] = raw_counts["raw_label"].map(RAW2ID).fillna(UNKNOWN_ID).astype(int)
raw_counts["mapped_nm"] = raw_counts["mapped_id"].map(lambda x: ID2NAME.get(int(x), "other"))

unmapped = raw_counts.loc[raw_counts["mapped_id"] == UNKNOWN_ID]
print(f"[STEP 2] Unique raw labels: {len(raw_counts)} | Unmapped: {len(unmapped)}")
if not unmapped.empty:
    print(unmapped.sort_values("count", ascending=False).head(25)[["raw_label","count"]].to_string(index=False))

# --- minimal extra checks ---
# Coverage
total_ct = int(raw_counts["count"].sum())
mapped_ct = int(raw_counts.loc[raw_counts["mapped_id"] != UNKNOWN_ID, "count"].sum())
cov_pct = 100.0 * mapped_ct / max(total_ct, 1)
print(f"[STEP 2] coverage={cov_pct:.2f}%  (mapped={mapped_ct:,} / total={total_ct:,})")

# If there are unmapped labels, show a short list
if not unmapped.empty:
    print("Unmapped (top up to 10):")
    print(unmapped.sort_values("count", ascending=False)
                 .head(10)[["raw_label","count"]].to_string(index=False))

# Counts per mapped global name (short head)
mapped_counts = (
    raw_counts.loc[raw_counts["mapped_id"] != UNKNOWN_ID, ["mapped_nm","count"]]
              .groupby("mapped_nm", as_index=False)["count"].sum()
              .sort_values("count", ascending=False)
)
print("\nTop mapped globals (up to 10):")
print(mapped_counts.head(10).to_string(index=False))

# NaN checks on accel
acc_nan_any = raw_opp[["acc_x","acc_y","acc_z"]].isna().any().to_dict()
all_acc_nan_rows = int(raw_opp[["acc_x","acc_y","acc_z"]].isna().all(axis=1).sum())
print(f"\nAccel NaNs any={acc_nan_any}  rows_all_acc_nan={all_acc_nan_rows}")

# --- NEW: show dataset-specific vs mapped global label sets ---
print("\nDataset-specific (native) labels:")
print(sorted(raw_counts['raw_label'].unique()))

print("\nMapped global labels:")
print(sorted(mapped_counts['mapped_nm'].unique()))

unmapped = raw_counts.loc[raw_counts["mapped_id"] == UNKNOWN_ID]
print(f"Raw label unique: {len(raw_counts)} | Unmapped: {len(unmapped)}")
print("Unmapped (top-10):")
print(unmapped.nlargest(10, "count")[["raw_label","count"]].to_string(index=False))
raw_counts.head(10)


[STEP 2] Unique raw labels: 15 | Unmapped: 0
[STEP 2] coverage=100.00%  (mapped=656,031 / total=656,031)

Top mapped globals (up to 10):
            mapped_nm  count
   posture_stationary 305454
    adl_appliance_ops 146825
                 walk 144018
             adl_food  45901
adl_household_general  13833

Accel NaNs any={'acc_x': True, 'acc_y': True, 'acc_z': True}  rows_all_acc_nan=15956

Dataset-specific (native) labels:
['clean_table', 'close_dishwasher', 'close_door', 'close_drawer', 'close_fridge', 'drink_from_cup', 'lie', 'open_dishwasher', 'open_door', 'open_drawer', 'open_fridge', 'sit', 'stand', 'toggle_switch', 'walk']

Mapped global labels:
['adl_appliance_ops', 'adl_food', 'adl_household_general', 'posture_stationary', 'walk']
Raw label unique: 15 | Unmapped: 0
Unmapped (top-10):
Empty DataFrame
Columns: [raw_label, count]
Index: []


Unnamed: 0,raw_label,count,mapped_id,mapped_nm
0,stand,174367,1,posture_stationary
1,walk,144018,2,walk
2,sit,105692,1,posture_stationary
3,drink_from_cup,45901,15,adl_food
4,lie,25395,1,posture_stationary
5,open_door,25026,20,adl_appliance_ops
6,close_door,23065,20,adl_appliance_ops
7,open_drawer,20906,20,adl_appliance_ops
8,close_drawer,18866,20,adl_appliance_ops
9,open_fridge,16288,20,adl_appliance_ops


### Step 3. Build and clean dataset in stream json fromat

In [4]:
# ==========================================================
# STEP 3 — Build schema-ordered continuous_stream (v3)
# ==========================================================
def to_continuous_stream_opp(
    df_raw: pd.DataFrame,
    dataset_name: str = "opportunity++",
) -> pd.DataFrame:
    if df_raw.empty:
        return pd.DataFrame(columns=[c["name"] for c in SCHEMA["columns"]])

    # native → global (map via collapsed dataset_activity_label)
    def _canon(s):
        if pd.isna(s): return ""
        s = str(s).strip().lower()
        s = re.sub(r"[^a-z0-9]+", "_", s)
        return s.strip("_")

    _RAW2ID = { _canon(k): int(v) for k, v in ACT_MAP_FULL.get("mapping", {}).items() }
    _ID2NAME = { int(x["id"]): x["name"] for x in ACT_MAP_FULL["label_set"] }

    raw_key = df_raw["dataset_activity_label"].astype("string").map(_canon)
    gid     = raw_key.map(_RAW2ID).fillna(UNKNOWN_ID).astype("int16")
    glabel  = gid.map(lambda x: _ID2NAME.get(int(x), "other")).astype("string")

    out = pd.DataFrame({
        "dataset":                  dataset_name,
        "subject_id":               df_raw["subject_id"].astype("string"),
        "session_id":               df_raw["session_id"].astype("string"),
        "timestamp_ns":             (df_raw["timestamp_s"].astype(np.float64) * 1e9).round().astype("int64"),

        "acc_x": df_raw["acc_x"].astype("float32"),
        "acc_y": df_raw["acc_y"].astype("float32"),
        "acc_z": df_raw["acc_z"].astype("float32"),
        "gyro_x": df_raw["gyro_x"].astype("float32"),
        "gyro_y": df_raw["gyro_y"].astype("float32"),
        "gyro_z": df_raw["gyro_z"].astype("float32"),

        "global_activity_id":       gid,
        "global_activity_label":    glabel,

        "dataset_activity_id":      df_raw["dataset_activity_id"].astype("Int32"),
        "dataset_activity_label":   df_raw["dataset_activity_label"].astype("string"),
    })

    order = [c["name"] for c in SCHEMA["columns"]]
    return out[order]

opp_df_native = to_continuous_stream_opp(raw_opp)
print("UNIFIED (pre-50Hz) rows:", len(opp_df_native))
print(opp_df_native.head(3))


UNIFIED (pre-50Hz) rows: 656031
         dataset subject_id session_id  timestamp_ns     acc_x     acc_y  \
0  opportunity++         S1       ADL1   98466000000 -4.069760  8.276813   
1  opportunity++         S1       ADL1   98499000000 -4.628739  8.963278   
2  opportunity++         S1       ADL1   98532000000 -4.148213  8.521978   

      acc_z  gyro_x  gyro_y  gyro_z  global_activity_id global_activity_label  \
0  1.470997     NaN     NaN     NaN                   1    posture_stationary   
1  1.451384     NaN     NaN     NaN                   1    posture_stationary   
2  1.735777     NaN     NaN     NaN                   1    posture_stationary   

   dataset_activity_id dataset_activity_label  
0                    1                  stand  
1                    1                  stand  
2                    1                  stand  


### Step 4. Audit check the unified frame

In [5]:
# ==================================================
# STEP 4 — Resample → 50 Hz, QA contracts, save
# ==================================================
def _resample_group_to_50hz(g: pd.DataFrame) -> pd.DataFrame:
    """
    Resample sensors to TARGET_HZ (50).
    - Interpolate ONLY acc_x/y/z on a strict 20 ms grid.
    - Gyro stays NaN (no RWR gyro in OPP).
    - Align labels by nearest-within-half-frame, then ffill/bfill.
    - Return schema-ordered frame with required dtypes.
    """
    if g.empty:
        return pd.DataFrame(columns=[c["name"] for c in SCHEMA["columns"]])

    g = g.sort_values("timestamp_ns", kind="mergesort").reset_index(drop=True)

    ACC    = ["acc_x","acc_y","acc_z"]
    GYRO   = ["gyro_x","gyro_y","gyro_z"]
    LABELS = ["global_activity_id","global_activity_label",
              "dataset_activity_id","dataset_activity_label"]

    # --- 1) Seconds view for acc interpolation ---
    tmp = g[["timestamp_ns"] + ACC].copy()
    tmp["timestamp_s"] = tmp["timestamp_ns"].to_numpy(dtype=np.float64) / 1e9

    up = upsample_df_rate(
        df=tmp[["timestamp_s"] + ACC],
        tcol="timestamp_s",
        num_cols=ACC,
        src_hz=30.0,           # helper ignores this if it infers rate; safe placeholder
        dst_hz=TARGET_HZ,      # 50
    )

    # Strict 20 ms grid in ns
    up["timestamp_ns"] = (up["timestamp_s"] * 1e9).round().astype("int64")

    # --- 2) Label alignment on ns grid ---
    HALF_FRAME_NS = int(1e9 / (2 * TARGET_HZ))  # 10 ms
    g_lab = g[["timestamp_ns"] + LABELS].copy()
    aligned = nearest_label_join_1d(
        src_ts_ns=g_lab["timestamp_ns"].to_numpy(),
        src_label_df=g_lab[LABELS].copy(),
        target_ts_ns=up["timestamp_ns"].to_numpy(),
        half_frame_ns=HALF_FRAME_NS,
    )

    # Guard: fill occasional holes after nearest join
    aligned = aligned.ffill().bfill()

    # --- 3) Assemble output; add gyro placeholders ---
    out = pd.DataFrame({
        "dataset":    g["dataset"].iloc[0],
        "subject_id": g["subject_id"].iloc[0],
        "session_id": g["session_id"].iloc[0],
    }, index=up.index)

    out["timestamp_ns"] = up["timestamp_ns"].astype("int64")
    out[ACC] = up[ACC].astype("float32")

    for c in GYRO:
        out[c] = np.nan
    out[GYRO] = out[GYRO].astype("float32")

    out[LABELS] = aligned.astype({
        "global_activity_id":    "int16",
        "global_activity_label": "string",
        "dataset_activity_id":   "Int32",
        "dataset_activity_label":"string",
    })

    # --- 4) Schema order + required-not-null hardening ---
    order = [c["name"] for c in SCHEMA["columns"]]
    out = out[order]
    out["dataset"]    = out["dataset"].astype("string").fillna("opportunity++")
    out["subject_id"] = out["subject_id"].astype("string").fillna("S?")
    out["session_id"] = out["session_id"].astype("string").fillna("UNK")
    return out

# Run resampling per (subject, session)
groups = ["subject_id","session_id"]
out_chunks = []
for (_sid, _sess), g in tqdm(opp_df_native.groupby(groups, sort=False), desc="Resampling → 50Hz"):
    out_chunks.append(_resample_group_to_50hz(g))

opp_50 = pd.concat(out_chunks, ignore_index=True)
print("Rows @50Hz:", len(opp_50))

# ---- QA contracts ----
print("\n=== QA ===")
print("Subjects:", opp_50["subject_id"].nunique(), "| Sessions:", opp_50["session_id"].nunique())

# Monotonic per group
viol = 0
for (_sid, _sess), g in opp_50.groupby(groups, sort=False):
    ts = g["timestamp_ns"].to_numpy()
    if ts.size and not np.all(np.diff(ts) >= 0):
        viol += 1
print("Monotonic violations (groups):", viol)

def est_hz_ns(ts_ns: pd.Series):
    arr = ts_ns.to_numpy()
    if arr.size < 3: return np.nan
    dt = np.diff(arr) / 1e9
    dt = dt[(dt > 0) & np.isfinite(dt)]
    return float(np.median(1.0/dt)) if dt.size else np.nan

hz = opp_50.groupby(groups)["timestamp_ns"].apply(est_hz_ns)
print(f"Median Hz: {np.nanmedian(hz.values):.2f} (target={SCHEMA['rate_hz']})")

req = SCHEMA["expectations"]["required_not_null"]
pct = opp_50[req].notnull().all(axis=1).mean() * 100
print(f"Rows meeting required-not-null: {pct:.2f}%")

cov = (opp_50["global_activity_id"] != UNKNOWN_ID).mean() * 100
print(f"Global mapping coverage: {cov:.1f}% (unknown={UNKNOWN_ID})")

print("\nTop-15 canonical labels:")
print(opp_50["global_activity_label"].value_counts().head(15))


Resampling → 50Hz: 100%|██████████| 23/23 [00:00<00:00, 26.74it/s]


Rows @50Hz: 1215455

=== QA ===
Subjects: 4 | Sessions: 6
Monotonic violations (groups): 0
Median Hz: 50.00 (target=50)
Rows meeting required-not-null: 100.00%
Global mapping coverage: 100.0% (unknown=9000)

Top-15 canonical labels:
global_activity_label
posture_stationary       519055
walk                     350256
adl_appliance_ops        245368
adl_food                  77720
adl_household_general     23056
Name: count, dtype: Int64


### Step 5. Save outputs

In [6]:
# Optional save
out_path = CLEANED / "opportunity_clean_data.parquet"
out_path.parent.mkdir(parents=True, exist_ok=True)
opp_50.to_parquet(out_path, index=False)
print("Saved →", out_path)


Saved → /home/aidan/IMU_LM_Data/data/cleaned_premerge/opportunity_clean_data.parquet
