## PMAP Data Preprocessing notebook
### Step 0. Setup the paths and env variables

In [1]:
# =========================
# STEP 0 — Setup & contracts
# =========================
from pathlib import Path
import json, sys
import numpy as np
import pandas as pd
from tqdm import tqdm

ROOT = Path("/home/aidan/IMU_LM_Data")
sys.path.insert(0, str(ROOT))
from UTILS.helpers import (
    resample_df,            # decimate FIR-based: (df, target_cols, factor)
    convert_unit,           # (arr, kind="acc"|"gyro")
    zscore_normalize,
    normalize_str, keyize, _keyize
)

BASE    = ROOT / "data"
RAW     = BASE / "raw_data" / "PAMAP2" / "PAMAP2_Dataset"      # <— assumes .../data/raw_data/PAMAP2/{Protocol,Optional}/subject10X.dat
CLEANED = BASE / "cleaned_premerge"
MERGED  = BASE / "merged_dataset"

SCHEMA_PATH       = ROOT / "Unification" / "schemas" / "continuous_stream_schema.json"
ACTIVITY_MAP_PATH = ROOT / "Unification" / "schemas" / "activity_mapping.json"

SCHEMA       = json.loads(SCHEMA_PATH.read_text())
ACT_MAP_FULL = json.loads(ACTIVITY_MAP_PATH.read_text())

UNKNOWN_ID = int(ACT_MAP_FULL.get("unknown_activity_id", 9000))
ID2NAME    = {int(x["id"]): x["name"] for x in ACT_MAP_FULL["label_set"]}
RAW2ID     = {_keyize(k): int(v) for k, v in ACT_MAP_FULL.get("mapping", {}).items()}

print("Paths & contracts ready.")
print(f"Schema keys: {list(SCHEMA.keys())}")
print("RAW dir :", RAW)
print("CLEANED :", CLEANED)
print("MERGED  :", MERGED)

# Native PAMAP2 activityID → verbatim label (PAMPAP Specification)
PAMAP2_ACTIVITY_LABELS = {
    1: "lying",
    2: "sitting",
    3: "standing",
    4: "walking",
    5: "running",
    6: "cycling",
    7: "nordic_walking",
    9: "watching_tv",
    10: "computer_work",
    11: "car_driving",
    12: "ascending_stairs",
    13: "descending_stairs",
    16: "vacuum_cleaning",
    17: "ironing",
    18: "folding_laundry",
    19: "house_cleaning",
    20: "playing_soccer",
    24: "rope_jumping",
    0: "other",  # we'll drop rows with 0 below as metnoend in thier documentation
}


Paths & contracts ready.
Schema keys: ['name', 'version', 'primary_index', 'description', 'columns', 'rate_hz', 'axis_frame', 'unit_contract', 'unknown_activity_id', 'expectations']
RAW dir : /home/aidan/IMU_LM_Data/data/raw_data/PAMAP2/PAMAP2_Dataset
CLEANED : /home/aidan/IMU_LM_Data/data/cleaned_premerge
MERGED  : /home/aidan/IMU_LM_Data/data/merged_dataset


### Step 1. Ingest, preporccess and map the data 

In [2]:
# ==========================================
# STEP 1 — Load PAMAP2 (hand IMU), normalize
# ==========================================
from typing import List

def _collect_pamap2_files(raw_dir: Path, include_optional: bool) -> List[Path]:
    prot = sorted((raw_dir / "Protocol").glob("subject10*.dat"))
    opt  = sorted((raw_dir / "Optional").glob("subject10*.dat")) if include_optional and (raw_dir / "Optional").exists() else []
    files = prot + opt
    print(f"[PAMAP2] Found {len(prot)} Protocol files, {len(opt)} Optional files, total={len(files)}")
    return files

def load_pamap2_raw(
    raw_dir: Path,
    include_optional: bool = True,          # <- default to True
    downsample_to_50hz: bool = True
) -> pd.DataFrame:
    """
    Returns a raw, wrist-only (hand IMU) frame with:
      subject_id (str), session_id (str), timestamp_s (float),
      acc_x/y/z (m/s^2), gyro_x/y/z (rad/s),
      dataset_activity_id (int), activity_label_raw (str)
    """
    files = _collect_pamap2_files(raw_dir, include_optional)
    if not files:
        print("No PAMAP2 .dat files found.")
        return pd.DataFrame()

    HAND_BASE = 3
    ACC_IDX   = [1, 2, 3]        # ±16g accel
    GYRO_IDX  = [7, 8, 9]

    all_rows = []

    for f in tqdm(files, desc="PAMAP2 files"):
        # subject10X.dat -> subject ID; session by parent folder name
        subject_id_raw = f.stem.replace("subject", "")  # '101'...'109'
        source_folder  = f.parent.name                  # 'Protocol' or 'Optional'
        session_id     = source_folder                  # keep provenance

        all_names = ["timestamp","activity_id","heart_rate"] + [f"col_{i}" for i in range(3,54)]
        df = pd.read_csv(f, sep=" ", header=None, names=all_names, na_values=["NaN"])

        # Sort, drop unlabeled/transient (activity_id == 0)
        df = df.sort_values("timestamp")
        df = df[df["activity_id"] != 0]

        def _abs(i): return f"col_{HAND_BASE + i}"
        hand_acc_cols  = [_abs(i) for i in ACC_IDX]
        hand_gyro_cols = [_abs(i) for i in GYRO_IDX]

        keep_cols = ["timestamp","activity_id"] + hand_acc_cols + hand_gyro_cols
        df = df[keep_cols].copy()

        # Rename to unified wrist channel names
        rename_map = {
            hand_acc_cols[0]: "acc_x",  hand_acc_cols[1]: "acc_y",  hand_acc_cols[2]: "acc_z",
            hand_gyro_cols[0]: "gyro_x", hand_gyro_cols[1]: "gyro_y", hand_gyro_cols[2]: "gyro_z",
        }
        df = df.rename(columns=rename_map)

        # Native fields
        df["dataset_activity_id"] = df["activity_id"].astype("Int64")
        df["activity_label_raw"]  = df["dataset_activity_id"].map(lambda x: PAMAP2_ACTIVITY_LABELS.get(int(x), "other"))

        # Subject/session ids
        if subject_id_raw.isdigit():
            df["subject_id"] = f"S{int(subject_id_raw) - 100:02d}"
        else:
            df["subject_id"] = f"S{subject_id_raw}"
        df["session_id"] = session_id

        # 100 Hz → 50 Hz
        if downsample_to_50hz:
            target_cols = ["acc_x","acc_y","acc_z","gyro_x","gyro_y","gyro_z"]
            df = resample_df(df, target_cols=target_cols, factor=2)

        df = df[[
            "subject_id","session_id","timestamp",
            "acc_x","acc_y","acc_z","gyro_x","gyro_y","gyro_z",
            "dataset_activity_id","activity_label_raw"
        ]].rename(columns={"timestamp":"timestamp_s"})

        all_rows.append(df)

    if not all_rows:
        return pd.DataFrame()

    raw = pd.concat(all_rows, ignore_index=True)

    # Drop rows where all sensors are NaN (rare after decimation)
    sens = ["acc_x","acc_y","acc_z","gyro_x","gyro_y","gyro_z"]
    raw = raw.dropna(subset=sens, how="all").reset_index(drop=True)

    # Quick RAW SUMMARY
    print("\n=== RAW SUMMARY (PAMAP2 wrist) ===")
    print(f"Shape: {raw.shape[0]:,} rows × {raw.shape[1]} cols")
    def est_hz(ts: pd.Series):
        arr = ts.to_numpy()
        if arr.size < 3: return np.nan
        dt = np.diff(arr)
        dt = dt[(dt > 0) & np.isfinite(dt)]
        return float(np.median(1.0/dt)) if dt.size else np.nan
    hz = raw.groupby(["subject_id","session_id"])["timestamp_s"].apply(est_hz)
    print(f"Median Hz across sessions: {np.nanmedian(hz.values):.2f}")

    # Top activities (native)
    top = raw["activity_label_raw"].value_counts().head(20)
    print("\nTop activities (raw):")
    for lbl, cnt in top.items():
        print(f"  {lbl:20s} {cnt:,}")

    return raw
raw_pamap2 = load_pamap2_raw(RAW, include_optional=True, downsample_to_50hz=True)
raw_pamap2.head(3)

[PAMAP2] Found 9 Protocol files, 5 Optional files, total=14


PAMAP2 files: 100%|██████████| 14/14 [00:46<00:00,  3.32s/it]



=== RAW SUMMARY (PAMAP2 wrist) ===
Shape: 1,271,148 rows × 11 cols
Median Hz across sessions: 50.00

Top activities (raw):
  computer_work        154,553
  ironing              117,281
  lying                95,990
  standing             93,553
  house_cleaning       92,438
  sitting              91,863
  vacuum_cleaning      86,522
  cycling              81,137
  nordic_walking       78,276
  walking              75,940
  ascending_stairs     58,207
  descending_stairs    52,145
  folding_laundry      49,237
  watching_tv          41,780
  running              37,161
  car_driving          27,091
  playing_soccer       19,614
  rope_jumping         18,360


Unnamed: 0,subject_id,session_id,timestamp_s,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,dataset_activity_id,activity_label_raw
0,S01,Protocol,37.66,1.669773,6.089435,4.234356,-0.037392,0.024959,-0.010789,1,lying
1,S01,Protocol,37.68,2.455081,7.799323,6.185947,-0.235491,0.019542,0.0028,1,lying
2,S01,Protocol,37.7,2.108338,6.903461,5.867963,-0.070572,-0.004246,0.008684,1,lying


### Step 2. Map the data and audit the mapping

In [4]:
# ============================================
# STEP 2 — Quick audit: raw_label → mapped_id
# ============================================
if raw_pamap2.empty:
    raise SystemExit("No PAMAP2 rows after loading. Check RAW path/layout.")

raw_counts = (
    raw_pamap2["activity_label_raw"]
      .astype(str).map(_keyize)
      .value_counts()
      .rename_axis("raw_label")
      .reset_index(name="count")
)
raw_counts["mapped_id"] = raw_counts["raw_label"].map(RAW2ID).fillna(UNKNOWN_ID).astype(int)
raw_counts["mapped_nm"] = raw_counts["mapped_id"].map(lambda x: ID2NAME.get(int(x), "other"))

unmapped = raw_counts.loc[raw_counts["mapped_id"] == UNKNOWN_ID]
print(f"Raw label unique: {len(raw_counts)} | Unmapped: {len(unmapped)}")
print("Unmapped (top-10):")
print(unmapped.nlargest(10, "count")[["raw_label","count"]].to_string(index=False))
raw_counts.head(10)


Raw label unique: 18 | Unmapped: 2
Unmapped (top-10):
     raw_label  count
   car_driving  27091
playing_soccer  19614


Unnamed: 0,raw_label,count,mapped_id,mapped_nm
0,computer_work,154553,14,adl_desk_device
1,ironing,117281,13,adl_household_general
2,lying,95990,1,posture_stationary
3,standing,93553,1,posture_stationary
4,house_cleaning,92438,13,adl_household_general
5,sitting,91863,1,posture_stationary
6,vacuum_cleaning,86522,13,adl_household_general
7,cycling,81137,5,cycle
8,nordic_walking,78276,2,walk
9,walking,75940,2,walk


### Step 3. Build and clean dataset in stream json fromat

In [5]:
# =========================================================
# STEP 3 — Build schema-ordered continuous_stream (v3) df
# =========================================================
def to_continuous_stream_pamap2(df_raw: pd.DataFrame, dataset_name: str = "pamap2") -> pd.DataFrame:
    if df_raw.empty:
        return pd.DataFrame(columns=[c["name"] for c in SCHEMA["columns"]])

    # GLOBAL mapping via activity_mapping.json
    raw_key  = df_raw["activity_label_raw"].astype(str).map(_keyize)
    gid      = raw_key.map(RAW2ID).fillna(UNKNOWN_ID).astype("int16")
    glabel   = gid.map(lambda x: ID2NAME.get(int(x), "other")).astype("string")

    native_id  = df_raw["dataset_activity_id"].astype("Int16")
    native_lbl = df_raw["activity_label_raw"].astype("string")

    out = pd.DataFrame({
        "dataset":                  dataset_name,
        "subject_id":               df_raw["subject_id"].astype("string"),
        "session_id":               df_raw["session_id"].astype("string"),
        "timestamp_ns":             (df_raw["timestamp_s"].astype(np.float64) * 1e9).round().astype("int64"),

        "acc_x": df_raw["acc_x"].astype("float32"),
        "acc_y": df_raw["acc_y"].astype("float32"),
        "acc_z": df_raw["acc_z"].astype("float32"),
        "gyro_x": df_raw["gyro_x"].astype("float32"),
        "gyro_y": df_raw["gyro_y"].astype("float32"),
        "gyro_z": df_raw["gyro_z"].astype("float32"),

        "global_activity_id":       gid,
        "global_activity_label":    glabel,

        "dataset_activity_id":      native_id,
        "dataset_activity_label":   native_lbl,
    })

    order = [c["name"] for c in SCHEMA["columns"]]
    return out[order]

# Build unified frame (NOTE: use raw_pamap2)
pamap2_df = to_continuous_stream_pamap2(raw_pamap2, dataset_name="pamap2")
print("UNIFIED rows:", len(pamap2_df))
pamap2_df.head(3)


UNIFIED rows: 1271148


Unnamed: 0,dataset,subject_id,session_id,timestamp_ns,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,global_activity_id,global_activity_label,dataset_activity_id,dataset_activity_label
0,pamap2,S01,Protocol,37660000000,1.669773,6.089435,4.234356,-0.037392,0.024959,-0.010789,1,posture_stationary,1,lying
1,pamap2,S01,Protocol,37680000000,2.455081,7.799323,6.185947,-0.235491,0.019542,0.0028,1,posture_stationary,1,lying
2,pamap2,S01,Protocol,37700000000,2.108338,6.903461,5.867963,-0.070572,-0.004246,0.008684,1,posture_stationary,1,lying


### Step 4. Audit check the unified frame

In [6]:
# ==========================================
# STEP 4 — Contract checks & quick QA
# ==========================================
print("Subjects:", pamap2_df["subject_id"].nunique(),
      "| Sessions:", pamap2_df["session_id"].nunique())

# Monotonic timestamp per (subject, session)
viol = 0
for (_sid, _sess), g in pamap2_df.groupby(["subject_id","session_id"], sort=False):
    ts = g["timestamp_ns"].to_numpy()
    if ts.size and not np.all(np.diff(ts) >= 0):
        viol += 1
print("Monotonic violations (groups):", viol)

# Approx Hz from ns timestamps
def est_hz_ns(ts_ns: pd.Series):
    arr = ts_ns.to_numpy()
    if arr.size < 3: return np.nan
    dt = np.diff(arr) / 1e9  # ns → s
    dt = dt[(dt > 0) & np.isfinite(dt)]
    return float(np.median(1.0 / dt)) if dt.size else np.nan

hz = pamap2_df.groupby(["subject_id","session_id"])["timestamp_ns"].apply(est_hz_ns)
print(f"Median Hz: {np.nanmedian(hz.values):.2f} (target={SCHEMA['rate_hz']})")

# Required-not-null coverage
req = SCHEMA["expectations"]["required_not_null"]
pct = pamap2_df[req].notnull().all(axis=1).mean() * 100
print(f"Rows meeting required-not-null: {pct:.2f}%")

# Mapping coverage
cov = (pamap2_df["global_activity_id"] != UNKNOWN_ID).mean() * 100
print(f"Global mapping coverage: {cov:.1f}% (unknown={UNKNOWN_ID})")

print("\nTop-15 canonical labels:")
print(pamap2_df["global_activity_label"].value_counts().head(15))

Subjects: 9 | Sessions: 2
Monotonic violations (groups): 0
Median Hz: 50.00 (target=50)
Rows meeting required-not-null: 100.00%
Global mapping coverage: 96.3% (unknown=9000)

Top-15 canonical labels:
global_activity_label
adl_household_general    345478
posture_stationary       323186
adl_desk_device          154553
walk                     154216
stairs                   110352
cycle                     81137
other                     46705
run_jog                   37161
exercise_jump_rope        18360
Name: count, dtype: Int64


### Step 5. Save outputs

In [None]:
# Optional: write cleaned shard
CLEANED.mkdir(parents=True, exist_ok=True)
out_path = CLEANED / "pamap2_clean_data.parquet"
pamap2_df.to_parquet(out_path, index=False)
print("Saved:", out_path)

Saved: /home/aidan/IMU_LM_Data/data/cleaned_premerge/pamap2_clean_data.parquet
