## RecoFit Data Preprocessing notebook
### Step 0. Setup the paths and env variables

In [1]:
from pathlib import Path
import json, sys, os
ROOT = Path("/home/aidan/IMU_LM_Data")
sys.path.insert(0, str(ROOT)) 
print(f"Project root: {ROOT}")
from tqdm import tqdm
from UTILS.helpers import resample_df, convert_unit, zscore_normalize, normalize_str, keyize, _keyize
import numpy as np
import pandas as pd
from scipy.io import loadmat

BASE    = ROOT / "data"
RAW     = BASE / "raw_data" / "RecoFit" / "Exercise-Recognition-from-Wearable-Sensors"
CLEANED = BASE / "cleaned_premerge"
MERGED  = BASE / "merged_dataset"
SCHEMA_PATH       = ROOT / "Unification" / "schemas" / "continuous_stream_schema.json"
ACTIVITY_MAP_PATH = ROOT / "Unification" / "schemas" / "activity_mapping.json"
SCHEMA       = json.loads(SCHEMA_PATH.read_text())
ACT_MAP_FULL = json.loads(ACTIVITY_MAP_PATH.read_text())
UNKNOWN_ID = int(ACT_MAP_FULL.get("unknown_activity_id", -1))
ID2NAME    = {int(x["id"]): x["name"] for x in ACT_MAP_FULL["label_set"]}
RAW2ID     = {_keyize(k): int(v) for k, v in ACT_MAP_FULL.get("mapping", {}).items()}

print("Paths & contracts ready.")
print(f"Schema keys: {list(SCHEMA.keys())}")
print(RAW)
print(CLEANED)
print(MERGED)

Project root: /home/aidan/IMU_LM_Data
Paths & contracts ready.
Schema keys: ['name', 'version', 'primary_index', 'description', 'columns', 'rate_hz', 'axis_frame', 'unit_contract', 'unknown_activity_id', 'expectations']
/home/aidan/IMU_LM_Data/data/raw_data/RecoFit/Exercise-Recognition-from-Wearable-Sensors
/home/aidan/IMU_LM_Data/data/cleaned_premerge
/home/aidan/IMU_LM_Data/data/merged_dataset


### Step 1. Ingest, preporccess and map the data 

In [None]:
def preprocess_recofit_imu(imu: np.ndarray, shift_time: bool=False,
                           kind: str|None="acc", normalize_imu: bool=False) -> np.ndarray:
    imu = imu[np.argsort(imu[:, 0])]
    if shift_time:
        imu[:, 0] -= imu[0, 0]
    if kind is not None:
        imu[:, 1:4] = convert_unit(imu[:, 1:4], kind=kind)  # g→m/s², dps→rad/s
    if normalize_imu:
        imu[:, 1:4] = zscore_normalize(imu[:, 1:4])
    return imu

def load_recofit_raw(dataset_path: Path) -> pd.DataFrame:
    mat = loadmat(dataset_path / "exercise_data.50.0000_singleonly.mat")
    subject_data = mat["subject_data"]

    rows = []
    for i in tqdm(range(subject_data.shape[0]), desc="subjects"):
        for j in range(subject_data.shape[1]):
            cell = subject_data[i, j]
            if not isinstance(cell, np.ndarray) or cell.size == 0:
                continue
            entry = cell[0][0]

            activity_index = int(entry[1][0][0])   # native per-dataset ID
            raw_label      = str(entry[5][0])      # native label (verbatim)

            sensor   = entry[14][0][0]
            acc_raw  = sensor[0]
            gyro_raw = sensor[1]

            acc  = preprocess_recofit_imu(acc_raw,  shift_time=False, kind="acc",  normalize_imu=False)
            gyro = preprocess_recofit_imu(gyro_raw, shift_time=False, kind="gyro", normalize_imu=False)
            n = min(acc.shape[0], gyro.shape[0])
            sid, sess = f"S{i:02d}", f"Trial_{j:02d}"

            rows.extend({
                "subject_id": sid,
                "session_id": sess,
                "timestamp_s": float(acc[k, 0]),
                "acc_x": float(acc[k, 1]), "acc_y": float(acc[k, 2]), "acc_z": float(acc[k, 3]),
                "gyro_x": float(gyro[k, 1]), "gyro_y": float(gyro[k, 2]), "gyro_z": float(gyro[k, 3]),
                "activity_label_raw": raw_label,
                "dataset_activity_id": activity_index,   # NEW (native, stable)
            } for k in range(n))

    df = pd.DataFrame(rows)
    print("\n=== RAW SUMMARY ===")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} cols")  # dimensions

    if len(df):
        # Avoid FutureWarning by selecting the series before apply
        def est_hz(ts: pd.Series):
            arr = ts.to_numpy()
            if arr.size < 3: return np.nan
            dt = np.diff(arr)
            dt = dt[(dt > 0) & np.isfinite(dt)]
            return float(np.median(1.0 / dt)) if dt.size else np.nan

        def sess_dur(ts: pd.Series):
            arr = ts.to_numpy()
            return float(arr[-1] - arr[0]) if arr.size > 1 else 0.0

        hz = df.groupby(["subject_id", "session_id"])["timestamp_s"].apply(est_hz)
        dur = df.groupby(["subject_id", "session_id"])["timestamp_s"].apply(sess_dur)

        n_subjects = df["subject_id"].nunique()
        n_sessions = df["session_id"].nunique()

        # total *raw* classes (case-insensitive)
        n_classes = df["activity_label_raw"].str.lower().nunique()

        print(f"Subjects: {n_subjects} | Sessions: {n_sessions} | Raw classes: {n_classes}")
        print(f"Median Hz across sessions: {np.nanmedian(hz.values):.2f}")
        print(f"Median session duration (s): {np.nanmedian(dur.values):.2f}")

        # top raw labels
        top = df["activity_label_raw"].value_counts().head(15)
        print("\nTop-15 raw labels:")
        for lbl, cnt in top.items():
            print(f"  {lbl:45s} {cnt:,}")

    return df

raw_df = load_recofit_raw(RAW)


subjects: 100%|██████████| 94/94 [00:07<00:00, 12.62it/s]



=== RAW SUMMARY ===
Shape: 7,751,906 rows × 11 cols
Subjects: 94 | Sessions: 73 | Raw classes: 73
Median Hz across sessions: 50.00
Median session duration (s): 46.47

Top-15 raw labels:
  Device on Table                               1,301,735
  Walk                                          608,204
  Static stretch                                451,196
  Static Stretch (at your own pace)             313,323
  Running (treadmill)                           291,569
  Dynamic Stretch (at your own pace)            284,271
  Elliptical machine                            275,296
  Rowing machine                                267,743
  Plank                                         139,039
  Lunge (alternating both legs, weight optional) 131,904
  Butterfly Sit-up                              131,310
  Squat (arms in front of body, parallel to ground) 116,623
  Non-Exercise                                  116,307
  Burpee                                        114,438
  Triceps Kickback (kn

### Step 2. Map the data and audit the mapping

In [3]:
# Build an audit table: raw_label -> mapped_id -> mapped_name, with counts
# ---- quick audit (no files) ----
raw_counts = (
    raw_df["activity_label_raw"]
      .astype(str).map(_keyize)          # <— normalize BEFORE counting
      .value_counts()
      .rename_axis("raw_label")
      .reset_index(name="count")
)
raw_counts["mapped_id"] = raw_counts["raw_label"].map(RAW2ID).fillna(UNKNOWN_ID).astype(int)
raw_counts["mapped_nm"] = raw_counts["mapped_id"].map(lambda x: ID2NAME.get(int(x), "other"))

unmapped = raw_counts.loc[raw_counts["mapped_id"] == UNKNOWN_ID]
print(f"Raw label unique: {len(raw_counts)} | Unmapped: {len(unmapped)}")
print("Unmapped (top-10):")
print(unmapped.nlargest(10, "count")[["raw_label","count"]].to_string(index=False))



Raw label unique: 73 | Unmapped: 1
Unmapped (top-10):
          raw_label  count
arm band adjustment  30194


### Step 3. Build and clean dataset in stream json fromat

In [None]:
def to_continuous_stream(df_raw: pd.DataFrame, dataset_name: str="recofit") -> pd.DataFrame:
    if df_raw.empty:
        return pd.DataFrame(columns=[c["name"] for c in SCHEMA["columns"]])

    # ---- GLOBALS: map using activity_mapping.json + normalizer
    raw_key = df_raw["activity_label_raw"].astype(str).map(_keyize)
    gid = raw_key.map(RAW2ID).fillna(UNKNOWN_ID).astype("int16")
    glabel = gid.map(lambda x: ID2NAME.get(int(x), "other")).astype("string")

    # ---- NATIVE: keep 1:1 with dataset
    native_id  = df_raw["dataset_activity_id"].astype("Int16")
    native_lbl = df_raw["activity_label_raw"].astype("string")

    out = pd.DataFrame({
        "dataset":        dataset_name,
        "subject_id":     df_raw["subject_id"].astype("string"),
        "session_id":     df_raw["session_id"].astype("string"),
        "timestamp_ns":   (df_raw["timestamp_s"].astype(np.float64) * 1e9).round().astype("int64"),

        "acc_x": df_raw["acc_x"].astype("float32"),
        "acc_y": df_raw["acc_y"].astype("float32"),
        "acc_z": df_raw["acc_z"].astype("float32"),
        "gyro_x": df_raw["gyro_x"].astype("float32"),
        "gyro_y": df_raw["gyro_y"].astype("float32"),
        "gyro_z": df_raw["gyro_z"].astype("float32"),

        "global_activity_id":    gid,
        "global_activity_label": glabel,

        "dataset_activity_id":    native_id,
        "dataset_activity_label": native_lbl,
    })

    order = [c["name"] for c in SCHEMA["columns"]]
    return out[order]

# build the unified frame
recofit_df = to_continuous_stream(raw_df, dataset_name="recofit")


### Step 4. Audit check the unified frame

In [2]:
print("UNIFIED rows:", len(recofit_df))
print("Subjects:", recofit_df["subject_id"].nunique(), "| Sessions:", recofit_df["session_id"].nunique())

# monotonic per (subject,session)
viol = 0
for (_sid, _sess), g in recofit_df.groupby(["subject_id","session_id"], sort=False):
    ts = g["timestamp_ns"].to_numpy()
    if ts.size and not np.all(np.diff(ts) >= 0):
        viol += 1
print("Monotonic violations (groups):", viol)

# approximate Hz (ns series)
def est_hz_ns(ts_ns: pd.Series):
    arr = ts_ns.to_numpy()
    if arr.size < 3: return np.nan
    dt = np.diff(arr) / 1e9  # ns -> s
    dt = dt[(dt > 0) & np.isfinite(dt)]
    return float(np.median(1.0 / dt)) if dt.size else np.nan

hz = recofit_df.groupby(["subject_id","session_id"])["timestamp_ns"].apply(est_hz_ns)
print(f"Median Hz: {np.nanmedian(hz.values):.2f} (target={SCHEMA['rate_hz']})")

# required-not-null check
req = SCHEMA["expectations"]["required_not_null"]
pct = recofit_df[req].notnull().all(axis=1).mean() * 100
print(f"Rows meeting required-not-null: {pct:.2f}%")

print("Top-10 canonical labels:")
print(recofit_df["global_activity_label"].value_counts().head(10))
cov = (recofit_df["global_activity_id"] != UNKNOWN_ID).mean()*100
print(f"Global mapping coverage: {cov:.1f}% (unknown={UNKNOWN_ID})")
print(recofit_df["global_activity_label"].value_counts().head(15))
recofit_df


NameError: name 'recofit_df' is not defined

### Step 5. Save outputs

In [11]:
# Save unified RecoFit frame for later merge
CLEANED.mkdir(parents=True, exist_ok=True)
recofit_df.to_parquet(CLEANED / "recofit_clean_data.parquet", index=False)