# Human Activity Recognition with Hidden Markov Models

This notebook builds a compact end-to-end pipeline to recognize four activities — **standing, walking, jumping, still** — from smartphone inertial data.  



In [1]:
# Imports & setup (single cell)
import sys
import os
from pathlib import Path
import pandas as pd
from IPython.display import display

# project root so we can import our simple modules
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if ROOT not in sys.path:
    sys.path.append(ROOT)

# simple modules (functions only)
from src.config import TARGET_HZ, EDGE_TRIM_SEC, MERGE_TOL_SEC
from src.data_loader import unpack_and_clean_dir



## 1 — Data Acquisition & Cleaning

### 1.1 — Inventory & naming convention
We first list the zipped recordings and verify the expected pattern:  
`setA|setB_<Activity>_<Index>-YYYY-MM-DD_HH-MM-SS.zip`  
Activities expected: **standing, walking, jumping, still**.


In [3]:
train_raw = os.path.join(ROOT, "data", "raw", "train")
test_raw  = os.path.join(ROOT, "data", "raw", "test")

train_zips = sorted([os.path.join(train_raw, f) for f in os.listdir(train_raw) if f.endswith(".zip")])
test_zips  = sorted([os.path.join(test_raw,  f) for f in os.listdir(test_raw)  if f.endswith(".zip")])

print(f"Found {len(train_zips)} train zips and {len(test_zips)} test zips.")
print("Train sample:", [os.path.basename(p) for p in train_zips[:3]])
print("Test sample:",  [os.path.basename(p) for p in test_zips[:3]])


Found 52 train zips and 12 test zips.
Train sample: ['setA_Jumping_1-2025-10-25_13-52-04.zip', 'setA_Jumping_10-2025-10-25_14-00-15.zip', 'setA_Jumping_11-2025-10-25_14-06-37.zip']
Test sample: ['setB_Jumping_1-2025-10-24_15-15-26.zip', 'setB_Jumping_2-2025-10-24_15-16-46.zip', 'setB_Jumping_3-2025-10-24_15-18-46.zip']


### 1.2 — Unpack & standardize sensor files
For each zip:
1) Read accelerometer and gyroscope CSVs  
2) Normalize the time axis to seconds-from-start and axis names to x/y/z  
3) Drop duplicate timestamps, sort by time


In [4]:
train_out = os.path.join(ROOT, "data", "processed", "train")
test_out  = os.path.join(ROOT, "data", "processed", "test")

print("Cleaning TRAIN…")
unpack_and_clean_dir(train_raw, train_out)

print("Cleaning TEST…")
unpack_and_clean_dir(test_raw, test_out)


Cleaning TRAIN…
Cleaning TEST…


In [5]:
clean_train = sorted([os.path.join(train_out, f) for f in os.listdir(train_out) if f.endswith("_cleaned.csv")])
clean_test  = sorted([os.path.join(test_out,  f) for f in os.listdir(test_out)  if f.endswith("_cleaned.csv")])

print(f"Cleaned CSVs → train: {len(clean_train)}, test: {len(clean_test)}")


Cleaned CSVs → train: 52, test: 12


### 1.3 — Harmonize sampling rate
Phones may log at slightly different rates. We resample to a **uniform 100 Hz** grid so windows and features are comparable across recordings and devices.


In [7]:
def estimate_hz(time_s_series):
    t = time_s_series.to_numpy()
    if len(t) < 2: 
        return float("nan")
    dt = t[1:] - t[:-1]
    dt = dt[dt > 0]
    return float(1.0 / pd.Series(dt).median()) if len(dt) else float("nan")

samples = (clean_train[:2] + clean_test[:2])[:4]
for p in samples:
    df = pd.read_csv(p, usecols=["time_s"])
    hz = estimate_hz(df["time_s"])
    print(f"{os.path.basename(p):50s}  ~{hz:.1f} Hz (target={TARGET_HZ})")


setA_Jumping_1-2025-10-25_13-52-04_cleaned.csv      ~100.0 Hz (target=100)
setA_Jumping_10-2025-10-25_14-00-15_cleaned.csv     ~100.0 Hz (target=100)
setB_Jumping_1-2025-10-24_15-15-26_cleaned.csv      ~100.0 Hz (target=100)
setB_Jumping_2-2025-10-24_15-16-46_cleaned.csv      ~100.0 Hz (target=100)


### 1.4 — Merge sensors & trim edges
We align accelerometer and gyroscope by nearest timestamp (≈10 ms tolerance), then trim **1.0 s** at both ends to remove “start/stop” noise added during recording


In [8]:
if clean_train:
    df = pd.read_csv(clean_train[0])
    first_t = float(df["time_s"].iloc[0])
    last_t  = float(df["time_s"].iloc[-1])
    have_cols = all(c in df.columns for c in ["ax","ay","az","gx","gy","gz"])
    print(f"First time_s: {first_t:.3f} s  (expected ≈ {EDGE_TRIM_SEC} s)")
    print(f"Last  time_s: {last_t:.3f} s")
    print("Accel+Gyro columns present:", have_cols)


First time_s: 1.000 s  (expected ≈ 1.0 s)
Last  time_s: 13.570 s
Accel+Gyro columns present: True


### 1.5 — Save cleaned recordings
We verify the number of cleaned files per split and preview the first few rows of one cleaned CSV to confirm schema and values.


In [9]:
print(f"Cleaned CSVs → train: {len(clean_train)}, test: {len(clean_test)}")
if clean_train:
    ex = clean_train[0]
    print("Train example file:", os.path.basename(ex))
    df_preview = pd.read_csv(ex)
    display(df_preview.head(8))
    print("Columns:", list(df_preview.columns))


Cleaned CSVs → train: 52, test: 12
Train example file: setA_Jumping_1-2025-10-25_13-52-04_cleaned.csv


Unnamed: 0,time_s,ax,ay,az,gx,gy,gz,activity,split,recording_id
0,1.0,3.564852,6.340639,-7.314131,0.511791,-0.33063,1.921244,jumping,train,1
1,1.01,1.68629,6.790849,-5.589526,0.364654,0.261083,1.233521,jumping,train,1
2,1.02,1.019344,8.09629,-5.028453,0.346601,1.151401,0.545515,jumping,train,1
3,1.03,0.901354,9.388726,-4.746779,0.034873,1.412567,0.230657,jumping,train,1
4,1.04,3.42471,10.618591,-5.809456,-0.562286,1.321471,0.069513,jumping,train,1
5,1.05,5.676328,11.574267,-6.654969,-1.084602,1.027379,-0.239417,jumping,train,1
6,1.06,6.465978,11.054787,-6.332126,-1.266396,0.718832,-0.615428,jumping,train,1
7,1.07,7.096062,10.488675,-5.967803,-1.324868,0.37,-0.920295,jumping,train,1


Columns: ['time_s', 'ax', 'ay', 'az', 'gx', 'gy', 'gz', 'activity', 'split', 'recording_id']


## 2 — Device & Sampling Metadata

### 2.1 — Device table (fill in and display)
We record the devices used and their intended sampling rates.  
Fill the rows below (add/remove as needed), then we’ll verify the **empirical** rates from the cleaned files.


In [10]:
device_rows = [
    {"member": "Christian B.", "phone": "Android Pixel", "target_hz": TARGET_HZ, "notes": "Front pocket (screen out, upside-down)"},
    {"member": "Reponse I.", "phone": "Iphone 15", "target_hz": TARGET_HZ, "notes": "Back pocket (screen in, upright)"},
]
device_df = pd.DataFrame(device_rows, columns=["member","phone","target_hz","notes"])
display(device_df)


Unnamed: 0,member,phone,target_hz,notes
0,Christian B.,Android Pixel,100,"Front pocket (screen out, upside-down)"
1,Reponse I.,Iphone 15,100,"Back pocket (screen in, upright)"


### 2.2 — Harmonization plan
Phones can log slightly above/below the target rate.  
We resample each stream to a **uniform 100 Hz** grid during cleaning so windows/features are comparable across recordings and devices.  
Next, we verify this by **estimating the empirical Hz per cleaned file**.

We compute an empirical sampling rate for each cleaned file (median Δt⁻¹) and summarize by split/activity.


In [11]:
def estimate_hz_from_csv(path_csv):
    s = pd.read_csv(path_csv, usecols=["time_s"])
    t = s["time_s"].to_numpy()
    if len(t) < 2:
        return float("nan")
    dt = t[1:] - t[:-1]
    dt = dt[dt > 0]
    return float(1.0 / pd.Series(dt).median()) if len(dt) else float("nan")

def read_activity_quick(path_csv):
    s = pd.read_csv(path_csv, usecols=["activity","split"], nrows=1)
    return s["split"].iloc[0], s["activity"].iloc[0]


In [12]:
rows = []
for p in clean_train + clean_test:
    split, activity = read_activity_quick(p)
    hz = estimate_hz_from_csv(p)
    rows.append({"file": os.path.basename(p), "split": split, "activity": activity, "empirical_hz": round(hz, 2)})

hz_df = pd.DataFrame(rows).sort_values(["split","activity","file"]).reset_index(drop=True)
display(hz_df.head(10))

Unnamed: 0,file,split,activity,empirical_hz
0,setB_Jumping_1-2025-10-24_15-15-26_cleaned.csv,test,jumping,100.0
1,setB_Jumping_2-2025-10-24_15-16-46_cleaned.csv,test,jumping,100.0
2,setB_Jumping_3-2025-10-24_15-18-46_cleaned.csv,test,jumping,100.0
3,setB_Standing_1-2025-10-26_13-29-15_cleaned.csv,test,standing,100.0
4,setB_Standing_2-2025-10-26_13-28-55_cleaned.csv,test,standing,100.0
5,setB_Standing_3-2025-10-26_13-28-35_cleaned.csv,test,standing,100.0
6,setB_Still_1-2025-10-26_13-27-04_cleaned.csv,test,still,100.0
7,setB_Still_2-2025-10-26_13-26-43_cleaned.csv,test,still,100.0
8,setB_Still_3-2025-10-26_13-26-24_cleaned.csv,test,still,100.0
9,setB_Walking_1-2025-10-24_15-11-38_cleaned.csv,test,walking,100.0


### 2.4 — Summary by split and activity
We expect empirical rates to cluster tightly around the target (≈100 Hz) for **both** train and test, across all activities.


In [13]:
summary = (hz_df
           .groupby(["split","activity"], as_index=False)
           .agg(n=("empirical_hz","size"),
                hz_median=("empirical_hz","median"),
                hz_min=("empirical_hz","min"),
                hz_max=("empirical_hz","max")))
display(summary)

Unnamed: 0,split,activity,n,hz_median,hz_min,hz_max
0,test,jumping,3,100.0,100.0,100.0
1,test,standing,3,100.0,100.0,100.0
2,test,still,3,100.0,100.0,100.0
3,test,walking,3,100.0,100.0,100.0
4,train,jumping,13,100.0,100.0,100.0
5,train,standing,13,100.0,100.0,100.0
6,train,still,13,100.0,100.0,100.0
7,train,walking,13,100.0,100.0,100.0
