In [2]:
# MAKE THE DIRECTORIES FOR SAVING THE PKL DATASET IF THEY DO NOT EXIST
from pathlib import Path

WORKSPACE = Path.cwd()
PKL_DATASET_DIR = WORKSPACE / "pkl_dataset"
SUBJECTS = [f"subj{n:02d}" for n in range(1, 11)]

PKL_DATASET_DIR.mkdir(parents=True, exist_ok=True)
for subj in SUBJECTS:
    (PKL_DATASET_DIR / subj).mkdir(parents=True, exist_ok=True)

print("PKL_DATASET_DIR:", PKL_DATASET_DIR)
print("Created subject subdirs:", ", ".join(SUBJECTS))

PKL_DATASET_DIR: /home/connor/Documents/Development/Gesture-Recognition/Dataset Work/pkl_dataset
Created subject subdirs: subj01, subj02, subj03, subj04, subj05, subj06, subj07, subj08, subj09, subj10


In [1]:
from __future__ import annotations

import os
import pickle
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path

import numpy as np
from scipy.io import loadmat

WORKSPACE = Path.cwd()
SRC_ROOT = WORKSPACE / 'osfstorage-archive'
DST_ROOT = WORKSPACE / 'pkl_dataset'

SUBJECTS = [f'subj{n:02d}' for n in range(1, 11)]

# Concurrency: tune if you want (I/O + CPU mix).
MAX_WORKERS = min(12, (os.cpu_count() or 4))

# Validation is an extra load of the pickle (but avoids comparing full arrays).
VALIDATE = True

print('SRC_ROOT:', SRC_ROOT)
print('DST_ROOT:', DST_ROOT)
print('MAX_WORKERS:', MAX_WORKERS)
print('VALIDATE:', VALIDATE)

SRC_ROOT: /home/connor/Documents/Development/Gesture-Recognition/Dataset Work/osfstorage-archive
DST_ROOT: /home/connor/Documents/Development/Gesture-Recognition/Dataset Work/pkl_dataset
MAX_WORKERS: 12
VALIDATE: True


In [4]:
def discover_mat_files() -> list[Path]:
    mats: list[Path] = []
    for subj in SUBJECTS:
        subj_dir = SRC_ROOT / subj
        if not subj_dir.exists():
            raise FileNotFoundError(f'Missing subject directory: {subj_dir}')
        mats.extend(sorted(subj_dir.rglob('*.mat')))
    return mats

mat_files = discover_mat_files()
print('Discovered .mat files:', len(mat_files))
print('Example:', mat_files[0] if mat_files else 'None')

Discovered .mat files: 0
Example: None


In [5]:
@dataclass
class ConvertResult:
    mat_path: Path
    pkl_path: Path
    ok: bool
    seconds: float
    error: str | None = None

def dst_path_for(mat_path: Path) -> Path:
    # Expect mat_path like .../osfstorage-archive/subjXX/<optional subdirs>/file.mat
    # Mirror any nested subfolders under pkl_dataset/subjXX/... and keep filename.
    parts = mat_path.parts
    try:
        i = parts.index('osfstorage-archive')
    except ValueError:
        raise ValueError(f'Path is not under osfstorage-archive: {mat_path}')

    subj = parts[i + 1]
    if subj not in SUBJECTS:
        raise ValueError(f'Not a supported subject folder: {subj} (path: {mat_path})')

    rel_under_subj = Path(*parts[i + 2 : -1])  # may be '.'
    out_dir = DST_ROOT / subj / rel_under_subj
    return out_dir / (mat_path.stem + '.pkl')

def lightweight_validate(mat_dict: dict, pkl_dict: dict, mat_path: Path) -> None:
    # Keys
    mat_keys = sorted(mat_dict.keys())
    pkl_keys = sorted(pkl_dict.keys())
    if mat_keys != pkl_keys:
        raise ValueError(f'Key mismatch for {mat_path}: MAT has {len(mat_keys)} keys, PKL has {len(pkl_keys)} keys')

    # Types + basic ndarray structure
    for k in mat_keys:
        a = mat_dict[k]
        b = pkl_dict[k]
        if type(a) is not type(b):
            raise TypeError(f'Type mismatch for {mat_path} key={k}: MAT={type(a)} PKL={type(b)}')
        if isinstance(a, np.ndarray):
            if a.shape != b.shape or a.dtype != b.dtype:
                raise ValueError(
                    f'Array mismatch for {mat_path} key={k}: ' 
                    f'MAT shape/dtype={a.shape}/{a.dtype} vs PKL={b.shape}/{b.dtype}'
                )

def convert_one(mat_path: Path) -> ConvertResult:
    t0 = time.perf_counter()
    pkl_path = dst_path_for(mat_path)
    try:
        pkl_path.parent.mkdir(parents=True, exist_ok=True)

        # Load ENTIRE .mat file dict
        mat_dict = loadmat(mat_path)

        # Write pickle
        with open(pkl_path, 'wb') as f:
            pickle.dump(mat_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

        # Lightweight validation (reload pickle, compare keys/types/shape/dtype)
        if VALIDATE:
            with open(pkl_path, 'rb') as f:
                pkl_dict = pickle.load(f)
            lightweight_validate(mat_dict, pkl_dict, mat_path)

        t1 = time.perf_counter()
        return ConvertResult(mat_path=mat_path, pkl_path=pkl_path, ok=True, seconds=t1 - t0)
    except Exception as e:
        t1 = time.perf_counter()
        return ConvertResult(mat_path=mat_path, pkl_path=pkl_path, ok=False, seconds=t1 - t0, error=str(e))

# Run conversion with multi-threading
results: list[ConvertResult] = []
start = time.perf_counter()

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = {ex.submit(convert_one, mp): mp for mp in mat_files}
    for idx, fut in enumerate(as_completed(futures), start=1):
        res = fut.result()
        results.append(res)
        if idx % 25 == 0 or idx == len(futures):
            ok = sum(r.ok for r in results)
            print(f'Progress: {idx}/{len(futures)} | ok={ok} | failed={idx-ok}')

end = time.perf_counter()
print('Done in %.2f seconds' % (end - start))

Done in 0.00 seconds


In [6]:
total = len(results)
ok = sum(r.ok for r in results)
failed = total - ok

print('Total:', total)
print('Succeeded:', ok)
print('Failed:', failed)

if failed:
    print('--- Failures (up to 25) ---')
    for r in [x for x in results if not x.ok][:25]:
        print('MAT:', r.mat_path)
        print('PKL:', r.pkl_path)
        print('ERR:', r.error)
        print('---')

# Basic timing summary (conversion time per file, includes optional validation)
times = np.array([r.seconds for r in results], dtype=float) if results else np.array([], dtype=float)
if times.size:
    print('Mean seconds/file:', float(times.mean()))
    print('Median seconds/file:', float(np.median(times)))
    print('p95 seconds/file:', float(np.quantile(times, 0.95)))

Total: 0
Succeeded: 0
Failed: 0


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import resample, decimate
import os
from tqdm.auto import tqdm  # <-- progress bar

## Down sample the EMG data and interpolate the glove data
## The EMG data is 2048Hz nxt where n=number of channels (134) and t is number of time points
## The glove data is 256Hz mxt where m=number of joint angles (18) and t is number of time points

ROOT = Path.cwd()
PKL_ROOT = ROOT / 'pkl_dataset'
OUT_ROOT = ROOT / 'pkl_dataset_resampled'  # new output folder (non-destructive)

#### 1: check data in one file to confirm data shape and available vars
dirs = [entry.name for entry in os.scandir(PKL_ROOT) if entry.is_dir()]

# Build a flat list of work items so tqdm can show an accurate total
work = []
for dir in dirs:
    files = [x for x in os.listdir(PKL_ROOT / Path(dir)) if '.pkl' in x]
    for file in files:
        if 'calibration' in file:
            continue
        work.append((dir, file))

for dir, file in tqdm(work, desc="Resampling PKLs", unit="file"):
    df = pd.read_pickle(PKL_ROOT / Path(dir) / Path(file))

    # upsample glove data
    glove_fs = 256
    resampled_glove_fs = 512
    t = np.linspace(0, df['glove'].shape[-1]/glove_fs, df['glove'].shape[-1], endpoint=True)
    upsampled_glove_data = resample(
        df['glove'],
        num=int(resampled_glove_fs/glove_fs) * df['glove'].shape[-1],
        t=t,
        axis=1
    )[0]

    # downsample emg data
    emg_fs = 2048
    downsampled_emg_fs = 512
    downsampled_emg_data = decimate(df['emg'], q=int(emg_fs/downsampled_emg_fs), n=2, axis=1)

    df['emg'] = downsampled_emg_data
    df['glove'] = upsampled_glove_data

    out_dir = OUT_ROOT / dir        # join path with string
    out_dir.mkdir(parents=True, exist_ok=True)

    out_file = out_dir / file

    with out_file.open("wb") as f:
        pickle.dump(df, f, protocol=pickle.HIGHEST_PROTOCOL)

  from .autonotebook import tqdm as notebook_tqdm
Resampling PKLs: 100%|██████████| 2340/2340 [01:13<00:00, 31.80file/s]
