In [1]:
from __future__ import annotations

import pickle
import time
from pathlib import Path

import numpy as np
from scipy.io import loadmat
from scipy import stats

WORKSPACE = Path.cwd()
DATA_ROOT = WORKSPACE / 'osfstorage-archive'

# Use ONE fixed subject file (edit this if you want a different single file).
MAT_PATH = DATA_ROOT / 'subj01' / 'detop_exp01_subj01_Sess1_01_01.mat'
if not MAT_PATH.exists():
    raise FileNotFoundError(f'MAT file not found: {MAT_PATH}')

# Pickle output path (stores the entire loadmat() dictionary).
PKL_PATH = WORKSPACE / (MAT_PATH.stem + '.FULLFILE.pkl')

print('MAT_PATH:', MAT_PATH)
print('PKL_PATH:', PKL_PATH)

MAT_PATH: /home/connor/Documents/Development/Gesture-Recognition/Dataset Work/osfstorage-archive/subj01/detop_exp01_subj01_Sess1_01_01.mat
PKL_PATH: /home/connor/Documents/Development/Gesture-Recognition/Dataset Work/detop_exp01_subj01_Sess1_01_01.FULLFILE.pkl


In [2]:
# Load the .mat once and write the ENTIRE FILE contents to pickle
mat_dict_full = loadmat(MAT_PATH)

print('Loaded .mat keys (including metadata keys like __header__):')
print(sorted(mat_dict_full.keys()))

with open(PKL_PATH, 'wb') as f:
    pickle.dump(mat_dict_full, f, protocol=pickle.HIGHEST_PROTOCOL)

print('Wrote pickle successfully.')

Loaded .mat keys (including metadata keys like __header__):
['__globals__', '__header__', '__version__', 'channels_emg', 'channels_glove', 'date', 'emg', 'fs_emg', 'fs_glove', 'glove', 'movement', 'session', 'speed', 'subject']
Wrote pickle successfully.


In [3]:
def load_entire_mat(mat_path: Path) -> dict:
    return loadmat(mat_path)

def load_entire_pkl(pkl_path: Path) -> dict:
    with open(pkl_path, 'rb') as f:
        return pickle.load(f)

def time_load(fn, repeats: int = 10) -> np.ndarray:
    # Warm-up (not measured)
    _ = fn()

    samples = []
    for _ in range(repeats):
        t0 = time.perf_counter()
        _ = fn()
        t1 = time.perf_counter()
        samples.append(t1 - t0)
    return np.asarray(samples, dtype=float)

mat_times = time_load(lambda: load_entire_mat(MAT_PATH), repeats=10)
pkl_times = time_load(lambda: load_entire_pkl(PKL_PATH), repeats=10)

print('MAT times (s):', mat_times)
print('PKL times (s):', pkl_times)

MAT times (s): [0.0447392  0.04049057 0.040901   0.03898914 0.03672016 0.0369969
 0.03697607 0.03743287 0.03721555 0.03712672]
PKL times (s): [0.00151045 0.00115954 0.00113949 0.00134831 0.0013354  0.00131431
 0.00137618 0.0013005  0.00121097 0.00124821]


In [4]:
def mean_ci_95(samples: np.ndarray) -> tuple[float, float, float]:
    """95% CI for the mean using Student-t: mean ± t * s/sqrt(n)."""
    samples = np.asarray(samples, dtype=float)
    n = samples.size
    if n < 2:
        raise ValueError('Need at least 2 samples for CI.')
    mean = float(samples.mean())
    s = float(samples.std(ddof=1))
    tcrit = float(stats.t.ppf(0.975, df=n - 1))
    half_width = tcrit * (s / (n ** 0.5))
    return mean, mean - half_width, mean + half_width

mat_mean, mat_lo, mat_hi = mean_ci_95(mat_times)
pkl_mean, pkl_lo, pkl_hi = mean_ci_95(pkl_times)

print(f'.mat mean: {mat_mean*1000:.3f} ms (95% CI: [{mat_lo*1000:.3f}, {mat_hi*1000:.3f}])')
print(f'.pkl mean: {pkl_mean*1000:.3f} ms (95% CI: [{pkl_lo*1000:.3f}, {pkl_hi*1000:.3f}])')

savings = mat_mean - pkl_mean  # positive => PKL faster
if savings > 0:
    print(f'PKL is faster by {savings*1000:.3f} ms on average ({(mat_mean/pkl_mean):.2f}× speedup).')
elif savings < 0:
    print(f'MAT is faster by {(-savings)*1000:.3f} ms on average ({(pkl_mean/mat_mean):.2f}× speedup).')
else:
    print('No average difference observed (means equal).')

.mat mean: 38.759 ms (95% CI: [36.902, 40.616])
.pkl mean: 1.294 ms (95% CI: [1.215, 1.373])
PKL is faster by 37.464 ms on average (29.94× speedup).


## Notes
- This pickles the **entire** `loadmat()` dictionary
- To benchmark a different single file, edit `MAT_PATH` in Cell 2


## ADDITIONAL TESTING

In [6]:
# LOAD THE MATLAB FILE ONLY

mat_dict_full = loadmat(MAT_PATH)
print('Reloaded .mat keys (including metadata keys like __header__):')
print(sorted(mat_dict_full.keys()))

Reloaded .mat keys (including metadata keys like __header__):
['__globals__', '__header__', '__version__', 'channels_emg', 'channels_glove', 'date', 'emg', 'fs_emg', 'fs_glove', 'glove', 'movement', 'session', 'speed', 'subject']


In [8]:
# CHECK THE DATA TYPES ARE NOT MATLABOPAQUE OBJECTS

i = 0
for key, value in mat_dict_full.items():
    if 'matlab' in str(type(value)).lower():
        print(f'WARNING: Key "{key}" has MATLAB opaque type: {type(value)}')
        i+=1
if i == 0:
    print('No MATLAB opaque types found in the loaded .mat dictionary.')

No MATLAB opaque types found in the loaded .mat dictionary.


In [9]:
# COMPARE THE STRUCTURE OF PKL AND MAT FILE

pkl_dict_full = load_entire_pkl(PKL_PATH)
print('Reloaded .pkl keys:')
print(sorted(pkl_dict_full.keys()))

# Compare keys and basic value structure between MAT and PKL
mat_keys = sorted(mat_dict_full.keys())
pkl_keys = sorted(pkl_dict_full.keys())

print('Reloaded .mat keys:')
print(mat_keys)
print('Keys match:', mat_keys == pkl_keys)

missing_in_pkl = sorted(set(mat_keys) - set(pkl_keys))
missing_in_mat = sorted(set(pkl_keys) - set(mat_keys))
if missing_in_pkl:
    print('Missing in PKL:', missing_in_pkl)
if missing_in_mat:
    print('Extra in PKL:', missing_in_mat)

def _describe(v):
    if isinstance(v, np.ndarray):
        return f'ndarray shape={v.shape} dtype={v.dtype}'
    return type(v).__name__

common_keys = sorted(set(mat_keys) & set(pkl_keys))
mismatches = []

for k in common_keys:
    a = mat_dict_full[k]
    b = pkl_dict_full[k]

    # Compare type first
    if type(a) is not type(b):
        mismatches.append((k, _describe(a), _describe(b)))
        continue

    # If arrays, compare shape/dtype
    if isinstance(a, np.ndarray):
        if a.shape != b.shape or a.dtype != b.dtype:
            mismatches.append((k, _describe(a), _describe(b)))

print(f'Structure mismatches (type/shape/dtype): {len(mismatches)}')
for k, da, db in mismatches[:25]:
    print(f'- {k}: MAT={da} | PKL={db}')

Reloaded .pkl keys:
['__globals__', '__header__', '__version__', 'channels_emg', 'channels_glove', 'date', 'emg', 'fs_emg', 'fs_glove', 'glove', 'movement', 'session', 'speed', 'subject']
Reloaded .mat keys:
['__globals__', '__header__', '__version__', 'channels_emg', 'channels_glove', 'date', 'emg', 'fs_emg', 'fs_glove', 'glove', 'movement', 'session', 'speed', 'subject']
Keys match: True
Structure mismatches (type/shape/dtype): 0
