# .mat to .pkl file conversion
This project converts the SEEDS dataset to pkl format for faster python processing

In [None]:
# Imports
from pathlib import Path
import os
import pickle
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import resample, decimate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    ConfusionMatrixDisplay,
)

In [None]:
# MAKE THE DIRECTORIES FOR SAVING THE PKL DATASET IF THEY DO NOT EXIST
from pathlib import Path

WORKSPACE = Path.cwd()
PKL_DATASET_DIR = WORKSPACE / "pkl_dataset"
SUBJECTS = [f"subj{n:02d}" for n in range(1, 11)]

PKL_DATASET_DIR.mkdir(parents=True, exist_ok=True)
for subj in SUBJECTS:
    (PKL_DATASET_DIR / subj).mkdir(parents=True, exist_ok=True)

print("PKL_DATASET_DIR:", PKL_DATASET_DIR)
print("Created subject subdirs:", ", ".join(SUBJECTS))

In [None]:
from __future__ import annotations

import os
import pickle
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path

import numpy as np
from scipy.io import loadmat

WORKSPACE = Path.cwd()
SRC_ROOT = WORKSPACE / 'osfstorage-archive'
DST_ROOT = WORKSPACE / 'pkl_dataset'

SUBJECTS = [f'subj{n:02d}' for n in range(1, 11)]
MAX_WORKERS = min(12, (os.cpu_count() or 4))

VALIDATE = True # disable to speed up processing if taking way too long

print('SRC_ROOT:', SRC_ROOT)
print('DST_ROOT:', DST_ROOT)
print('MAX_WORKERS:', MAX_WORKERS)
print('VALIDATE:', VALIDATE)

In [None]:
def discover_mat_files() -> list[Path]:
    mats: list[Path] = []
    for subj in SUBJECTS:
        subj_dir = SRC_ROOT / subj
        if not subj_dir.exists():
            raise FileNotFoundError(f'Missing subject directory: {subj_dir}')
        mats.extend(sorted(subj_dir.rglob('*.mat')))
    return mats

mat_files = discover_mat_files()
print('Discovered .mat files:', len(mat_files))
print('Example:', mat_files[0] if mat_files else 'None')

In [None]:
@dataclass
class ConvertResult:
    mat_path: Path
    pkl_path: Path
    ok: bool
    seconds: float
    error: str | None = None

def dst_path_for(mat_path: Path) -> Path:
    parts = mat_path.parts
    try:
        i = parts.index('osfstorage-archive')
    except ValueError:
        raise ValueError(f'Path is not under osfstorage-archive: {mat_path}')

    subj = parts[i + 1]
    if subj not in SUBJECTS:
        raise ValueError(f'Not a supported subject folder: {subj} (path: {mat_path})')

    rel_under_subj = Path(*parts[i + 2 : -1])  # may be '.'
    out_dir = DST_ROOT / subj / rel_under_subj
    return out_dir / (mat_path.stem + '.pkl')

def lightweight_validate(mat_dict: dict, pkl_dict: dict, mat_path: Path) -> None:
    mat_keys = sorted(mat_dict.keys())
    pkl_keys = sorted(pkl_dict.keys())
    if mat_keys != pkl_keys:
        raise ValueError(f'Key mismatch for {mat_path}: MAT has {len(mat_keys)} keys, PKL has {len(pkl_keys)} keys')

    for k in mat_keys:
        a = mat_dict[k]
        b = pkl_dict[k]
        if type(a) is not type(b):
            raise TypeError(f'Type mismatch for {mat_path} key={k}: MAT={type(a)} PKL={type(b)}')
        if isinstance(a, np.ndarray):
            if a.shape != b.shape or a.dtype != b.dtype:
                raise ValueError(
                    f'Array mismatch for {mat_path} key={k}: ' 
                    f'MAT shape/dtype={a.shape}/{a.dtype} vs PKL={b.shape}/{b.dtype}'
                )

def convert_one(mat_path: Path) -> ConvertResult:
    t0 = time.perf_counter()
    pkl_path = dst_path_for(mat_path)
    try:
        pkl_path.parent.mkdir(parents=True, exist_ok=True)

        # Load ENTIRE .mat file dict
        mat_dict = loadmat(mat_path)

        # Write pickle
        with open(pkl_path, 'wb') as f:
            pickle.dump(mat_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

        # Lightweight validation (reload pickle, compare keys/types/shape/dtype)
        if VALIDATE:
            with open(pkl_path, 'rb') as f:
                pkl_dict = pickle.load(f)
            lightweight_validate(mat_dict, pkl_dict, mat_path)

        t1 = time.perf_counter()
        return ConvertResult(mat_path=mat_path, pkl_path=pkl_path, ok=True, seconds=t1 - t0)
    except Exception as e:
        t1 = time.perf_counter()
        return ConvertResult(mat_path=mat_path, pkl_path=pkl_path, ok=False, seconds=t1 - t0, error=str(e))

# Run conversion with multi-threading
results: list[ConvertResult] = []
start = time.perf_counter()

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = {ex.submit(convert_one, mp): mp for mp in mat_files}
    for idx, fut in enumerate(as_completed(futures), start=1):
        res = fut.result()
        results.append(res)
        if idx % 25 == 0 or idx == len(futures):
            ok = sum(r.ok for r in results)
            print(f'Progress: {idx}/{len(futures)} | ok={ok} | failed={idx-ok}')

end = time.perf_counter()
print('Done in %.2f seconds' % (end - start))

In [None]:
total = len(results)
ok = sum(r.ok for r in results)
failed = total - ok

print('Total:', total)
print('Succeeded:', ok)
print('Failed:', failed)

if failed:
    print('--- Failures (up to 25) ---')
    for r in [x for x in results if not x.ok][:25]:
        print('MAT:', r.mat_path)
        print('PKL:', r.pkl_path)
        print('ERR:', r.error)
        print('---')

# timing summary (not working at the moment)
times = np.array([r.seconds for r in results], dtype=float) if results else np.array([], dtype=float)
if times.size:
    print('Mean seconds/file:', float(times.mean()))
    print('Median seconds/file:', float(np.median(times)))
    print('p95 seconds/file:', float(np.quantile(times, 0.95)))