In [49]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.signal import welch
from scipy.ndimage import uniform_filter1d
from tqdm import tqdm
import joblib
import re

In [50]:
SOURCE_DIR = r"data\filtered"   # change per shot
OUT_DIR    = r"data\features"  # outputs go here
FS         = 2148.1481
WELCH_NPERSEG = 512
MIN_NUM_SAMPLES = 50

# Fixed channel canonical order - set names exactly matching CSV column names (case sensitive)
# Edit this list to match the exact column names in your preprocessed CSVs.
CANONICAL_CHANNEL_ORDER = [
    'Rectus Femoris right', 'Rectus Femoris left', 
    'Hamstrings right', 'Hamstrings left', 
    'TibilaisÂ Anterior right', 'TibilaisÂ Anterior left', 
    'Gastrocnemius right', 'Gastrocnemius left'
]

# RMS window (seconds) for moving-RMS
RMS_WINDOW_MS = 50
RMS_WINDOW_SAMPLES = max(1, int((RMS_WINDOW_MS/1000.0) * FS))

# Save settings
os.makedirs(OUT_DIR, exist_ok=True)
print("SOURCE_DIR:", SOURCE_DIR)
print("OUT_DIR:", OUT_DIR)
print("FS:", FS, "RMS window samples:", RMS_WINDOW_SAMPLES)



SOURCE_DIR: data\filtered
OUT_DIR: data\features
FS: 2148.1481 RMS window samples: 107


In [51]:
# Cell 2 - helpers: safe PSD, move-RMS, canonicalize columns
def safe_welch(x, fs=FS, nperseg=WELCH_NPERSEG):
    nperseg_eff = min(len(x), max(16, nperseg))
    try:
        f, Pxx = welch(x, fs=fs, nperseg=nperseg_eff)
    except Exception:
        f = np.array([0.0])
        Pxx = np.array([0.0])
    return f, Pxx

def moving_rms(x, window_samples=RMS_WINDOW_SAMPLES):
    if len(x) < window_samples or window_samples <= 1:
        # fallback to global RMS
        return np.sqrt(np.mean(x**2)) * np.ones_like(x)
    sq = x.astype(float)**2
    # uniform_filter1d behaves as a sliding window mean; take sqrt for RMS
    mean_sq = uniform_filter1d(sq, size=window_samples, mode='nearest')
    return np.sqrt(mean_sq)

def canonicalize_emg_df(df, canonical_order=CANONICAL_CHANNEL_ORDER):
    """
    Return DataFrame with columns in canonical order.
    If a canonical column is missing, create it filled with NaNs.
    If there are extra columns, append them at the end.
    """
    cols_present = df.columns.tolist()
    ordered = []
    for c in canonical_order:
        if c in cols_present:
            ordered.append(c)
        else:
            # create missing column as NaN
            df[c] = np.nan
            ordered.append(c)
    # append any remaining columns not in canonical list (to avoid data loss)
    remaining = [c for c in cols_present if c not in canonical_order]
    ordered += remaining
    return df[ordered]


In [52]:
# Cell 3 - per-channel feature extractors
def extract_time_features(x):
    x = np.asarray(x).astype(float)
    if x.size == 0:
        return {
            "mean": np.nan, "std": np.nan, "rms": np.nan,
            "mav": np.nan, "wl": np.nan, "peak": np.nan, "iEMG": np.nan
        }
    return {
        "mean": float(np.mean(x)),
        "std":  float(np.std(x)),
        "rms":  float(np.sqrt(np.mean(x**2))),
        "mav":  float(np.mean(np.abs(x))),
        "wl":   float(np.sum(np.abs(np.diff(x)))),
        "peak": float(np.max(x)),
        "iEMG": float(np.trapz(np.abs(x)))  # numeric integral
    }

def extract_freq_features(x, fs=FS):
    x = np.asarray(x).astype(float)
    if len(x) < 4:
        return {"mnf": np.nan, "mdf": np.nan, "bp_20_60": np.nan, "bp_60_100": np.nan, "bp_100_200": np.nan}
    f, Pxx = safe_welch(x, fs=fs)
    total = np.sum(Pxx) + 1e-12
    mnf = float(np.sum(f * Pxx) / total)
    csum = np.cumsum(Pxx)
    half = total / 2.0
    idx = np.searchsorted(csum, half)
    mdf = float(f[idx]) if idx < len(f) else float(f[-1])
    def bandpow(a,b):
        mask = (f >= a) & (f <= b)
        return float(np.trapz(Pxx[mask], f[mask])) if np.any(mask) else 0.0
    return {"mnf": mnf, "mdf": mdf, "bp_20_60": bandpow(20,60), "bp_60_100": bandpow(60,100), "bp_100_200": bandpow(100,200)}


In [53]:
# Updated: Cell 4 - single-file processing & feature assembly
def process_file_to_features(path):
    """
    Read one preprocessed CSV and return a dict of features + metadata.
    - extracts a robust player name from filename (falls back to parent folder)
    - uses more tolerant left/right channel matching
    - fixes small typos in expected muscle names
    """
    df = pd.read_csv(path)
    df = canonicalize_emg_df(df)  # ensure canonical column order
    # detect time column
    time_cols = [c for c in df.columns if 'time' in c.lower() or 'timestamp' in c.lower()]
    time_col = time_cols[0] if time_cols else None
    emg_cols = [c for c in df.columns if c != time_col]
    n_samples = df.shape[0]

    row = {}
    row['file'] = path

    # ====== FIX: robust player extraction ======
    # Prefer filename-based player id: e.g. "Ahesan_1.csv" -> "Ahesan"
    # Fallback to parent folder name if parsing fails
    try:
        p = Path(path)
        stem = p.stem  # e.g. "Ahesan_1"
        if "_" in stem:
            player_name = stem.split("_")[0]
        else:
            # try splitting on hyphen or dot-separated tokens
            tokens = re.split(r"[-\.]", stem)
            player_name = tokens[0] if tokens and tokens[0] else p.parent.name
        # final cleanup: strip spaces and force consistent case if desired
        player_name = str(player_name).strip()
        if player_name == "" or player_name.lower() == "filtered":
            player_name = p.parent.name
    except Exception:
        player_name = Path(path).parent.name

    row['player'] = player_name
    # ===========================================

    row['n_samples'] = n_samples
    row['fs_used'] = FS

    # gather per-channel features for both envelope (assumed) and filtered-like values
    # we will compute both absolute and relative (per-trial normalized) metrics
    channel_means = {}
    channel_peaks = {}

    # first pass compute envelope/time-domain/freq features and moving RMS
    per_channel_data = {}
    for ch in emg_cols:
        x = pd.to_numeric(df[ch], errors='coerce').fillna(0).values.astype(float)
        per_channel_data[ch] = {}
        td = extract_time_features(x)
        per_channel_data[ch].update(td)
        ff = extract_freq_features(x)
        per_channel_data[ch].update(ff)
        # timing features using envelope
        peak_idx = int(np.nanargmax(x)) if np.any(~np.isnan(x)) else 0
        per_channel_data[ch]['time_to_peak_s'] = peak_idx / FS if n_samples>0 else np.nan
        # duration above halfmax
        halfmax = 0.5 * per_channel_data[ch].get('peak', np.nan)
        if np.isnan(halfmax):
            per_channel_data[ch]['dur_halfmax_s'] = np.nan
        else:
            per_channel_data[ch]['dur_halfmax_s'] = float(np.sum(x > halfmax) / FS) if n_samples>0 else np.nan
        # moving RMS
        mr = moving_rms(x, RMS_WINDOW_SAMPLES)
        per_channel_data[ch]['mrms_mean'] = float(np.mean(mr))
        per_channel_data[ch]['mrms_peak'] = float(np.max(mr))
        channel_means[ch] = per_channel_data[ch].get('mean', np.nan)
        channel_peaks[ch] = per_channel_data[ch].get('peak', np.nan)

    # compute per-trial normalization constants
    all_means = np.array([v for v in channel_means.values() if not np.isnan(v)])
    all_peaks = np.array([v for v in channel_peaks.values() if not np.isnan(v)])
    trial_mean = np.nanmean(all_means) if all_means.size>0 else 1.0
    trial_peak = np.nanmax(all_peaks) if all_peaks.size>0 else 1.0
    if trial_peak == 0:
        trial_peak = 1.0

    # fill row with structured names and also relative versions (divided by trial peak)
    for ch in emg_cols:
        chdata = per_channel_data[ch]
        for k,v in chdata.items():
            # convert keys like 'rms' to 'RF_L__rms'
            colname = f"{ch}__{k}"
            row[colname] = v
            # relative versions for amplitude-like features (rms, peak, iEMG, mrms_mean, mrms_peak, mav)
            if k in ('rms','peak','iEMG','mrms_mean','mrms_peak','mav'):
                rel_name = f"{ch}__{k}_rel"
                row[rel_name] = v / trial_peak if (not np.isnan(v) and trial_peak!=0) else np.nan

    # symmetry & co-activation features using canonical pairs
    # tolerant matching by looking for muscle name and left/right tokens
    def find_col(ch_list, muscle_name, side):
        """
        side: 'L' or 'R'
        match if muscle_name in col (case-insensitive) AND column contains left/right indicator
        left indicators: [' l', '_l', ' left', 'left', ' L', '(l)']
        right indicators: analogous
        """
        side_tokens = {
            'L': [' l', '_l', ' left', 'left', ' L', '(l)', '_L', ' L.'],
            'R': [' r', '_r', ' right', 'right', ' R', '(r)', '_R', ' R.']
        }
        muscle_lower = muscle_name.lower()
        for c in ch_list:
            cl = c.lower()
            if muscle_lower in cl:
                for t in side_tokens[side]:
                    if t in cl:
                        return c
        # fallback: try endings like 'l' or 'r' after underscore
        for c in ch_list:
            cl = c.lower()
            if muscle_lower in cl and re.search(r'[_\-\s]{}(?![a-z])'.format(side.lower()), cl):
                return c
        return None

    # define expected muscle groups and left/right patterns (fixed typo)
    muscles = ["Rectus Femoris", "TibilaisÂ Anterior", "Gastrocnemius", "Hamstrings"]
    for m in muscles:
        left = find_col(emg_cols, m, 'L')
        right = find_col(emg_cols, m, 'R')
        if left and right:
            meanL = per_channel_data[left].get('mean', np.nan)
            meanR = per_channel_data[right].get('mean', np.nan)
            row[f"{m}__LR_mean_ratio"] = meanL / (meanR + 1e-12) if not np.isnan(meanL) else np.nan
            row[f"{m}__LR_mean_absdev1"] = abs((meanL/ (meanR + 1e-12)) - 1.0)
            # co-activation index simple: overlap of normalized envelopes
            try:
                xL = pd.to_numeric(df[left], errors='coerce').fillna(0).values.astype(float)
                xR = pd.to_numeric(df[right], errors='coerce').fillna(0).values.astype(float)
                # normalize by peak to get 0..1
                pL = np.max(xL) if np.max(xL)!=0 else 1.0
                pR = np.max(xR) if np.max(xR)!=0 else 1.0
                nL = xL / pL
                nR = xR / pR
                coact = np.mean(np.minimum(nL, nR))
                row[f"{m}__LR_coact_simple"] = float(coact)
            except Exception:
                row[f"{m}__LR_coact_simple"] = np.nan
        else:
            # ensure columns exist but set NaN if pair not found
            row[f"{m}__LR_mean_ratio"] = np.nan
            row[f"{m}__LR_mean_absdev1"] = np.nan
            row[f"{m}__LR_coact_simple"] = np.nan

    # Add trial-level summary stats
    row['trial_mean_of_channel_means'] = float(trial_mean)
    row['trial_peak_of_channel_peaks'] = float(trial_peak)

    return row


In [54]:
# Cell 5 - iterate all files, collect features, save master CSV & QC
def find_csv_files(root_dir):
    fl = []
    for root, _, files in os.walk(root_dir):
        for f in files:
            if f.lower().endswith(".csv"):
                fl.append(os.path.join(root, f))
    return sorted(fl)

files = find_csv_files(SOURCE_DIR)
print("Found", len(files), "files.")
rows = []
qc_rows = []
for p in tqdm(files):
    try:
        feats = process_file_to_features(p)
        if feats is None:
            qc_rows.append({"file": p, "status": "no_emg"})
            continue
        rows.append(feats)
        qc_rows.append({"file": p, "status": "ok", "n_samples": feats.get('n_samples', np.nan)})
    except Exception as e:
        qc_rows.append({"file": p, "status": "error", "error": str(e)})
        print("Error processing", p, e)

feat_df = pd.DataFrame(rows).fillna(np.nan)
qc_df = pd.DataFrame(qc_rows)
feat_csv = os.path.join(OUT_DIR, "features_master.csv")
qc_csv = os.path.join(OUT_DIR, "qc_master.csv")
feat_df.to_csv(feat_csv, index=False)
qc_df.to_csv(qc_csv, index=False)
print("Saved feature table:", feat_csv)
print("Saved QC table:", qc_csv)


Found 35 files.


100%|██████████| 35/35 [00:00<00:00, 35.69it/s]

Saved feature table: data\features\features_master.csv
Saved QC table: data\features\qc_master.csv





In [55]:
# Cell 6 - save numpy arrays and feature column names for ML convenience
meta_cols = ['file','player','n_samples','fs_used','trial_mean_of_channel_means','trial_peak_of_channel_peaks']
feature_cols = [c for c in feat_df.columns if c not in meta_cols]
X = feat_df[feature_cols].values.astype(float)
players = feat_df['player'].values.astype(str)
np.save(os.path.join(OUT_DIR, "X.npy"), X)
np.save(os.path.join(OUT_DIR, "players.npy"), players)
pd.Series(feature_cols).to_csv(os.path.join(OUT_DIR, "feature_columns.csv"), index=False)
print("Saved X.npy, players.npy, feature_columns.csv")
print("Feature count:", len(feature_cols))


Saved X.npy, players.npy, feature_columns.csv
Feature count: 188


In [56]:
# Cell 7 - quick sanity outputs: first rows and summary stats and per-feature NaN counts
print("Preview features (first 3 rows):")
display(feat_df)
print("\nPer-feature missing counts (top 20):")
print(feat_df.isna().sum().sort_values(ascending=False))
print("\nDescriptive stats for some composite feature candidates:")
cands = [c for c in feature_cols if "__rms" in c or "__peak" in c or "__mnf" in c or "coact" in c]
display(feat_df[cands].describe().T)


Preview features (first 3 rows):


Unnamed: 0,file,player,n_samples,fs_used,Rectus Femoris right__mean,Rectus Femoris right__std,Rectus Femoris right__rms,Rectus Femoris right__rms_rel,Rectus Femoris right__mav,Rectus Femoris right__mav_rel,...,TibilaisÂ Anterior__LR_mean_absdev1,TibilaisÂ Anterior__LR_coact_simple,Gastrocnemius__LR_mean_ratio,Gastrocnemius__LR_mean_absdev1,Gastrocnemius__LR_coact_simple,Hamstrings__LR_mean_ratio,Hamstrings__LR_mean_absdev1,Hamstrings__LR_coact_simple,trial_mean_of_channel_means,trial_peak_of_channel_peaks
0,data\filtered\Ahesan_1.csv,Ahesan,6438,2148.1481,0.025718,0.050058,0.056278,0.236439,0.025722,0.108064,...,0.21603,0.112065,1.677526,0.6775256,0.056561,1.299145,0.299145,0.082939,0.022392,0.238023
1,data\filtered\Ahesan_2.csv,Ahesan,6438,2148.1481,0.026553,0.045076,0.052315,0.187049,0.026579,0.095033,...,0.158305,0.101972,19253360000.0,19253360000.0,-2.8e-05,0.880541,0.119459,0.05841,0.020265,0.279686
2,data\filtered\Ahesan_3.csv,Ahesan,6438,2148.1481,0.023641,0.042643,0.048757,0.198308,0.023671,0.096277,...,0.357628,0.130393,24408910000.0,24408910000.0,-3.3e-05,0.798764,0.201236,0.100744,0.028258,0.245867
3,data\filtered\Ahesan_4.csv,Ahesan,6438,2148.1481,0.031851,0.046411,0.056289,0.171899,0.032009,0.097751,...,0.339249,0.119284,21059220000.0,21059220000.0,-0.000602,0.775838,0.224162,0.090343,0.023999,0.327453
4,data\filtered\Ahesan_5.csv,Ahesan,6438,2148.1481,0.023008,0.032752,0.040026,0.16569,0.023008,0.095243,...,0.393387,0.150827,20128350000.0,20128350000.0,-0.00011,0.930527,0.069473,0.084311,0.024713,0.241569
5,data\filtered\Devansh_1.csv,Devansh,6438,2148.1481,0.021569,0.02824,0.035535,0.136128,0.02166,0.082978,...,0.368502,0.122958,22235990000.0,22235990000.0,-0.000488,0.450743,0.549257,0.072757,0.024781,0.261039
6,data\filtered\Devansh_2.csv,Devansh,6438,2148.1481,0.013506,0.021758,0.025609,0.087972,0.013506,0.046397,...,0.530198,0.123566,12942550000.0,12942550000.0,0.0,0.415626,0.584374,0.045921,0.01832,0.291102
7,data\filtered\Devansh_3.csv,Devansh,6438,2148.1481,0.016067,0.031667,0.03551,0.143458,0.016076,0.064945,...,0.43103,0.087112,12365710000.0,12365710000.0,-0.000842,0.399947,0.600053,0.035608,0.0164,0.247526
8,data\filtered\Devansh_4.csv,Devansh,6438,2148.1481,0.015194,0.025918,0.030044,0.126423,0.015194,0.063937,...,0.126513,0.127828,17421660000.0,17421660000.0,-0.000582,0.676013,0.323987,0.042045,0.017211,0.237643
9,data\filtered\Devansh_5.csv,Devansh,6438,2148.1481,0.018095,0.026237,0.031871,0.123083,0.018124,0.069993,...,0.023664,0.119191,20941920000.0,20941920000.0,-0.00089,0.6583,0.3417,0.041208,0.020636,0.258942



Per-feature missing counts (top 20):
file                                 0
Gastrocnemius right__iEMG            0
TibilaisÂ Anterior left__iEMG        0
TibilaisÂ Anterior left__iEMG_rel    0
TibilaisÂ Anterior left__mnf         0
                                    ..
Hamstrings right__mrms_mean_rel      0
Hamstrings right__mrms_peak          0
Hamstrings right__mrms_peak_rel      0
Hamstrings left__mean                0
trial_peak_of_channel_peaks          0
Length: 194, dtype: int64

Descriptive stats for some composite feature candidates:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rectus Femoris right__rms,35.0,0.034313,0.013938,0.009872,0.023251,0.035269,0.044748,0.059399
Rectus Femoris right__rms_rel,35.0,0.1288,0.063204,0.017765,0.079747,0.136128,0.178773,0.236439
Rectus Femoris right__peak,35.0,0.138624,0.068079,0.027191,0.091353,0.118323,0.191795,0.323805
Rectus Femoris right__peak_rel,35.0,0.511659,0.27312,0.068636,0.313299,0.459625,0.692616,1.0
Rectus Femoris right__mnf,35.0,4.229184,0.22124,3.837674,4.022994,4.199259,4.358276,4.653164
Rectus Femoris left__rms,35.0,0.028863,0.014222,0.006144,0.017241,0.027642,0.040766,0.055926
Rectus Femoris left__rms_rel,35.0,0.10785,0.058543,0.013782,0.067817,0.104363,0.150625,0.227463
Rectus Femoris left__peak,35.0,0.124481,0.073039,0.019598,0.069797,0.131257,0.180935,0.2663
Rectus Femoris left__peak_rel,35.0,0.4533,0.288834,0.055836,0.198791,0.404635,0.621761,1.0
Rectus Femoris left__mnf,35.0,4.206724,0.296666,3.718682,3.92442,4.237317,4.368342,4.873847
