In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.signal import welch
from scipy.ndimage import uniform_filter1d
from tqdm import tqdm
import re

In [2]:
SOURCE_DIR = r"data\filtered"   
OUT_DIR    = r"data\features"  
FS         = 2148.1481
WELCH_NPERSEG = 512
MIN_NUM_SAMPLES = 50


CANONICAL_CHANNEL_ORDER = [
    'Rectus Femoris right', 'Rectus Femoris left', 
    'Hamstrings right', 'Hamstrings left', 
    'TibilaisÂ Anterior right', 'TibilaisÂ Anterior left', 
    'Gastrocnemius right', 'Gastrocnemius left'
]

# RMS window (seconds) for moving-RMS
RMS_WINDOW_MS = 50
RMS_WINDOW_SAMPLES = max(1, int((RMS_WINDOW_MS/1000.0) * FS))

os.makedirs(OUT_DIR, exist_ok=True)
print("SOURCE_DIR:", SOURCE_DIR)
print("OUT_DIR:", OUT_DIR)
print("FS:", FS, "RMS window samples:", RMS_WINDOW_SAMPLES)



SOURCE_DIR: data\filtered
OUT_DIR: data\features
FS: 2148.1481 RMS window samples: 107


In [3]:
def safe_welch(x, fs=FS, nperseg=WELCH_NPERSEG):
    nperseg_eff = min(len(x), max(16, nperseg))
    try:
        f, Pxx = welch(x, fs=fs, nperseg=nperseg_eff)
    except Exception:
        f = np.array([0.0])
        Pxx = np.array([0.0])
    return f, Pxx

def moving_rms(x, window_samples=RMS_WINDOW_SAMPLES):
    if len(x) < window_samples or window_samples <= 1:
        return np.sqrt(np.mean(x**2)) * np.ones_like(x)
    sq = x.astype(float)**2
    mean_sq = uniform_filter1d(sq, size=window_samples, mode='nearest')
    return np.sqrt(mean_sq)

def canonicalize_emg_df(df, canonical_order=CANONICAL_CHANNEL_ORDER):
    """
    Return DataFrame with columns in canonical order.
    If a canonical column is missing, create it filled with NaNs.
    If there are extra columns, append them at the end.
    """
    cols_present = df.columns.tolist()
    ordered = []
    for c in canonical_order:
        if c in cols_present:
            ordered.append(c)
        else:
            df[c] = np.nan
            ordered.append(c)
    remaining = [c for c in cols_present if c not in canonical_order]
    ordered += remaining
    return df[ordered]


In [4]:
def extract_time_features(x):
    x = np.asarray(x).astype(float)
    if x.size == 0:
        return {
            "mean": np.nan, "std": np.nan, "rms": np.nan,
            "mav": np.nan, "wl": np.nan, "peak": np.nan, "iEMG": np.nan
        }
    return {
        "mean": float(np.mean(x)),
        "std":  float(np.std(x)),
        "rms":  float(np.sqrt(np.mean(x**2))),
        "mav":  float(np.mean(np.abs(x))),
        "wl":   float(np.sum(np.abs(np.diff(x)))),
        "peak": float(np.max(x)),
        "iEMG": float(np.trapz(np.abs(x)))  # numeric integral
    }

def extract_freq_features(x, fs=FS):
    x = np.asarray(x).astype(float)
    if len(x) < 4:
        return {"mnf": np.nan, "mdf": np.nan, "bp_20_60": np.nan, "bp_60_100": np.nan, "bp_100_200": np.nan}
    f, Pxx = safe_welch(x, fs=fs)
    total = np.sum(Pxx) + 1e-12
    mnf = float(np.sum(f * Pxx) / total)
    csum = np.cumsum(Pxx)
    half = total / 2.0
    idx = np.searchsorted(csum, half)
    mdf = float(f[idx]) if idx < len(f) else float(f[-1])
    def bandpow(a,b):
        mask = (f >= a) & (f <= b)
        return float(np.trapz(Pxx[mask], f[mask])) if np.any(mask) else 0.0
    return {"mnf": mnf, "mdf": mdf, "bp_20_60": bandpow(20,60), "bp_60_100": bandpow(60,100), "bp_100_200": bandpow(100,200)}


In [5]:
def process_file_to_features(path):
   
    df = pd.read_csv(path)
    df = canonicalize_emg_df(df) 
    time_cols = [c for c in df.columns if 'time' in c.lower() or 'timestamp' in c.lower()]
    time_col = time_cols[0] if time_cols else None
    emg_cols = [c for c in df.columns if c != time_col]
    n_samples = df.shape[0]

    row = {}
    row['file'] = path

    try:
        p = Path(path)
        stem = p.stem  
        if "_" in stem:
            player_name = stem.split("_")[0]
        else:
            tokens = re.split(r"[-\.]", stem)
            player_name = tokens[0] if tokens and tokens[0] else p.parent.name
        player_name = str(player_name).strip()
        if player_name == "" or player_name.lower() == "filtered":
            player_name = p.parent.name
    except Exception:
        player_name = Path(path).parent.name

    row['player'] = player_name
    row['n_samples'] = n_samples
    row['fs_used'] = FS

    channel_means = {}
    channel_peaks = {}

    per_channel_data = {}
    for ch in emg_cols:
        x = pd.to_numeric(df[ch], errors='coerce').fillna(0).values.astype(float)
        per_channel_data[ch] = {}
        td = extract_time_features(x)
        per_channel_data[ch].update(td)
        ff = extract_freq_features(x)
        per_channel_data[ch].update(ff)
        peak_idx = int(np.nanargmax(x)) if np.any(~np.isnan(x)) else 0
        per_channel_data[ch]['time_to_peak_s'] = peak_idx / FS if n_samples>0 else np.nan
        halfmax = 0.5 * per_channel_data[ch].get('peak', np.nan)
        if np.isnan(halfmax):
            per_channel_data[ch]['dur_halfmax_s'] = np.nan
        else:
            per_channel_data[ch]['dur_halfmax_s'] = float(np.sum(x > halfmax) / FS) if n_samples>0 else np.nan
        # moving RMS
        mr = moving_rms(x, RMS_WINDOW_SAMPLES)
        per_channel_data[ch]['mrms_mean'] = float(np.mean(mr))
        per_channel_data[ch]['mrms_peak'] = float(np.max(mr))
        channel_means[ch] = per_channel_data[ch].get('mean', np.nan)
        channel_peaks[ch] = per_channel_data[ch].get('peak', np.nan)

    all_means = np.array([v for v in channel_means.values() if not np.isnan(v)])
    all_peaks = np.array([v for v in channel_peaks.values() if not np.isnan(v)])
    trial_mean = np.nanmean(all_means) if all_means.size>0 else 1.0
    trial_peak = np.nanmax(all_peaks) if all_peaks.size>0 else 1.0
    if trial_peak == 0:
        trial_peak = 1.0

    for ch in emg_cols:
        chdata = per_channel_data[ch]
        for k,v in chdata.items():
            colname = f"{ch}__{k}"
            row[colname] = v
            if k in ('rms','peak','iEMG','mrms_mean','mrms_peak','mav'):
                rel_name = f"{ch}__{k}_rel"
                row[rel_name] = v / trial_peak if (not np.isnan(v) and trial_peak!=0) else np.nan

    def find_col(ch_list, muscle_name, side):
        """
        side: 'L' or 'R'
        match if muscle_name in col (case-insensitive) AND column contains left/right indicator
        left indicators: [' l', '_l', ' left', 'left', ' L', '(l)']
        right indicators: analogous
        """
        side_tokens = {
            'L': [' l', '_l', ' left', 'left', ' L', '(l)', '_L', ' L.'],
            'R': [' r', '_r', ' right', 'right', ' R', '(r)', '_R', ' R.']
        }
        muscle_lower = muscle_name.lower()
        for c in ch_list:
            cl = c.lower()
            if muscle_lower in cl:
                for t in side_tokens[side]:
                    if t in cl:
                        return c
        for c in ch_list:
            cl = c.lower()
            if muscle_lower in cl and re.search(r'[_\-\s]{}(?![a-z])'.format(side.lower()), cl):
                return c
        return None

    muscles = ["Rectus Femoris", "TibilaisÂ Anterior", "Gastrocnemius", "Hamstrings"]
    for m in muscles:
        left = find_col(emg_cols, m, 'L')
        right = find_col(emg_cols, m, 'R')
        if left and right:
            meanL = per_channel_data[left].get('mean', np.nan)
            meanR = per_channel_data[right].get('mean', np.nan)
            row[f"{m}__LR_mean_ratio"] = meanL / (meanR + 1e-12) if not np.isnan(meanL) else np.nan
            row[f"{m}__LR_mean_absdev1"] = abs((meanL/ (meanR + 1e-12)) - 1.0)
            try:
                xL = pd.to_numeric(df[left], errors='coerce').fillna(0).values.astype(float)
                xR = pd.to_numeric(df[right], errors='coerce').fillna(0).values.astype(float)
                pL = np.max(xL) if np.max(xL)!=0 else 1.0
                pR = np.max(xR) if np.max(xR)!=0 else 1.0
                nL = xL / pL
                nR = xR / pR
                coact = np.mean(np.minimum(nL, nR))
                row[f"{m}__LR_coact_simple"] = float(coact)
            except Exception:
                row[f"{m}__LR_coact_simple"] = np.nan
        else:
            row[f"{m}__LR_mean_ratio"] = np.nan
            row[f"{m}__LR_mean_absdev1"] = np.nan
            row[f"{m}__LR_coact_simple"] = np.nan

    row['trial_mean_of_channel_means'] = float(trial_mean)
    row['trial_peak_of_channel_peaks'] = float(trial_peak)

    return row


In [6]:
def find_csv_files(root_dir):
    fl = []
    for root, _, files in os.walk(root_dir):
        for f in files:
            if f.lower().endswith(".csv"):
                fl.append(os.path.join(root, f))
    return sorted(fl)

files = find_csv_files(SOURCE_DIR)
print("Found", len(files), "files.")
rows = []
qc_rows = []
for p in tqdm(files):
    try:
        feats = process_file_to_features(p)
        if feats is None:
            qc_rows.append({"file": p, "status": "no_emg"})
            continue
        rows.append(feats)
        qc_rows.append({"file": p, "status": "ok", "n_samples": feats.get('n_samples', np.nan)})
    except Exception as e:
        qc_rows.append({"file": p, "status": "error", "error": str(e)})
        print("Error processing", p, e)

feat_df = pd.DataFrame(rows).fillna(np.nan)
qc_df = pd.DataFrame(qc_rows)
feat_csv = os.path.join(OUT_DIR, "features_master.csv")
qc_csv = os.path.join(OUT_DIR, "qc_master.csv")
feat_df.to_csv(feat_csv, index=False)
qc_df.to_csv(qc_csv, index=False)
print("Saved feature table:", feat_csv)
print("Saved QC table:", qc_csv)


Found 210 files.


100%|██████████| 210/210 [00:14<00:00, 14.42it/s]

Saved feature table: data\features\features_master.csv
Saved QC table: data\features\qc_master.csv





In [7]:
meta_cols = ['file','player','n_samples','fs_used','trial_mean_of_channel_means','trial_peak_of_channel_peaks']
feature_cols = [c for c in feat_df.columns if c not in meta_cols]
X = feat_df[feature_cols].values.astype(float)
players = feat_df['player'].values.astype(str)
np.save(os.path.join(OUT_DIR, "X.npy"), X)
np.save(os.path.join(OUT_DIR, "players.npy"), players)
pd.Series(feature_cols).to_csv(os.path.join(OUT_DIR, "feature_columns.csv"), index=False)
print("Saved X.npy, players.npy, feature_columns.csv")
print("Feature count:", len(feature_cols))


Saved X.npy, players.npy, feature_columns.csv
Feature count: 188


In [8]:
print("Preview features (first 3 rows):")
display(feat_df)
print("\nPer-feature missing counts (top 20):")
print(feat_df.isna().sum().sort_values(ascending=False))
print("\nDescriptive stats for some composite feature candidates:")
cands = [c for c in feature_cols if "__rms" in c or "__peak" in c or "__mnf" in c or "coact" in c]
display(feat_df[cands].describe().T)


Preview features (first 3 rows):


Unnamed: 0,file,player,n_samples,fs_used,Rectus Femoris right__mean,Rectus Femoris right__std,Rectus Femoris right__rms,Rectus Femoris right__rms_rel,Rectus Femoris right__mav,Rectus Femoris right__mav_rel,...,TibilaisÂ Anterior__LR_mean_absdev1,TibilaisÂ Anterior__LR_coact_simple,Gastrocnemius__LR_mean_ratio,Gastrocnemius__LR_mean_absdev1,Gastrocnemius__LR_coact_simple,Hamstrings__LR_mean_ratio,Hamstrings__LR_mean_absdev1,Hamstrings__LR_coact_simple,trial_mean_of_channel_means,trial_peak_of_channel_peaks
0,data\filtered\Ahesan_1.csv,Ahesan,6438,2148.1481,0.025718,0.050058,0.056278,0.236439,0.025722,0.108064,...,0.216030,0.112065,1.677526,0.677526,0.056561,1.299145,0.299145,0.082939,0.022392,0.238023
1,data\filtered\Ahesan_2.csv,Ahesan,6438,2148.1481,0.026553,0.045076,0.052315,0.187049,0.026579,0.095033,...,0.158305,0.101972,1.535675,0.535675,0.058191,0.880541,0.119459,0.058410,0.021833,0.279686
2,data\filtered\Ahesan_3.csv,Ahesan,6438,2148.1481,0.023641,0.042643,0.048757,0.198308,0.023671,0.096277,...,0.357628,0.130393,1.205874,0.205874,0.050691,0.798764,0.201236,0.100744,0.030788,0.245867
3,data\filtered\Ahesan_4.csv,Ahesan,6438,2148.1481,0.031851,0.046411,0.056289,0.171899,0.032009,0.097751,...,0.339249,0.119284,1.148692,0.148692,0.049839,0.775838,0.224162,0.090343,0.026291,0.327453
4,data\filtered\Ahesan_5.csv,Ahesan,6438,2148.1481,0.023008,0.032752,0.040026,0.165690,0.023008,0.095243,...,0.393387,0.150827,1.260666,0.260666,0.037672,0.930527,0.069473,0.084311,0.026709,0.241569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,data\filtered\dummy35_1.csv,dummy35,6438,2148.1481,0.022713,0.032542,0.039685,0.164559,0.022713,0.094183,...,0.396604,0.149580,1.260315,0.260315,0.037017,0.927988,0.072012,0.083055,0.026410,0.241159
206,data\filtered\dummy35_2.csv,dummy35,6438,2148.1481,0.022746,0.032585,0.039739,0.164519,0.022746,0.094170,...,0.397288,0.149524,1.262998,0.262998,0.037102,0.927765,0.072235,0.083314,0.026426,0.241545
207,data\filtered\dummy35_3.csv,dummy35,6438,2148.1481,0.022726,0.032550,0.039698,0.164401,0.022726,0.094114,...,0.397359,0.149514,1.258353,0.258353,0.036983,0.926279,0.073721,0.083285,0.026421,0.241471
208,data\filtered\dummy35_4.csv,dummy35,6438,2148.1481,0.022732,0.032610,0.039751,0.164806,0.022732,0.094245,...,0.396533,0.149638,1.259084,0.259084,0.036923,0.927381,0.072619,0.083345,0.026423,0.241196



Per-feature missing counts (top 20):
file                                 0
Gastrocnemius right__iEMG            0
TibilaisÂ Anterior left__iEMG        0
TibilaisÂ Anterior left__iEMG_rel    0
TibilaisÂ Anterior left__mnf         0
                                    ..
Hamstrings right__mrms_mean_rel      0
Hamstrings right__mrms_peak          0
Hamstrings right__mrms_peak_rel      0
Hamstrings left__mean                0
trial_peak_of_channel_peaks          0
Length: 194, dtype: int64

Descriptive stats for some composite feature candidates:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rectus Femoris right__rms,210.0,0.034112,0.013714,0.009815,0.020863,0.035067,0.045594,0.059399
Rectus Femoris right__rms_rel,210.0,0.114259,0.070345,0.015456,0.058138,0.099237,0.181068,0.236439
Rectus Femoris right__peak,210.0,0.13793,0.067044,0.027046,0.087923,0.11745,0.19375,0.323805
Rectus Femoris right__peak_rel,210.0,0.450961,0.294168,0.067623,0.219435,0.362519,0.660982,1.0
Rectus Femoris right__mnf,210.0,4.229139,0.218771,3.835164,4.012235,4.195663,4.358855,4.659317
Rectus Femoris left__rms,210.0,0.028651,0.013977,0.006097,0.016073,0.027447,0.040788,0.055926
Rectus Femoris left__rms_rel,210.0,0.094768,0.06335,0.010443,0.033451,0.078212,0.156543,0.227483
Rectus Femoris left__peak,210.0,0.123655,0.071811,0.019497,0.066274,0.129738,0.181974,0.2663
Rectus Femoris left__peak_rel,210.0,0.394675,0.303757,0.053409,0.134813,0.305103,0.667432,1.0
Rectus Femoris left__mnf,210.0,4.207162,0.293842,3.717173,3.907486,4.237173,4.371397,4.875216
