In [8]:
# Cell 1 - Imports & configuration
import os
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import signal
from scipy.signal import resample
from tqdm import tqdm   # nice progress bars in Jupyter

In [9]:
# ---------- CONFIG ----------
SOURCE_DIR = "data/processed"          # location of your raw CSVs
DEST_DIR   = "data/preprocessed_ml"  # where filtered (non-resampled) CSVs will be saved
FS = 2148.1481            # sampling rate (Hz)
NOTCH_FREQ = 50.0
NOTCH_Q = 30
LOWCUT = 20.0
HIGHCUT = 450.0
ENV_CUTOFF = 6.0          # low-pass cutoff for envelope (Hz)
FILTER_ORDER = 4

# Create destination root
Path(DEST_DIR).mkdir(parents=True, exist_ok=True)

In [10]:
# Cell 2 - Filter helper functions (zero-phase filtering with filtfilt)
def notch_filter(x, freq=NOTCH_FREQ, fs=FS, Q=NOTCH_Q):
    if len(x) < 4:
        return x
    w0 = freq / (0.5 * fs)
    b, a = signal.iirnotch(w0, Q)
    return signal.filtfilt(b, a, x)

def bandpass_filter(x, lowcut=LOWCUT, highcut=HIGHCUT, fs=FS, order=FILTER_ORDER):
    if len(x) < (order*3):
        return x
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    if not (0 < low < high < 1):
        # If high is >= nyquist due to fs mismatch, scale high down
        high = min(0.99, high)
    b, a = signal.butter(order, [low, high], btype='band')
    return signal.filtfilt(b, a, x)

def lowpass_filter(x, cutoff=ENV_CUTOFF, fs=FS, order=FILTER_ORDER):
    if len(x) < (order*3):
        return x
    nyq = 0.5 * fs
    c = cutoff / nyq
    c = min(0.99, c)
    b, a = signal.butter(order, c, btype='low')
    return signal.filtfilt(b, a, x)


In [11]:
# Cell 3 - File discovery helper & EMG column detection
def find_csv_files(root_dir):
    paths = []
    for root, _, files in os.walk(root_dir):
        for f in files:
            if f.lower().endswith(".csv"):
                paths.append(os.path.join(root, f))
    return sorted(paths)

def detect_time_and_emg_columns(df):
    # find best time column, otherwise None
    time_candidates = [c for c in df.columns if 'time' in str(c).lower() or 'timestamp' in str(c).lower()]
    time_col = time_candidates[0] if time_candidates else None
    # EMG candidates = numeric columns excluding time
    numeric = []
    for c in df.columns:
        if c == time_col:
            continue
        # consider a column numeric if coercion yields many non-nulls
        series = pd.to_numeric(df[c], errors='coerce')
        if series.notnull().sum() > 5:
            numeric.append(c)
    return time_col, numeric


In [12]:
# Cell 4 - Per-file preprocessing (no resampling) and save
def preprocess_and_save(src_path, dst_path,
                        notch_freq=NOTCH_FREQ, lowcut=LOWCUT, highcut=HIGHCUT,
                        env_cutoff=ENV_CUTOFF, fs=FS):
    df = pd.read_csv(src_path)
    time_col, emg_cols = detect_time_and_emg_columns(df)
    if len(emg_cols) == 0:
        # nothing to process; copy file as-is or skip
        print("Skipping (no EMG columns found):", src_path)
        return

    # coerce EMG cols to numeric, fill NaN with 0 (or you can choose interpolation)
    for c in emg_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(float)

    # keep original time vector if present
    if time_col and time_col in df.columns:
        time_vec = pd.to_numeric(df[time_col], errors='coerce').values
    else:
        time_vec = None

    # build matrix (samples x channels)
    emg = df[emg_cols].values.astype(float)

    # apply filters channel-wise
    env_mat = np.zeros_like(emg)
    for i in range(emg.shape[1]):
        x = emg[:, i]
        try:
            x = notch_filter(x, freq=notch_freq, fs=fs)
        except Exception:
            # fallback to original if filter fails
            pass
        try:
            x = bandpass_filter(x, lowcut=lowcut, highcut=highcut, fs=fs)
        except Exception:
            pass
        x_rect = np.abs(x)
        try:
            x_env = lowpass_filter(x_rect, cutoff=env_cutoff, fs=fs)
        except Exception:
            x_env = x_rect
        env_mat[:, i] = x_env

    # build output DataFrame preserving time col if present
    out_df = pd.DataFrame(env_mat, columns=emg_cols)
    if time_vec is not None and len(time_vec) == out_df.shape[0]:
        out_df.insert(0, time_col, time_vec)
    else:
        # create normalized time (seconds) based on fs and number of samples
        n = out_df.shape[0]
        out_df.insert(0, "time", np.arange(n) / fs)

    # ensure destination directory exists
    os.makedirs(os.path.dirname(dst_path), exist_ok=True)
    out_df.to_csv(dst_path, index=False)
    out_df


In [13]:
# Cell 5 - Batch processing loop (walks tree, preserves folder structure)
src_files = find_csv_files(SOURCE_DIR)
print(f"Found {len(src_files)} CSV files to preprocess.")

for src in tqdm(src_files):
    rel = os.path.relpath(src, SOURCE_DIR)
    dst = os.path.join(DEST_DIR, rel)
    try:
        preprocess_and_save(src, dst)
    except Exception as e:
        print(f"Error processing {src}: {e}")


Found 5 CSV files to preprocess.


100%|██████████| 5/5 [00:00<00:00,  5.27it/s]


In [14]:
# Cell 6 - Quick integrity check (list a few saved files and preview)
from IPython.display import display

sample_saved = find_csv_files(DEST_DIR)[:5]
print("Saved files (first 5):")
for s in sample_saved:
    print(" ", s)
if sample_saved:
    display(pd.read_csv(sample_saved[0]).head())


Saved files (first 5):
  data/preprocessed_ml\ChipShot_Jordan_1.csv
  data/preprocessed_ml\ChipShot_Jordan_2.csv
  data/preprocessed_ml\ChipShot_Jordan_3.csv
  data/preprocessed_ml\ChipShot_Jordan_4.csv
  data/preprocessed_ml\ChipShot_Jordan_5.csv


Unnamed: 0,Time,Rectus Femoris right,Rectus Femoris left,Hamstrings right,Hamstrings left,TibilaisÂ Anterior right,TibilaisÂ Anterior left,Gastrocnemius right,Gastrocnemius left
0,0.0,0.031887,-0.034612,-0.041638,0.014238,0.012791,-0.162861,0.007584,0.056808
1,0.000465,0.032508,-0.033341,-0.040439,0.014347,0.013244,-0.158401,0.007806,0.057333
2,0.000931,0.033127,-0.032072,-0.039242,0.014454,0.013696,-0.153958,0.00803,0.057856
3,0.001397,0.033746,-0.030805,-0.038047,0.014561,0.014147,-0.149534,0.008256,0.058377
4,0.001862,0.034363,-0.029539,-0.036855,0.014667,0.014597,-0.145128,0.008484,0.058896
