### Loop to clean the CSV file

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.signal import medfilt, welch
from scipy.stats import skew, kurtosis, iqr, entropy
import glob
import re

In [None]:
def clean_sensor_csv(input_path, subject, motion, sensor, output_dir="../cleaned", save_csv=True):
    """
    - Skip header rows (12 lines)
    - Drop 'SampleTimeFine'
    - Reset 'PacketCounter' to 0 and converting to milliseconds (40 Hz)
    - Save as T1_M5_wrist.csv format
    """
    df = pd.read_csv(input_path, skiprows=12)
    if "SampleTimeFine" in df.columns:
        df = df.drop(columns=["SampleTimeFine"])
    df["PacketTime_ms"] = (df["PacketCounter"] - df["PacketCounter"].iloc[0]) * 25
    
    df.drop(columns=["PacketCounter"], inplace=True)


    if save_csv:
        os.makedirs(output_dir, exist_ok=True)
        filename = f"{subject}_{motion}_{'wrist' if sensor == '00B4381A' else 'ankle'}.csv"
        df.to_csv(os.path.join(output_dir, filename), index=False)

    return df


In [3]:
# Loop clean all files (Except for M10)
subjects = [f"T{i}" for i in [1,2,3,4,5,11,12,13,14,17,18,19,20,21,22,23,24,25,26,28,29]]
motions = [f"M{i}" for i in [5,6,7,8,9,11]]
sensors = {"00B4381A": "wrist", "00B43876": "ankle"}

for subj in subjects:
    for motion in motions:
        for sensor_id, sensor_name in sensors.items():
            file_path = f"../{subj}/{motion}/M_{sensor_id}.csv"
            if os.path.exists(file_path):
                try:
                    clean_sensor_csv(file_path, subj, motion, sensor_id)
                    print(f"Cleaned {subj} {motion} {sensor_name}")
                except Exception as e:
                    print(f"Failed to clean {subj} {motion} {sensor_name}: {e}")
            else:
                print(f"File not found: {file_path}")

Cleaned T1 M5 wrist
Cleaned T1 M5 ankle
Cleaned T1 M6 wrist
Cleaned T1 M6 ankle
Cleaned T1 M7 wrist
Cleaned T1 M7 ankle
Cleaned T1 M8 wrist
Cleaned T1 M8 ankle
Cleaned T1 M9 wrist
Cleaned T1 M9 ankle
Cleaned T1 M11 wrist
Cleaned T1 M11 ankle
Cleaned T2 M5 wrist
Cleaned T2 M5 ankle
Cleaned T2 M6 wrist
Cleaned T2 M6 ankle
Cleaned T2 M7 wrist
Cleaned T2 M7 ankle
Cleaned T2 M8 wrist
Cleaned T2 M8 ankle
Cleaned T2 M9 wrist
Cleaned T2 M9 ankle
Cleaned T2 M11 wrist
Cleaned T2 M11 ankle
Cleaned T3 M5 wrist
Cleaned T3 M5 ankle
Cleaned T3 M6 wrist
Cleaned T3 M6 ankle
Cleaned T3 M7 wrist
Cleaned T3 M7 ankle
Cleaned T3 M8 wrist
Cleaned T3 M8 ankle
Cleaned T3 M9 wrist
Cleaned T3 M9 ankle
Cleaned T3 M11 wrist
Cleaned T3 M11 ankle
Cleaned T4 M5 wrist
Cleaned T4 M5 ankle
Cleaned T4 M6 wrist
Cleaned T4 M6 ankle
Cleaned T4 M7 wrist
Cleaned T4 M7 ankle
Cleaned T4 M8 wrist
Cleaned T4 M8 ankle
Cleaned T4 M9 wrist
Cleaned T4 M9 ankle
Cleaned T4 M11 wrist
Cleaned T4 M11 ankle
Cleaned T5 M5 wrist
Cleaned T5 M

In [4]:
# Loop for M10
def clean_m10_walk_segments(subject, m10_dir, output_dir="../cleaned"):
    sensors = {"00B4381A": "wrist", "00B43876": "ankle"}
    
    # Read the file and clean the data
    for i in range(6):  # WT0–WT5
        for sensor_id, sensor_name in sensors.items():
            filename = f"WT{i}_{sensor_id}.csv"
            input_path = os.path.join(m10_dir, filename)
            if os.path.exists(input_path):
                try:
                    df = pd.read_csv(input_path, skiprows=12)
                    if "SampleTimeFine" in df.columns:
                        df.drop(columns=["SampleTimeFine"], inplace=True)
                    df["PacketTime_ms"] = (df["PacketCounter"] - df["PacketCounter"].iloc[0]) * 25
                    df.drop(columns=["PacketCounter"], inplace=True)

                    os.makedirs(output_dir, exist_ok=True)
                    cleaned_name = f"{subject}_M10_WT{i}_{sensor_name}.csv"
                    df.to_csv(os.path.join(output_dir, cleaned_name), index=False)
                    print(f"Cleaned {subject} M10 WT{i} {sensor_name}")
                except Exception as e:
                    print(f"Failed to clean {subject} M10 WT{i} {sensor_name}: {e}")
            else:
                print(f"File not found: {input_path}")

# Loop through all participants
subjects = [f"T{i}" for i in [1,2,3,4,5,11,12,13,14,17,18,19,20,21,22,23,24,25,26,28,29]]
for subj in subjects:
    m10_dir = f"../{subj}/M10"
    if os.path.exists(m10_dir):
        clean_m10_walk_segments(subj, m10_dir)


Cleaned T1 M10 WT0 wrist
Cleaned T1 M10 WT0 ankle
Cleaned T1 M10 WT1 wrist
Cleaned T1 M10 WT1 ankle
Cleaned T1 M10 WT2 wrist
Cleaned T1 M10 WT2 ankle
Cleaned T1 M10 WT3 wrist
Cleaned T1 M10 WT3 ankle
Cleaned T1 M10 WT4 wrist
Cleaned T1 M10 WT4 ankle
Cleaned T1 M10 WT5 wrist
Cleaned T1 M10 WT5 ankle
Cleaned T2 M10 WT0 wrist
Cleaned T2 M10 WT0 ankle
Cleaned T2 M10 WT1 wrist
Cleaned T2 M10 WT1 ankle
Cleaned T2 M10 WT2 wrist
Cleaned T2 M10 WT2 ankle
Cleaned T2 M10 WT3 wrist
Cleaned T2 M10 WT3 ankle
Cleaned T2 M10 WT4 wrist
Cleaned T2 M10 WT4 ankle
Cleaned T2 M10 WT5 wrist
Cleaned T2 M10 WT5 ankle
Cleaned T3 M10 WT0 wrist
Cleaned T3 M10 WT0 ankle
Cleaned T3 M10 WT1 wrist
Cleaned T3 M10 WT1 ankle
Cleaned T3 M10 WT2 wrist
Cleaned T3 M10 WT2 ankle
Cleaned T3 M10 WT3 wrist
Cleaned T3 M10 WT3 ankle
Cleaned T3 M10 WT4 wrist
Cleaned T3 M10 WT4 ankle
Cleaned T3 M10 WT5 wrist
Cleaned T3 M10 WT5 ankle
Cleaned T4 M10 WT0 wrist
Cleaned T4 M10 WT0 ankle
Cleaned T4 M10 WT1 wrist
Cleaned T4 M10 WT1 ankle


In [5]:
# All cleaned data into one dataframe
all_csv_files = glob.glob('../cleaned/*.csv')

# Read all files
dfs = [pd.read_csv(f).assign(source_file=os.path.basename(f)) 
       for f in all_csv_files]

# Combined into one dataframe
combined_df = pd.concat(dfs, ignore_index=True)

In [7]:
combined_df

Unnamed: 0,Acc_X,Acc_Y,Acc_Z,FreeAcc_E,FreeAcc_N,FreeAcc_U,Gyr_X,Gyr_Y,Gyr_Z,Quat_q0,...,Pitch,Yaw,PacketTime_ms,source_file,Latitude,Longitude,Altitude,Vel_E,Vel_N,Vel_U
0,1.557896,9.676229,-1.280055,-0.016881,0.006758,0.071374,-0.002425,0.002683,-0.002856,0.338488,...,-9.039794,-128.519394,0,T11_M10_WT0_ankle.csv,,,,,,
1,1.547948,9.671388,-1.301751,0.006167,0.000614,0.067920,-0.001086,0.001074,-0.004578,0.338464,...,-9.039372,-128.522711,25,T11_M10_WT0_ankle.csv,,,,,,
2,1.562017,9.652690,-1.276506,-0.022484,0.001207,0.048513,-0.001400,0.004174,-0.004783,0.338454,...,-9.039158,-128.523871,50,T11_M10_WT0_ankle.csv,,,,,,
3,1.551941,9.647881,-1.298117,0.000716,-0.004934,0.045062,-0.003201,0.003915,-0.005942,0.338439,...,-9.038769,-128.526502,75,T11_M10_WT0_ankle.csv,,,,,,
4,1.532116,9.690491,-1.326086,0.034524,0.002288,0.087328,-0.001557,0.005724,-0.004886,0.338435,...,-9.038616,-128.526763,100,T11_M10_WT0_ankle.csv,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1399982,-2.200050,2.485923,-5.966106,-0.650635,-0.790088,-3.062366,0.139885,0.199611,0.377971,-0.121401,...,27.311063,-125.585254,48775,T5_M9_wrist.csv,,,,,,
1399983,-2.843399,3.183019,-6.508719,-0.872836,-0.320605,-2.085130,0.300801,0.408073,0.372705,-0.119979,...,27.018531,-125.702725,48800,T5_M9_wrist.csv,,,,,,
1399984,-2.849498,3.137758,-6.878788,-0.800420,-0.489865,-1.787605,0.370989,0.589428,0.311443,-0.118023,...,26.639203,-125.741380,48825,T5_M9_wrist.csv,,,,,,
1399985,-3.259757,3.203016,-7.497140,-0.595948,-0.445194,-1.064048,0.308158,0.678210,0.208452,-0.115714,...,26.237966,-125.698922,48850,T5_M9_wrist.csv,,,,,,


In [None]:
# Save in csv
combined_df.to_csv('combined_data.csv', index=False)

## Median Filter

In [None]:
# Apply median filter to all cleaned files
def median_filter(cleaned_folder="../cleaned"):
    """
    Apply median filter to all cleaned CSVs
    """
    filtered_data = {}
    files = sorted(glob.glob(f"{cleaned_folder}/*.csv"))
    signals_to_filter = ["Acc_X", "Acc_Y", "Acc_Z", "FreeAcc_E", "FreeAcc_N", "FreeAcc_U",
                         "Gyr_X", "Gyr_Y", "Gyr_Z", "Quat_q0", "Quat_q1", "Quat_q2", "Quat_q3",
                         "Roll", "Pitch", "Yaw"]

    for file_path in files:
        df = pd.read_csv(file_path)

        # Apply median filter
        for col in signals_to_filter:
            if col in df.columns:
                df[col] = medfilt(df[col], kernel_size=3)

        # Store in a dictionary using filename as key
        key = os.path.basename(file_path).replace(".csv", "")

        filtered_data[key] = df

    return filtered_data


filtered_data_dict = median_filter()


In [6]:
filtered_data_dict

{'T11_M10_WT0_ankle':          Acc_X     Acc_Y     Acc_Z  FreeAcc_E  FreeAcc_N  FreeAcc_U     Gyr_X  \
 0     1.547948  9.671388 -1.280055   0.000000   0.000614   0.067920 -0.001086   
 1     1.557896  9.671388 -1.280055  -0.016881   0.001207   0.067920 -0.001400   
 2     1.551941  9.652690 -1.298117   0.000716   0.000614   0.048513 -0.001400   
 3     1.551941  9.652690 -1.298117   0.000716   0.001207   0.048513 -0.001557   
 4     1.534013  9.673602 -1.298117   0.000716   0.002288   0.063518 -0.003201   
 ...        ...       ...       ...        ...        ...        ...       ...   
 2211  0.955272  9.818328 -0.978958   0.056813   0.187227   0.097461  0.120099   
 2212  0.955272  9.761436 -1.039333   0.008652   0.210952   0.053384  0.116985   
 2213  1.069702  9.743563 -1.115152  -0.003864   0.312548   0.048032  0.091267   
 2214  1.069702  9.709308 -1.160449  -0.028392   0.312548   0.013846  0.073818   
 2215  1.034714  9.676686 -1.115152  -0.003864   0.294255   0.000000  0.04587

## Feature Extraction

40 Hz - 40 rows per second  
5 seconds = 5 × 40 = 200 samples  
     
Pairwise correlation of X, Y, Z for each sensor.  
- Acc_X vs Acc_Y
- Acc_X vs Acc_Z
- Acc_Y vs Acc_Z
- Gyr_X vs Gyr_Y
- Gyr_X vs Gyr_Z
- Gyr_Y vs Gyr_Z
- FreeAcc_E vs FreeAcc_N 
- FreeAcc_E vs FreeAcc_U
- FreeAcc_N vs FreeAcc_U

### 7 Time Domain Features:

| Feature Name                    | What It Does                      | Function                        |
| ------------------------------- | --------------------------------- | -------------------------------------- |
| **mean**                        | average value                     | `np.mean()`                            |
| **standard deviation**          | spread of values                  | `np.std()`                             |
| **skewness**                    | how lopsided the data is          | `skew()`                               |
| **kurtosis**                    | how peaky the data is             | `kurtosis()`                           |
| **inter-quartile range** (IQR)  | middle 50% range                  | `iqr()`                                |
| **signal magnitude area** (SMA) | average movement                  | `np.sum(np.abs(values)) / len(values)` |
| **pairwise correlation**        | relationship between X, Y, Z axes | `np.corrcoef()`                        |


### 2 Frequency Domain Features：
| Feature                          | Meaning                               | Function                      |
| -------------------------------- | ------------------------------------- | ----------------------------- |
| **SPE (Spectral Power Entropy)** | randomness in frequency domain        | `entropy(Pxx)`, `scipy.stats` |
| **PPF (Peak Power Frequency)**   | the strongest frequency in the signal | `f[np.argmax(Pxx)]`, `scipy.signal.welch`|


Total features = (6 time-domain * 16 signals + 2 frequency-domain * 16 signals + pairwise correlation * 3 sensors) * 2 (wrist + ankle) = 274 features

In [None]:
# Feature extraction pipeline
def extract_features_from_window(window_df, prefix=""):
    features = {}
    signals = ["Acc_X", "Acc_Y", "Acc_Z", "FreeAcc_E", "FreeAcc_N", "FreeAcc_U",
               "Gyr_X", "Gyr_Y", "Gyr_Z", "Quat_q0", "Quat_q1", "Quat_q2", "Quat_q3",
               "Roll", "Pitch", "Yaw"]

# Calcuaate the 6 time domain features and 2 frequency domain features
    for col in signals:
        values = window_df[col].values
        features[f"{prefix}{col}_mean"] = np.mean(values)
        features[f"{prefix}{col}_std"] = np.std(values)
        features[f"{prefix}{col}_skew"] = skew(values)
        features[f"{prefix}{col}_kurt"] = kurtosis(values)
        features[f"{prefix}{col}_iqr"] = iqr(values)
        features[f"{prefix}{col}_sma"] = np.sum(np.abs(values)) / len(values)

        f, Pxx = welch(values, fs=40)
        features[f"{prefix}{col}_spe"] = entropy(Pxx)
        features[f"{prefix}{col}_ppf"] = f[np.argmax(Pxx)]
    
    # Calculate pairwise correlations for Acc, Gyr and FreeAcc sensors
    for sensor in ["Acc", "Gyr", "FreeAcc"]:
        if sensor == "FreeAcc":
            # FreeAcc uses E/N/U
            cols = [f"{sensor}_E", f"{sensor}_N", f"{sensor}_U"]
        else:
            cols = [f"{sensor}_X", f"{sensor}_Y", f"{sensor}_Z"]
            
        for i in range(3):
            for j in range(i + 1, 3):
                if cols[i] in window_df.columns and cols[j] in window_df.columns:
                    corr = np.corrcoef(window_df[cols[i]], window_df[cols[j]])[0, 1]
                    features[f"{prefix}{cols[i]}_{cols[j]}_corr"] = corr

    return features

In [None]:
# Segment signal and extract features
window_size = 200  # 5 seconds @ 40Hz
step_size = 100    # 50% overlap
all_feature_rows = []

from collections import defaultdict
grouped = defaultdict(dict)

for name, df in filtered_data_dict.items():
    # Extract Subject number and Activity from filename
    ### 
    filename = os.path.basename(name).replace(".csv", "")

    match = re.match(r"T(\d+)_((M10_WT\d+)|M\d+)_.*", filename)

    if match:
        subject = int(match.group(1))
        activity = match.group(2)
        sensor = "ankle" if "ankle" in filename else "wrist"
        key = (subject, activity)
        grouped[key][sensor] = df
    else:
        print(f"Skip: {filename}")

# Put each subject + activity pair
for (subject, activity), sensors in grouped.items():
    if "ankle" not in sensors or "wrist" not in sensors:
        print(f"Missing sensor for Subject {subject} Activity {activity}")
        continue

    df_ankle = sensors["ankle"]
    df_wrist = sensors["wrist"]
    min_len = min(len(df_ankle), len(df_wrist))

    # Segment and extract
    for window_id, start in enumerate(range(0, min_len - window_size + 1, step_size), 1):
        win_ankle = df_ankle.iloc[start:start+window_size]
        win_wrist = df_wrist.iloc[start:start+window_size]

        feats = {}
        feats.update(extract_features_from_window(win_ankle, prefix="ankle_"))
        feats.update(extract_features_from_window(win_wrist, prefix="wrist_"))

        feats["window"] = window_id
        feats["Subject"] = subject
        feats["Activity"] = activity

        all_feature_rows.append(feats)

features_df = pd.DataFrame(all_feature_rows)
os.makedirs("../features", exist_ok=True)
features_df.to_csv("../features/all_features_merged.csv", index=False)


  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,


In [None]:
features_df.head()

Unnamed: 0,ankle_Acc_X_mean,ankle_Acc_X_std,ankle_Acc_X_skew,ankle_Acc_X_kurt,ankle_Acc_X_iqr,ankle_Acc_X_sma,ankle_Acc_X_spe,ankle_Acc_X_ppf,ankle_Acc_Y_mean,ankle_Acc_Y_std,...,wrist_Acc_Y_Acc_Z_corr,wrist_Gyr_X_Gyr_Y_corr,wrist_Gyr_X_Gyr_Z_corr,wrist_Gyr_Y_Gyr_Z_corr,wrist_FreeAcc_E_FreeAcc_N_corr,wrist_FreeAcc_E_FreeAcc_U_corr,wrist_FreeAcc_N_FreeAcc_U_corr,window,Subject,Activity
0,1.574001,1.229563,-3.942524,26.178878,0.018981,1.814221,2.706374,0.8,9.682674,0.74852,...,-0.744998,0.150898,-0.615163,-0.050829,-0.240072,0.269405,0.339014,1,11,M10_WT0
1,1.076608,2.317051,-2.65029,7.476643,1.093011,2.057104,3.576606,0.8,9.691851,1.885551,...,-0.927674,0.155275,-0.699899,-0.00389,-0.145753,-0.15881,0.341447,2,11,M10_WT0
2,0.695348,3.603414,-1.472929,9.654722,1.94303,2.274548,3.418213,1.0,10.084883,1.963305,...,-0.55146,0.162346,-0.310301,0.135889,0.335629,-0.209852,0.135999,3,11,M10_WT0
3,1.368532,6.003468,0.615921,5.839792,2.282835,3.419414,3.037735,0.8,10.670244,2.682854,...,0.205224,0.012755,-0.567457,0.504836,0.507268,-0.037528,0.124332,4,11,M10_WT0
4,1.79298,6.448243,0.482381,3.610016,2.744634,4.223076,3.496999,0.6,11.018744,3.578314,...,0.09273,-0.042941,-0.675152,0.415525,0.401421,-0.167218,0.181889,5,11,M10_WT0
