In [None]:
from google.colab import drive
drive.mount('/content/drive')
import tensorflow as tf
device_list = tf.test.gpu_device_name()
device_list

Mounted at /content/drive


''

# Data preparition

Data:

* ├─1
* │  …………
* └─25

  * ├─back
  * ├─forward
  * ├─halfsquat
  * ├─still

here 1-25 is the index of each participant.


We attempted to automate the data processing workflow. We uploaded everything to the drive and created functions for extracting, segmenting, and concatenating the data. Further details about these functions will be provided in the following sections.

In [None]:
def ensure_dataframe(file_path):
    """
    Read a CSV file if it exists; otherwise, return an empty DataFrame.
    """
    if os.path.exists(file_path):
        return pd.read_csv(file_path, index_col=0).T
    else:
        print(f"File not found: {file_path}. Returning empty DataFrame.")
        return pd.DataFrame()


def categorise_data(back_data, forward_data, halfsquat_data, still_data):

    all_data = pd.concat([back_data, forward_data, halfsquat_data, still_data], axis=0)
    # Create a list of categories based on the number of rows in each original dataframe
    num_back_rows = back_data.shape[0]
    num_forward_rows = forward_data.shape[0]
    num_halfsquat_rows = halfsquat_data.shape[0]
    num_still_rows = still_data.shape[0]

    categories = ['back'] * num_back_rows + ['forward'] * num_forward_rows + ['halfsquat'] * num_halfsquat_rows + ['still'] * num_still_rows

    # Add the 'category' column to the concatenated dataframe
    all_data['category'] = categories
    return all_data



## Sliding window

### EMG

#### Calculate RMS, WL, SSC ZC for each sliding window

In this section, we calculate the features **RMS** (Root Mean Square), **WL** (Waveform Length), **SSC** (Slope Sign Changes), and **ZC** (Zero Crossings) as described in the research paper. The functions for these calculations are presented below:


In [None]:
import pandas as pd
import numpy as np


# Calcul des caractéristiques EMG
def compute_rms(signal):
    return np.sqrt(np.mean(signal**2))

def compute_waveform_length(signal):
    return np.sum(np.abs(np.diff(signal)))

def compute_slope_sign_changes(signal):
    # Compte les changements de signe de la pente
    slope = np.diff(signal)
    return np.sum(np.diff(np.sign(slope)) != 0)

def compute_zero_crossings(signal):
    # Compte les traversées de l'axe des abscisses
    return np.sum(np.diff(np.sign(signal)) != 0)

def extract_emg_features(window_data):
    features = []
    for col in ['R_Vlat', 'R_RF', 'R_ST', 'R_TA', 'L_Vlat', 'L_RF', 'L_ST', 'L_TA',
       'R_MG', 'R_LG', 'R_SOL', 'R_IL', 'L_MG', 'L_LG', 'L_SOL', 'L_IL']:
        signal = window_data[col].values

        # Calcul des 4 caractéristiques pour chaque signal EMG
        rms = compute_rms(signal)
        wl = compute_waveform_length(signal)
        ssc = compute_slope_sign_changes(signal)
        zc = compute_zero_crossings(signal)

        features.extend([rms, wl, ssc, zc])  # Ajouter les 4 caractéristiques pour chaque capteur

    return features

def sliding_window_segmentation_emg(df, window_samples, step_size):
    """
    Découpe les données en fenêtres glissantes et extrait les caractéristiques EMG.

    Args:
    - df : DataFrame contenant les données, y compris les signaux EMG et les labels (category).
    - window_samples : Taille de la fenêtre (en nombre de points).
    - step_size : Pas de glissement (en nombre de points).

    Returns:
    - segmented_df : DataFrame avec les fenêtres segmentées et les labels.
    """
    segmented_data = []

    # Assurez-vous que les catégories sont traitées séparément
    for category in df['category'].unique():
        category_df = df[df['category'] == category]

        # Découpage des données en fenêtres
        for start in range(0, len(category_df) - window_samples + 1, step_size):
            end = start + window_samples
            window_data = category_df.iloc[start:end]

            # Extraire les caractéristiques EMG de la fenêtre
            emg_features = extract_emg_features(window_data)

            # Ajouter la catégorie de cette fenêtre
            emg_features.append(category)

            segmented_data.append(emg_features)

    # Créer un DataFrame avec les données segmentées
    column_names = [col + "_rms" for col in ['R_Vlat', 'R_RF', 'R_ST', 'R_TA', 'L_Vlat', 'L_RF', 'L_ST', 'L_TA',
       'R_MG', 'R_LG', 'R_SOL', 'R_IL', 'L_MG', 'L_LG', 'L_SOL', 'L_IL']] + \
                    [col + "_wl" for col in ['R_Vlat', 'R_RF', 'R_ST', 'R_TA', 'L_Vlat', 'L_RF', 'L_ST', 'L_TA',
       'R_MG', 'R_LG', 'R_SOL', 'R_IL', 'L_MG', 'L_LG', 'L_SOL', 'L_IL']] + \
                    [col + "_ssc" for col in ['R_Vlat', 'R_RF', 'R_ST', 'R_TA', 'L_Vlat', 'L_RF', 'L_ST', 'L_TA',
       'R_MG', 'R_LG', 'R_SOL', 'R_IL', 'L_MG', 'L_LG', 'L_SOL', 'L_IL']] + \
                    [col + "_zc" for col in ['R_Vlat', 'R_RF', 'R_ST', 'R_TA', 'L_Vlat', 'L_RF', 'L_ST', 'L_TA',
       'R_MG', 'R_LG', 'R_SOL', 'R_IL', 'L_MG', 'L_LG', 'L_SOL', 'L_IL']] + \
                    ['category']

    segmented_df = pd.DataFrame(segmented_data, columns=column_names)

    return segmented_df

## IMU et IPS

In [None]:
import pandas as pd
import numpy as np

# Function to compute mean for IMU and IPS
def compute_mean(signal):
    return np.mean(signal)

# Function to extract features for IMU and IPS
def extract_imu_ips_features(window_data, sensors):
    features = []
    for sensor in sensors:
        signal = window_data[sensor].values
        feature = compute_mean(signal)  # Mean value for each sensor's axis
        features.append(feature)
    return features

In [None]:
# Function for sliding window segmentation for IMU and IPS data
def sliding_window_segmentation_imu_ips(df, window_samples, step_size):

    """
    Découpe les données en fenêtres glissantes et extrait les caractéristiques EMG.

    Args:
    - df : DataFrame contenant les données, y compris les signaux EMG et les labels (category).
    - window_samples : Taille de la fenêtre (en nombre de points).
    - step_size : Pas de glissement (en nombre de points).

    Returns:
    - segmented_df : DataFrame avec les fenêtres segmentées et les labels.
    """

    segmented_data = []
    sensors = (df.columns).to_list()
    sensors.remove('category')
    # Iterate through all data and segment it based on categories
    for category in df['category'].unique():
        category_df = df[df['category'] == category]

        # Sliding window segmentation for IMU and IPS
        for start in range(0, len(category_df) - window_samples + 1, step_size):
            end = start + window_samples
            window_data = category_df.iloc[start:end]
            # Extract IMU features
            features = extract_imu_ips_features(window_data, sensors)
            # Ajouter la catégorie de cette fenêtre
            features.append(category)
            segmented_data.append(features)

    # Create a DataFrame with the segmented data
    feature_columns = [f'{sensor}_mean' for sensor in sensors] + \
                      ['category']

    segmented_df = pd.DataFrame(segmented_data, columns=feature_columns)

    return segmented_df



### MoCap

From the doc : the mean value of re-referenced coordinates for each sensor within each sliding window was extracted as a feature.

In [None]:
def extract_mocap_features(window_data, sensors_referenced, sensors_velocity):

    """
    Extracts motion capture (MoCap) features from a given window of data.

    Parameters:
        window_data (pd.DataFrame): A DataFrame containing sensor data for a single time window.

        sensors_referenced (list of str): A list of sensor names (excluding reference sensors)
                                          used for re-referenced coordinate features.
        sensors_velocity (list of str): A list of sensor names (including reference sensors)
                                        used for velocity-based features.

    Returns:
        list: A list of extracted features including:
              - Mean of re-referenced coordinates for each axis of each sensor in `sensors_referenced`.
              - Mean velocity for each axis of each sensor in `sensors_velocity`.
    """
    features = []

    # Re-referenced coordinate features (exclude reference sensor)
    for sensor in sensors_referenced:
        for axis in ['x', 'y', 'z']:
            signal = window_data[f'{sensor}_{axis}'].values
            features.append(np.mean(signal))  # Mean of re-referenced coordinates

    # Velocity features (include reference sensor, including IJ)
    for sensor in sensors_velocity:
        for axis in ['x', 'y', 'z']:
            signal = window_data[f'{sensor}_{axis}'].values
            velocity = np.diff(signal)  # Derivative of coordinates
            features.append(np.mean(velocity))  # Mean of velocity

    return features


In [None]:

def sliding_window_segmentation_mocap(df, window_samples, step_size):

    """
    Performs sliding window segmentation on motion capture (MoCap) data and extracts features for each window.

    Parameters:
        df (pd.DataFrame): A DataFrame containing motion capture data.
        window_samples (int): The number of samples (rows) in each sliding window.
        step_size (int): The step size (number of rows) to move the window for the next segment.

    Returns:
        pd.DataFrame: A DataFrame where each row corresponds to the features extracted from a window,
                      and includes a 'category' column for the window's label. The columns are:
                      - Mean features for each axis of sensors in `sensors_referenced`.
                      - Mean velocity features for each axis of sensors in `sensors_velocity`.
                      - The 'category' label of the window.
    """
    segmented_data = []

    reference_sensor = 'IJ'  # Replace 'IJ' with 'C7'

    # List of available sensors (exclude '_x', '_y', '_z' duplicates)
    available_sensors = [sensor[:-2] for sensor in df.columns if sensor.endswith('_x')]

    # Separate sensors for re-referenced coordinates and velocities
    sensors_referenced = [sensor for sensor in available_sensors if sensor != reference_sensor]
    sensors_velocity = available_sensors  # Include all sensors, including IJ


    for category in df['category'].unique():
        category_df = df[df['category'] == category]

        for start in range(0, len(category_df) - window_samples + 1, step_size):
            end = start + window_samples
            window_data = category_df.iloc[start:end]

            # Extract features
            features = extract_mocap_features(window_data, sensors_referenced, sensors_velocity)

            # Append category label
            features.append(category)
            segmented_data.append(features)

    # Feature column names
    feature_columns = [
        f'{sensor}_mean_{axis}' for sensor in sensors_referenced for axis in ['x', 'y', 'z']
    ] + [
        f'{sensor}_velocity_{axis}' for sensor in sensors_velocity for axis in ['x', 'y', 'z']
    ] + ['category']

    return pd.DataFrame(segmented_data, columns=feature_columns)


# Run data preparition

In this code, we chose to segment the data and create sliding windows for each person before concatenation. The only concatenation performed at this stage was, for instance, between emg_1 and emg_2 for each person. Throughout the process, we ensured that the category column names were preserved. Finally, we saved the processed data into separate files for each individual.

In [None]:
import os
import re
import pandas as pd


def segment_and_save(data, segmentation_func, window_samples, step_size, save_path):
    """
    Segment the data using the specified function and save to the given path.
    """
    segmented_data = segmentation_func(data, window_samples, step_size)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    segmented_data.to_csv(save_path)
    print(f"Saved segmented data to: {save_path}")

def process_additional_files(dataset_path, signal, categorise_data, segmentation_funcs, segmentation_params):
    """
    Detect and process additional files for a given signal in a dataset.
    """
    activities = ["back", "forward", "halfsquat", "still"]
    processed_files = set()

    # Check for additional files dynamically
    for activity in activities:
        activity_path = os.path.join(dataset_path, activity)
        if not os.path.exists(activity_path):
            continue

        for filename in os.listdir(activity_path):
            match = re.match(rf"({signal}_\d+)\.csv", filename)
            if match:
                signal_name = match.group(1)

                # Check if segmented file already exists
                segmented_file_path = os.path.join(dataset_path, "segmented_df", f"segmented_df_{signal_name}.csv")
                if os.path.exists(segmented_file_path):
                    print(f"Segmented file already exists for {signal_name}. Skipping processing.")
                    continue  # Skip if the segmented file already exists

                # Collect data for this signal across activities
                signal_data = {act: ensure_dataframe(os.path.join(dataset_path, act, f"{signal_name}.csv"))
                               for act in activities}

                # Combine and categorize
                all_data = categorise_data(
                    signal_data["back"],
                    signal_data["forward"],
                    signal_data["halfsquat"],
                    signal_data["still"]
                )

                # Get segmentation function and parameters
                params = segmentation_params.get(signal, {})
                window_samples = int(params["window_size"] * params["sampling_rate"]) # seconds * frenquence of the captor
                step_size = params["step_size"]
                save_path = os.path.join(dataset_path, "segmented_df", f"segmented_df_{signal_name}.csv")

                # Determine the segmentation function
                segmentation_func = segmentation_funcs.get(signal, None)
                if segmentation_func:
                    segment_and_save(all_data, segmentation_func, window_samples, step_size, save_path)

                processed_files.add(signal_name)

def load_dataset(num):
    """
    Load and process data for a given dataset number, including additional signal files.
    """
    dataset_path = f".../Project/data/{num}/"
    signals = ["emg", "imu", "ips", "mocap"]

    # Segmentation parameters for each signal type
    segmentation_params = {
        "emg": {"sampling_rate": 2000, "window_size": 0.05, "step_size": 100}, # for emg as mentioned in the document
        "imu": {"sampling_rate": 100, "window_size": 0.05, "step_size": 5}, # for imu //
        "ips": {"sampling_rate": 60, "window_size": 0.05, "step_size": 3}, # for ips //
        "mocap": {"sampling_rate": 100, "window_size": 0.05, "step_size": 5}, # for //
    }

    # Segmentation functions for each signal type
    segmentation_funcs = {
        "emg": sliding_window_segmentation_emg,
        "imu": sliding_window_segmentation_imu_ips,
        "ips": sliding_window_segmentation_imu_ips,
        "mocap": sliding_window_segmentation_mocap,
    }

    # Process primary files (e.g., emg_1, imu_1, etc.)
    process_additional_files(dataset_path, "emg", categorise_data, segmentation_funcs, segmentation_params)
    process_additional_files(dataset_path, "imu", categorise_data, segmentation_funcs, segmentation_params)
    process_additional_files(dataset_path, "ips", categorise_data, segmentation_funcs, segmentation_params)
    process_additional_files(dataset_path, "mocap", categorise_data, segmentation_funcs, segmentation_params)

# Run for all dataset numbers
for i in range(4, 26):
    print(f"Processing dataset {i}...")
    load_dataset(i)


Processing dataset 4...
Segmented file already exists for emg_1. Skipping processing.
Segmented file already exists for emg_1. Skipping processing.
Segmented file already exists for emg_1. Skipping processing.
Segmented file already exists for emg_1. Skipping processing.
Segmented file already exists for imu_1. Skipping processing.
Segmented file already exists for imu_1. Skipping processing.
Segmented file already exists for imu_1. Skipping processing.
Segmented file already exists for imu_1. Skipping processing.
Segmented file already exists for ips_1. Skipping processing.
Segmented file already exists for ips_1. Skipping processing.
Segmented file already exists for ips_1. Skipping processing.
Segmented file already exists for ips_1. Skipping processing.
Segmented file already exists for mocap_1. Skipping processing.
Segmented file already exists for mocap_1. Skipping processing.
Segmented file already exists for mocap_1. Skipping processing.
Segmented file already exists for mocap_

#### Concat files with multiple emgs, ips..etc, example concat emg_1 and emg_2

In [None]:
import os
import pandas as pd

def concatenate_files_in_directory(directory_path, prefixes):
    """
    Concatenate files that match specific patterns (like segmented_df_emg_1, segmented_df_imu_1, etc.)

    :param directory_path: The directory to scan for files
    :param prefixes: List of prefixes to look for (e.g., ['segmented_df_emg', 'segmented_df_imu', 'segmented_df_ips', 'segmented_df_mocap'])
    :return: A dictionary with concatenated data for each prefix.
    """
    concatenated_data = {prefix: pd.DataFrame() for prefix in prefixes}

    # List all files in the directory
    for filename in os.listdir(directory_path):
        for prefix in prefixes:
            # Check if the filename contains the prefix and ends with .csv
            if filename.startswith(prefix) and filename.endswith(".csv"):
                file_path = os.path.join(directory_path, filename)
                # Read the file and concatenate it to the corresponding prefix dataframe
                df = pd.read_csv(file_path, index_col=0)
                concatenated_data[prefix] = pd.concat([concatenated_data[prefix], df], axis=0)
                print(f"Loaded and concatenated {filename}")

    return concatenated_data


# Example usage
directory_path = ".../Project/data/25/segmented_df/"
prefixes = ['segmented_df_emg', 'segmented_df_imu', 'segmented_df_ips', 'segmented_df_mocap']

# Call function to concatenate files
concatenated_data = concatenate_files_in_directory(directory_path, prefixes)

# Example of accessing the concatenated data for each prefix
for i in range(1, 26):
    # Call function to concatenate files
    directory_path = f".../Project/data/{i}/segmented_df/"
    concatenated_data = concatenate_files_in_directory(directory_path, prefixes)
    for prefix, data in concatenated_data.items():
        data.to_csv(f".../Project/data/{i}/segmented_df/concatenated_data_{prefix}.csv")


Loaded and concatenated segmented_df_emg_1.csv
Loaded and concatenated segmented_df_imu_1.csv
Loaded and concatenated segmented_df_ips_1.csv
Loaded and concatenated segmented_df_mocap_1.csv
Loaded and concatenated segmented_df_emg_2.csv
Loaded and concatenated segmented_df_emg_3.csv
Loaded and concatenated segmented_df_imu_2.csv
Loaded and concatenated segmented_df_imu_3.csv
Loaded and concatenated segmented_df_ips_2.csv
Loaded and concatenated segmented_df_ips_3.csv
Loaded and concatenated segmented_df_mocap_3.csv
Loaded and concatenated segmented_df_mocap_2.csv
Loaded and concatenated segmented_df_emg_1.csv
Loaded and concatenated segmented_df_imu_1.csv
Loaded and concatenated segmented_df_ips_1.csv
Loaded and concatenated segmented_df_mocap_1.csv
Loaded and concatenated segmented_df_emg_1.csv
Loaded and concatenated segmented_df_imu_1.csv
Loaded and concatenated segmented_df_ips_1.csv
Loaded and concatenated segmented_df_mocap_1.csv
Loaded and concatenated segmented_df_emg_1.csv
Loa

#### Concatenate EMG, IMU, IPS, and MoCap data for each person

In [None]:
import os
import pandas as pd

# Base dataset path
dataset_base_path = ".../Project/data"

# File names to concatenate
file_names = [
    "concatenated_data_segmented_df_emg.csv",
    "concatenated_data_segmented_df_imu.csv",
    "concatenated_data_segmented_df_ips.csv",
    "concatenated_data_segmented_df_mocap.csv"
]

# Initialize dictionaries to store DataFrames for each file type
concatenated_data = {file_name: [] for file_name in file_names}

# Iterate through folders 1 to 25
for folder_num in range(1, 26):
    folder_path = os.path.join(dataset_base_path, str(folder_num), "segmented_df")

    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if os.path.exists(file_path):
            # Read CSV and add a column for the folder number
            df = pd.read_csv(file_path)
            df["person_id"] = folder_num  # Add a column for person ID
            concatenated_data[file_name].append(df)
        else:
            print(f"File not found: {file_path}")

# Concatenate all files of the same type and save
output_path = ".../Project/concatenated_data"
os.makedirs(output_path, exist_ok=True)

for file_name, dfs in concatenated_data.items():
    if dfs:  # Only process if we have data
        combined_df = pd.concat(dfs, ignore_index=True)
        combined_file_path = os.path.join(output_path, file_name)
        combined_df.to_csv(combined_file_path, index=False)
        print(f"Saved concatenated file: {combined_file_path}")


**PS:** In the folder ../Project/concatenated_data, there are four CSV files: one for EMG, one for IMU, one for IPS, and one for MoCap. Each file contains information for all participants. As shown below, each CSV file includes two additional columns: one for **categories** and another for **patient IDs**.

In [None]:
dataset_base_path = ".../Project/concatenated_data"


concatenated_data_segmented_df_emg = pd.read_csv(os.path.join(dataset_base_path, "concatenated_data_segmented_df_emg.csv"), index_col=0)
concatenated_data_segmented_df_imu = pd.read_csv(os.path.join(dataset_base_path, "concatenated_data_segmented_df_imu.csv"), index_col=0)
concatenated_data_segmented_df_ips = pd.read_csv(os.path.join(dataset_base_path, "concatenated_data_segmented_df_ips.csv"), index_col=0)
concatenated_data_segmented_df_mocap = pd.read_csv(os.path.join(dataset_base_path, "concatenated_data_segmented_df_mocap.csv"), index_col=0)

In [None]:
print(concatenated_data_segmented_df_emg.shape)
concatenated_data_segmented_df_emg.head(5)

(113446, 66)


Unnamed: 0,R_Vlat_rms,R_RF_rms,R_ST_rms,R_TA_rms,L_Vlat_rms,L_RF_rms,L_ST_rms,L_TA_rms,R_MG_rms,R_LG_rms,...,R_MG_zc,R_LG_zc,R_SOL_zc,R_IL_zc,L_MG_zc,L_LG_zc,L_SOL_zc,L_IL_zc,category,person_id
0,3.154265,292.75818,65,38,3.696738,282.788085,56,34,3.684881,195.675659,...,3.497438,229.110718,51,27,3.413615,145.623779,29,21,back,1
1,3.677968,326.696778,69,37,4.208268,287.722779,62,32,4.657688,368.994141,...,9.854272,394.674683,51,23,5.228666,243.411255,48,24,back,1
2,3.800089,267.279053,50,26,3.798007,264.962768,44,20,3.792937,284.197999,...,5.891582,253.17993,35,12,4.752426,123.065186,34,12,back,1
3,3.374545,292.657471,61,39,4.385206,314.007569,66,32,5.642971,368.893433,...,7.399809,394.473266,40,24,2.422399,159.622193,38,24,back,1
4,4.138228,382.388306,64,34,3.381075,289.837644,61,32,5.003856,253.582763,...,6.837258,340.7959,52,32,4.945372,221.759034,34,22,back,1


In [None]:
print(concatenated_data_segmented_df_imu.shape)
concatenated_data_segmented_df_imu.head(5)

(113448, 56)


Unnamed: 0,Head_Acc_X_mean,Head_Acc_Y_mean,Head_Acc_Z_mean,Head_Gyr_X_mean,Head_Gyr_Y_mean,Head_Gyr_Z_mean,Head_Roll_mean,Head_Pitch_mean,Head_Yaw_mean,Waist_Acc_X_mean,...,R_F_Acc_Y_mean,R_F_Acc_Z_mean,R_F_Gyr_X_mean,R_F_Gyr_Y_mean,R_F_Gyr_Z_mean,R_F_Roll_mean,R_F_Pitch_mean,R_F_Yaw_mean,category,person_id
0,8.904977,0.384233,4.042627,-0.011431,0.014266,-0.003312,-12.650186,-10.219978,30.065427,9.556038,...,-1.945828,9.028294,-0.011208,0.003525,-0.006971,10.771145,35.53506,-17.801271,back,1
1,8.932194,0.351909,4.096076,0.00309,0.03256,0.018453,-12.710571,-10.173904,30.079566,9.562018,...,-1.917709,9.022139,-0.00248,0.000588,0.005154,10.758786,35.533102,-17.822135,back,1
2,8.815747,0.335551,4.130934,0.012052,0.044186,0.031625,-12.83083,-10.102711,30.164629,9.567548,...,-1.950754,9.035559,0.00066,0.002772,0.008308,10.74195,35.54825,-17.8248,back,1
3,8.806464,0.340593,4.196511,0.011351,0.028329,0.0288,-12.93534,-10.051806,30.253489,9.566574,...,-1.91891,9.035628,0.011773,-0.010301,0.003964,10.756064,35.528137,-17.802386,back,1
4,8.825958,0.329816,4.230514,0.006858,0.029847,0.023784,-13.026545,-10.009585,30.324697,9.551094,...,-1.935038,9.027972,0.000204,0.00288,0.009668,10.757534,35.522094,-17.784689,back,1


In [None]:
print(concatenated_data_segmented_df_ips.shape)
concatenated_data_segmented_df_ips.head(5)

(117600, 684)


Unnamed: 0,0_mean,1_mean,2_mean,3_mean,4_mean,5_mean,6_mean,7_mean,8_mean,9_mean,...,674_mean,675_mean,676_mean,677_mean,678_mean,679_mean,680_mean,681_mean,category,person_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.606667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.12,0.0,0.0,back,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.46,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.12,0.0,0.0,back,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.46,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.12,0.0,0.0,back,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.68,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.12,0.0,0.0,back,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.606667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.12,0.0,0.0,back,1


In [None]:
print(concatenated_data_segmented_df_mocap.shape)
concatenated_data_segmented_df_mocap.head(5)

(113448, 155)


Unnamed: 0,C7_mean_x,C7_mean_y,C7_mean_z,RA_mean_x,RA_mean_y,RA_mean_z,LA_mean_x,LA_mean_y,LA_mean_z,T8_mean_x,...,L_LM_velocity_y,L_LM_velocity_z,L_CAL_velocity_x,L_CAL_velocity_y,L_CAL_velocity_z,L_MH1_velocity_x,L_MH1_velocity_y,L_MH1_velocity_z,category,person_id
0,0.182445,0.023007,1.402115,0.295875,-0.130377,1.394484,0.270495,0.187968,1.399205,0.149561,...,-0.000554,-0.00043,-1.1e-05,1.492492e-06,-3e-06,-1.646624e-05,-5.342223e-06,-9e-06,back,1
1,0.182231,0.023087,1.402198,0.296074,-0.130325,1.394413,0.270739,0.188166,1.399469,0.149692,...,-0.002312,-0.001826,-1.2e-05,-8.534617e-07,-2e-06,-8.245567e-06,2.5736e-06,-1e-06,back,1
2,0.182379,0.022964,1.40223,0.29621,-0.130333,1.39438,0.270884,0.188148,1.399508,0.149777,...,-0.000166,-0.000142,-3e-06,1.329413e-05,-4e-06,-1.418989e-06,-2.591971e-06,1e-06,back,1
3,0.182557,0.022945,1.40229,0.296361,-0.130337,1.394363,0.271069,0.188148,1.399518,0.14993,...,1e-06,1.5e-05,2e-06,-9.381637e-06,-1e-05,2.83787e-07,-4.829473e-07,-6e-06,back,1
4,0.182692,0.022905,1.402271,0.296523,-0.13031,1.39432,0.27117,0.188165,1.399493,0.150038,...,1.8e-05,2.4e-05,-3e-06,7.311105e-06,1.1e-05,-5.972218e-06,-7.379168e-06,-9e-06,back,1


**As we can see, the data is not homogeneous; we have different shapes across each dataset, which could be problematic for the future training of our unified or multimodal model.**

#### Method 1

Therefore, we will select data across emg, imu, ips, and mocap datasets that has the same shape and the same sequence of categories.

In [None]:
def verify_and_filter_datasets(emg_df, imu_df, ips_df, mocap_df):
    """
    Verifies if each person's data across emg, imu, ips, and mocap datasets
    has the same shape and the same sequence of categories. Removes inconsistent persons.

    Args:
    - emg_df, imu_df, ips_df, mocap_df: DataFrames for emg, imu, ips, and mocap datasets.

    Returns:
    - Filtered DataFrames for emg, imu, ips, and mocap with only consistent persons.
    """
    # List of datasets
    datasets = [emg_df, imu_df, ips_df, mocap_df]
    dataset_names = ['EMG', 'IMU', 'IPS', 'MOCAP']

    # Extract unique person IDs from each dataset
    person_ids = set(emg_df['person_id']).intersection(
        imu_df['person_id'], ips_df['person_id'], mocap_df['person_id']
    )

    # Valid person IDs
    valid_person_ids = []

    for person_id in person_ids:
        # Filter each dataset for the current person
        person_data = {name: df[df['person_id'] == person_id] for name, df in zip(dataset_names, datasets)}

        # Extract category sequences
        category_sequences = {name: data['category'].reset_index(drop=True) for name, data in person_data.items()}

        # Check if all category sequences are identical
        first_dataset_name = dataset_names[0]
        is_consistent = all(
            seq.equals(category_sequences[first_dataset_name])
            for seq in category_sequences.values()
        )

        # Check if all datasets have the same number of rows for the person
        is_same_shape = all(
            len(person_data[name]) == len(person_data[first_dataset_name])
            for name in dataset_names
        )

        if is_consistent and is_same_shape:
            valid_person_ids.append(person_id)

    # Filter each dataset to include only valid persons
    filtered_datasets = [df[df['person_id'].isin(valid_person_ids)] for df in datasets]

    return tuple(filtered_datasets), valid_person_ids  # Return filtered datasets


# Example usage:
filtered_datasets, valid_person_ids = verify_and_filter_datasets(
    concatenated_data_segmented_df_emg,
    concatenated_data_segmented_df_imu,
    concatenated_data_segmented_df_ips,
    concatenated_data_segmented_df_mocap
)

# Display the results
print("People kept :", valid_person_ids)
print("Filtered EMG shape:", filtered_datasets[0].shape)
print("Filtered IMU shape:", filtered_datasets[1].shape)
print("Filtered IPS shape:", filtered_datasets[2].shape)
print("Filtered MOCAP shape:", filtered_datasets[3].shape)

People kept : [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24]
Filtered EMG shape: (99364, 66)
Filtered IMU shape: (99364, 56)
Filtered IPS shape: (99364, 684)
Filtered MOCAP shape: (99364, 155)


In [None]:
dataset_base_path = ".../Project/dataset/method_1/"
filtered_datasets[0].to_csv(os.path.join(dataset_base_path, "filtered_datasets_emg.csv"))
filtered_datasets[1].to_csv(os.path.join(dataset_base_path, "filtered_datasets_imu.csv"))
filtered_datasets[2].to_csv(os.path.join(dataset_base_path, "filtered_datasets_ips.csv"))
filtered_datasets[3].to_csv(os.path.join(dataset_base_path, "filtered_datasets_mocap.csv"))

In [None]:
minimum_data_to_be_used = 113446
new_used_data = 99364

percentage_change = ((minimum_data_to_be_used-new_used_data) / minimum_data_to_be_used) * 100

print(f"The percentage of deleted data is: {percentage_change:.2f}%")

The percentage of deleted data is: 12.41%


#### Method 2

**Truncating Excess Data**

If the IPS contains more data over a given period (e.g., due to differing recording durations or uneven sampling rates):

  For example, if the EMG and IMU cover 10 seconds of recording, retain only the first 10 seconds of IPS data.

In [None]:
def align_person_data(emg_df, imu_df, ips_df, mocap_df):
    """
    Align data for each person based on the minimum number of rows for that person
    across all datasets (EMG, IMU, IPS, MOCAP).

    Args:
    - emg_df, imu_df, ips_df, mocap_df: DataFrames for EMG, IMU, IPS, and MOCAP datasets.

    Returns:
    - Aligned DataFrames for EMG, IMU, IPS, and MOCAP for each person.
    """
    # List of datasets and dataset names
    datasets = [emg_df, imu_df, ips_df, mocap_df]
    dataset_names = ['EMG', 'IMU', 'IPS', 'MOCAP']

    # Extract unique person IDs from each dataset
    person_ids = set(emg_df['person_id']).intersection(
        imu_df['person_id'], ips_df['person_id'], mocap_df['person_id']
    )

    # Create an empty list to store the aligned data for each dataset
    aligned_datasets = []

    for person_id in person_ids:
        # Filter each dataset for the current person
        person_data = {name: df[df['person_id'] == person_id] for name, df in zip(dataset_names, datasets)}

        # Get the number of rows in each dataset for the current person
        num_rows = {name: len(data) for name, data in person_data.items()}

        # Find the minimum number of rows across the datasets for the current person
        min_shape = min(num_rows.values())

        # Extract categories to ensure alignment (use the category sequence from EMG, as an example)
        valid_categories = person_data['EMG']['category']

        # Align the datasets for the current person by taking the first `min_shape` rows
        aligned_data_for_person = {}
        for name, data in person_data.items():
            # Filter the data by ensuring the category sequence is aligned and take the first `min_shape` rows
            aligned_data_for_person[name] = data.head(min_shape)

        # Append the aligned data for the current person
        aligned_datasets.append({
            name: aligned_data_for_person[name] for name in dataset_names
        })

    # Merge all aligned datasets (EMG, IMU, IPS, MOCAP) into a final list for each dataset
    final_datasets = {name: pd.concat([aligned_data[name] for aligned_data in aligned_datasets]) for name in dataset_names}

    return final_datasets['EMG'], final_datasets['IMU'], final_datasets['IPS'], final_datasets['MOCAP']


# Example usage:
aligned_emg, aligned_imu, aligned_ips, aligned_mocap = align_person_data(
    concatenated_data_segmented_df_emg,
    concatenated_data_segmented_df_imu,
    concatenated_data_segmented_df_ips,
    concatenated_data_segmented_df_mocap
)



# Display the results
print("Aligned EMG shape:", aligned_emg.shape)
print("Aligned IMU shape:", aligned_imu.shape)
print("Aligned IPS shape:", aligned_ips.shape)
print("Aligned MOCAP shape:", aligned_mocap.shape)

Aligned EMG shape: (113446, 66)
Aligned IMU shape: (113446, 56)
Aligned IPS shape: (113446, 684)
Aligned MOCAP shape: (113446, 155)


In [None]:
dataset_base_path = ".../Project/dataset/method_2/" # use the fulle path if you want to run it
aligned_emg.to_csv(os.path.join(dataset_base_path, "aligned_datasets_emg.csv"))
aligned_imu.to_csv(os.path.join(dataset_base_path, "aligned_datasets_imu.csv"))
aligned_ips.to_csv(os.path.join(dataset_base_path, "aligned_datasets_ips.csv"))
aligned_mocap.to_csv(os.path.join(dataset_base_path, "aligned_datasets_mocap.csv"))



**PS** : If you're willing to run the notebook you'll need to replace ataset_base_path = ".../Project/dataset/method_2/" by the full path