In [None]:
"""
This script will fetch all csv files from the specified directory and preprocess them.   
The preprocessing stage will include:   
    - Loop through all the csv files for all subjects and trials.
    - Remove unlabeled data (start and end of the experiment) 
    - Reformatting the labels to be sin and cos of the phase variable.   
    - Prefilter the data to remove noise (e.g. Moving Average)    
    - Splitting the data into windows (by window size and overlap).   
    - Normalizing the input data    
    - Remove unnecessary columns (e.g. timestamp, foot and trunk imu data)   
    - Saving the preprocessed data into a new data npy file based on the inputs provided for the preprocessing stage    
"""

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.signal import butter, lfilter, filtfilt
from torch.utils.data import TensorDataset, DataLoader

In [3]:
# Scenario parameter settings
is_filter = True            # True or False decision variable to filter the IMU data before splitting into windows 
filter_type = "causal"      # causal or non-causal - choose causal for RT applications
cutoff = 25                 # cutoff frequency for the filter (Hz)
is_normalize = True         # True or False decision variable to normalize the input data after filtering (before windowing) 
window_size = 400           # Number of samples per window 
overlap = 200               # Number of samples to overlap between windows


In [70]:
def sliding_window_with_label(imu_data: pd.DataFrame, gc_data: pd.DataFrame, window_size=400, overlap=200):
    """ 
    Perform sliding window on IMU data and extract the last value of GC data as label.
    
    Parameters:
    - imu_data: DataFrame with 12 columns (IMU channels)
    - gc_data: DataFrame with 2 columns (labels)
    - window_size: Number of samples per window
    - overlap: Number of overlapping samples

    Returns:
    - X_windows: NumPy array of shape (num_windows, 12, window_size)
    - y_labels: NumPy array of shape (num_windows, 2)
    """
    step_size = window_size - overlap
    X_windows, y_labels = [], []

    for start_idx in range(0, imu_data.shape[0] - window_size + 1, step_size):
        end_idx = start_idx + window_size
        X_windows.append(imu_data.iloc[start_idx:end_idx].values.T)  # (12, window_size)
        y_labels.append(gc_data.iloc[end_idx - 1].values)  # Correctly extract both columns
    
    return np.array(X_windows), np.array(y_labels)

def apply_filter(data, filter_type, cutoff=25, fs=200, order=4):
    b, a = butter(order, cutoff / (fs / 2), btype='low', analog=False)
    return lfilter(b, a, data, axis=0) if filter_type == "causal" else filtfilt(b, a, data, axis=0)

def plot_filtered_imu(original_df, filtered_df, channel_idx=0):
    plt.plot(original_df.iloc[1:500, channel_idx], label="Original", alpha=0.6)
    plt.plot(filtered_df.iloc[1:500, channel_idx], label="Filtered", linestyle="--")
    plt.legend()
    plt.xlabel("Samples")
    plt.ylabel("Amplitude")
    plt.title(f"IMU Channel {original_df.columns[channel_idx]} Before & After Filtering")
    plt.show()


In [None]:
DATA_PATH = r'C:\Users\Elad\vscode Projects\Technion\LBIS_project\dataset'

# create a multidimensional array to store the data in one single file
X_data = np.empty((0, 12, window_size))
y_data = np.empty((0, 2))

# access all subject folders
for subject in os.listdir(DATA_PATH):
    # access all csv files in the treadmill folder of the subject
    for file in os.listdir(os.path.join(DATA_PATH,  subject, 'treadmill', 'imu')):
        # read the imu csv file
        if file.endswith('.csv'):
            imu_df = pd.read_csv(os.path.join(DATA_PATH, subject, 'treadmill', 'imu', file))
            gc_df = pd.read_csv(os.path.join(DATA_PATH, subject, 'treadmill', 'gcRight', file))

            # remove unnecessary columns
            gc_df = gc_df.drop(columns=["ToeOff"])
            imu_df = imu_df.drop(columns=['foot_Accel_X', 'foot_Accel_Y', 'foot_Accel_Z', 'foot_Gyro_X', 'foot_Gyro_Y', 'foot_Gyro_Z', 'trunk_Accel_X', 'trunk_Accel_Y', 'trunk_Accel_Z', 'trunk_Gyro_X', 'trunk_Gyro_Y', 'trunk_Gyro_Z'])

            # remove the first and last samples that have no proper label defined (until the first Heel Strike occurance + after the last Toe Off occurance)
            gc_df = gc_df.loc[gc_df.index[gc_df["HeelStrike"].gt(0)].min() : gc_df.index[gc_df["HeelStrike"] == 100].max()]
            imu_df = imu_df[imu_df["Header"].isin(gc_df["Header"])] # remove the rows that are not in the gc data

            # Apply the cosine and sine functions to the HeelStrike column
            gc_df['cos_gait_phase'] = np.cos(gc_df['HeelStrike'] * 2 * np.pi / 100)
            gc_df['sin_gait_phase'] = np.sin(gc_df['HeelStrike'] * 2 * np.pi / 100)
            
            # remove header and other columns
            gc_df.drop(columns=["Header","HeelStrike"], inplace=True)
            imu_df.drop(columns=['Header'], inplace=True)

            gc_df.reset_index(drop=True, inplace=True)
            imu_df.reset_index(drop=True, inplace=True)

            # Apply a filter to the IMU data (choose between a causal and non-causal filter i.e. with phase or zero phase lag filters)
            # filter_type = "causal" # or "non-causal" - defined in the scenario parameters settings section
            filtered_df = pd.DataFrame(apply_filter(imu_df.values, filter_type=filter_type, cutoff=25, order=4), columns=imu_df.columns) if is_filter else imu_df

            # Normalize the input data (is_normalize = True or False)
            filtered_df = (filtered_df - filtered_df.mean()) / filtered_df.std() if is_normalize else filtered_df

            # Split the data into windows (by window size and overlap)
            X_windows, y_labels = sliding_window_with_label(filtered_df, gc_df, window_size=window_size, overlap=overlap)
            
            # Concatenate the data to the multidimensional array
            X_data = np.concatenate((X_data, X_windows), axis=0)
            y_data = np.concatenate((y_data, y_labels), axis=0)
            
    print(f"The shape of X_windows is: {X_windows.shape}")
    print(f"The shape of y_labels is: {y_labels.shape}")
    print(f"The first window of X_windows is: {X_windows[0]}")
    print(f"The first label of y_labels is: {y_labels[0]}")
    print(f"The shape of X_data is: {X_data.shape}")
    print(f"The shape of y_data is: {y_data.shape}")
    
    # plot_filtered_imu(imu_df, filtered_df, channel_idx=0)  # Change channel_idx to plot different channels
    # print(gc_df.head())
    # print(imu_df.head())





# # Get the IMU CSV file path.
#         imu_path = self.imu_files[idx]
#         # Derive the corresponding gcRight CSV file path by replacing 'imu' with 'gcRight'
#         gcRight_path = imu_path.replace(os.sep + 'imu' + os.sep, os.sep + 'gcRight' + os.sep)
        
#         # Load CSV files (skip the header row)
#         imu_data = self._load_csv_file(imu_path)
#         gcRight_data = self._load_csv_file(gcRight_path)
        
#         # Drop the timestamp column (first column)
#         imu_data = imu_data[:, 1:]
#         gcRight_data = gcRight_data[:, 1:]
        
#         # Select only shank and thigh channels from IMU data.
#         # CSV column order (after dropping timestamp) is:
#         # [foot_Accel_X, foot_Accel_Y, foot_Accel_Z,
#         #  foot_Gyro_X, foot_Gyro_Y, foot_Gyro_Z,
#         #  shank_Accel_X, shank_Accel_Y, shank_Accel_Z,
#         #  shank_Gyro_X, shank_Gyro_Y, shank_Gyro_Z,
#         #  thigh_Accel_X, thigh_Accel_Y, thigh_Accel_Z,
#         #  thigh_Gyro_X, thigh_Gyro_Y, thigh_Gyro_Z,
#         #  trunk_Accel_X, trunk_Accel_Y, trunk_Accel_Z,
#         #  trunk_Gyro_X, trunk_Gyro_Y, trunk_Gyro_Z]
#         # We keep shank (columns 6 to 11) and thigh (columns 12 to 17)
#         shank = imu_data[:, 6:12]
#         thigh = imu_data[:, 12:18]
#         imu_selected = np.concatenate([shank, thigh], axis=1)  # Shape: (N, 12)
        
#         # Synchronize lengths: truncate all signals to the minimum available length.
#         min_length = min(imu_selected.shape[0], gcRight_data.shape[0])
#         imu_selected = imu_selected[:min_length, :]
#         gcRight_data = gcRight_data[:min_length, :]
        
#         # Randomly extract a window of fixed length.
#         if min_length > self.sequence_length:
#             start_idx = random.randint(0, min_length - self.sequence_length)
#         else:
#             start_idx = 0  # Alternatively, pad shorter sequences.
#         end_idx = start_idx + self.sequence_length
#         imu_window = imu_selected[start_idx:end_idx, :]  # (sequence_length, 12)
        
#         # Use the HeelStrike value from gcRight at the center of the window.
#         center_idx = start_idx + self.sequence_length // 2
#         heel_strike = gcRight_data[center_idx, 0]  # HeelStrike value (0-100)
#         # Normalize to [0, 1]
#         heel_strike_norm = heel_strike / 100.0
#         target = np.array([heel_strike_norm], dtype=np.float32)
        
#         # Optionally apply a transform; otherwise, convert to torch tensors.
#         if self.transform:
#             imu_window = self.transform(imu_window)
#         else:
#             imu_window = torch.tensor(imu_window, dtype=torch.float32)
#         target = torch.tensor(target, dtype=torch.float32)
        
#         return imu_window, target

#     def _load_csv_file(self, file_path):
#         """Loads a CSV file using NumPy (skipping the header row)."""
#         data = np.loadtxt(file_path, delimiter=',', skiprows=1)
#         return data


In [None]:
# HARDCODED PARAMETERS
BASE_SAMPLING_RATE = 200    # Hz

# INPUT PARAMETERS
NORMALIZE_FLAG = True       # Normalize the data decision variable
WINDOW_SIZE = 2             # seconds
WINDOW_OVERLAP = 1          # seconds