In [None]:
#import libraries etc.

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


## Moving Average on 1hz preprocessed physio files, annotations


This code preprocess and resample the physiological (train and test) and annotation data files for all the scenarios (1 to 4) and their respective folds. The main steps are as follows:

1. A dictionary (`scenario_folds`) is created to store the number of folds for each scenario.

2. The base folder path and output folder path are specified, and the output folder is created if it doesn't exist.

3. The code loops through each scenario (1 to 4) and their respective folds.

4. It sets the paths for the training physiological data (`train_physio_folder`) and annotation data (`train_annotations_folder`). Same thing for test physio data.

5. For each annotation file ending with "_resampled_1hz.csv", it loads the data, calculates the rolling mean with a window size of 30 (sampling_window) to resample it into 30-second intervals, and saves the resampled data into the output folder.

6. Similarly, for each physiological file ending with "_processed_1hz.csv", it loads the data and filters out the columns with the word "raw". It calculates the rolling mean with a window size of 30 (sampling_window) to resample it into 30-second intervals and saves the resampled data into the output folder.

In summary, this code preprocesses and resamples the physiological and annotation data for all scenarios and their respective folds. The resampled data is then saved in the specified output folder for further analysis.

In [None]:

base_folder_path = "/work/abslab/emognition_2023_challenge/data/"
output_folder = "/work/abslab/emognition_2023_challenge/file_prep_30sec"

column_NO_RAW = [
    'PPG_Clean', 'PPG_Rate', 'PPG_Peaks',
    'ECG_Rate','ECG_Quality',
    'EDA_Clean', 'EDA_Tonic', 'EDA_Phasic', 'SCR_Onsets', 'SCR_Peaks','SCR_Height', 'SCR_Amplitude',
    'RSP_Amplitude', 'RSP_Rate',
    'corrugator_EMG_Clean',
    'corrugator_EMG_Amplitude', 'corrugator_EMG_Activity',
    'corrugator_EMG_Onsets', 'corrugator_EMG_Offsets',
    'trapezius_EMG_Clean', 'trapezius_EMG_Amplitude',
    'trapezius_EMG_Activity', 'trapezius_EMG_Onsets',
    'trapezius_EMG_Offsets',  'zygomaticus_EMG_Clean',
    'zygomaticus_EMG_Amplitude', 'zygomaticus_EMG_Activity',
    'zygomaticus_EMG_Onsets', 'zygomaticus_EMG_Offsets',
    'RSP_RVT', 'RSP_Phase', 'RSP_Phase_Completion',
    'RSP_Symmetry_PeakTrough', 'RSP_Symmetry_RiseDecay','skt','sub_num','vid_num',
]



In [None]:
# #start with one file (fold_0, scenario2)

# first_folder = os.path.join(base_folder_path, 'scenario_2/','fold_0/','train')

# print(first_folder)

In [None]:

# Create a dictionary to store the number of folds for each scenario
scenario_folds = {
    1: 1,  # 1 fold for scenario 1
    2: 5,  # 5 folds for scenario 2
    3: 4,  # 4 folds for scenario 3
    4: 2,  # 2 folds for scenario 4
}


base_folder_path = "/work/abslab/emognition_2023_challenge/data/"
output_folder = "/work/abslab/emognition_2023_challenge/file_prep_30sec"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)


# Loop through the scenario folders
for scenario_num in range(1, 5):  # 1, 2, 3, and 4
    scenario_folder = f"scenario_{scenario_num}"
    save_folder = os.path.join(output_folder, scenario_folder)
    
    for fold_num in range(0, scenario_folds[scenario_num]):  # Iterate through the folds based on the number of folds for the current scenario
        if scenario_folds[scenario_num] == 1:
            fold_folder = None
            
            # Set the paths for the training and testing data folders
            train_physio_folder = os.path.join(base_folder_path, scenario_folder, "train", "physiology")
            train_annotations_folder = os.path.join(base_folder_path, scenario_folder, "train", "annotations")
        else:
            fold_folder = f"fold_{fold_num}"

            # Set the paths for the training and testing data folders
            train_physio_folder = os.path.join(base_folder_path, scenario_folder, fold_folder, "train", "physiology")
            train_annotations_folder = os.path.join(base_folder_path, scenario_folder, fold_folder, "train", "annotations")
        
        print(train_annotations_folder)
        
        fold_data = pd.DataFrame()
        if fold_folder is not None:
            fold_save_folder = os.path.join(save_folder, fold_folder)
        else:
            fold_save_folder = save_folder

        for file in os.listdir(train_annotations_folder):
            if file.endswith("_resampled_1hz.csv"):
                file_ = os.path.join(train_annotations_folder, file)
#                 print(f"Processing file: {file_}")
                # Load the physio data file
                anno_data_train = pd.read_csv(file_, index_col=0)

                # Resample the physiology data to 30 seconds (30000 milliseconds)
                # sampling rate of 1 Hz (129 rows total)
                sampling_window = 30

                
                #This line calculates the moving average using the sliding window method. 
                #The rolling function is applied to the physio_data_train_filtered DataFrame with 
                #the specified window size (which is sampling_window). 
                #This creates a view that allows us to apply the .mean() function
                anno_data_train = anno_data_train.rolling(sampling_window).mean() 
                anno_data_train = anno_data_train.iloc[::sampling_window, :]
                anno_data_train = anno_data_train.iloc[1:, :]
        #   
          
            
            # Save the resampled physio data to separate CSV files in the output folder
                output_file_name = file[:-len("_processed_1hz.csv")] + "_resampled_1hz.csv"
                output_file_path = os.path.join(fold_save_folder, 'train', 'annotations', output_file_name)
                anno_data_train.to_csv(output_file_path)
#                 print(output_file_path)
            
        for file in os.listdir(train_physio_folder):
            if file.endswith("_processed_1hz.csv"):
                file_ = os.path.join(train_physio_folder, file)
#                 print(f"Processing file: {file_}")
                # Load the physio data file
                physio_data_train = pd.read_csv(file_, index_col=0)

                # Exclude the columns with the word "raw"
                physio_data_train_filtered = physio_data_train[column_NO_RAW]

                # Resample the physiology data to 30 seconds (30000 milliseconds)
                # sampling rate of 1 Hz (129 rows total)
                sampling_window = 30

                
                #This line calculates the moving average using the sliding window method. 
                #The rolling function is applied to the physio_data_train_filtered DataFrame with 
                #the specified window size (which is sampling_window). 
                #This creates a view that allows us to apply the .mean() function
                physio_data_train_resampled = physio_data_train_filtered.rolling(sampling_window).mean() 
                physio_data_train_resampled = physio_data_train_resampled.iloc[::sampling_window, :]
                physio_data_train_resampled = physio_data_train_resampled.iloc[1:, :]
        #   
          
            
            # Save the resampled physio data to separate CSV files in the output folder
                output_file_name = file[:-len("_processed_1hz.csv")] + "_resampled_1hz.csv"
                output_file_path = os.path.join(fold_save_folder, 'train', 'physiology', output_file_name)
                physio_data_train_resampled.to_csv(output_file_path)
      

In [None]:
## do the same for z scored files

In [None]:

# Create a dictionary to store the number of folds for each scenario
scenario_folds = {
    1: 1,  # 1 fold for scenario 1
    2: 5,  # 5 folds for scenario 2
    3: 4,  # 4 folds for scenario 3
    4: 2,  # 2 folds for scenario 4
}


base_folder_path = "/work/abslab/emognition_2023_challenge/data/"
output_folder = "/work/abslab/emognition_2023_challenge/file_prep_30sec"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)


# Loop through the scenario folders
for scenario_num in range(1, 5):  # 1, 2, 3, and 4
    scenario_folder = f"scenario_{scenario_num}"
    save_folder = os.path.join(output_folder, scenario_folder)
    
    for fold_num in range(0, scenario_folds[scenario_num]):  # Iterate through the folds based on the number of folds for the current scenario
        if scenario_folds[scenario_num] == 1:
            fold_folder = None
            
            # Set the paths for the training and testing data folders
            train_physio_folder = os.path.join(base_folder_path, scenario_folder, "train", "physiology")
#             train_annotations_folder = os.path.join(base_folder_path, scenario_folder, "train", "annotations")
        else:
            fold_folder = f"fold_{fold_num}"

            # Set the paths for the training and testing data folders
            train_physio_folder = os.path.join(base_folder_path, scenario_folder, fold_folder, "train", "physiology")
#             train_annotations_folder = os.path.join(base_folder_path, scenario_folder, fold_folder, "train", "annotations")
        
        #print(train_physio_folder)
        
        fold_data = pd.DataFrame()
        if fold_folder is not None:
            fold_save_folder = os.path.join(save_folder, fold_folder)
        else:
            fold_save_folder = save_folder
        
        for file in os.listdir(train_physio_folder):
            if file.endswith("_processed_1hz_zscored.csv"):
                file_ = os.path.join(train_physio_folder, file)
#                 print(f"Processing file: {file_}")
                # Load the physio data file
                physio_data_train = pd.read_csv(file_, index_col=0)

                # Exclude the columns with the word "raw"
                physio_data_train_filtered = physio_data_train[column_NO_RAW]

                # Resample the physiology data to 30 seconds (30000 milliseconds)
                # sampling rate of 1 Hz (129 rows total)
                sampling_window = 30

                
                #This line calculates the moving average using the sliding window method. 
                #The rolling function is applied to the physio_data_train_filtered DataFrame with 
                #the specified window size (which is sampling_window). 
                #This creates a view that allows us to apply the .mean() function
                physio_data_train_resampled = physio_data_train_filtered.rolling(sampling_window).mean() 
                physio_data_train_resampled = physio_data_train_resampled.iloc[::sampling_window, :]
                physio_data_train_resampled = physio_data_train_resampled.iloc[1:, :]
        #   
          
            
            # Save the resampled physio data to separate CSV files in the output folder
                output_file_name = file[:-len("_processed_1hz_zscored.csv")] + "_resampled_1hz_zscored.csv"
                output_file_path = os.path.join(fold_save_folder, 'train', 'physiology', output_file_name)
                physio_data_train_resampled.to_csv(output_file_path)
      

In [None]:
#TEST DATA

In [None]:

# Create a dictionary to store the number of folds for each scenario
scenario_folds = {
    1: 1,  # 1 fold for scenario 1
    2: 5,  # 5 folds for scenario 2
    3: 4,  # 4 folds for scenario 3
    4: 2,  # 2 folds for scenario 4
}


base_folder_path = "/work/abslab/emognition_2023_challenge/data/"
output_folder = "/work/abslab/emognition_2023_challenge/file_prep_30sec"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)


# Loop through the scenario folders
for scenario_num in range(1, 5):  # 1, 2, 3, and 4
    scenario_folder = f"scenario_{scenario_num}"
    save_folder = os.path.join(output_folder, scenario_folder)
    
    for fold_num in range(0, scenario_folds[scenario_num]):  # Iterate through the folds based on the number of folds for the current scenario
        if scenario_folds[scenario_num] == 1:
            fold_folder = None
            
            # Set the paths for the training and testing data folders
            train_physio_folder = os.path.join(base_folder_path, scenario_folder, "test", "physiology")
#             train_annotations_folder = os.path.join(base_folder_path, scenario_folder, "train", "annotations")
        else:
            fold_folder = f"fold_{fold_num}"

            # Set the paths for the training and testing data folders
            train_physio_folder = os.path.join(base_folder_path, scenario_folder, fold_folder, "test", "physiology")
#             train_annotations_folder = os.path.join(base_folder_path, scenario_folder, fold_folder, "train", "annotations")
        
        #print(train_physio_folder)
        
        fold_data = pd.DataFrame()
        if fold_folder is not None:
            fold_save_folder = os.path.join(save_folder, fold_folder)
        else:
            fold_save_folder = save_folder
        
        for file in os.listdir(train_physio_folder):
            if file.endswith("_processed_1hz.csv"):
                file_ = os.path.join(train_physio_folder, file)
#                 print(f"Processing file: {file_}")
                # Load the physio data file
                physio_data_train = pd.read_csv(file_, index_col=0)

                # Exclude the columns with the word "raw"
                physio_data_train_filtered = physio_data_train[column_NO_RAW]

                # Resample the physiology data to 30 seconds (30000 milliseconds)
                # sampling rate of 1 Hz (129 rows total)
                sampling_window = 30

                
                #This line calculates the moving average using the sliding window method. 
                #The rolling function is applied to the physio_data_train_filtered DataFrame with 
                #the specified window size (which is sampling_window). 
                #This creates a view that allows us to apply the .mean() function
                physio_data_train_resampled = physio_data_train_filtered.rolling(sampling_window).mean() 
                physio_data_train_resampled = physio_data_train_resampled.iloc[::sampling_window, :]
                physio_data_train_resampled = physio_data_train_resampled.iloc[1:, :]
        #   
          
            
            # Save the resampled physio data to separate CSV files in the output folder
                output_file_name = file[:-len("_processed_1hz.csv")] + "_resampled_1hz.csv"
                output_file_path = os.path.join(fold_save_folder, 'test', 'physiology', output_file_name)
                physio_data_train_resampled.to_csv(output_file_path)