RAW (MOABB) to CSV

This code convert the data sets from RAW format to CSV format using MOABB.

It has been specifically conceived for BCI data.

This script is for AlexMI

In [None]:
import numpy as np
import pandas as pd
from moabb import datasets

# Import decimate 
import sys
import os
sys.path.append(os.path.abspath('..'))
from ConvTools import decimate

In [None]:
#Load Database
m_dataset = datasets.AlexMI()
m_data = m_dataset.get_data()

In [None]:
# See all channel names (EEG, misc, stim...)
raw = m_data[1]['0']['0']
print("Canal list :", raw.ch_names)

In [None]:
# Know what index is stim channel
stim_name = 'Stim'
stim_idx = raw.ch_names.index(stim_name)
print(f"Canal index {stim_name} is : {stim_idx}")

In [None]:
# downsampling
sfreq = 512
decimation_factor = 2 
raw_decimated = decimate(raw, sfreq, decimation_factor, stim_name)

In [None]:
#count stim data unique values (depends on the database)
stim_data = raw_decimated.get_data(picks=stim_idx)
print(stim_data.shape)
unique_vals, counts = np.unique(stim_data, return_counts=True)
for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurences count : {count}")

In [None]:
# Process the stimulation channel and removing 1 representing beginning of trial
stim_data = np.array(stim_data, copy=True).flatten() # make sure stimdata is a 1D vector
stim_data[stim_data == 1] = 0 # replace all 1 with 0, 1 was anouncing the beginning of the trial.  

In [None]:
# Trial duration: 3 s * 256 Hz = 768 samples.
# The goal is, as soon as a non-zero value is detected, to keep that sample and set the next 767 to 0.
trial_length = 768

i = 0
while i < len(stim_data):
    if stim_data[i] != 0:
        # We keep the signal at index i (non-zero), and set the next 767 samples to 0.
        start = i + 1
        end = min(i + trial_length, len(stim_data))
        stim_data[start:end] = 0
        # We jump to the end of the trial to avoid processing samples from the same trial again.
        i = end
    else:
        i += 1

In [None]:
# quick check : print unique values of stim channel after transformation.
unique_vals, counts = np.unique(stim_data, return_counts=True)
for val, count in zip(unique_vals, counts):
    print(f"(AprÃ¨s) Valeur : {val}, Occurrences : {count}")

In [None]:
# Transpose to invert columns/lines
data = raw_decimated.get_data()
dataT = data.T

In [None]:
# creating timestamps and header
n_times, n_channels = dataT.shape
timestamps = np.arange(n_times, dtype=int)
data_with_timestamp = np.column_stack((timestamps, dataT))
header = [""] + [str(i) for i in range(n_channels)]

# Removing decimals from timestamps
df = pd.DataFrame(data_with_timestamp, columns=header)
df[""] = df[""].astype(int)
df.iloc[:, stim_idx + 1] = stim_data # new stim column

In [None]:
# Test to check csv file
df.to_csv("data.csv", index=False)

In [None]:
# loop on all subjects
subject_list = list(m_data.keys())

# downsampling parameters 
sfreq = 512
decimation_factor = 2

# Trial duration in samples after decimation (3 s * 256 Hz)
trial_length = 768

# Loop through all subjects
for subject in subject_list:
    # Load the Raw recording for the subject (session '0', run '0')
    raw = m_data[subject]['0']['0']

    # downsampling
    raw_decimated = decimate(raw, sfreq, decimation_factor, stim_name)

    # Process the stimulation channel for this subject
    stim_data = raw_decimated.get_data(picks=stim_idx)         # shape: (1, n_times)
    stim_data = np.array(stim_data, copy=True).flatten() # convert to mutable 1D vector
    
    # Replace all '1' values with '0', 1 was anouncing the begining of the trial.  
    stim_data[stim_data == 1] = 0
    
    i = 0
    while i < len(stim_data):
        if stim_data[i] != 0:
            # Start of a trial found: keep the value at i
            start = i + 1
            end = min(i + trial_length, len(stim_data))
            stim_data[start:end] = 0  # set the next 1535 samples to 0
            i = end  # jump to the end of this trial
        else:
            i += 1

    # Extract all EEG data
    data = raw_decimated.get_data()       # shape: (n_channels, n_times)
    dataT = data.T                        # shape: (n_times, n_channels)
    n_times, n_channels = dataT.shape
    
    # Create the timestamps column
    timestamps = np.arange(n_times, dtype=int)
    datacsv = np.column_stack((timestamps, dataT))
    header = [""] + [str(i) for i in range(n_channels)]
    df = pd.DataFrame(datacsv, columns=header)
    df[""] = df[""].astype(int) # Ensure the first column (timestamp) is integer
    
    # Integrate the new stimulation channel into the DataFrame
    # The corresponding column in the DataFrame is at stim_idx+1 (after the "timestamp" column)
    df.iloc[:, stim_idx + 1] = stim_data
    
    # Construct the filename
    subject_str = f"{int(subject):02d}" # format the subject number to 2 digits
    session_str = f"{1:02d}"            # here, session fixed to 1 (since it corresponds to run '0' in our case)
    filename = f"subject_{subject_str}_session_{session_str}.csv"
    
    # Export the DataFrame to CSV
    df.to_csv(filename, index=False)
    print(f"Saved file : {filename}")

    # display info
    events = df.iloc[:, stim_idx + 1]
    n_rh = len(events[events == 2]) 
    n_f = len(events[events == 3]) 
    rest = len(events[events == 4])
    print(f"Number of Right hand (2): {n_rh}")
    print(f"Number of feet (3): {n_f}")
    print(f"Number of rest (4): {rest}")