RAW (MOABB) to CSV

This code convert the data sets from RAW format to CSV format using MOABB.

It has been specifically conceived for BCI data.

This script is for BNCI2014002-Train

In [None]:
import numpy as np
import pandas as pd
from moabb import datasets

# Import decimate 
import sys
import os
sys.path.append(os.path.abspath('..'))
from ConvTools import decimate

In [None]:
#Load Database
m_dataset = datasets.BNCI2014002()
m_data = m_dataset.get_data()

In [None]:
#See all canal names (EEG, misc, stim...)
raw = m_data[1]['0']['0train']
print("Canal list :", raw.ch_names)

In [None]:
#Know what index is stim channel
stim_name = 'stim'
stim_idx = raw.ch_names.index(stim_name)
print(f"Canal index {stim_name} is : {stim_idx}")

In [None]:
#count stim data unique values (depends on the database)
stim_data = raw.get_data(picks=stim_idx)
print(stim_data.shape)
unique_vals, counts = np.unique(stim_data, return_counts=True)
for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurences count : {count}")

In [None]:
# downsampling
sfreq = 512
decimation_factor = 2

# For example, for subject 1 and a given session (here '0' or '1test' depending on the nomenclature)
# Retrieve the list of runs in this session, filtering for 'train' runs
run_keys = sorted([key for key in m_data[1]['0'].keys() if 'train' in key])
print("Training runs found for subject 1, session 1:", run_keys)

# Initialize a list to store the data from each run
all_runs_data = []

# Iterate over each training run and extract its data
for run in run_keys:
# Correction
    raw_run = m_data[1]['0'][run]
    # downsampling 
    raw_decimated = decimate(raw_run, sfreq, decimation_factor, stim_name)
    # Extract the data: shape (n_channels, n_times_run)
    run_data = raw_decimated.get_data()
    all_runs_data.append(run_data)

# Concatenate the data along the time axis (axis=1)
concatenated_data = np.concatenate(all_runs_data, axis=1)  # shape: (n_channels, total_timesamples)

In [None]:
# Transpose to get dataT of shape (total_timesamples, n_channels)
dataT = concatenated_data.T
print("Shape of dataT before dropping rows:", dataT.shape)

# Remove all time samples (rows) if they only contain 0s
# To be sure, we check if the sum per row is equal to 0
nonzero_indices = np.where(np.sum(dataT, axis=1) != 0)[0]
dataT = dataT[nonzero_indices, :]

print("Shape of dataT after dropping rows:", dataT.shape)

In [None]:
# label standardization
dataT[:, -1] = np.where(dataT[:, -1] == 2, 3, dataT[:, -1])
dataT[:, -1] = np.where(dataT[:, -1] == 1, 2, dataT[:, -1])

In [None]:
# Extract the last column (stim channel)
stim_col = dataT[:, -1]

# Count the unique values
unique_vals, counts = np.unique(stim_col, return_counts=True)

# Loop through unique values and their counts to print the results
for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurrence count : {count}")

In [None]:
# creating timestamps and header
n_times, n_channels = dataT.shape
timestamps = np.arange(n_times, dtype=int)
data_with_timestamp = np.column_stack((timestamps, dataT))
header = [""] + [str(i) for i in range(n_channels)]

# Removing decimals from timestamps
df = pd.DataFrame(data_with_timestamp, columns=header)
df[""] = df[""].astype(int)

In [None]:
# Test to check csv file
df.to_csv("data.csv", index=False)

In [None]:
# loop on all subjects
subject_list = list(m_data.keys())

# downsampling
sfreq = 512
decimation_factor = 2 

for subject in subject_list:

    # Retrieve the list of runs in this session, filtering for 'train' runs within session '0'
    run_keys = sorted([key for key in m_data[subject]['0'].keys() if 'train' in key])
    
    # Initialize a list to store the data from each run
    all_runs_data = []
    for run in run_keys:
        raw_run = m_data[subject]['0'][run]
        #downsampling
        raw_decimated = decimate(raw_run, sfreq, decimation_factor, stim_name)
        # Extract the data: shape (n_channels, n_times_run)
        run_data = raw_decimated.get_data()
        all_runs_data.append(run_data)
    
    # Concatenate the data along the time axis (axis=1)
    concatenated_data = np.concatenate(all_runs_data, axis=1)  # (n_channels, total_timesamples)
    # Transpose to get dataT of shape (total_timesamples, n_channels)
    dataT = concatenated_data.T

    # Filter out time samples (rows) where the sum across all channels is zero
    nonzero_indices = np.where(np.sum(dataT, axis=1) != 0)[0]
    dataT = dataT[nonzero_indices, :]

    n_times, n_channels = dataT.shape
    
    # Recode markers in the stimulation channel (last column):
    # Change marker 2 to 3 (e.g., 'Right Hand' becomes 'Feet')
    dataT[:, -1] = np.where(dataT[:, -1] == 2, 3, dataT[:, -1])
    # Change marker 1 to 2 (e.g., 'Rest' becomes 'Right Hand')
    dataT[:, -1] = np.where(dataT[:, -1] == 1, 2, dataT[:, -1])

    # Create the timestamps column
    timestamps = np.arange(n_times, dtype=int)
    datacsv = np.column_stack((timestamps, dataT))
    header = [""] + [str(i) for i in range(n_channels)]
    df = pd.DataFrame(datacsv, columns=header)
    df[""] = df[""].astype(int)
    
    # Name the file
    subject_str = f"{int(subject):02d}"
    # Calculate the session number (here fixed to '01')
    session_str = f"{1:02d}"
    filename = f"subject_{subject_str}_session_{session_str}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved file : {filename}")

    # display info
    events = df.iloc[:, -1]
    n_rh = len(events[events == 2]) 
    n_f = len(events[events == 3]) 
    print(f"Number of Right hand (2): {n_rh}")
    print(f"Number of feet (3): {n_f}")
