RAW (MOABB) to CSV

This code convert the data sets from RAW format to CSV format using MOABB.

It has been specifically conceived for BCI data.

This script is for BNCI2014001

In [None]:
import numpy as np
import pandas as pd
from moabb import datasets

In [None]:
#Load Database
m_dataset = datasets.BNCI2014001()
m_data = m_dataset.get_data()

In [None]:
#See all channel names (EEG, misc, stim...)
raw = m_data[1]['0train']['0']
print("Canal list :", raw.ch_names)

In [None]:
#Know what index is stim channel 
stim_channel_name = 'stim'
stim_idx = raw.ch_names.index(stim_channel_name)
print(f"Canal index {stim_channel_name} is : {stim_idx}")

In [None]:
#count stim data unique values (depends on the database)
stim_data = raw.get_data(picks=stim_idx)
print(stim_data.shape)
unique_vals, counts = np.unique(stim_data, return_counts=True)

for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurences count : {count}")

In [None]:
# For example, for subject 1 and a given session
# Retrieve the list of runs in this session
run_keys = sorted(m_data[1]['0train'].keys())
print("Runs found in the session:", run_keys)

# Initialize a list to store the data from each run
all_runs_data = []

# Iterate over each run and extract its data
for run in run_keys:
    raw_run = m_data[1]['0train'][run]
    # Drop EOG channels (typically eye movement artifacts)
    raw_run.drop_channels(['EOG1', 'EOG2', 'EOG3']) 
    run_data = raw_run.get_data()  # shape: (n_channels, n_times_run)
    all_runs_data.append(run_data)

# Concatenate the data along the time axis (axis=1)
concatenated_data = np.concatenate(all_runs_data, axis=1)  # shape: (n_channels, total_timesamples)

In [None]:
# Transpose to get an array of shape (total_timesamples, n_channels)
dataT = concatenated_data.T
print("Shape of dataT:", dataT.shape)

In [None]:
# Extract the last column (stim channel)
stim_col = dataT[:, -1]

# Count the unique values
unique_vals, counts = np.unique(stim_col, return_counts=True)

# Loop through unique values and their counts to print the results
for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurrence count : {count}")

In [None]:
# creating timestamps and header
n_times, n_channels = dataT.shape
timestamps = np.arange(n_times, dtype=int)
data_with_timestamp = np.column_stack((timestamps, dataT))
header = [""] + [str(i) for i in range(n_channels)]

# Removing decimals from timestamps
df = pd.DataFrame(data_with_timestamp, columns=header)
df[""] = df[""].astype(int)

In [None]:
# Test to check csv file
df.to_csv("data.csv", index=False)

In [None]:
# Loop on all subjects
subject_list = list(m_data.keys())

# Define the sessions to process (here '0train' and '1test')
sessions = ['0train', '1test']

# Loop through all subjects
for subject in subject_list:
    # Loop through the defined sessions
    for session in sessions:
        # Retrieve the list of runs in this session
        run_keys = sorted(m_data[subject][session].keys())
        
        # Initialize a list to store the data from each run
        all_runs_data = []
        for run in run_keys:
            raw_run = m_data[subject][session][run]
            # Drop EOG channels
            raw_run.drop_channels(['EOG1', 'EOG2', 'EOG3'])
            run_data = raw_run.get_data()  # shape: (n_channels, n_times_run)
            all_runs_data.append(run_data)
        
        # Concatenate the data along the time axis (axis=1)
        concatenated_data = np.concatenate(all_runs_data, axis=1)  # (n_channels, total_timesamples)
        
        # Transpose to get dataT of shape (total_timesamples, n_channels)
        dataT = concatenated_data.T
        n_times, n_channels = dataT.shape
        
        # Replace all occurrences of '4' with '6' in the last column
        # We assume the last column corresponds to the stimulation channel.
        dataT[:, -1] = np.where(dataT[:, -1] == 4, 6, dataT[:, -1])

        # Create the timestamps column
        timestamps = np.arange(n_times, dtype=int)
        datacsv = np.column_stack((timestamps, dataT))
        header = [""] + [str(i) for i in range(n_channels)]
        df = pd.DataFrame(datacsv, columns=header)
        df[""] = df[""].astype(int)
        
        # Name the file
        subject_str = f"{int(subject):02d}"
        # Calculate the session number from the string '0train' or '1test'
        session_str = f"{int(session[0]) + 1:02d}"
        filename = f"subject_{subject_str}_session_{session_str}.csv"
        df.to_csv(filename, index=False)
        print(f"Saved file : {filename}")
        
        # display info
        events = df.iloc[:, -1]
        n_lh = len(events[events == 1]) 
        n_rh = len(events[events == 2]) 
        n_f = len(events[events == 3]) 
        n_tongue = len(events[events == 6])
        print(f"Number of Left hand (1): {n_lh}")
        print(f"Number of Right hand (2): {n_rh}")
        print(f"Number of feet (3): {n_f}")
        print(f"Number of tongue (6): {n_tongue}")