RAW (MOABB) to CSV

This code convert the data sets from RAW format to CSV format using MOABB.

It has been specifically conceived for BCI data.

This script is for BNCI2014002 (Train Runs)

In [1]:
import numpy as np
import pandas as pd
import moabb.datasets

In [2]:
#Load Database
m_dataset = moabb.datasets.BNCI2014_002()
m_data = m_dataset.get_data()

In [None]:
#See all canal names (EEG, misc, stim...)
raws = m_data[1]['0']['0train']
print("Canal list :", raws.ch_names)

In [None]:
#Know what index is stim channel (we will need it later for the CSV to NY conversion)
stim_channel_name = 'stim'
stim_idx = raws.ch_names.index(stim_channel_name)
print(f"Canal index {stim_channel_name} is : {stim_idx}")


In [None]:
#count stim data unique values (1 non target, 2 = target with a ratio needed of 5 to 1)
stim_data = raws.get_data(picks=stim_idx)
print(stim_data.shape)
unique_vals, counts = np.unique(stim_data, return_counts=True)

for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurences count : {count}")

In [None]:
# Par exemple, pour le sujet 1 et une session donnée (ici '0' ou '1test' selon la nomenclature)
subject = 1
session = '0' # ou '0' selon ton cas

 
# Récupérer la liste des runs dans cette session
run_keys = sorted([key for key in m_data[subject][session].keys() if 'train' in key])
print("Runs d'entraînement trouvés pour sujet 1, session 0 :", run_keys)

# Initialiser une liste pour stocker les données de chaque run
all_runs_data = []

# Parcourir chaque run d'entraînement et extraire ses données
for run in run_keys:
    raw_run = m_data[subject][session][run]
    # Extraire les données : forme (n_channels, n_times_run)
    run_data = raw_run.get_data()
    all_runs_data.append(run_data)

concatenated_data = np.concatenate(all_runs_data, axis=1)  # forme: (n_channels, total_timesamples)

# Transposer pour obtenir dataT de forme (total_timesamples, n_channels)
dataT = concatenated_data.T

print("Forme de dataT avant suppression :", dataT.shape)

# Supprimer les 8 premiers time samples (lignes) s'ils ne contiennent que des 0
# Pour être sûr, on peut vérifier si la somme par ligne vaut 0
nonzero_indices = np.where(np.sum(dataT, axis=1) != 0)[0]
dataT = dataT[nonzero_indices, :]

print("Forme de dataT après suppression :", dataT.shape)


In [None]:
# creating timestamps and header
dataT[:, -1] = np.where(dataT[:, -1] == 2, 3, dataT[:, -1])
dataT[:, -1] = np.where(dataT[:, -1] == 1, 2, dataT[:, -1])


In [None]:
# Extraire la dernière colonne (canal stim)
stim_col = dataT[:, -1]

# Compter les valeurs uniques
unique_vals, counts = np.unique(stim_col, return_counts=True)

for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurrence count : {count}")

In [None]:
n_times, n_channels = dataT.shape
timestamps = np.arange(n_times, dtype=int)
data_with_timestamp = np.column_stack((timestamps, dataT))
header = [""] + [str(i) for i in range(n_channels)]

# Removing decimals from timestamps
df = pd.DataFrame(data_with_timestamp, columns=header)
df[""] = df[""].astype(int)

In [None]:
# Test to check csv file
df.to_csv("data.csv", index=False)

In [4]:
subject_list = list(m_data.keys())


for subject in subject_list:

    # Récupérer la liste des runs dans cette session
    run_keys = sorted([key for key in m_data[subject]['0'].keys() if 'test' in key])
    
    # Initialiser une liste pour stocker les données de chaque run
    all_runs_data = []
    for run in run_keys:
        raw_run = m_data[subject]['0'][run]
        # Extraire les données : forme (n_channels, n_times_run)
        run_data = raw_run.get_data()
        all_runs_data.append(run_data)

    
    # Concaténer les données le long de l'axe temporel (axis=1)
    concatenated_data = np.concatenate(all_runs_data, axis=1)  # (n_channels, total_timesamples)
    dataT = concatenated_data.T

    nonzero_indices = np.where(np.sum(dataT, axis=1) != 0)[0]
    dataT = dataT[nonzero_indices, :]

    n_times, n_channels = dataT.shape
    
    dataT[:, -1] = np.where(dataT[:, -1] == 2, 3, dataT[:, -1])
    dataT[:, -1] = np.where(dataT[:, -1] == 1, 2, dataT[:, -1])

    # Création de la colonne de timestamps
    timestamps = np.arange(n_times, dtype=int)
    datacsv = np.column_stack((timestamps, dataT))
    header = [""] + [str(i) for i in range(n_channels)]
    df = pd.DataFrame(datacsv, columns=header)
    df[""] = df[""].astype(int)
    
    # Nommer le fichier
    subject_str = f"{int(subject):02d}"
    # Calculer le numéro de session à partir de la chaîne '0train' ou '1test'
    session_str = f"{1:02d}"
    filename = f"subject_{subject_str}_session_{session_str}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved file : {filename}")

Saved file : subject_01_session_01.csv
Saved file : subject_02_session_01.csv
Saved file : subject_03_session_01.csv
Saved file : subject_04_session_01.csv
Saved file : subject_05_session_01.csv
Saved file : subject_06_session_01.csv
Saved file : subject_07_session_01.csv
Saved file : subject_08_session_01.csv
Saved file : subject_09_session_01.csv
Saved file : subject_10_session_01.csv
Saved file : subject_11_session_01.csv
Saved file : subject_12_session_01.csv
Saved file : subject_13_session_01.csv
Saved file : subject_14_session_01.csv
