RAW (MOABB) to CSV

This code convert the data sets from RAW format to CSV format using MOABB.

It has been specifically conceived for BCI data.

This script is for EPFLP300-6 (Radio run 6)



In [1]:
import numpy as np
import pandas as pd
import moabb.datasets

In [29]:
#Load Database
m_dataset = moabb.datasets.BNCI2014004()
m_data = m_dataset.get_data()

BNCI2014004 has been renamed to BNCI2014_004. BNCI2014004 will be removed in version 1.1.
The dataset class name 'BNCI2014004' must be an abbreviation of its code 'BNCI2014-004'. See moabb.datasets.base.is_abbrev for more information.


In [None]:
#See all canal names (EEG, misc, stim...)
raws = m_data[1]['1test']['0']
print("Canal list :", raws.ch_names)

Canal list : dict_keys(['0train', '1train', '2train', '3test', '4test'])


In [11]:
#Know what index is stim channel (we will need it later for the CSV to NY conversion)
stim_channel_name = 'stim'
stim_idx = raws.ch_names.index(stim_channel_name)
print(f"Canal index {stim_channel_name} is : {stim_idx}")


Canal index stim is : 6


In [12]:
#count stim data unique values (1 non target, 2 = target with a ratio needed of 5 to 1)
stim_data = raws.get_data(picks=stim_idx)
print(stim_data.shape)
unique_vals, counts = np.unique(stim_data, return_counts=True)

for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurences count : {count}")

(1, 604803)
Value : 0.0, Occurences count : 604683
Value : 1.0, Occurences count : 60
Value : 2.0, Occurences count : 60


In [26]:
# Par exemple, pour le sujet 1 et une session donnée (ici '0' ou '1test' selon la nomenclature)
subject = 1
run = '0' # ou '0' selon ton cas

 
# Récupérer la liste des runs dans cette session
session_keys = sorted([key for key in m_data[subject].keys() if 'test' in key])
print("Runs d'entraînement trouvés pour sujet 1, session 0 :", session_keys)


Runs d'entraînement trouvés pour sujet 1, session 0 : ['3test', '4test']


In [None]:
# raw_run.drop_channels(['EOG1', 'EOG2', 'EOG3'])
# Parcourir chaque run et extraire ses données
# for session in session_keys:

raw_session = m_data[subject]['0train'][run]
raw_session.drop_channels(['EOG1', 'EOG2', 'EOG3'])
data = raw_session.get_data()  # forme: (n_channels, n_times_run)


In [22]:

# Transposer pour avoir un tableau de forme (total_timesamples, n_channels)
dataT = data.T
nonzero_indices = np.where(np.sum(dataT, axis=1) != 0)[0]
dataT = dataT[nonzero_indices, :]
print("Forme de dataT :", dataT.shape)

Forme de dataT : (604718, 4)


In [23]:
# Extraire la dernière colonne (canal stim)
stim_col = dataT[:, -1]

# Compter les valeurs uniques
unique_vals, counts = np.unique(stim_col, return_counts=True)

for val, count in zip(unique_vals, counts):
    print(f"Value : {val}, Occurrence count : {count}")


Value : 0.0, Occurrence count : 604598
Value : 1.0, Occurrence count : 60
Value : 2.0, Occurrence count : 60


In [24]:
# creating timestamps and header
n_times, n_channels = dataT.shape
timestamps = np.arange(n_times, dtype=int)
data_with_timestamp = np.column_stack((timestamps, dataT))
header = [""] + [str(i) for i in range(n_channels)]

# Removing decimals from timestamps
df = pd.DataFrame(data_with_timestamp, columns=header)
df[""] = df[""].astype(int)

In [25]:
# Test to check csv file
df.to_csv("data.csv", index=False)

In [None]:
subject_list = list(m_data.keys())
# Définir les sessions à traiter (ici '0train' et '0test')

sessions = ['0train', '1train'] # ERREUR DANS MOABB 3EME RUN NOTE TRAIN ALORS QUE TEST !!!!!!!!!!!!!!!!!!!!!

for subject in subject_list:
    for session in sessions:
        raw_session = m_data[subject][session]['0']
        raw_session.drop_channels(['EOG1', 'EOG2', 'EOG3'])
        data = raw_session.get_data() 
        
        # Transposer pour obtenir dataT de forme (total_timesamples, n_channels)
        dataT = data.T

        nonzero_indices = np.where(np.sum(dataT, axis=1) != 0)[0]
        dataT = dataT[nonzero_indices, :]
        
        n_times, n_channels = dataT.shape
        
        # Création de la colonne de timestamps
        timestamps = np.arange(n_times, dtype=int)
        datacsv = np.column_stack((timestamps, dataT))
        header = [""] + [str(i) for i in range(n_channels)]
        df = pd.DataFrame(datacsv, columns=header)
        df[""] = df[""].astype(int)
        
        # Nommer le fichier
        subject_str = f"{int(subject):02d}"
        # Calculer le numéro de session à partir de la chaîne '0train' ou '1test'
        session_str = f"{int(session[0]) + 1:02d}"
        filename = f"subject_{subject_str}_session_{session_str}.csv"
        df.to_csv(filename, index=False)
        print(f"Saved file : {filename}")

Saved file : subject_01_session_01.csv
Saved file : subject_01_session_02.csv
Saved file : subject_02_session_01.csv
Saved file : subject_02_session_02.csv
Saved file : subject_03_session_01.csv
Saved file : subject_03_session_02.csv
Saved file : subject_04_session_01.csv
Saved file : subject_04_session_02.csv
Saved file : subject_05_session_01.csv
Saved file : subject_05_session_02.csv
Saved file : subject_06_session_01.csv
Saved file : subject_06_session_02.csv
Saved file : subject_07_session_01.csv
Saved file : subject_07_session_02.csv
Saved file : subject_08_session_01.csv
Saved file : subject_08_session_02.csv
Saved file : subject_09_session_01.csv
Saved file : subject_09_session_02.csv
