MOABB to NY format

.CSV to NY

this code convert the data sets in .CSV format to NY format which it can be easily read in Julia and Python. 

The .npz file holds the data and the stimulation vector while the .yml file holds the metadata. 

It has been specifically conceived for BCI data.

This script is for BNCI2014008


In [None]:
#import essential libraries

import numpy as np
import pandas as pd
import glob, os, sys, yaml
from yaml import CLoader as Loader, CDumper as Dumper
import moabb
from moabb.paradigms import P300

In [None]:
# Verify the epochs shape, labels, and metadata of BNCI2014008 if wanted
# Epoch shape >>> trials X channels X time_samples

paradigm = P300()
dataset = moabb.datasets.BNCI2014_008()
subjects = dataset.subject_list

epochs, labels, meta = paradigm.get_data(dataset=dataset, subjects = [subjects[0]], return_epochs = False)


In [None]:
# Download the .csv files from MOABB
# Extracting all the information we need to create .yml file for metadata

paradigm = P300()
dataset = moabb.datasets.BNCI2014_008()

label_dict = dataset.event_id
subjects = dataset.subject_list
data_d = dataset.get_data()

for sub_id in list(data_d.keys()):
    epochs, _, _ = paradigm.get_data(dataset=dataset, subjects = [sub_id], return_epochs=True)

    for session_id in list(data_d[sub_id].keys()):
        session_eeg = np.array([]).reshape(len(epochs.ch_names),0)
        session_stim = np.array([]).reshape(0,)

        for run_id in list(data_d[sub_id][session_id].keys()):
            session_stim = np.concatenate((session_stim, data_d[sub_id][session_id][run_id].get_data()[-1,:]), axis=0)
            session_eeg = np.concatenate((session_eeg, data_d[sub_id][session_id][run_id].pick_types(eeg=True).get_data()), axis=1)

In [None]:
#Calc windowlength
samplingrate = 256 #available in dataset description
windowlength = np.diff(dataset.interval)[0]*samplingrate
windowlength

In [None]:
# Forward your file with all .csv of the dataset
file_dir = "//filesrv4/home$/doumif/.windows/Bureau/bnci2014008"
all_files = [os.path.join(file_dir, file) for file in os.listdir(file_dir)]
all_files #check if you have .csv files

In [7]:
#YML creator 
#The Meta data of the Data set
#These variables and parameters should address once for each data set.  

#######################################Acquisition#######################################
filter= 'Unknown'
ground='Left Mastoid'
hardware= 'g.tec EEG'
reference= 'Right ear-lobe'
samplingrate= 256
sensors = epochs.ch_names
sensortype ='Active electrodes'
software= 'Unknown'

######################################Documentation######################################
description= 'https://lampx.tugraz.at/~bci/database/008-2014/description.pdf'
doi= 'https://doi.org/10.3389/fnhum.2013.00732'
investigators= "Riccio, Simione, Schettini, Pizzimenti, Inghilleri, Bellardinelli, Mattia, Cincotti"
place= 'Neuroelectrical Imaging and BCI Laboratory, IRCCS Fondazione Santa Lucia, Rome, Italy'
repository= 'None'
##id
condition= 'Unknown' 
database= 'BNCI2014008'  
paradigm= 'P300'
timestamp= 2013

##########################################Stim###########################################
####Labels
event_labels = dataset.event_id
nclasses = len(event_labels.keys())
offset= 0
windowlength = 256

#########################################Subjects########################################  
subjects=len(dataset.subject_list)
sessions = 1
runs=1

#function of YMLcreator
def YMLcreator():
  

  d = dict(acquisition=dict
          (filter= str(filter),
          ground=str(ground),
          hardware=str(hardware),
          reference=str(reference),
          samplingrate= samplingrate,
          sensors= sensors,
          sensortype=str(sensortype),
          software=str(software)
          )
          ,documentation=dict
          (description=str(description),
          doi=str(doi),
          investigators=str(investigators),
          place=str(place),
          repository=str(repository)
          )              
          ,
          formatversion=str('0.0.1')
          ,
          id=dict
          (condition=str(condition),
          database=str(database),
          paradigm=str(paradigm),
          subject=subject,
          session=session,
          run=run,
          timestamp=timestamp,
          )
          ,stim=dict
          (labels=event_labels,
          nclasses=nclasses,
          offset=offset,
          windowlength=windowlength,
          )
          
  )
  return d  

i=-1 #don't change i

for subject in range(1,subjects+1):
    for session in range(1,sessions+1):
        i=i+1
        filepath = all_files[i]
        for run in range(1,runs+1):
            d=YMLcreator()
            newpath=os.path.splitext(filepath)[0] + '.yml'
            with open(newpath, 'w') as file:
              documents = yaml.dump(d, file)

              file.write("""
##############################################################################
#                     GIPSA-lab standard for EEG time series (version 0.0.1) #
#                                Authors : Pedro Rodrigues and Marco Congedo #
#                                                        November 15th, 2019 #
##############################################################################

# This format has been conceived for easily sharing EEG data in Python and 
# Julia. Each file is understood as a separate recording. Data consist of two
# files. They have the same name and extensions `npz` and `yml` (this file).

# The `npz` file typically holds the EEG data matrix, a real matrix of 
# dimension num. of samples x num. of electrodes and a vector of integer with
# the tags for stimulations, with as many entries as number of samples. The 
# tags are 0 (zero) for no stimulation and then employs the natural numbers 
# (1, 2,...) for different stimulation classes.

# The `yml` file holds all meta-data info of the recording in `yml` format. 
# It holds two fields and four dictionaries:

# FIELDS:
#
# - paradigm: (string) the experimental paradigm, e.g., P300, MI, ... 
#             for Brain Computer Interfaces experiments
#
# - formatversion: (version) version of this metadata specification

# DICTIONARIES:
#
# - acquisition: (dictionary)
#
#   - filter: (string) filter setting of the EEG acquisition machine, 
#             specifying the type and specification. Ex: "Band-pass digital 
#             filter (0.01-70Hz)"
#   - ground: (string) location of the sensor used as ground. Ex: "Fpz"
#   - reference: (string) location of the sensor used as reference for the 
#                recording. Ex: "A1"
#   - hardware: (string) the commercial name and producer of the the EEG 
#               acquisition machine. Ex: "actiCHamp, Brain Products GmbH 
#               (Germany), DC amplifiers"
#   - software: (string) software used for acquiring and storing the data. 
#               Ex: OpenViBE, INRIA (France)
#   - samplingrate:(int) sampling rate. Ex: 128
#   - sensors: (array-like of strings) location of the sensors, excluding 
#              ground and reference.
#   - sensortype: (string) type, material and product name of electrodes. 
#                 Ex: Ag/AgCl, Braincap, Brain Products GmbH (Germany)
#
# - documentation: (dictionary)
#
#   - description: (string) link to a file or website describing the dataset
#   - doi: (string) digital object identifier of the dataset's documentation
#   - repository:  (string) link to the online repository where the data can 
#                  be downloaded
#
# - id: (dictionary)
#   
#   - database: (string) name of the database
#
# - stim: (dictionary)
#
#   - labels: (dictionary) dictionary with the labels and code of the 
#             stimulations
#       - nclasses: (int) number of classes for the stimulations
#       - offset: (int) offset, given in number of samples, with respect to 
#                 stimulation samples, defining the beginning of trials
#       - windowlength: (int) size of the window, given in number of sample, 
#                       defining the duration of trials
""")

In [None]:

##NPZ creator

#the Structure of the Data sets 
"""
The data should be in the form of  Samples*Channels, where channels may include a column for the time,
a column for the ground, a colum for the trigger, and a colum for the target. So, you need to drop the
columns of time and ground and add the two columns of trigger and target together(If you have not already done so).
"""

# timestamp_col= # no timestamp col in BNCI2014008  
# ground_col = # ground not included in BNCI2014008
target_col= 8 # Stim 

#NPZ creator

def csv2npz(filepath):

    df= pd.read_csv(filepath, header=None)
    df= np.array(df)
    
    #DATA is in the shape of Samples*Channels 
    DATA=np.float32(df[:,0:8])

    #STIM is the sum of the two trigger and target columns 
    STIM=np.int16(df[:,target_col])

    newpath=os.path.splitext(filepath)[0] + '.npz'
    np.savez(newpath,data=DATA , stim=STIM)

for f in all_files:
    csv2npz(f)