MOABB to NY format

.CSV to NY

this code convert the data sets in .CSV format to NY format which it can be easily read in Julia and Python. 

The .npz file holds the data and the stimulation vector while the .yml file holds the metadata. 

It has been specifically conceived for BCI data.

This script is for Cattan2019_PC


In [None]:
#import essential libraries

import numpy as np
import pandas as pd
import glob, os, sys, yaml
from yaml import CLoader as Loader, CDumper as Dumper
import moabb
from moabb.paradigms import P300

In [None]:
# Verify the epochs shape, labels, and metadata of Cattan2019_PC if wanted
# Epoch shape >>> trials X channels X time_samples

paradigm = P300()
dataset = moabb.datasets.Cattan2019_VR(virtual_reality=False, screen_display=True)
subjects = dataset.subject_list

epochs, labels, meta = paradigm.get_data(dataset=dataset, subjects = [subjects[0]], return_epochs = True)


In [None]:
epochs.ch_names

In [None]:
event_labels = dataset.event_id
event_labels

In [None]:
#Calc windowlength
samplingrate = 256 # original data was downsampled to 256Hz, original = 512Hz (available in dataset description)
windowlength = np.diff(dataset.interval)[0]*samplingrate
windowlength

In [None]:
# Forward your file with all .csv of the dataset

file_dir = "C:\\Users\\doumif\\work\\OfficeWork\\BCI Databases\\CSV\\P300\\Cattan-PC"
all_files = [os.path.join(file_dir, file) for file in os.listdir(file_dir)]
all_files #check if you have .csv files

In [None]:
#YML creator 
#The Meta data of the Data set
#These variables and parameters should address once for each data set.  

#######################################Acquisition#######################################
filter= 'Low-Pass 83Hz (Butterworth order 4 zero phase) for downsampling'
ground='AFz'
hardware= 'EasyCap EC20'
reference= 'Righ Earlobe'
samplingrate= 256  # original data was downsampled to 256Hz, original = 512Hz (available in dataset description)
sensors = epochs.ch_names
sensortype ='Wet electrodes'
software= 'OpenVibe'

######################################Documentation######################################
description= 'https://hal.science/hal-02078533'
doi= 'https://zenodo.org/records/2605205'
investigators= "Cattan, Andreev, Rodrigues, Congedo"
place= 'GIPSA-lab (University of Grenoble Alpes, CNRS, Grenoble-INP)'
repository= 'https://github.com/plcrodrigues/py.VR.EEG.2018-GIPSA'
##id
condition= 'Personal Computer (PC)' 
database= 'Cattan2019'  
paradigm= 'P300'
timestamp= 2019

##########################################Stim###########################################
####Labels
event_labels = dataset.event_id
nclasses = len(event_labels.keys())
offset = 0
windowlength = 256

#########################################Subjects########################################  
subjects=len(dataset.subject_list)
sessions=1
runs=1

#function of YMLcreator

def YMLcreator():
  

  d = dict(acquisition=dict
          (filter= str(filter),
          ground=str(ground),
          hardware=str(hardware),
          reference=str(reference),
          samplingrate= samplingrate,
          sensors= sensors,
          sensortype=str(sensortype),
          software=str(software)
          )
          ,documentation=dict
          (description=str(description),
          doi=str(doi),
          investigators=str(investigators),
          place=str(place),
          repository=str(repository)
          )              
          ,
          formatversion=str('0.0.1')
          ,
          id=dict
          (condition=str(condition),
          database=str(database),
          paradigm=str(paradigm),
          run=run,
          session=session,
          subject=subject,
          timestamp=timestamp,
          )
          ,stim=dict
          (labels=event_labels,
          nclasses=nclasses,
          offset=offset,
          windowlength=windowlength,
          )
          
  )
  return d  

i=-1 #don't change i

for subject in range(1,subjects+1):
    for session in range(1,sessions+1):
        i=i+1
        filepath = all_files[i]
        for run in range(1,runs+1):
            d=YMLcreator()
            
            newpath=os.path.splitext(filepath)[0] + '.yml'
            with open(newpath, 'w') as file:
              documents = yaml.dump(d, file)


              file.write("""
##############################################################################
#                     GIPSA-lab standard for EEG time series (version 0.0.1) #
#                                Authors : Pedro Rodrigues and Marco Congedo #
#                                                        November 15th, 2019 #
##############################################################################

# This format has been conceived for easily sharing EEG data in Python and 
# Julia. Each file is understood as a separate recording. Data consist of two
# files. They have the same name and extensions `npz` and `yml` (this file).

# The `npz` file typically holds the EEG data matrix, a real matrix of 
# dimension num. of samples x num. of electrodes and a vector of integer with
# the tags for stimulations, with as many entries as number of samples. The 
# tags are 0 (zero) for no stimulation and then employs the natural numbers 
# (1, 2,...) for different stimulation classes.

# The `yml` file holds all meta-data info of the recording in `yml` format. 
# It holds two fields and four dictionaries:

# FIELDS:
#
# - paradigm: (string) the experimental paradigm, e.g., P300, MI, ... 
#             for Brain Computer Interfaces experiments
#
# - formatversion: (version) version of this metadata specification

# DICTIONARIES:
#
# - acquisition: (dictionary)
#
#   - filter: (string) filter setting of the EEG acquisition machine, 
#             specifying the type and specification. Ex: "Band-pass digital 
#             filter (0.01-70Hz)"
#   - ground: (string) location of the sensor used as ground. Ex: "Fpz"
#   - reference: (string) location of the sensor used as reference for the 
#                recording. Ex: "A1"
#   - hardware: (string) the commercial name and producer of the the EEG 
#               acquisition machine. Ex: "actiCHamp, Brain Products GmbH 
#               (Germany), DC amplifiers"
#   - software: (string) software used for acquiring and storing the data. 
#               Ex: OpenViBE, INRIA (France)
#   - samplingrate:(int) sampling rate. Ex: 128
#   - sensors: (array-like of strings) location of the sensors, excluding 
#              ground and reference.
#   - sensortype: (string) type, material and product name of electrodes. 
#                 Ex: Ag/AgCl, Braincap, Brain Products GmbH (Germany)
#
# - documentation: (dictionary)
#
#   - description: (string) link to a file or website describing the dataset
#   - doi: (string) digital object identifier of the dataset's documentation
#   - repository:  (string) link to the online repository where the data can 
#                  be downloaded
#
# - id: (dictionary)
#   
#   - database: (string) name of the database
#
# - stim: (dictionary)
#
#   - labels: (dictionary) dictionary with the labels and code of the 
#             stimulations
#       - nclasses: (int) number of classes for the stimulations
#       - offset: (int) offset, given in number of samples, with respect to 
#                 stimulation samples, defining the beginning of trials
#       - windowlength: (int) size of the window, given in number of sample, 
#                       defining the duration of trials
""")

In [None]:
##NPZ creator

#the Structure of the Data sets 
"""
The data should be in the form of  Samples* Channels, where channels may include a column for the time,
a column for the ground, a colum for the trigger, and a colum for the target. So, you need to drop the
columns of time and ground and add the two columns of trigger and target together(If you have not already done so).
"""

##############################################################################

timestamp_col=0  
# ground_col = # ground column not included in Cattan2019_PC
stim_col = 17 # Stim col

#NPZ creator

def csv2npz(filepath):
    df = pd.read_csv(filepath, header=0) 
    df = np.array(df)

    # from V to ÂµV
    raw_data = df[:, 1:-1] * 1e6  

    DATA = np.float32(raw_data)
    print(DATA.shape)

    # STIM col
    STIM = np.int16(df[:, stim_col]) 

    unique_values, counts = np.unique(STIM, return_counts=True)
    for value, count in zip(unique_values, counts):
        print(f"Dans {filepath} : Nombre de {value} = {count}")

    newpath = os.path.splitext(filepath)[0] + '.npz'
    np.savez(newpath, data=DATA, stim=STIM)

for f in all_files:
    csv2npz(f)