MOABB to NY format

.CSV to NY

this code convert the data sets in .CSV format to NY format which it can be easily read in Julia and Python. 

The .nzp file holds the data and the stimulation vector while the .yml file holds the metadata. 

It has been specifically conceived for BCI data.


In [None]:
#import essential libraries

import numpy as np, pandas as pd
import glob, os, sys, yaml
from yaml import CLoader as Loader, CDumper as Dumper

In [None]:
#upload the data set in CSV format 

path= 'dataset/bi2015a/' 
all_files= glob.glob(path + "*.csv")
all_files

In [None]:
#chech the data set for Null or Empty array 
Null_Data= []

for f in range(0,len(all_files)):
    df = pd.read_csv(all_files[f], header=None)
    df = df.isnull().sum().sum()  #total Number of Missing Values
    
    if df==0:
        Null_Data= np.append(Null_Data,df)
    else:
        #Error
        sys.exit('Data of '+ all_files[f] +' contains empty or Null arrays')

#Double Check
is_all_zero = np.all((Null_Data == 0))

if is_all_zero:
    print('The Data looks Good!')
else:
    print('The Data contains empty or Null arrays ')


In [None]:
#YML creator 
#The Meta data of the Data set
#These variables and parameters should address once for each data set.  

#######################################Acquisition#######################################
filter= 'Unknown'
ground='Fz'
hardware= 'g.GAMMAcap (g.tec, Schiedlberg, Austria)'
reference= 'Hardware common average reference'
samplingrate= 512
sensors=['FP1' ,'FP2','F7','F3','F4','F8','FC5' ,'FC1','FC2','FC6',
          'T7','C3','Cz','C4','T8','CP5','CP1','CP2' ,'CP6','P7' ,'P3','Pz',
          'P4' ,'P8' ,'PO7' ,'O1','Oz' ,'O2','PO8','PO9' , 'PO10']

sensortype ='EEG wet Silver/Silver Chloride electrodes'
software= 'OpenVibe'
######################################Documentation######################################
description= ' https://hal.archives-ouvertes.fr/hal-02172347'
doi= '10.5281/zenodo.3266930'
investigators= "Louis Korczowski, Martine Cederhout, Anton Andreev,Grégoire Cattan,Pedro Luiz Coelho Rodrigues, Violette Gautheret, Marco Congedo"
place= 'GIPSA-lab, Grenoble, France'
repository= 'https://zenodo.org/record/3266930'
##id
condition= 'Online' 
database= 'bi2015a(only five subjects)'  
paradigm= 'P300'
timestamp= 2015

##########################################Stim###########################################
####Labels
NonTarget= 1
Target= 2
nclasses= 2
offset= 0
windowlength= 512  

#########################################Subjects########################################  
subjects=43
sessions=3
runs=1


#function of YMLcreator

def YMLcreator():
  

  d = dict(acquisition=dict
          (filter= str(filter),
          ground=str(ground),
          hardware=str(hardware),
          reference=str(reference),
          samplingrate= samplingrate,
          sensors= sensors,
          sensortype=str(sensortype),
          software=str(software)
          )
          ,documentation=dict
          (description=str(description),
          doi=str(doi),
          investigators=str(investigators),
          place=str(place),
          repository=str(repository)
          )              
          ,
          formatversion=str('0.0.1')
          ,
          id=dict
          (condition=str(condition),
          database=str(database),
          paradigm=str(paradigm),
          run=run,
          session=session,
          subject=subject,
          timestamp=timestamp,
          )
          ,stim=dict
          (labels=dict(NonTarget=NonTarget, Target=Target),
          nclasses=nclasses,
          offset=offset,
          windowlength=windowlength,
          )
          
  )
  return d  

i=-1 #don't change i

for subject in range(1,subjects+1):
    for session in range(1,sessions+1):
        i=i+1
        filepath = all_files[i]
        for run in range(1,runs+1):
            d=YMLcreator()
            
            newpath=os.path.splitext(filepath)[0] + '.yml'
            with open(newpath, 'w') as file:
              documents = yaml.dump(d, file)


              file.write("""
##############################################################################
#                     GIPSA-lab standard for EEG time series (version 0.0.1) #
#                                Authors : Pedro Rodrigues and Marco Congedo #
#                                                        November 15th, 2019 #
##############################################################################

# This format has been conceived for easily sharing EEG data in Python and 
# Julia. Each file is understood as a separate recording. Data consist of two
# files. They have the same name and extensions `npz` and `yml` (this file).

# The `npz` file typically holds the EEG data matrix, a real matrix of 
# dimension num. of samples x num. of electrodes and a vector of integer with
# the tags for stimulations, with as many entries as number of samples. The 
# tags are 0 (zero) for no stimulation and then employs the natural numbers 
# (1, 2,...) for different stimulation classes.

# The `yml` file holds all meta-data info of the recording in `yml` format. 
# It holds two fields and four dictionaries:

# FIELDS:
#
# - paradigm: (string) the experimental paradigm, e.g., P300, MI, ... 
#             for Brain Computer Interfaces experiments
#
# - formatversion: (version) version of this metadata specification

# DICTIONARIES:
#
# - acquisition: (dictionary)
#
#   - filter: (string) filter setting of the EEG acquisition machine, 
#             specifying the type and specification. Ex: "Band-pass digital 
#             filter (0.01-70Hz)"
#   - ground: (string) location of the sensor used as ground. Ex: "Fpz"
#   - reference: (string) location of the sensor used as reference for the 
#                recording. Ex: "A1"
#   - hardware: (string) the commercial name and producer of the the EEG 
#               acquisition machine. Ex: "actiCHamp, Brain Products GmbH 
#               (Germany), DC amplifiers"
#   - software: (string) software used for acquiring and storing the data. 
#               Ex: OpenViBE, INRIA (France)
#   - samplingrate:(int) sampling rate. Ex: 128
#   - sensors: (array-like of strings) location of the sensors, excluding 
#              ground and reference.
#   - sensortype: (string) type, material and product name of electrodes. 
#                 Ex: Ag/AgCl, Braincap, Brain Products GmbH (Germany)
#
# - documentation: (dictionary)
#
#   - description: (string) link to a file or website describing the dataset
#   - doi: (string) digital object identifier of the dataset's documentation
#   - repository:  (string) link to the online repository where the data can 
#                  be downloaded
#
# - id: (dictionary)
#   
#   - database: (string) name of the database
#
# - stim: (dictionary)
#
#   - labels: (dictionary) dictionary with the labels and code of the 
#             stimulations
#       - nclasses: (int) number of classes for the stimulations
#       - offset: (int) offset, given in number of samples, with respect to 
#                 stimulation samples, defining the beginning of trials
#       - windowlength: (int) size of the window, given in number of sample, 
#                       defining the duration of trials
""")











In [None]:

##NPZ creator

#the Structure of the Data sets 
"""
The data should be in the form of  Samples* Channels, where channels may include a column for the time,
a column for the ground, a colum for the trigger, and a colum for the target. So, you need to drop the
columns of time and ground and add the two columns of trigger and target together(If you have not already done so).
"""

#

timestamp_col=0  
ground_col=3
target_col=33
trigger_col= 34



#NPZ creator

def csv2npz(filepath):

    df= pd.read_csv(filepath, header=None)
    df= np.array(df)
    
    #DATA is in the shape of Samples*Channels 
    DATA=np.float32(df[:,0:32])

    #remove the ground and time columns (if the data set includes those columns)
    DATA=np.delete(DATA,np.s_[timestamp_col,ground_col],axis=1)

    #STIM is the sum of the two trigger and target columns 
    STIM=np.int16(df[:,target_col]+df[:,trigger_col])
   
    newpath=os.path.splitext(filepath)[0] + '.npz'
    np.savez(newpath,data=DATA , stim=STIM)


for f in all_files:
    csv2npz(f)