CSV to NY

This code convert the databases from .CSV format to NY format which it can be easily read in Julia and Python. 

The .npz file holds the data and the stimulation vector while the .yml file holds the metadata. 

It has been specifically conceived for BCI data.

This script is for BNCI2014004-Test

In [None]:
import numpy as np
import pandas as pd
import os, yaml
from moabb import datasets
from moabb.paradigms import MotorImagery

In [None]:
# Load 1 subject to extract information
paradigm = MotorImagery()
dataset = datasets.BNCI2014004()
epochs, _, _ = paradigm.get_data(dataset=dataset, subjects = [dataset.subject_list[0]], return_epochs = True)

In [None]:
# check info 
print("Subject list: ", dataset.subject_list)
print("Channel names: ", epochs.ch_names)

In [None]:
#Calc windowlength
samplingrate = 250 # original data was downsampled to 256Hz, original = 512Hz (available in dataset description)
windowlength = np.ceil(np.diff(dataset.interval)[0]*samplingrate)
print("Window length: ", windowlength)

#Calc offset
offset = dataset.interval[0] * samplingrate
print("Offset: ",offset)

In [None]:
# Forward your file with all .csv of the database
file_dir = "C:\\Users\\doumif\\work\\OfficeWork\\BCI Databases\\CSV\\MI\\BNCI2014004-test"
all_files = [os.path.join(file_dir, file) for file in os.listdir(file_dir)]
print(all_files)

In [None]:
#YML creator 
#The Meta data of the Data set
#These variables and parameters should address once for each data set.  

#######################################Acquisition#######################################
filter= 'Bandpass [0.5-100 Hz], Notch 50 Hz'
ground='Right Mastoid'
hardware= ' Easycap, Germany ; biosignal amplifier (g.tec, Guger Technologies OEG, Graz, Austria)'
reference= 'Left Mastoid'
samplingrate= 250
sensors = epochs.ch_names 
sensortype ='Ag/AgCl Wet electrodes'
software= 'BioSig toolbox (Octave/MATLAB/C++) + SIMULINK'

##############Documentation######################################
description= 'https://lampx.tugraz.at/~bci/database/004-2014/description.pdf'
doi= 'https://doi.org/10.1109/TNSRE.2007.906956'
investigators = "R. Leeb, C. Brunner, G. R. Muller-Putz, A. Schlogl, and G.Pfurtscheller"
place = "Graz University of Technology, Institute for Knowledge Discovery, Laboratory of Brain-Computer Interfaces, Graz, Austria"
repository= 'https://bnci-horizon-2020.eu/database/data-sets'
##id
condition= 'Test'
database= 'BNCI2014004'  
paradigm= 'MI'
timestamp= 2008

##########################################Stim###########################################
####Labels
event_labels = dict(left_hand = 1, right_hand = 2) 
nclasses= len(event_labels.keys())
offset = 750
windowlength = 1125

#########################################Subjects########################################  
subjects=len(dataset.subject_list)
sessions = 3 
runs = 1 

#function of YMLcreator
def YMLcreator():


  d = dict(acquisition=dict
          (filter= str(filter),
          ground=str(ground),
          hardware=str(hardware),
          reference=str(reference),
          samplingrate= samplingrate,
          sensors= sensors,
          sensortype=str(sensortype),
          software=str(software)
          )
          ,documentation=dict
          (description=str(description),
          doi=str(doi),
          investigators=str(investigators),
          place=str(place),
          repository=str(repository)
          )              
          ,
          formatversion=str('0.0.1')
          ,
          id=dict
          (condition=str(condition),
          database=str(database),
          paradigm=str(paradigm),
          subject=subject,
          session=session,
          run=run,
          timestamp=timestamp,
          )
          ,stim=dict
          (labels=event_labels,
          nclasses=nclasses,
          offset=offset,
          windowlength=windowlength,
          )
          
  )
  return d  

i=-1 #don't change i

for subject in range(1,subjects+1):
    for session in range(3,sessions+3):
        i=i+1
        filepath = all_files[i]
        for run in range(1,runs+1):
            d=YMLcreator()
            newpath=os.path.splitext(filepath)[0] + '.yml'
            with open(newpath, 'w') as file:
              documents = yaml.dump(d, file)

              file.write("""
##############################################################################
#                     GIPSA-lab standard for EEG time series (version 0.0.1) #
#                                Authors : Pedro Rodrigues and Marco Congedo #
#                                                        November 15th, 2019 #
##############################################################################

# This format has been conceived for easily sharing EEG data in Python and 
# Julia. Each file is understood as a separate recording. Data consist of two
# files. They have the same name and extensions `npz` and `yml` (this file).

# The `npz` file typically holds the EEG data matrix, a real matrix of 
# dimension num. of samples x num. of electrodes and a vector of integer with
# the tags for stimulations, with as many entries as number of samples. The 
# tags are 0 (zero) for no stimulation and then employs the natural numbers 
# (1, 2,...) for different stimulation classes.

# The `yml` file holds all meta-data info of the recording in `yml` format. 
# It holds two fields and four dictionaries:

# FIELDS:
#
# - paradigm: (string) the experimental paradigm, e.g., P300, MI, ... 
#             for Brain Computer Interfaces experiments
#
# - formatversion: (version) version of this metadata specification

# DICTIONARIES:
#
# - acquisition: (dictionary)
#
#   - filter: (string) filter setting of the EEG acquisition machine, 
#             specifying the type and specification. Ex: "Band-pass digital 
#             filter (0.01-70Hz)"
#   - ground: (string) location of the sensor used as ground. Ex: "Fpz"
#   - reference: (string) location of the sensor used as reference for the 
#                recording. Ex: "A1"
#   - hardware: (string) the commercial name and producer of the the EEG 
#               acquisition machine. Ex: "actiCHamp, Brain Products GmbH 
#               (Germany), DC amplifiers"
#   - software: (string) software used for acquiring and storing the data. 
#               Ex: OpenViBE, INRIA (France)
#   - samplingrate:(int) sampling rate. Ex: 128
#   - sensors: (array-like of strings) location of the sensors, excluding 
#              ground and reference.
#   - sensortype: (string) type, material and product name of electrodes. 
#                 Ex: Ag/AgCl, Braincap, Brain Products GmbH (Germany)
#
# - documentation: (dictionary)
#
#   - description: (string) link to a file or website describing the dataset
#   - doi: (string) digital object identifier of the dataset's documentation
#   - repository:  (string) link to the online repository where the data can 
#                  be downloaded
#
# - id: (dictionary)
#   
#   - database: (string) name of the database
#
# - stim: (dictionary)
#
#   - labels: (dictionary) dictionary with the labels and code of the 
#             stimulations
#       - nclasses: (int) number of classes for the stimulations
#       - offset: (int) offset, given in number of samples, with respect to 
#                 stimulation samples, defining the beginning of trials
#       - windowlength: (int) size of the window, given in number of sample, 
#                       defining the duration of trials
""")

In [None]:
## NPZ Creator

# Dataset Structure
"""
The data shape is: Samples * Channels, where channels may include a column for time,
a column for the ground, a column for the trigger, and a column for the target. You must drop the
time and ground columns (if present) and separate the stimulation column from the data.
"""

##############################################################################

# NPZ Creator

def csv2npz(filepath):

    df = pd.read_csv(filepath, header=0)
    # Convert dataframe to numpy array
    df = np.array(df)
    
    # Convert from V to ÂµV (excluding first column and last column)
    raw_data = df[:, 1:-1] * 1e6  

    # EEG data as float32
    DATA = np.float32(raw_data)
    print(DATA.shape)

    # Stimulation column (last column) converted to int16
    STIM = np.int16(df[:, -1])

    # Count occurrences of each event label
    unique_values, counts = np.unique(STIM, return_counts=True)
    for value, count in zip(unique_values, counts):
        print(f"In {filepath}: Count of {value} = {count}")
    
    # Save as .npz with the same name
    newpath = os.path.splitext(filepath)[0] + '.npz'
    np.savez(newpath, data=DATA, stim=STIM)

# Process all files
for f in all_files:
    csv2npz(f)