# Explore the NeuArm data set 

The data is not stored in the present GitHub repository. 

The data is stored in a local directory on my machine.

I use a simlink to the data directory in my local machine. 

The simlink was created with: 

1. Move to the directory where you want to create the simlink, here in the `data` folder.  

1. Create the simlink with: `ln -s <path to data directory> <current directory>`  
Here: `ln -s ~/ExperimentalData/NeuArm ./`  
This creates a simlink named `NeuArm` in the current directory, which is the `data` folder.

1. Rename the simlink to `NeuArm.lnk` and add `*.lnk` it to the `.gitignore` file.  
This is done to avoid that the data is added to the GitHub repository. 


In [None]:
# explore  the files in all the folders that are in the data folder
# print the name of the files found in each folder

import os
import glob

verbose = False
# utility function to print only if verbose is True
def print_v(*args):
    if verbose:
        print(*args)

# get the current working directory
cwd = os.getcwd()
print_v(cwd)

# if we are in the notebooks folder, go back to the root folder
if os.path.basename(cwd) == 'notebooks':
    os.chdir('..')
    cwd = os.getcwd()
    print_v(cwd)

# get the path to the data folder
data_folder = os.path.join(cwd, 'data')
print_v(data_folder)

# recursively get the list of all the folders in the data folder
folders = glob.glob(data_folder + '/**/', recursive=True)
print_v(folders)

# loop among the folders to fill in a list of *.xdf files
dataFiles_NeuArm = []
for folder in folders:
    files = os.listdir(folder)
    # print the name of the folder and the list of files as a column, file by file
    print_v(folder)
    for file in files:
        print_v(file)
        if file.endswith('.xdf'):
            fullPath = os.path.join(folder, file)
            relPath = os.path.relpath(fullPath, cwd)
            dataFiles_NeuArm.append(relPath)

# sort the list of files
dataFiles_NeuArm.sort()
for i in range(len(dataFiles_NeuArm)):
    # print the index with 2 leading zeros, the name of the file with an f string
    print(f'{i:02d} {dataFiles_NeuArm[i]}')


In [None]:
# make a function that takes a filename and returns the participantID and all information a dictionary
def getConditionFromFileName(relFileName):
    # get the path (if exists) and filename
    path = os.path.dirname(relFileName)
    fname = os.path.basename(relFileName)
    fname, extension = os.path.splitext(fname)
    # get the tokens separated by the underscore
    tokens = fname.split('_')
    # try to get the condition from the filename
    try:
        participantNumber = tokens[0]
        participantName = tokens[1]
        date = tokens[2]
        session = tokens[3]
        condition = tokens[4]
        participantID = participantNumber + '_' + participantName
        group = path.split(os.path.sep)[-1]
        return {'participantID': participantID, 'date': date, 'session': session, 'condition': condition, 'group': group, 'fname': fname, 'extension': extension}
    except Exception as e: 
        print(file, 'is incorrect (error:', e, ')')
        return None

# test the function
fname = '001_LecGer_20210826_1_c.xdf'   # not path
fname = dataFiles_NeuArm[0]             # with path (relative)
tokens = getConditionFromFileName(fname)
print(tokens)



## Check that each *.xdf file in the `data` folder has a valid filename

In [None]:
# Check that each *.xdf file in the `data` folder has a valid filename
for folder in folders:
    files = os.listdir(folder)
    for file in files:
        if file.endswith('.xdf'):
            tokens = getConditionFromFileName(file)
  

## Create a CSV file with all the information about the data files
for each file, we want to parse the information that is in the file name and location, and store it in a CSV file.

In [None]:

def appendLineToConditionsFile(conditionsFile, fullFname):
    relativeFname = os.path.relpath(fullFname, cwd)
    tokens = getConditionFromFileName(relativeFname)
    # protect the backslashes and commas before writing the line in CSV format
    protectedFname = relativeFname.replace('\\', '\\\\')
    protectedFname = protectedFname.replace(',', '\\,')
    if tokens is not None:
        txt = tokens['participantID'] + ',' + tokens['date'] 
        txt = txt + ',' + tokens['session'] + ',' + tokens['condition'] + ',' + tokens['group'] 
        txt = txt + ',' + tokens['fname'] + ',' + tokens['extension'] 
        txt = txt + ',' + relativeFname
        txt = txt + '\n'

        f = open(conditionsFile, 'a')
        f.write(txt)
        f.close()

# create the conditions file
conditionsFile = 'data/NeuArm_conditions.csv'
f = open(conditionsFile, 'w')
f.write('participantID,date,session,condition,group,fname,extension,relativePath\n')
f.close()
# loop among the folders
for folder in folders:
    files = os.listdir(folder)
    for file in files:
        fullpath = os.path.join(folder, file)
        if file.endswith('.xdf'):
            appendLineToConditionsFile(conditionsFile, fullpath)


## Read the CSV file in a numpy array

In [None]:
import numpy as np
# open the CSV file with the conditions in a numpy array
conditionsFile = 'data/NeuArm_conditions.csv'
conditions = np.genfromtxt(conditionsFile, delimiter=',', names=True, dtype=None, encoding=None)
print(conditions.dtype.names)
print(conditions.shape)

## Check that the data files contain the expected data streams 

### Define the expected data streams

A data stream is uniquely identified by its *name* and *type*. Yet, the name of some data streams are not always the same: `NIC` changed to `LSLOutletStreamName` during the experiment...   
Below, we use a list of list, where each sublist (line) contains the possible names and types of a data stream.

    expected_data_stream = [
        [ [name1, name2], [ type ] ], 
        ...
        [ [name1, name2], [ type1, type2 ] ]
    ]

In [None]:
# list of expected streams
# each stream is a list of 2 elements: the first is a list of possible names, the second is the type
# NB : this is because NIC changed to LSLOutletStreamName in the middle of the study 
expectedDataStreams = [
    [['NIC-Accelerometer', 'LSLOutletStreamName-Accelerometer'] , 'Accelerometer'], 
    [['NIC-EEG', 'LSLOutletStreamName-EEG'], 'EEG'], 
    [['NIC-Markers', 'LSLOutletStreamName-Markers'], 'Markers'],
    [['NIC-Quality', 'LSLOutletStreamName-Quality'], 'Quality'], 

    [['Oxysoft Event'], 'Event'], 
    [['Oxysoft'], 'NIRS'], 

    [['Mouse'], 'MoCap'], 
    [['Mouse'], 'Markers'], 
    [['MouseToNIC'],  'Markers'], 

    [['EuroMov-Markers-Kinect'], 'Markers'], 
    [['EuroMov-Mocap-Kinect'], 'MoCap'],
]

### Check the data streams in one file

In [None]:
import pyxdf

def checkStreamsInXdfFile(data, expected, verbose=False):
    # array of zeros of the same length as expected 
    iFound = [0 * i for i in range(len(expected))] 
    unexpectedStream = []
    foundStream = []
    for stream in data:
        streamName = stream['info']['name'][0]
        streamType = stream['info']['type'][0]
        unexpectedStream.append([streamName, streamType])
        for i in range(len(expected)):
            expectedStreamName = expected[i][0]
            expectedStreamType = expected[i][1]
            if streamName in expectedStreamName and streamType in expectedStreamType:
                iFound[i] = 1
                foundStream.append([streamName, streamType])
                unexpectedStream.pop()
                break

    def printFoundStreams():
        print('found streams (order as in data):')
        for i in range(len(foundStream)):
            print('  ', foundStream[i][0], foundStream[i][-1])

        print('found streams (order as in expected):')
        for i in range(len(expected)):
            if iFound[i] == 1:
                print('  ', expected[i][0], expected[i][-1])

    def printUnexpectedStreams():
        print('unexpected streams (order as in data):')
        for i in range(len(unexpectedStream)):
            print('  ', unexpectedStream[i][0], unexpectedStream[i][-1])
        if len(unexpectedStream) == 0:
            print('  none')

    def printMissingStreams():
        print('missing streams (order as in expected):')
        for i in range(len(iFound)):
            if iFound[i] == 0:
                print('  ', expected[i][0], expected[i][-1])
        if all(iFound) == 1:
            print('  none')

    if verbose:
        printFoundStreams()
        printUnexpectedStreams()
        printMissingStreams()
    else:
        if len(unexpectedStream) > 0:
            printUnexpectedStreams()
        if all(iFound) != 1:
            printMissingStreams()

# test the function

fname = '017_BatJea_20211214_1_c' + '.xdf' # 11 streams
fullpath = cwd + "/data/NeuArm.lnk/Old_Healthy/017_BatJea_20211214_1_c.xdf"

fname = '018_ChaJea_20220202_1_c' + '.xdf' # 09 streams
fullpath = cwd + "/data/NeuArm.lnk/Old_Healthy/018_ChaJea_20220202_1_c.xdf"

# fname = "010_LapBas_20201124_1_c" + ".xdf" 
# fullpath = cwd + "//data/NeuArm.lnk/Young_Healthy/010_LapBas_20201124_1_c.xdf"

data, header = pyxdf.load_xdf(fullpath, synchronize_clocks=True, dejitter_timestamps=False, verbose=False)
checkStreamsInXdfFile(data, expectedDataStreams, verbose=False)


## Check that all the data files contain the expected data streams

In [None]:
for condition in conditions:
    fullpath = os.path.join(cwd, condition['relativePath'])
    data, header = pyxdf.load_xdf(fullpath, synchronize_clocks=True, dejitter_timestamps=False, verbose=False)
    print(condition['relativePath'])
    checkStreamsInXdfFile(data, expectedDataStreams)
