Script to fix line duplication error for Matthew Lu's EEG data collection for 2021 Honours. 

Going to be analysing .vmrk files in the attached folder


Algorithm to be implemented according to Matt's manual examination of the data:

For subject 1 (similarity_01.vmrk)
1. Triggers "S 29", " S 30", "S 31" should not be in the data at all and should be deleted.
2. Fixation trigger S 10 should only be followed by the colour triggers S 1 to S 9 (e.g., S 10 → S 7). Anything else should be deleted.
3. Response screen trigger S 11 should only be followed by one rating trigger from (e.g., S 11 → S 15).
   * Sometimes S 11 is followed by one extra rating trigger (e.g., S 11 → S 12 → S 15), sometimes two extra rating triggers (e.g., S 11 → S 12 → S 15 → S 19).
   * In either case, it seems like the correct rating trigger is always the one that comes last. So S 15 and S 19 would be the correct triggers in the above example.
4. There are extra triggers for the colours (S 1 to S 9). These seem to always come before an actual colour trigger (e.g., S 6 → S 3). So if there are two consecutive colour triggers, the first one is the spurious one and should be deleted.

And these are for subjects 2 to 8 (similarity_02.vmrk - similarity_08.vmrk).

1. First, if any of the lines contain a trigger that isn't S 1 to S 9 or S 11 to S 19, then they should be deleted (run this step first).
2. If there are multiple consecutive lines that are within 10 samples of each other, then only the last line should be kept and the others deleted.
 * E.g., S 6, 208776 → S 7, 208778. Delete S 6.
 * E.g., S 18, 209900 → S 19, 209901 → S 15, 209902. Delete S 18 and S 19, keep S 15

Written by Ariel Zeleznikow-Johnston on 2021-Aug-13

ariel.zeleznikow-johnston@monash.edu





In [1]:
# libraries
import os
import re

In [2]:
# folder with data
folder = 'vmrk_raw/'

In [3]:
# list of acceptable stimuli
acceptable_subj_1 = [
             'S  1',
             'S  2',
             'S  3',
             'S  4',
             'S  5',
             'S  6',
             'S  7',
             'S  8',
             'S  9',
             'S 10',
             'S 11',
             'S 12',
             'S 13',
             'S 14',
             'S 15',
             'S 16',
             'S 17',
             'S 18',
             'S 19',]

acceptable_subj_2_9 = [
             'S  1',
             'S  2',
             'S  3',
             'S  4',
             'S  5',
             'S  6',
             'S  7',
             'S  8',
             'S  9',
             'S 11',
             'S 12',
             'S 13',
             'S 14',
             'S 15',
             'S 16',
             'S 17',
             'S 18',
             'S 19',]

after_s10 = ['S  1',
             'S  2',
             'S  3',
             'S  4',
             'S  5',
             'S  6',
             'S  7',
             'S  8',
             'S  9']

In [4]:
def load_vmrk(filename,diagnose=False):
  """Opening a file"""
  file = open(folder + filename,'r')
  lines = file.readlines()
  # testing
  if(diagnose):
    for i in range(100):
      print(lines[i])
  return lines


In [5]:
def get_files():
    """Get a list of all the vmrk files"""
    files = os.listdir(folder)
    return(files)

In [110]:
def write_corrected(filename):
    """Write a corrected file"""
    
    # get the data from the current file
    lines = load_vmrk(filename)
    
    # make a new folder
    if not os.path.exists('corrected_' + folder):
        os.makedirs('corrected_' + folder)
    # write a new file
    out_name = 'corrected_' + filename
    out_file = open('corrected_' + folder + out_name,'w')
    
    # write the header material
    write_header(lines,out_file)
    
    # write the data
    data = get_data(lines)
    
    if(filename == 'similarity_01.vmrk'): # subject 1 specific stuff
        data = initial_clean(data,acceptable_subj_1)
        data = subj_1_clean(data)
        data = reindex(data)
    else: # everyone else
        data = initial_clean(data,acceptable_subj_2_9)
        data = subj_2_n_clean(data)
        data = reindex(data)
        
    for line in data:
        out_file.write(line)
    out_file.close()

In [8]:
def find_start(lines):
    """Find line on which the data starts"""
    count = 0
    for line in lines:
        count +=1
        if line.startswith('Mk3'):
            return count
        

In [9]:
def write_header(lines,out_file):
    """Write the header data across"""
    end_of_header = find_start(lines)
    i = 0
    while(i < end_of_header-1):
        out_file.write(lines[i])
        i += 1

In [10]:
def get_data(lines):
    return lines[(find_start(lines)-1):]

In [75]:
def get_stimulus(line):
    stim = re.search('lus,(.*?),',line)
    if stim:
        return stim.group(1)

In [93]:
def get_sample(line):
    return(int(line.split(',')[2]))

In [12]:
def initial_clean(data,acceptable):
    """Remove inappropriate triggers"""
    output_data = []
    for line in data:
        if get_stimulus(line) in acceptable:
            output_data.append(line)
    return output_data

In [61]:
def subj_1_clean(data):
    """Remove subject 1 specific issues"""
    output_data = []
    temp_data_s10 = []
    temp_data_response = []
    
    # Fixation trigger S 10 should only be followed by the colour triggers S 1 to S 9 (e.g., S 10 → S 7). Anything else should be deleted.
    # Remove two consecutive colour triggers
    i = 0
    length = len(data)
    while(i < length):
        if(get_stimulus(data[i]) == 'S 10'):
            temp_data_s10.append(data[i]) # add the S 10
            while (bool(get_stimulus(data[i + 1]) not in after_s10)): # skip until a colour found
                i += 1 # skip the inappropriate line
            if (bool(get_stimulus(data[i + 1]) in after_s10)): # if the next line also a colour, skip to it
                i += 1
            temp_data_s10.append(data[i])
            i +=1
        else:
            temp_data_s10.append(data[i])
            i += 1
    
    # Response screen trigger S 11 should only be followed by one rating trigger from (e.g., S 11 → S 15)
    i = 0
    while i < (len(temp_data_s10)):
        #print(i)
        #print(data[i])
        #print(get_stimulus(data[i]))
        if(get_stimulus(temp_data_s10[i]) == 'S 11'):
            temp_data_response.append(temp_data_s10[i])
            #print(i,'stim',get_stimulus(temp_data_s10[i]))
            while(get_stimulus(temp_data_s10[i]) != 'S 10'): # go looking for the colour trigger
                #print('bum')
                if(i == len(temp_data_s10)-1): # hit end of file
                    temp_data_response.append(temp_data_s10[i])
                    return temp_data_response
                i += 1
            temp_data_response.append(temp_data_s10[i - 1]) # add only the last correct response
            temp_data_response.append(temp_data_s10[i]) # add the S 10 we're on now
            i += 1
        else:
            temp_data_response.append(temp_data_s10[i])
            i += 1
    
    # should be good now
    for line in temp_data_response:
        print(line)

In [97]:
def subj_2_n_clean(data):
    """Remove subject 2...n specific issues"""
    # If there are multiple consecutive lines that are within 10 samples of each other, then only the last line should be kept and the others deleted.
    out_data = []
    i = 0
    while i < (len(data)):
        if (get_sample(data[i + 1]) - get_sample(data[i]) > 10):
            out_data.append(data[i])
        i += 1
        if i == (len(data) - 1):
            out_data.append(data[i])
            return(out_data)

In [108]:
def reindex(data):
    """Reindex the lines"""
    out_data =[]
    i = 3 # data starts on Mk3
    for line in data:
        temp = line.split('=')
        index = 'Mk' + str(i) + '='
        out_data.append(index+temp[1])
        i +=1
    return(out_data)

In [111]:
# run the script
files = get_files()
for file in files:
    write_corrected(file)

In [67]:
write_corrected('similarity_01.vmrk')

In [94]:
test_lines = load_vmrk('similarity_02.vmrk')
find_start(test_lines)
test_line = test_lines[15]
print(test_line)
print(get_stimulus(test_line))
print(get_sample(test_line))

Mk5=Stimulus,S 11,23458,1,0

S 11
23458


In [85]:
test_data = get_data(test_lines)
print(len(test_data))
print()
print(len(initial_clean(test_data,acceptable_subj_1)))
subj_1_test = initial_clean(test_data,acceptable_subj_1)

5889

5584


In [86]:
find_start(test_lines)
print(test_data[0])
print(subj_1_test[0])

Mk3=Stimulus,S  5,22650,1,0

Mk3=Stimulus,S  5,22650,1,0



In [100]:
len(subj_1_test)

5584

In [109]:
reindex(subj_2_n_clean(subj_1_test))

['Mk3=Stimulus,S  5,22650,1,0\n',
 'Mk4=Stimulus,S  7,23308,1,0\n',
 'Mk5=Stimulus,S 11,23458,1,0\n',
 'Mk6=Stimulus,S 14,24633,1,0\n',
 'Mk7=Stimulus,S  2,26967,1,0\n',
 'Mk8=Stimulus,S 11,27117,1,0\n',
 'Mk9=Stimulus,S 18,28108,1,0\n',
 'Mk10=Stimulus,S  4,29317,1,0\n',
 'Mk11=Stimulus,S 11,29467,1,0\n',
 'Mk12=Stimulus,S 18,30484,1,0\n',
 'Mk13=Stimulus,S  7,31750,1,0\n',
 'Mk14=Stimulus,S 11,31900,1,0\n',
 'Mk15=Stimulus,S 15,32867,1,0\n',
 'Mk16=Stimulus,S  7,34067,1,0\n',
 'Mk17=Stimulus,S 11,34217,1,0\n',
 'Mk18=Stimulus,S 12,34942,1,0\n',
 'Mk19=Stimulus,S  3,37309,1,0\n',
 'Mk20=Stimulus,S 11,37459,1,0\n',
 'Mk21=Stimulus,S 17,38217,1,0\n',
 'Mk22=Stimulus,S  1,39425,1,0\n',
 'Mk23=Stimulus,S 11,39575,1,0\n',
 'Mk24=Stimulus,S 18,40509,1,0\n',
 'Mk25=Stimulus,S  2,41634,1,0\n',
 'Mk26=Stimulus,S 11,41784,1,0\n',
 'Mk27=Stimulus,S 14,42350,1,0\n',
 'Mk28=Stimulus,S  6,44834,1,0\n',
 'Mk29=Stimulus,S 11,44984,1,0\n',
 'Mk30=Stimulus,S 19,45767,1,0\n',
 'Mk31=Stimulus,S  8,46834,