# SEEG Brainstorm Challenge 2024

Thank you for joining us for the Brainstorm Challenge 2024! This script is designed to re-organize the data and save each participants data into a CSV file. We developed this script to give our attendees direct access to the data, rather than needing to navigate the necessary complex file structure of the raw data. As such, the result of this script - the CSV files - can be used directly.  

## 0. Setup

### Import Modules
First, we will need to import different modules for this script.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import itertools
import pickle
from tqdm import tqdm
import bz2
import _pickle as cPickle

### Define Functions

We will load some basic functions that Bryan Zheng wrote to read and organize the data.

In [None]:
def load_mat(fmat):
    return sio.loadmat(fmat, struct_as_record=False, squeeze_me=True)

def mat_to_dict(mat):
    """ convert mat_struct object to a dictionary recursively
    """
    dict = {}

    for field in mat._fieldnames:
        val = mat.__dict__[field]
        if isinstance(val, sio.matlab.mat_struct):
            dict[field] = mat_to_dict(val)
        else:
            dict[field] = val

    return dict

def read_bhv(behavior):
    """ load behavioral data from .mat file and extra trial data
        returns dictionary of all behavioral data per trial
    """
    bhv = {}
    existing_trials = 0

    f = load_mat(behavior)
    for k in f.keys():
        if k[0:5] == 'Trial' and k[5:].isdigit():
            trial = f[k]
            trial_num = int(k[5:]) + existing_trials
            bhv[trial_num] = mat_to_dict(trial)

    return bhv

def load_pkl(path):
    """ loads compressed pickle file called by load_electrodes() """

    with bz2.open(path, 'rb') as f:
        neural_data = cPickle.load(f)
    return neural_data

def find_trials(events, verbose=False):
    """ finds *all* trials with associated codes in an events file """
    events_signal = load_pkl(events)
    codes = events_signal['signal']
    fs = events_signal['fs']

    trials = {0: []}

    count = 0
    for c in codes:
        if c[0] == 9:
            count += 1
            trials[count] = [c]
            if count > 1 and trials[count-1][-1][0] != 18 and verbose:
                # assert trials[count-1][-1][0] == 18
                print('WARNING: parsed trial {} does not end in 18'.format(count))
        else:
            trials[count].append(c)

    if verbose:
        print('found {} trials with {} codes'.format(count, len(codes)))

    return fs, codes, trials

def ranges(i):
    """ just provides better print() output for trials """
    for a, b in itertools.groupby(enumerate(i), lambda pair: pair[1] - pair[0]):
        b = list(b)
        yield b[0][1], b[-1][1]

# this is the work-horse function that aligns the trials in the behavioral files with those in the neural ones
def match_trials(bhv, codes, trials, fs, margin=0.1, verbose=False):
    """ matches codes from behavioral data with neural events - returns dictionary of trials """
    matches = {}
    c = 0
    cmax = 0
    for tr in bhv:
        bhv_tr_len = np.asarray(bhv[tr]['BehavioralCodes']['CodeTimes'][-1] - bhv[tr]['BehavioralCodes']['CodeTimes'][0], dtype=int)
        m = False
        while c < len(trials) and m == False:
            # check for valid trials
            if len(trials[c]) > 1:
                tr_codes = [j[0] for j in trials[c]]
                if trials[c][0][0] == 9 and 18 in tr_codes:
                    tr_start = trials[c][0][1]

                    i18 = tr_codes.index(18)

                    tr_end = trials[c][i18][1]
                    nrl_tr_len = (tr_end - tr_start) / fs * 1000 # convert to milliseconds
                    diff = 1 - nrl_tr_len/bhv_tr_len
                    if abs(diff) < margin:
                        if verbose:
                            print('Aligned Trial {} (acc: {:.3f}%)'.format(tr, diff*100))
                        matches[tr] = trials[c]
                        m = True
                        c += 1
                        cmax = c
                    else:
                        c += 1
                else:
                    c += 1
            else:
                c+=1
        c = cmax

        if m == False and verbose:
            print('could not find match for {}'.format(tr))

    match_range = ranges(matches.keys())

    rstr = []
    for i in match_range:
        if i[0] == i[1]:
            rstr.append('{}'.format(i[0]))
        else:
            rstr.append('{}-{}'.format(i[0], i[1]))

    match_range = ', '.join(list(rstr))
    print('found matches for trials {}'.format(match_range))

    return matches

## 1. Task and Data Description

First, let's discuss the task. Participants underwent a memory task where they saw a series of 5-second movie clips. Each clip was accompanied by a circle that varied in color and location (the circles appeared in a location around the movie clip). This is the encoding phase. The participants then saw each clip again and then were to recall the properties of the circle (color, location) for the clip. This is the same day recall phase. The recall was then repeated the next day, which is the next day recall phase. This describes the full implementation of the task once.

The data can be found in the following directory on Oscar:

In [None]:
data_path = os.path.abspath('/oscar/data/brainstorm-ws/seeg_data/Memory Task Data/Epilepsy/Monitoring/')

This directory contains a lot of folders corresponding to different participant data. The folders will look something like this: `2020-11-12_e0010GP_00` where `2020-11-12` corresponds to the data the data were recorded, `e0010GP` corresponds to the participant ID and `00` corresponds to the recording number of that participant (so, the first set of the participant's data is `00`, the second set of participant's is `01` and so forth). 

Let's see what folders exist in this directory:

In [None]:
[filename for filename in os.listdir(data_path) if '.csv' not in filename and '.ipynb' not in filename] 

Remember that fully conducting the memory experiment takes 2 days, so to get a full dataset from one participant, we will need two of these folders. For example, `2020-11-12_e0010GP_00` corresponds to participant `e0010GP`'s first day of recording, which will contain the encoding phase and the same-day recall phase, while `2020-11-13_e0010GP_01` corresponds to participant `e0010GP`'s second day of recording, which contains the next-day recall phase.

You might have noticed, however, that some participants have more than just two folders. This can occur when the participants conducted the task more than one time. For example `2020-11-13_e0010GP_02` corresponds to participant `e0010GP`'s third day of recording, which contains the encoding phase and the same-day recall phase of this participant conducting the task a second time. Although you may use this data if you like, this script will only be organizing data from each participant the first time they conducted the task. 

Unfortunately, for various reasons, the post-fixes (`00`, `01`, `02`, ...) are not always fully consistent in that `00` indicates the encoding and same-day recall phase of the first time the task was conducted, and `01` indicates the next-day recall phase of the first time the task was conducted. So, instead of relying on these post-fixes, we will here manually determine which folders indicate the first and second day that the corresponding participant conducted the task the first time.

In [None]:
#Participants e0014VG, e0016YR, and e0022ZG do not have next-day recall, so will be ignored in this script. 
#They do all have encoding and same day recall, so they may still be useful to you if you would like to include them.

participants_of_interest = [
    ['e0010GP_00', 'e0010GP_01'],
    ['e0011XQ_00', 'e0011XQ_01'],
    ['e0017MC_00', 'e0017MC_01'],
    ['e0019VQ_00', 'e0019VQ_01'],
    ['e0020JA_00', 'e0020JA_01'],
    ['e0024DV_00', 'e0024DV_01'],
    ['e0013LW_02', 'e0013LW_03'],
    ['e0015TJ_01', 'e0015TJ_02']]

participants_of_interest_flattened = [participant_filename for participant_filenames in participants_of_interest for participant_filename in participant_filenames]
participant_fullnames_flattened = [filename for filename in os.listdir(data_path) if '_'.join(filename.split('_')[-2:]) in participants_of_interest_flattened] 
participant_fullnames = np.array(participant_fullnames_flattened).reshape(-1,2).tolist()

[print(participant_fullname) for participant_fullname in participant_fullnames];

## 2. Re-Organizing the Data

The following code is the meat of this script. It will extract data for each participant, dependent on the list provided above, re-organize it, and save it into a CSV file. 

In [None]:
#Iterates through all participants
for participant_filenames in participant_fullnames: 
    
    #Create participant specific dataframes
    times = [f"Time{str(datapoint).zfill(4)}" for datapoint in range(5120)]
    participant_data = pd.DataFrame(columns=['participant_id','Phase','Condition','Electrode']+times)
    
    #Iterates through Day 0 and Day 1. 
    #Day 0 includes the encoding phase and the same-day recall phase while Day 1 includes the next-day recall phase.
    for day, participant_filename in enumerate(participant_filenames):
        print(f"\nParticipant Filename: {participant_filename}")

        #Determine metadata and filenames
        participant_id = participant_filename.split('_')[-2]
        participants_filenames = os.listdir(f"{data_path}/{participant_filename}")
        participant_electrodes = [p_file.split('-')[-1].replace('.pbz2','') for p_file in participants_filenames if '.pbz2' in p_file and 'Events.pbz2' not in p_file]
        participant_neural_filenames = [p_file for p_file in participants_filenames if '.pbz2' in p_file and 'Events.pbz2' not in p_file]

        #Iterate through phases.
        #Day 0 contains Phases A and B, which are encoding and same-day recall, respectively
        #Day 1 contains Phase A (which is different from Day 0's phase A), which is next-day recall.
        phases = [['A','B'] if day == 0 else ['A']][0]
        phases = ['C'] if day == 1 and participant_id == 'e0015TJ' else phases #One participant has a different naming convention
        
        #Iterate through the phases we just defined
        for phase in phases:
            
            #Determine filenames
            participant_beh = f"{data_path}/{participant_filename}/{participant_filename[:-2]}{phase}.mat"
            participant_events = f"{data_path}/{participant_filename}/{participant_filename[:-2]}Events.pbz2"
            
            #Create a label to better signify the current phase
            if day == 0 and phase == 'A':
                phase_dict = 'Encoding'
            elif day == 0 and phase == 'B':
                phase_dict = 'SameDayRecall'
            elif day == 1:
                phase_dict = 'NextDayRecall'

            #Print report
            print(f"\nID: {participant_id}")
            print(f"Day: {day}")
            print(f"Phase: {phase_dict}")

            #Load and navigate the data using Bryan Zheng's functions
            bhv = read_bhv(participant_beh)
            fs, codes, trials = find_trials(participant_events)
            matched_trials = match_trials(bhv, codes, trials, fs)

            #Iterate through electrodes
            for ei, electrode in enumerate(tqdm(participant_neural_filenames)):

                #Load current electrodes data using Bryan Zheng's function
                participant_neural = load_pkl(f"{data_path}/{participant_filename}/{electrode}")
                
                #Iterate through the trials
                for trial in bhv.keys():
                    
                    #Determine the video clip being watched (video clip labels range from 1 to 30)
                    trial_condition = bhv[trial]['Condition']
                    
                    #Determine current electrode being processed
                    current_electrode = participant_electrodes[ei]

                    #Determine the start and end time of the video clip
                    start_time = matched_trials[trial][0][1]
                    end_time = matched_trials[trial][-1][1]

                    #Segment the data to only contain the video clip
                    trial_neural = list(participant_neural['signal'][start_time:end_time][:int(5*participant_neural['fs'])])
                    
                    #Add metadata and data to the participant-specific dataframe to later be saved
                    participant_data.loc[len(participant_data)] = [participant_id, phase_dict, trial_condition, current_electrode]+trial_neural                

    #Save participant data to a csv file
    participant_data.to_csv(f"{data_path}/{participant_id}_combined.csv", index=False)
    
    #Add participant data to a dictionary that will be saved to contain all participants
    all_data[participant_id] = participant_data
    