# New Approach:

For both:
- Sentence embeddings
- Constituent embeddings

Generate the embedding by iterating through them, instead of generating it from the whole txt file and chunking after.

In order to get the right chunking:
- get the metadata format from a normal analysis,
- get the frontiers of constituents / sentences from it, and generate the embeddings from there

## Testing metadata

In [1]:
from dataset import read_raw, get_subjects, get_path
from utils import decod_xy, mne_events
import mne
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import match_list
import spacy

modality = "auditory"
nlp = spacy.load("fr_core_news_sm")
all_evos = []
all_scores = []
path = get_path(modality)
subjects = get_subjects(path)
runs = 2
epoch_windows = {"word": {"onset_min": -0.3, "onset_max": 1.0, "offset_min": -1.0, "offset_max": 0.3},
                  "constituent": {"offset_min": -2.0, "offset_max": 0.5, "onset_min": -0.5, "onset_max": 2.0},
                  "sentence": {"offset_min": -4.0, "offset_max": 1.0, "onset_min": -1.0, "onset_max": 4.0}}
levels = ('word','constituent','sentence')
starts = ('onset', 'offset')
      
# Iterate on subjects to epochs, and mean later
for subject in subjects[2:3]:
    
    dict_epochs = dict() # DICT containing epochs grouped by conditions (start x level)
    
    # Initialization of the dictionary
    for start in starts: 
            for level in levels:
                epoch_key = f'{level}_{start}'
                dict_epochs[epoch_key] = [] 
                
    # Iterating on runs, building the metadata and re-epoching
    for run in range(1,runs+1):
        raw, meta_, events = read_raw(subject, run, events_return = True, modality=modality)
        meta = meta_.copy()
        
        # Metadata update
        meta['word_onset'] = True
        meta['word_stop'] = meta.start + meta.duration
        meta['sentence_onset'] = meta.word_id == 0
        meta['prev_closing'] = meta['n_closing'].shift(1)
        meta['constituent_onset'] = meta.apply(lambda x: x['prev_closing'] > x['n_closing'] and x['n_closing'] == 1, axis=1)
        meta['constituent_onset'].fillna(False, inplace=True)
        meta.drop('prev_closing', axis=1, inplace=True)
        
        # Adding the sentence stop info
        meta['sentence_id'] = np.cumsum(meta.sentence_onset)
        for s, d in meta.groupby('sentence_id'):
            meta.loc[d.index, 'sent_word_id'] = range(len(d))
            meta.loc[d.index, 'sentence_start'] = d.start.min()
            meta.loc[d.index, 'sentence_stop'] = d.start.max()
            
        # Adding the constituents stop info
        meta['constituent_id'] = np.cumsum(meta.constituent_onset)
        for s, d in meta.groupby('constituent_id'):
            meta.loc[d.index, 'constituent_start'] = d.start.min()
            meta.loc[d.index, 'constituent_stop'] = d.start.max()
            meta.loc[d.index, 'const_word_id'] = range(len(d))
        
        # Adding embeddings info
        meta = add_embeddings(meta, run, 'constituent')
        meta = add_embeddings(meta, run, 'sentence')
        for start in starts: 
            for level in levels:
                # Select only the rows containing the True for the conditions
                # Simplified to only get for the onset: sentence onset epochs, constituent onset epochs,etc
                start = 'onset' # DEBUG
                sel = meta.query(f'{level}_{start}==True')
                assert sel.shape[0] > 10  #
                
                # Do we need to do that ???
                """
                # Matchlist events and meta
                # So that we can epoch now that's we've sliced our metadata
                if modality == 'auditory':
                    word_events = events[events[:, 2] > 1]
                    meg_delta = np.round(np.diff(word_events[:, 0]/raw.info['sfreq']))
                    meta_delta = np.round(np.diff(sel.onset.values))
                    i, j = match_list(meg_delta, meta_delta)

                # For auditory, we match on the time difference between triggers
                elif modality == "visual":

                    i, j = match_list(events[:, 2], sel.wlength)
                    assert len(i) > (0.9 * len(events))
                    assert (events[i, 2] == sel.loc[j].wlength).mean() > 0.95
                sel = sel.reset_index().loc[j]
                # Making sure there is not hidden bug when matching
                assert sel.shape[0] > 0.5 *  (meta.query(f'{level}_onset==True')).shape[0]
                """
                
                # Epoching from the metadata having all onset events: if the start=Offset, the mne events
                # Function will epoch on the offset of each level instead of the onset
                # TODO: add adaptative baseline
                epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
                                     tmin = epoch_windows[f'{level}'][f'{start}_min'],
                                       tmax = epoch_windows[f'{level}'][f'{start}_max'],
                                         event_repeated = 'drop',
                                            preload=True,
                                                baseline=None)
                epoch_key = f'{level}_{start}'

                dict_epochs[epoch_key].append(epochs)
            
    # Once we have the dict of epochs per condition full (epoching for each run for a subject)
    # we can concatenate them, and fix the dev_head             
    for start_ in starts: 
        for level_ in levels:
            start_ = 'onset' # DEBUG
            epoch_key = f'{level_}_{start_}'
            all_epochs_chosen = dict_epochs[epoch_key]
            # Concatenate epochs

            for epo in all_epochs_chosen:
                epo.info["dev_head_t"] = all_epochs_chosen[1].info["dev_head_t"]

            dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)


Reading raw files for modality: auditory

 Epoching for run 1, subject: 3

Opening raw data file /home/is153802/data/LPP_MEG_auditory/sub-3/ses-01/meg/sub-3_ses-01_task-listen_run-01_meg.fif...
    Read a total of 13 projection items:
        grad_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v6 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v7 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v8 (1 x 306)  idle
    Range : 28000 ... 658999 =     28.000 ...   658.999 secs
Ready.
Rea

  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)


1954 events found
Event IDs: [  1 129]
Reading 0 ... 630999  =      0.000 ...   630.999 secs...
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 20 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 20.00 Hz
- Upper transition bandwidth: 5.00 Hz (-6 dB cutoff frequency: 22.50 Hz)
- Filter length: 6601 samples (6.601 sec)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 306 out of 306 | elapsed:    7.6s finished


Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1597 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1597 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
484 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 484 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
155 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 155 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1597 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1597 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
484 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 484 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
155 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 155 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Reading raw files for modality: auditory

 Epoching for run 2, subject: 3

Opening raw data file /home/is153802/data/LPP_MEG_auditory/sub-3/ses-01/meg/sub-3_ses-01_task-listen_run-02_meg.fif...
    Read a total of 13 projection items:
        grad_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v6 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v7 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v8 (1 x 306)  idle
    Range : 120000 ... 772999 =    120.000 ...   7

  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)


1769 events found
Event IDs: [  1 129]
Reading 0 ... 652999  =      0.000 ...   652.999 secs...
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 20 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 20.00 Hz
- Upper transition bandwidth: 5.00 Hz (-6 dB cutoff frequency: 22.50 Hz)
- Filter length: 6601 samples (6.601 sec)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 306 out of 306 | elapsed:    9.0s finished


Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1765 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1765 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
510 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 510 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
141 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 141 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1765 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1765 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
510 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 510 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
141 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 141 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Adding metadata with 27 columns
6720 matching events found
No baseline correction applied


  dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)
  dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)


Adding metadata with 27 columns
1984 matching events found
No baseline correction applied
Adding metadata with 27 columns
588 matching events found
No baseline correction applied


  dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)


AttributeError: 'numpy.ndarray' object has no attribute 'info'

## LASER embeddings for sentences (easy)

In [18]:
# Make it a function:
from dataset import get_code_path
from pathlib import Path
import os

def add_embeddings(meta, run, level):
    
    """
    Function made to generate laser embeddings, store them,
    and add them to the metadata 

    Does so for both constituent embeddings, and sentence ones


    """
    # Parse the metadata into constituents / sentences, 
    # and generate txt files for each constituents / sentence
    # So that it can be parsed by LASER
    sentences = []
    current_sentence = []
    
    meta['const_end'] = meta.constituent_onset.shift(-1)
    for index, row in meta.iterrows():

        # Append word to current sentence 
        current_sentence.append(row['word'])

        # Check if end of sentence 
        if level == 'sentence' and row['is_last_word']:
            # Join words into sentence string and append to list
            sentences.append(' '.join(current_sentence)) 
            # Reset current sentence   
            current_sentence = []
            
        if level == 'constituent' and row['const_end']:
            # Join words into sentence string and append to list
            sentences.append(' '.join(current_sentence)) 
            # Reset current sentence   
            current_sentence = []

    # Loop through sentences 
    for i, sentence in enumerate(sentences):
        # Get sentence number
        sentence_num = i + 1

        # Create file name
        file_name = f'./embeds/txt/run{run}_{level}_{sentence_num}.txt'

        # Open text file 
        with open(file_name, 'w') as f:
            # Write sentence to file
            f.write(sentence)
            
    # Run LASER using the run number
    path = Path('/home/is153802/github/LASER/tasks/embed')
    %env LASER=/home/is153802/github/LASER

    for i, _ in enumerate(sentences):
    # Get sentence number
        sentence_num = i + 1

        txt_file = f"/home/is153802/code/decoding/local_testing/embeds/txt/run{run}_{level}_{sentence_num}.txt"
        emb_file = f"/home/is153802/code/decoding/local_testing/embeds/emb/run{run}_{level}_{sentence_num}.bin"
        if os.path.exists(emb_file):
            continue
        else:
            !bash /home/is153802/github/LASER/tasks/embed/embed.sh {txt_file} {emb_file}
        
    # Get the embeddings from the generated txt file, and add them to metadata
    dim = 1024
    embeddings = {}
    for index, sentence in enumerate(sentences):
        embeds = np.fromfile(
                f"{get_code_path()}/decoding/local_testing/embeds/emb/run{run}_{level}_{index+1}.bin",
                dtype=np.float32,
                count=-1,
                )
        embeds.resize(embeds.shape[0] // dim, dim)
        embeds = embeds.reshape(-1)
        embeddings[index] = embeds
    sent_index = 0
    embed_arrays = []
    for index, row in meta.iterrows():
        embed_arrays.append(embeddings[sent_index])
        # Check if end of sentence 
        if row['is_last_word']:
            sent_index += 1

    meta['embed'] = embed_arrays
    
    return meta


In [None]:
m = add_embeddings(meta,run, 'constituent')

env: LASER=/home/is153802/github/LASER
2023-06-08 14:52:28,328 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:52:28,328 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:52:28,328 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:52:28,858 | INFO | preprocess | SPM processing run2_constituent_1.txt  
2023-06-08 14:52:28,971 | INFO | embed | encoding /tmp/tmpqavadeej/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_1.bin
2023-06-08 14:52:28,986 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:52:32,121 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:52:32,122 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:52:32,122 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:52:32,645 | INFO | preprocess | SPM processing run2_constituent_2.txt  
2023-06

2023-06-08 14:53:20,509 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:53:20,509 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:53:20,509 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:53:21,037 | INFO | preprocess | SPM processing run2_constituent_15.txt  
2023-06-08 14:53:21,149 | INFO | embed | encoding /tmp/tmpkkfcajjy/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_15.bin
2023-06-08 14:53:21,164 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:53:24,299 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:53:24,299 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:53:24,299 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:53:24,867 | INFO | preprocess | SPM processing run2_constituent_16.txt  
2023-06-08 14:53:24,990 | INFO | embed | en

2023-06-08 14:54:13,899 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:54:13,899 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:54:13,899 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:54:14,428 | INFO | preprocess | SPM processing run2_constituent_29.txt  
2023-06-08 14:54:14,547 | INFO | embed | encoding /tmp/tmp98_0a4pr/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_29.bin
2023-06-08 14:54:14,563 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:54:17,703 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:54:17,703 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:54:17,703 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:54:18,232 | INFO | preprocess | SPM processing run2_constituent_30.txt  
2023-06-08 14:54:18,346 | INFO | embed | en

2023-06-08 14:55:07,224 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:55:07,224 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:55:07,224 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:55:07,778 | INFO | preprocess | SPM processing run2_constituent_43.txt  
2023-06-08 14:55:07,890 | INFO | embed | encoding /tmp/tmp1z9vg3y5/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_43.bin
2023-06-08 14:55:07,902 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:55:11,170 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:55:11,170 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:55:11,170 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:55:11,705 | INFO | preprocess | SPM processing run2_constituent_44.txt  
2023-06-08 14:55:11,823 | INFO | embed | en

2023-06-08 14:55:59,684 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:55:59,684 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:55:59,684 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:56:00,215 | INFO | preprocess | SPM processing run2_constituent_57.txt  
2023-06-08 14:56:00,331 | INFO | embed | encoding /tmp/tmphelwagf_/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_57.bin
2023-06-08 14:56:00,350 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:56:03,384 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:56:03,384 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:56:03,385 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:56:03,960 | INFO | preprocess | SPM processing run2_constituent_58.txt  
2023-06-08 14:56:04,079 | INFO | embed | en

2023-06-08 14:56:52,513 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:56:52,513 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:56:52,513 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:56:53,059 | INFO | preprocess | SPM processing run2_constituent_71.txt  
2023-06-08 14:56:53,168 | INFO | embed | encoding /tmp/tmpuqzyvukw/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_71.bin
2023-06-08 14:56:53,181 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:56:56,421 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:56:56,421 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:56:56,421 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:56:56,949 | INFO | preprocess | SPM processing run2_constituent_72.txt  
2023-06-08 14:56:57,064 | INFO | embed | en

2023-06-08 14:57:45,162 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:57:45,162 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:57:45,162 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:57:45,708 | INFO | preprocess | SPM processing run2_constituent_85.txt  
2023-06-08 14:57:45,819 | INFO | embed | encoding /tmp/tmp_1zmbt25/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_85.bin
2023-06-08 14:57:45,838 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:57:48,970 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:57:48,970 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:57:48,970 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:57:49,523 | INFO | preprocess | SPM processing run2_constituent_86.txt  
2023-06-08 14:57:49,635 | INFO | embed | en

2023-06-08 14:58:36,921 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:58:36,921 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:58:36,921 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:58:37,446 | INFO | preprocess | SPM processing run2_constituent_99.txt  
2023-06-08 14:58:37,551 | INFO | embed | encoding /tmp/tmpswyxdf8k/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_99.bin
2023-06-08 14:58:37,565 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:58:40,591 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:58:40,591 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:58:40,591 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:58:41,123 | INFO | preprocess | SPM processing run2_constituent_100.txt  
2023-06-08 14:58:41,231 | INFO | embed | e

2023-06-08 14:59:28,141 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:59:28,141 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:59:28,141 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:59:28,666 | INFO | preprocess | SPM processing run2_constituent_113.txt  
2023-06-08 14:59:28,769 | INFO | embed | encoding /tmp/tmpae3vsnei/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_113.bin
2023-06-08 14:59:28,783 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:59:31,739 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:59:31,739 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:59:31,739 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:59:32,265 | INFO | preprocess | SPM processing run2_constituent_114.txt  
2023-06-08 14:59:32,367 | INFO | embed |

2023-06-08 15:00:20,143 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:00:20,143 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:00:20,143 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:00:20,687 | INFO | preprocess | SPM processing run2_constituent_127.txt  
2023-06-08 15:00:20,797 | INFO | embed | encoding /tmp/tmpyezo8whq/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_127.bin
2023-06-08 15:00:20,817 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 15:00:24,022 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:00:24,022 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:00:24,022 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:00:24,549 | INFO | preprocess | SPM processing run2_constituent_128.txt  
2023-06-08 15:00:24,654 | INFO | embed |

2023-06-08 15:01:08,621 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 15:01:11,794 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:01:11,794 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:01:11,794 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:01:12,318 | INFO | preprocess | SPM processing run2_constituent_141.txt  
2023-06-08 15:01:12,423 | INFO | embed | encoding /tmp/tmpqybsn66v/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_141.bin
2023-06-08 15:01:12,439 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 15:01:15,421 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:01:15,421 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:01:15,421 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:01:15,976 | INFO | preprocess | SPM processing 

2023-06-08 15:02:02,798 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:02:02,798 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:02:02,799 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:02:03,331 | INFO | preprocess | SPM processing run2_constituent_155.txt  
2023-06-08 15:02:03,439 | INFO | embed | encoding /tmp/tmpl6fxigfi/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_155.bin
2023-06-08 15:02:03,450 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 15:02:06,472 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:02:06,472 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:02:06,472 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:02:06,995 | INFO | preprocess | SPM processing run2_constituent_156.txt  
2023-06-08 15:02:07,100 | INFO | embed |

2023-06-08 15:02:54,020 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:02:54,020 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:02:54,020 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:02:54,543 | INFO | preprocess | SPM processing run2_constituent_169.txt  
2023-06-08 15:02:54,652 | INFO | embed | encoding /tmp/tmpbqnq6smc/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_169.bin
2023-06-08 15:02:54,662 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 15:02:57,616 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:02:57,616 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:02:57,616 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:02:58,152 | INFO | preprocess | SPM processing run2_constituent_170.txt  
2023-06-08 15:02:58,262 | INFO | embed |

2023-06-08 15:03:41,820 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 15:03:45,051 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:03:45,052 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:03:45,052 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:03:45,588 | INFO | preprocess | SPM processing run2_constituent_183.txt  
2023-06-08 15:03:45,692 | INFO | embed | encoding /tmp/tmp3hsnf8sc/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_183.bin
2023-06-08 15:03:45,703 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 15:03:48,627 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:03:48,627 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:03:48,627 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:03:49,164 | INFO | preprocess | SPM processing 

2023-06-08 15:04:36,761 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:04:36,761 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:04:36,761 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:04:37,289 | INFO | preprocess | SPM processing run2_constituent_197.txt  
2023-06-08 15:04:37,399 | INFO | embed | encoding /tmp/tmpw6m2tf82/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_constituent_197.bin
2023-06-08 15:04:37,413 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 15:04:40,347 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 15:04:40,348 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 15:04:40,348 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 15:04:40,883 | INFO | preprocess | SPM processing run2_constituent_198.txt  
2023-06-08 15:04:40,986 | INFO | embed |

# Previous Approach

In [None]:
# First: generate the run{i}.txt file to input to LASER

# What was done previously: chunk the txt file raw by actual sentence (based on ., ?, !, etc..)
# Problem: the metadata in epochs (sentence_end calculated using the word onset difference) doesn't match, as there are
# Offsets that happen sometimes not at the end of sentences

# Solution: temporary: generate the line chunking for LASER by word onset difference from the metadata file
# Final: it will only work for read modality: for audio, an option could be to replicate the metadata file
# => supposing the shape of both metadata files are the same, we can add the sentence_end column to the audio one


In [None]:
import pandas as pd

# Open the events files to get the metadata, and then generate the txt file from there
for run in np.arange(1,10):

    file = f'/home/co/data/BIDS_lecture/sub-{sub}/ses-01/meg/sub-{sub}_ses-01_task-read_run-0{run}_events.tsv'



    # Load the TSV file into a pandas DataFrame
    df = pd.read_csv(file, sep='\t')

    # Keep track of the previous onset value
    prev_onset = None

    # Open the output file for writing
    with open(f'run{run}.txt', 'w') as output_file:

        # Loop through each row in the DataFrame
        for i, row in df.iterrows():

            # Get the onset value for this row
            onset = row['onset']

            # If this is the first row, or the onset difference with the previous row is less than 0.7, append the current column to the output
            if ((row.word).__contains__(".")
                or (row.word).__contains__("?")
                or (row.word).__contains__("!")):
                output_file.write(row['word'] +'\n')
                

            # Otherwise, start a new line in the output file
            else:
                
                output_file.write(row['word'] + ' ')

            # Remember the onset value for the next iteration
            prev_onset = onset


In [None]:
from pathlib import Path
import numpy as np
path = Path('/home/is153802/github/LASER/tasks/embed')

In [None]:
%env LASER=/home/is153802/github/LASER

In [None]:
CHAPTERS = {
        1: "1-3",
        2: "4-6",
        3: "7-9",
        4: "10-12",
        5: "13-14",
        6: "15-19",
        7: "20-22",
        8: "23-25",
        9: "26-27",
    }

for run in np.arange(1,10):
    ch = CHAPTERS[run]
    txt_file = f"/home/is153802/code/data/txt_laser/run{run}.txt"
    emb_file = f"/home/is153802/code/data/laser_embeddings/emb_{ch}.bin"
    !bash /home/is153802/github/LASER/tasks/embed/embed.sh {txt_file} {emb_file}
