# New Approach:

For both:
- Sentence embeddings
- Constituent embeddings

Generate the embedding by iterating through them, instead of generating it from the whole txt file and chunking after.

In order to get the right chunking:
- get the metadata format from a normal analysis,
- get the frontiers of constituents / sentences from it, and generate the embeddings from there

## Testing metadata

In [58]:
from dataset import read_raw, get_subjects, get_path
from utils import decod_xy, mne_events
import mne
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import match_list
import spacy

modality = "auditory"
nlp = spacy.load("fr_core_news_sm")
all_evos = []
all_scores = []
path = get_path(modality)
subjects = get_subjects(path)
runs = 3
epoch_windows = {"word": {"onset_min": -0.3, "onset_max": 1.0, "offset_min": -1.0, "offset_max": 0.3},
                  "constituent": {"offset_min": -2.0, "offset_max": 0.5, "onset_min": -0.5, "onset_max": 2.0},
                  "sentence": {"offset_min": -4.0, "offset_max": 1.0, "onset_min": -1.0, "onset_max": 4.0}}
levels = ('word','constituent','sentence')
starts = ('onset', 'offset')
      
# Iterate on subjects to epochs, and mean later
for subject in subjects[2:4]:
    
    dict_epochs = dict() # DICT containing epochs grouped by conditions (start x level)
    
    # Initialization of the dictionary
    for start in starts: 
            for level in levels:
                epoch_key = f'{level}_{start}'
                dict_epochs[epoch_key] = [] 
                
    # Iterating on runs, building the metadata and re-epoching
    for run in range(1,runs+1):
        raw, meta_, events = read_raw(subject, run, events_return = True, modality=modality)
        meta = meta_.copy()
        
        # Metadata update
        meta['word_onset'] = True
        meta['word_stop'] = meta.start + meta.duration
        meta['sentence_onset'] = meta.word_id == 0
        meta['prev_closing'] = meta['n_closing'].shift(1)
        meta['constituent_onset'] = meta.apply(lambda x: x['prev_closing'] > x['n_closing'] and x['n_closing'] == 1, axis=1)
        meta['constituent_onset'].fillna(False, inplace=True)
        meta.drop('prev_closing', axis=1, inplace=True)
        
        # Adding the sentence stop info
        meta['sentence_id'] = np.cumsum(meta.sentence_onset)
        for s, d in meta.groupby('sentence_id'):
            meta.loc[d.index, 'sent_word_id'] = range(len(d))
            meta.loc[d.index, 'sentence_start'] = d.start.min()
            meta.loc[d.index, 'sentence_stop'] = d.start.max()
            
        # Adding the constituents stop info
        meta['constituent_id'] = np.cumsum(meta.constituent_onset)
        for s, d in meta.groupby('constituent_id'):
            meta.loc[d.index, 'constituent_start'] = d.start.min()
            meta.loc[d.index, 'constituent_stop'] = d.start.max()
            meta.loc[d.index, 'const_word_id'] = range(len(d))

        for start in starts: 
            for level in levels:
                # Select only the rows containing the True for the conditions
                # Simplified to only get for the onset: sentence onset epochs, constituent onset epochs,etc
                start = 'onset' # DEBUG
                sel = meta.query(f'{level}_{start}==True')
                assert sel.shape[0] > 10  #
                
                # Do we need to do that ???
                """
                # Matchlist events and meta
                # So that we can epoch now that's we've sliced our metadata
                if modality == 'auditory':
                    word_events = events[events[:, 2] > 1]
                    meg_delta = np.round(np.diff(word_events[:, 0]/raw.info['sfreq']))
                    meta_delta = np.round(np.diff(sel.onset.values))
                    i, j = match_list(meg_delta, meta_delta)

                # For auditory, we match on the time difference between triggers
                elif modality == "visual":

                    i, j = match_list(events[:, 2], sel.wlength)
                    assert len(i) > (0.9 * len(events))
                    assert (events[i, 2] == sel.loc[j].wlength).mean() > 0.95
                sel = sel.reset_index().loc[j]
                # Making sure there is not hidden bug when matching
                assert sel.shape[0] > 0.5 *  (meta.query(f'{level}_onset==True')).shape[0]
                """
                
                # Epoching from the metadata having all onset events: if the start=Offset, the mne events
                # Function will epoch on the offset of each level instead of the onset
                # TODO: add adaptative baseline
                epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
                                     tmin = epoch_windows[f'{level}'][f'{start}_min'],
                                       tmax = epoch_windows[f'{level}'][f'{start}_max'],
                                         event_repeated = 'drop',
                                            preload=True,
                                                baseline=None)
                epoch_key = f'{level}_{start}'

                dict_epochs[epoch_key].append(epochs)
            
    # Once we have the dict of epochs per condition full (epoching for each run for a subject)
    # we can concatenate them, and fix the dev_head             
    for start_ in starts: 
        for level_ in levels:
            start_ = 'onset' # DEBUG
            epoch_key = f'{level_}_{start_}'
            all_epochs_chosen = dict_epochs[epoch_key]
            # Concatenate epochs

            for epo in all_epochs_chosen:
                epo.info["dev_head_t"] = all_epochs_chosen[1].info["dev_head_t"]

            dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)


auditory modality chosen

Reading raw files for modality: auditory
auditory modality chosen


 Epoching for run 1, subject: 3

Opening raw data file /home/is153802/data/LPP_MEG_auditory/sub-3/ses-01/meg/sub-3_ses-01_task-listen_run-01_meg.fif...
    Read a total of 13 projection items:
        grad_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v6 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v7 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v8 (1 x 306)  idle
    Range : 28000 .

  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)


1954 events found
Event IDs: [  1 129]
visual modality chosen

Reading 0 ... 630999  =      0.000 ...   630.999 secs...
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 20 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 20.00 Hz
- Upper transition bandwidth: 5.00 Hz (-6 dB cutoff frequency: 22.50 Hz)
- Filter length: 6601 samples (6.601 sec)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 306 out of 306 | elapsed:    8.1s finished


Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1597 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1597 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
484 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 484 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
155 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 155 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1597 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1597 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
484 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 484 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
155 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 155 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Reading raw files for modality: auditory
auditory modality chosen


 Epoching for run 2, subject: 3

Opening raw data file /home/is153802/data/LPP_MEG_auditory/sub-3/ses-01/meg/sub-3_ses-01_task-listen_run-02_meg.fif...
    Read a total of 13 projection items:
        grad_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v6 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v7 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v8 (1 x 306)  idle
    Range : 120000 ... 7

  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)


1769 events found
Event IDs: [  1 129]
visual modality chosen

Reading 0 ... 652999  =      0.000 ...   652.999 secs...
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 20 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 20.00 Hz
- Upper transition bandwidth: 5.00 Hz (-6 dB cutoff frequency: 22.50 Hz)
- Filter length: 6601 samples (6.601 sec)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 306 out of 306 | elapsed:    8.8s finished


Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1765 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1765 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
510 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 510 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
141 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 141 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1765 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1765 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
510 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 510 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
141 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 141 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Reading raw files for modality: auditory
auditory modality chosen


 Epoching for run 3, subject: 3

Opening raw data file /home/is153802/data/LPP_MEG_auditory/sub-3/ses-01/meg/sub-3_ses-01_task-listen_run-03_meg.fif...
    Read a total of 13 projection items:
        grad_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v6 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v7 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v8 (1 x 306)  idle
    Range : 226000 ... 9

  raw = mne_bids.read_raw_bids(bids_path)
  raw = mne_bids.read_raw_bids(bids_path)


1862 events found
Event IDs: [  1 129]
visual modality chosen

Reading 0 ... 715999  =      0.000 ...   715.999 secs...
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 20 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 20.00 Hz
- Upper transition bandwidth: 5.00 Hz (-6 dB cutoff frequency: 22.50 Hz)
- Filter length: 6601 samples (6.601 sec)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 306 out of 306 | elapsed:    9.8s finished


Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1849 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1849 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
566 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 566 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
183 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 183 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
1849 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 1849 events and 1301 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
566 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 566 events and 2501 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped
Multiple event values for single event times found. Keeping the first occurrence and dropping all others.
Adding metadata with 27 columns
183 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 183 events and 5001 original time points (prior to decimation) ...


  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
  epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,


1 bad epochs dropped


  dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)


Adding metadata with 27 columns
10416 matching events found
No baseline correction applied
Adding metadata with 27 columns
3114 matching events found
No baseline correction applied


  dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)
  dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)


Adding metadata with 27 columns
952 matching events found
No baseline correction applied


AttributeError: 'numpy.ndarray' object has no attribute 'info'

## LASER embeddings for sentences (easy)

In [2]:
# From the metadata, go from sentence end to sentence end, 
# regroup all the words each time, create a txt file from it,
# And run LASER on it, generating an associated txt file containing the embeddings
# finally, add these embeddings to the metadata 

# Initialize sentence list
sentences = []

# Loop through rows
current_sentence = []
for index, row in meta.iterrows():
    
    # Append word to current sentence 
    current_sentence.append(row['word'])
    
    # Check if end of sentence 
    if row['is_last_word']:
        # Join words into sentence string and append to list
        sentences.append(' '.join(current_sentence)) 
        # Reset current sentence   
        current_sentence = []

# Loop through sentences 
for i, sentence in enumerate(sentences):
    # Get sentence number
    sentence_num = i + 1
    
    # Create file name
    file_name = f'./embeds/txt/run{run}_sentence_{sentence_num}.txt'
    
    # Open text file 
    with open(file_name, 'w') as f:
        # Write sentence to file
        f.write(sentence)

In [3]:
from pathlib import Path
import numpy as np
path = Path('/home/is153802/github/LASER/tasks/embed')
%env LASER=/home/is153802/github/LASER

# for run in np.arange(1,10): # to be addded later
for i, _ in enumerate(sentences):
# Get sentence number
    sentence_num = i + 1

    txt_file = f"/home/is153802/code/decoding/local_testing/embeds/txt/run{run}_sentence_{sentence_num}.txt"
    emb_file = f"/home/is153802/code/decoding/local_testing/embeds/emb/run{run}_sentence_{sentence_num}.bin"
    !bash /home/is153802/github/LASER/tasks/embed/embed.sh {txt_file} {emb_file}

env: LASER=/home/is153802/github/LASER
2023-06-08 12:24:07,371 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:24:07,371 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:24:07,371 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:24:07,894 | INFO | preprocess | SPM processing run2_sentence_1.txt  
2023-06-08 12:24:08,012 | INFO | embed | encoding /tmp/tmp6mr4i9er/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_1.bin
2023-06-08 12:24:08,102 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:24:11,239 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:24:11,240 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:24:11,240 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:24:11,756 | INFO | preprocess | SPM processing run2_sentence_2.txt  
2023-06-08 12:24

2023-06-08 12:24:59,517 | INFO | preprocess | SPM processing run2_sentence_15.txt  
2023-06-08 12:24:59,630 | INFO | embed | encoding /tmp/tmp4un4y6j3/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_15.bin
2023-06-08 12:24:59,667 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:25:02,655 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:25:02,655 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:25:02,655 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:25:03,171 | INFO | preprocess | SPM processing run2_sentence_16.txt  
2023-06-08 12:25:03,276 | INFO | embed | encoding /tmp/tmp_hcsfj1i/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_16.bin
2023-06-08 12:25:03,294 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:25:06,275 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:25:06,276 | I

2023-06-08 12:25:50,541 | INFO | embed | encoding /tmp/tmpoywb2000/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_29.bin
2023-06-08 12:25:50,591 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:25:53,577 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:25:53,577 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:25:53,577 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:25:54,096 | INFO | preprocess | SPM processing run2_sentence_30.txt  
2023-06-08 12:25:54,212 | INFO | embed | encoding /tmp/tmp1d_40xus/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_30.bin
2023-06-08 12:25:54,222 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:25:57,197 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:25:57,197 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:25

2023-06-08 12:26:44,617 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:26:44,617 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:26:44,617 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:26:45,138 | INFO | preprocess | SPM processing run2_sentence_44.txt  
2023-06-08 12:26:45,250 | INFO | embed | encoding /tmp/tmps1vr269x/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_44.bin
2023-06-08 12:26:45,284 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:26:48,250 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:26:48,250 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:26:48,250 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:26:48,771 | INFO | preprocess | SPM processing run2_sentence_45.txt  
2023-06-08 12:26:48,883 | INFO | embed | encoding /t

2023-06-08 12:27:36,303 | INFO | preprocess | SPM processing run2_sentence_58.txt  
2023-06-08 12:27:36,408 | INFO | embed | encoding /tmp/tmp0lbtg_wf/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_58.bin
2023-06-08 12:27:36,430 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:27:39,412 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:27:39,412 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:27:39,412 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:27:39,925 | INFO | preprocess | SPM processing run2_sentence_59.txt  
2023-06-08 12:27:40,038 | INFO | embed | encoding /tmp/tmpm7i6xz8d/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_59.bin
2023-06-08 12:27:40,053 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:27:43,003 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:27:43,003 | I

2023-06-08 12:28:27,827 | INFO | embed | encoding /tmp/tmp35sdq3ez/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_72.bin
2023-06-08 12:28:27,851 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:28:30,890 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:28:30,890 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:28:30,890 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:28:31,405 | INFO | preprocess | SPM processing run2_sentence_73.txt  
2023-06-08 12:28:31,521 | INFO | embed | encoding /tmp/tmpb0ddpk00/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_73.bin
2023-06-08 12:28:31,565 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:28:34,568 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:28:34,568 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:28

2023-06-08 12:29:22,039 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:29:22,039 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:29:22,039 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:29:22,554 | INFO | preprocess | SPM processing run2_sentence_87.txt  
2023-06-08 12:29:22,664 | INFO | embed | encoding /tmp/tmp7cyi8bgj/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_87.bin
2023-06-08 12:29:22,688 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:29:25,644 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:29:25,645 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:29:25,645 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:29:26,157 | INFO | preprocess | SPM processing run2_sentence_88.txt  
2023-06-08 12:29:26,267 | INFO | embed | encoding /t

2023-06-08 12:30:13,327 | INFO | preprocess | SPM processing run2_sentence_101.txt  
2023-06-08 12:30:13,436 | INFO | embed | encoding /tmp/tmpc0hvmu6h/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_101.bin
2023-06-08 12:30:13,453 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:30:16,439 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:30:16,439 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:30:16,439 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:30:16,956 | INFO | preprocess | SPM processing run2_sentence_102.txt  
2023-06-08 12:30:17,066 | INFO | embed | encoding /tmp/tmph7843p6o/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_102.bin
2023-06-08 12:30:17,093 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:30:20,071 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:30:20,072

2023-06-08 12:31:04,140 | INFO | embed | encoding /tmp/tmp21n4bfmo/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_115.bin
2023-06-08 12:31:04,173 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:31:07,161 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:31:07,162 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:31:07,162 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:31:07,681 | INFO | preprocess | SPM processing run2_sentence_116.txt  
2023-06-08 12:31:07,790 | INFO | embed | encoding /tmp/tmpdukjbrqb/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_116.bin
2023-06-08 12:31:07,809 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:31:10,794 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:31:10,794 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12

2023-06-08 12:31:54,771 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:31:57,703 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:31:57,703 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:31:57,703 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:31:58,218 | INFO | preprocess | SPM processing run2_sentence_130.txt  
2023-06-08 12:31:58,327 | INFO | embed | encoding /tmp/tmpiqkehg36/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_130.bin
2023-06-08 12:31:58,342 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:32:01,346 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:32:01,347 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:32:01,347 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:32:01,861 | INFO | preprocess | SPM processing run2_s

2023-06-08 12:32:48,463 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:32:48,463 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:32:48,463 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:32:48,978 | INFO | preprocess | SPM processing run2_sentence_144.txt  
2023-06-08 12:32:49,088 | INFO | embed | encoding /tmp/tmpxt05_w10/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run2_sentence_144.bin
2023-06-08 12:32:49,098 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 12:32:52,059 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 12:32:52,059 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 12:32:52,059 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 12:32:52,574 | INFO | preprocess | SPM processing run2_sentence_145.txt  
2023-06-08 12:32:52,683 | INFO | embed | encoding

In [40]:
from dataset import get_code_path
dim = 1024
embeddings = {}
for index, sentence in enumerate(sentences):
    embeds = np.fromfile(
            f"{get_code_path()}/decoding/local_testing/embeds/emb/run{run}_sentence_{index+1}.bin",
            dtype=np.float32,
            count=-1,
            )
    embeds.resize(embeds.shape[0] // dim, dim)
    embeds = embeds.reshape(-1)
    embeddings[index] = embeds

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality chosen

visual modality 

In [45]:
sent_index = 0
embed_arrays = []
for index, row in meta.iterrows():
    embed_arrays.append(embeddings[sent_index])
    # Check if end of sentence 
    if row['is_last_word']:
        sent_index += 1

meta['embed'] = embed_arrays



In [48]:
# Make it a function:
from dataset import get_code_path

def add_embeddings_sentence(meta, run):
    
    """
    Function made to generate laser embeddings, store them,
    and add them to the metadata 

    Does so for both constituent embeddings, and sentence ones


    """
    # Parse the metadata into sentences, and generate txt files for each sentence
    # So that it can be parsed by LASER
    sentences = []
    current_sentence = []
    for index, row in meta.iterrows():

        # Append word to current sentence 
        current_sentence.append(row['word'])

        # Check if end of sentence 
        if row['is_last_word']:
            # Join words into sentence string and append to list
            sentences.append(' '.join(current_sentence)) 
            # Reset current sentence   
            current_sentence = []

    # Loop through sentences 
    for i, sentence in enumerate(sentences):
        # Get sentence number
        sentence_num = i + 1

        # Create file name
        file_name = f'./embeds/txt/run{run}_sentence_{sentence_num}.txt'

        # Open text file 
        with open(file_name, 'w') as f:
            # Write sentence to file
            f.write(sentence)
            
    # Run LASER using the run number
    path = Path('/home/is153802/github/LASER/tasks/embed')
    %env LASER=/home/is153802/github/LASER

    for i, _ in enumerate(sentences):
    # Get sentence number
        sentence_num = i + 1

        txt_file = f"/home/is153802/code/decoding/local_testing/embeds/txt/run{run}_sentence_{sentence_num}.txt"
        emb_file = f"/home/is153802/code/decoding/local_testing/embeds/emb/run{run}_sentence_{sentence_num}.bin"
        if os.path.exists(emb_file):
            continue
        else:
            !bash /home/is153802/github/LASER/tasks/embed/embed.sh {txt_file} {emb_file}
        
    # Get the embeddings from the generated txt file, and add them to metadata
    dim = 1024
    embeddings = {}
    for index, sentence in enumerate(sentences):
        embeds = np.fromfile(
                f"{get_code_path()}/decoding/local_testing/embeds/emb/run{run}_sentence_{index+1}.bin",
                dtype=np.float32,
                count=-1,
                )
        embeds.resize(embeds.shape[0] // dim, dim)
        embeds = embeds.reshape(-1)
        embeddings[index] = embeds
    sent_index = 0
    embed_arrays = []
    for index, row in meta.iterrows():
        embed_arrays.append(embeddings[sent_index])
        # Check if end of sentence 
        if row['is_last_word']:
            sent_index += 1

    meta['embed'] = embed_arrays
    
    return meta


In [None]:
emebds = add_embeddings_sentence(meta,1)

env: LASER=/home/is153802/github/LASER
2023-06-08 14:33:17,566 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:33:17,566 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:33:17,566 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:33:18,094 | INFO | preprocess | SPM processing run1_sentence_1.txt  
2023-06-08 14:33:18,208 | INFO | embed | encoding /tmp/tmptn4i4ujq/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run1_sentence_1.bin
2023-06-08 14:33:18,239 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:33:21,236 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:33:21,236 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:33:21,236 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:33:21,765 | INFO | preprocess | SPM processing run1_sentence_2.txt  
2023-06-08 14:33

2023-06-08 14:34:10,289 | INFO | preprocess | SPM processing run1_sentence_15.txt  
2023-06-08 14:34:10,393 | INFO | embed | encoding /tmp/tmpk3wza2kv/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run1_sentence_15.bin
2023-06-08 14:34:10,413 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:34:13,517 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:34:13,517 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-06-08 14:34:13,517 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-06-08 14:34:14,058 | INFO | preprocess | SPM processing run1_sentence_16.txt  
2023-06-08 14:34:14,178 | INFO | embed | encoding /tmp/tmpp120gvan/spm to /home/is153802/code/decoding/local_testing/embeds/emb/run1_sentence_16.bin
2023-06-08 14:34:14,195 | INFO | embed | encoded 1 sentences in 0s
2023-06-08 14:34:17,247 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-06-08 14:34:17,247 | I

# Previous Approach

In [None]:
# First: generate the run{i}.txt file to input to LASER

# What was done previously: chunk the txt file raw by actual sentence (based on ., ?, !, etc..)
# Problem: the metadata in epochs (sentence_end calculated using the word onset difference) doesn't match, as there are
# Offsets that happen sometimes not at the end of sentences

# Solution: temporary: generate the line chunking for LASER by word onset difference from the metadata file
# Final: it will only work for read modality: for audio, an option could be to replicate the metadata file
# => supposing the shape of both metadata files are the same, we can add the sentence_end column to the audio one


In [None]:
import pandas as pd

# Open the events files to get the metadata, and then generate the txt file from there
for run in np.arange(1,10):

    file = f'/home/co/data/BIDS_lecture/sub-{sub}/ses-01/meg/sub-{sub}_ses-01_task-read_run-0{run}_events.tsv'



    # Load the TSV file into a pandas DataFrame
    df = pd.read_csv(file, sep='\t')

    # Keep track of the previous onset value
    prev_onset = None

    # Open the output file for writing
    with open(f'run{run}.txt', 'w') as output_file:

        # Loop through each row in the DataFrame
        for i, row in df.iterrows():

            # Get the onset value for this row
            onset = row['onset']

            # If this is the first row, or the onset difference with the previous row is less than 0.7, append the current column to the output
            if ((row.word).__contains__(".")
                or (row.word).__contains__("?")
                or (row.word).__contains__("!")):
                output_file.write(row['word'] +'\n')
                

            # Otherwise, start a new line in the output file
            else:
                
                output_file.write(row['word'] + ' ')

            # Remember the onset value for the next iteration
            prev_onset = onset


In [None]:
from pathlib import Path
import numpy as np
path = Path('/home/is153802/github/LASER/tasks/embed')

In [None]:
%env LASER=/home/is153802/github/LASER

In [None]:
CHAPTERS = {
        1: "1-3",
        2: "4-6",
        3: "7-9",
        4: "10-12",
        5: "13-14",
        6: "15-19",
        7: "20-22",
        8: "23-25",
        9: "26-27",
    }

for run in np.arange(1,10):
    ch = CHAPTERS[run]
    txt_file = f"/home/is153802/code/data/txt_laser/run{run}.txt"
    emb_file = f"/home/is153802/code/data/laser_embeddings/emb_{ch}.bin"
    !bash /home/is153802/github/LASER/tasks/embed/embed.sh {txt_file} {emb_file}
