# New Approach:

For both:
- Sentence embeddings
- Constituent embeddings

Generate the embedding by iterating through them, instead of generating it from the whole txt file and chunking after.

In order to get the right chunking:
- get the metadata format from a normal analysis,
- get the frontiers of constituents / sentences from it, and generate the embeddings from there

## Testing metadata

In [1]:
from dataset import read_raw, get_subjects, get_path
from utils import decod_xy, mne_events
import mne
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import match_list
import spacy

modality = "auditory"
nlp = spacy.load("fr_core_news_sm")
all_evos = []
all_scores = []
path = get_path(modality)
subjects = get_subjects(path)
runs = 2
epoch_windows = {"word": {"onset_min": -0.3, "onset_max": 1.0, "offset_min": -1.0, "offset_max": 0.3},
                  "constituent": {"offset_min": -2.0, "offset_max": 0.5, "onset_min": -0.5, "onset_max": 2.0},
                  "sentence": {"offset_min": -4.0, "offset_max": 1.0, "onset_min": -1.0, "onset_max": 4.0}}
levels = ('word','constituent','sentence')
starts = ('onset', 'offset')
      
# Iterate on subjects to epochs, and mean later
for subject in subjects[2:3]:
    
    dict_epochs = dict() # DICT containing epochs grouped by conditions (start x level)
    
    # Initialization of the dictionary
    for start in starts: 
            for level in levels:
                epoch_key = f'{level}_{start}'
                dict_epochs[epoch_key] = [] 
                
    # Iterating on runs, building the metadata and re-epoching
    for run in range(1,runs+1):
        raw, meta_, events = read_raw(subject, run, events_return = True, modality=modality)
        meta = meta_.copy()
        
        # Metadata update
        meta['word_onset'] = True
        meta['word_stop'] = meta.start + meta.duration
        meta['sentence_onset'] = meta.word_id == 0
        meta['prev_closing'] = meta['n_closing'].shift(1)
        meta['constituent_onset'] = meta.apply(lambda x: x['prev_closing'] > x['n_closing'] and x['n_closing'] == 1, axis=1)
        meta['constituent_onset'].fillna(False, inplace=True)
        meta.drop('prev_closing', axis=1, inplace=True)
        
        # Adding the sentence stop info
        meta['sentence_id'] = np.cumsum(meta.sentence_onset)
        for s, d in meta.groupby('sentence_id'):
            meta.loc[d.index, 'sent_word_id'] = range(len(d))
            meta.loc[d.index, 'sentence_start'] = d.start.min()
            meta.loc[d.index, 'sentence_stop'] = d.start.max()
            
        # Adding the constituents stop info
        meta['constituent_id'] = np.cumsum(meta.constituent_onset)
        for s, d in meta.groupby('constituent_id'):
            meta.loc[d.index, 'constituent_start'] = d.start.min()
            meta.loc[d.index, 'constituent_stop'] = d.start.max()
            meta.loc[d.index, 'const_word_id'] = range(len(d))

        for start in starts: 
            for level in levels:
                # Select only the rows containing the True for the conditions
                # Simplified to only get for the onset: sentence onset epochs, constituent onset epochs,etc
                sel = meta.query(f'{level}_onset==True')
                assert sel.shape[0] > 10  #
                # TODO check variance as well for sentences
                # Matchlist events and meta
                # So that we can epoch now that's we've sliced our metadata
                i, j = match_list(events[:, 2], sel.word.apply(len))
                sel = sel.reset_index().loc[j]
                # Making sure there is not hidden bug when matching
                assert sel.shape[0] > 0.8 *  (meta.query(f'{level}_onset==True')).shape[0]

                # Epoching from the metadata having all onset events: if the start=Offset, the mne events
                # Function will epoch on the offset of each level instead of the onset
                # TODO: add adaptative baseline
                epochs = mne.Epochs(raw, **mne_events(sel, raw ,start=start, level=level), decim = 100,
                                     tmin = epoch_windows[f'{level}'][f'{start}_min'],
                                       tmax = epoch_windows[f'{level}'][f'{start}_max'],
                                         event_repeated = 'drop',
                                            preload=True,
                                                baseline=None)
                epoch_key = f'{level}_{start}'

                dict_epochs[epoch_key].append(epochs)
            
    # Once we have the dict of epochs per condition full (epoching for each run for a subject)
    # we can concatenate them, and fix the dev_head             
    for start_ in starts: 
        for level_ in levels:
            epoch_key = f'{level_}_{start_}'
            all_epochs_chosen = dict_epochs[epoch_key]
            # Concatenate epochs

            for epo in all_epochs_chosen:
                epo.info["dev_head_t"] = all_epochs_chosen[1].info["dev_head_t"]

            dict_epochs[epoch_key] = mne.concatenate_epochs(all_epochs_chosen)


auditory modality chosen

Reading raw files for modality: auditory
auditory modality chosen


 Epoching for run 1, subject: 3

Opening raw data file /home/is153802/data/LPP_MEG_auditory/sub-3/ses-01/meg/sub-3_ses-01_task-listen_run-01_meg.fif...
    Read a total of 13 projection items:
        grad_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        grad_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v1 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v2 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v3 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v4 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v5 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v6 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v7 (1 x 306)  idle
        mag_ssp_upright.fif : PCA-v8 (1 x 306)  idle
    Range : 28000 .

  raw = mne_bids.read_raw_bids(bids_path)


Reading events from /home/is153802/data/LPP_MEG_auditory/sub-3/ses-01/meg/sub-3_ses-01_task-listen_run-01_events.tsv.
Reading channel info from /home/is153802/data/LPP_MEG_auditory/sub-3/ses-01/meg/sub-3_ses-01_task-listen_run-01_channels.tsv.
Using 4 HPI coils: 293 307 314 321 Hz
Not fully anonymizing info - keeping his_id, sex, and hand info


  raw = mne_bids.read_raw_bids(bids_path)


1954 events found
Event IDs: [  1 129]


AttributeError: 'DataFrame' object has no attribute 'word'

In [None]:
%debug

> [0;32m/home/is153802/.pyenv/versions/meg-masc/lib/python3.10/site-packages/pandas/core/generic.py[0m(5575)[0;36m__getattr__[0;34m()[0m
[0;32m   5573 [0;31m        ):
[0m[0;32m   5574 [0;31m            [0;32mreturn[0m [0mself[0m[0;34m[[0m[0mname[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 5575 [0;31m        [0;32mreturn[0m [0mobject[0m[0;34m.[0m[0m__getattribute__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mname[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   5576 [0;31m[0;34m[0m[0m
[0m[0;32m   5577 [0;31m    [0;32mdef[0m [0m__setattr__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mname[0m[0;34m:[0m [0mstr[0m[0;34m,[0m [0mvalue[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> u
> [0;32m/mnt/localdrive/workspace-LPP/code/neurospin-petit-prince/decoding/local_testing/dataset.py[0m(72)[0;36mread_raw[0;34m()[0m
[0;32m     70 [0;31m[0;34m[0m[0m
[0m[0;32m     71 [0;31m    

(354, Unnamed: 0                                354
onset                                  135.09
duration                                 0.25
trial_type    {'kind': 'word', 'word': 'des'}
Name: 354, dtype: object)
(355, Unnamed: 0                                355
onset                                  135.39
duration                                 0.26
trial_type    {'kind': 'word', 'word': 'tas'}
Name: 355, dtype: object)
(356, Unnamed: 0                               356
onset                                 135.65
duration                                0.13
trial_type    {'kind': 'word', 'word': 'de'}
Name: 356, dtype: object)
(357, Unnamed: 0                                     357
onset                                       135.78
duration                                      0.54
trial_type    {'kind': 'word', 'word': 'contacts'}
Name: 357, dtype: object)
(358, Unnamed: 0                                 358
onset                                   136.32
duration            

(790, Unnamed: 0                                 790
onset                                   298.58
duration                                  0.36
trial_type    {'kind': 'word', 'word': 'rien'}
Name: 790, dtype: object)
(791, Unnamed: 0                              791
onset                                298.94
duration                               0.11
trial_type    {'kind': 'word', 'word': 'l'}
Name: 791, dtype: object)
(792, Unnamed: 0                                      792
onset                                        299.05
duration                                        0.6
trial_type    {'kind': 'word', 'word': 'apparence'}
Name: 792, dtype: object)
(793, Unnamed: 0                              793
onset                                299.65
duration                               0.06
trial_type    {'kind': 'word', 'word': 'd'}
Name: 793, dtype: object)
(794, Unnamed: 0                               794
onset                                 299.71
duration                    

(1296, Unnamed: 0                                 1296
onset                                    486.46
duration                                   0.36
trial_type    {'kind': 'word', 'word': 'avion'}
Name: 1296, dtype: object)
(1297, Unnamed: 0                             1297
onset                                487.22
duration                               0.13
trial_type    {'kind': 'word', 'word': 'c'}
Name: 1297, dtype: object)
(1298, Unnamed: 0                               1298
onset                                  487.35
duration                                 0.05
trial_type    {'kind': 'word', 'word': 'est'}
Name: 1298, dtype: object)
(1299, Unnamed: 0                               1299
onset                                   487.4
duration                                 0.25
trial_type    {'kind': 'word', 'word': 'mon'}
Name: 1299, dtype: object)
(1300, Unnamed: 0                                 1300
onset                                    487.65
duration                 

ipdb> [print(row['trial_type']) for row in meta.iterrows()]
*** TypeError: tuple indices must be integers or slices, not str
ipdb> [print(row[3]) for row in meta.iterrows()]
*** IndexError: tuple index out of range
ipdb> [print(row[0]) for row in meta.iterrows()]
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
21

ipdb> meta
      Unnamed: 0   onset  duration                           trial_type  word
0              0    3.05      0.37  {'kind': 'word', 'word': 'lorsque'}  None
1              1    3.42      0.02        {'kind': 'word', 'word': 'j'}  None
2              2    3.53      0.23    {'kind': 'word', 'word': 'avais'}  None
3              3    3.93      0.25      {'kind': 'word', 'word': 'six'}  None
4              4    4.18      0.18      {'kind': 'word', 'word': 'ans'}  None
...          ...     ...       ...                                  ...   ...
1627        1627  609.51      0.14     {'kind': 'word', 'word': 'peut'}  None
1628        1628  609.65      0.22      {'kind': 'word', 'word': 'pas'}  None
1629        1629  609.87      0.17    {'kind': 'word', 'word': 'aller'}  None
1630        1630  610.04      0.16     {'kind': 'word', 'word': 'bien'}  None
1631        1631  610.20      0.18     {'kind': 'word', 'word': 'loin'}  None

[1632 rows x 5 columns]
ipdb> df['word'] = df['trial

In [None]:
import pandas as pd

# Open the events files to get the metadata, and then generate the txt file from there
for run in np.arange(1,10):

    file = f'/home/co/data/LPP_MEG_auditory/sub-{sub}/ses-01/meg/sub-{sub}_ses-01_task-read_run-0{run}_events.tsv'

    # Load the TSV file into a pandas DataFrame
    df = pd.read_csv(file, sep='\t')

    # Keep track of the previous onset value
    prev_onset = None

    # Open the output file for writing
    with open(f'run{run}.txt', 'w') as output_file:

        # Loop through each row in the DataFrame
        for i, row in df.iterrows():

            # Get the onset value for this row
            onset = row['onset']

            # If this is the first row, or the onset difference with the previous row is less than 0.7, append the current column to the output
            if ((row.word).__contains__(".")
                or (row.word).__contains__("?")
                or (row.word).__contains__("!")):
                output_file.write(row['word'] +'\n')
                

            # Otherwise, start a new line in the output file
            else:
                
                output_file.write(row['word'] + ' ')

            # Remember the onset value for the next iteration
            prev_onset = onset


## LASER embeddings for sentences (easy)

In [None]:
# From the metadata, go from sentence end to sentence end, 
# regroup all the words each time, create a txt file from it,
# And run LASER on it, generating an associated txt file containing the embeddings
# finally, add these embeddings to the metadata 





# Previous Approach

In [None]:
# First: generate the run{i}.txt file to input to LASER

# What was done previously: chunk the txt file raw by actual sentence (based on ., ?, !, etc..)
# Problem: the metadata in epochs (sentence_end calculated using the word onset difference) doesn't match, as there are
# Offsets that happen sometimes not at the end of sentences

# Solution: temporary: generate the line chunking for LASER by word onset difference from the metadata file
# Final: it will only work for read modality: for audio, an option could be to replicate the metadata file
# => supposing the shape of both metadata files are the same, we can add the sentence_end column to the audio one


In [None]:
import pandas as pd

# Open the events files to get the metadata, and then generate the txt file from there
for run in np.arange(1,10):

    file = f'/home/co/data/BIDS_lecture/sub-{sub}/ses-01/meg/sub-{sub}_ses-01_task-read_run-0{run}_events.tsv'



    # Load the TSV file into a pandas DataFrame
    df = pd.read_csv(file, sep='\t')

    # Keep track of the previous onset value
    prev_onset = None

    # Open the output file for writing
    with open(f'run{run}.txt', 'w') as output_file:

        # Loop through each row in the DataFrame
        for i, row in df.iterrows():

            # Get the onset value for this row
            onset = row['onset']

            # If this is the first row, or the onset difference with the previous row is less than 0.7, append the current column to the output
            if ((row.word).__contains__(".")
                or (row.word).__contains__("?")
                or (row.word).__contains__("!")):
                output_file.write(row['word'] +'\n')
                

            # Otherwise, start a new line in the output file
            else:
                
                output_file.write(row['word'] + ' ')

            # Remember the onset value for the next iteration
            prev_onset = onset


In [None]:
from pathlib import Path
import numpy as np
path = Path('/home/is153802/github/LASER/tasks/embed')

In [None]:
%env LASER=/home/is153802/github/LASER

In [None]:
CHAPTERS = {
        1: "1-3",
        2: "4-6",
        3: "7-9",
        4: "10-12",
        5: "13-14",
        6: "15-19",
        7: "20-22",
        8: "23-25",
        9: "26-27",
    }

for run in np.arange(1,10):
    ch = CHAPTERS[run]
    txt_file = f"/home/is153802/code/data/txt_laser/run{run}.txt"
    emb_file = f"/home/is153802/code/data/laser_embeddings/emb_{ch}.bin"
    !bash /home/is153802/github/LASER/tasks/embed/embed.sh {txt_file} {emb_file}
