In [None]:
# First: generate the run{i}.txt file to input to LASER

# What was done previously: chunk the txt file raw by actual sentence (based on ., ?, !, etc..)
# Problem: the metadata in epochs (sentence_end calculated using the word onset difference) doesn't match, as there are
# Offsets that happen sometimes not at the end of sentences

# Solution: temporary: generate the line chunking for LASER by word onset difference from the metadata file
# Final: it will only work for read modality: for audio, an option could be to replicate the metadata file
# => supposing the shape of both metadata files are the same, we can add the sentence_end column to the audio one


In [7]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Using cached python_Levenshtein-0.20.9-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.20.9
  Downloading Levenshtein-0.20.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (174 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/174.1 kB[0m [31m61.8 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:02[0m
[?25hCollecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m114.6 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.20.9 python-Levenshtein-0.20.9 rapidfuzz-2.13.7
Note: you may need to restart the kernel to use updated packages.


In [24]:
# Open the events files to get the metadata, and then generate the txt file from there

run = 2
sub = 2
file = f'/home/co/data/BIDS_lecture/sub-{sub}/ses-01/meg/sub-{sub}_ses-01_task-read_run-0{run}_events.tsv'

import pandas as pd

# Load the TSV file into a pandas DataFrame
df = pd.read_csv(file, sep='\t')

# Keep track of the previous onset value
prev_onset = None

# Open the output file for writing
with open('output.txt', 'w') as output_file:

    # Loop through each row in the DataFrame
    for i, row in df.iterrows():

        # Get the onset value for this row
        onset = row['onset']

        # If this is the first row, or the onset difference with the previous row is less than 0.7, append the current column to the output
        if prev_onset is None or onset - prev_onset < 0.7:
            output_file.write(row['word'] + ' ')

        # Otherwise, start a new line in the output file
        else:
            output_file.write('\n' + row['word'] + ' ')

        # Remember the onset value for the next iteration
        prev_onset = onset


In [21]:
import numpy as np
np.unique(np.round(np.diff(df.onset)))

array([0., 1., 3.])

In [25]:
df[50:100]

Unnamed: 0.1,Unnamed: 0,word,onset,duration,trial_type
50,50,des,16.7,0.25,"{'kind': 'word', 'word': 'des'}"
51,51,centaines,17.0,0.25,"{'kind': 'word', 'word': 'centaines'}"
52,52,d'autres,17.3,0.25,"{'kind': 'word', 'word': 'dautres'}"
53,53,qui,17.6,0.25,"{'kind': 'word', 'word': 'qui'}"
54,54,sont,17.9,0.25,"{'kind': 'word', 'word': 'sont'}"
55,55,quelquefois,18.2,0.25,"{'kind': 'word', 'word': 'quelquefois'}"
56,56,si,18.5,0.25,"{'kind': 'word', 'word': 'si'}"
57,57,petites,18.8,0.25,"{'kind': 'word', 'word': 'petites'}"
58,58,qu'on,19.1,0.25,"{'kind': 'word', 'word': 'quon'}"
59,59,a,19.4,0.25,"{'kind': 'word', 'word': 'a'}"


In [11]:
meta[50]

Unnamed: 0.1,level_0,index,Unnamed: 0,word,onset,duration,trial_type,start,condition,n_closing,is_last_word,pos,content_word,sentence_end,label,kind
7,7,7,7,fois,2.8,0.25,"{'kind': 'word', 'word': 'fois'}",36.252,sentence,2,False,NC,True,False,run_1,word
8,8,8,8,une,3.1,0.25,"{'kind': 'word', 'word': 'une'}",36.519,sentence,1,False,DET,False,False,run_1,word
9,9,9,9,magnifique,3.4,0.25,"{'kind': 'word', 'word': 'magnifique'}",36.785,sentence,1,False,ADJ,True,False,run_1,word
10,10,10,10,image,3.7,0.25,"{'kind': 'word', 'word': 'image'}",37.052,sentence,2,False,NC,True,False,run_1,word
11,11,11,11,dans,4.0,0.25,"{'kind': 'word', 'word': 'dans'}",37.336,sentence,1,False,P,False,False,run_1,word
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1406,1406,1459,1459,ne,508.9,0.25,"{'kind': 'word', 'word': 'ne'}",491.725,sentence,1,False,ADV,True,False,run_1,word
1407,1407,1460,1460,peut,509.2,0.25,"{'kind': 'word', 'word': 'peut'}",491.991,sentence,2,False,V,True,False,run_1,word
1408,1408,1461,1461,pas,509.5,0.25,"{'kind': 'word', 'word': 'pas'}",492.258,sentence,1,False,ADV,True,False,run_1,word
1409,1409,1462,1462,aller,509.8,0.25,"{'kind': 'word', 'word': 'aller'}",492.541,sentence,2,False,VINF,True,False,run_1,word


In [1]:
from pathlib import Path
import numpy as np
path = Path('/home/is153802/github/LASER/tasks/embed')

In [22]:
%env LASER=/home/is153802/github/LASER

env: LASER=/home/is153802/github/LASER


In [24]:
!pip install transliterate tensorboardX

Collecting transliterate
  Using cached transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
Collecting tensorboardX
  Downloading tensorboardX-2.6-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting protobuf<4,>=3.8.0
  Using cached protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
Installing collected packages: transliterate, protobuf, tensorboardX
Successfully installed protobuf-3.20.3 tensorboardX-2.6 transliterate-1.10.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [27]:
CHAPTERS = {
        1: "1-3",
        2: "4-6",
        3: "7-9",
        4: "10-12",
        5: "13-14",
        6: "15-19",
        7: "20-22",
        8: "23-25",
        9: "26-27",
    }

for run in np.arange(1,10):
    ch = CHAPTERS[run]
    txt_file = f"/home/is153802/code/data/txt_laser/run{run}.txt"
    emb_file = f"/home/is153802/code/data/laser_embeddings/emb_{ch}.bin"
    !bash /home/is153802/github/LASER/tasks/embed/embed.sh {txt_file} {emb_file}


2023-03-23 00:48:06,276 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-03-23 00:48:06,276 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-03-23 00:48:06,276 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-03-23 00:48:06,811 | INFO | preprocess | SPM processing run1.txt  
2023-03-23 00:48:06,913 | INFO | embed | encoding /tmp/tmprciny7o3/spm to /home/is153802/code/data/laser_embeddings/emb_1-3.bin
2023-03-23 00:48:07,638 | INFO | embed | encoded 134 sentences in 0s
2023-03-23 00:48:10,364 | INFO | embed | spm_model: /home/is153802/github/LASER/laser2.spm
2023-03-23 00:48:10,364 | INFO | embed | spm_cvocab: /home/is153802/github/LASER/laser2.cvocab
2023-03-23 00:48:10,364 | INFO | embed | loading encoder: /home/is153802/github/LASER/laser2.pt
2023-03-23 00:48:10,899 | INFO | preprocess | SPM processing run2.txt  
2023-03-23 00:48:11,012 | INFO | embed | encoding /tmp/tmpvmp1hhb6/spm to /home/is153802/code/d

In [9]:
ls /home/is153802/code/data/laser_embeddings/

emb_10-12.bin  emb_1-3.bin    emb_20-22.bin  emb_26-27.bin  emb_7-9.bin
emb_13-14.bin  emb_15-19.bin  emb_23-25.bin  emb_4-6.bin


In [9]:
pwd

'/home/co/workspace_LPP/code/neurospin-petit-prince/testing/laser_embeddings'

In [16]:
ls /home/is153802/

[0m[01;34macas_participation[0m/  [01;34mDesktop[0m/    [01;34mgithub[0m/     source_reconstruction.ipynb
[01;34mafer[0m/                [01;34mDocuments[0m/  [01;34mmatlab[0m/     [01;34mtmp[0m/
[01;36mcode[0m@                [01;34mDownloads[0m/  [01;34mneurospin[0m/  [01;34mvideos[0m/
[01;36mdata[0m@                [01;34mexpe[0m/       [01;34mph[0m/         [01;36mworkspace_LPP[0m@
[01;34mdecode[0m/              [01;34mfun[0m/        [01;34msnap[0m/       [01;34mZotero[0m/
