# Notebook to extract embeddings from Basic Features 

In [13]:
import os
import glob
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from model import BasicFeaturesExtractor
from modeling_hacked_basic_features import BasicFeatures
from tokenizer import tokenize, rms_tokenizer
from utils import set_seed, wordrate, content_words, function_words, log_frequency, word_position, rms, sentence_to_words, create_onsets_files


In [2]:
def check_folder(path):
    """Create adequate folders if necessary."""
    try:
        if not os.path.isdir(path):
            check_folder(os.path.dirname(path))
            os.mkdir(path)
    except:
        pass

Defining variables:

In [18]:
language = 'french'

In [19]:
template = f'/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/{language}/text_{language}_run*.txt' # path to text input
template_rms = f'/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/{language}/wave_{language}_run*.wav'



In [20]:
#functions = [wordrate, content_words, function_words, log_word_freq, word_position, rms]
functions = [rms]

In [21]:
vocab_path = f'/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/{language}/glove_training'
path_to_data = f'/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{language}'

In [22]:
#template = '/Users/alexpsq/Code/Parietal/data/text_english_run*.txt' # path to text input
#template_rms = '/Users/alexpsq/Code/Parietal/data/wave_english_run*.wav'
#path_to_data = '/Users/alexpsq/Code/Parietal/data/stimuli-representations'

In [23]:
saving_path_folders = [os.path.join(path_to_data, function.__name__) for function in functions]
config_paths = [None]

Creating iterator for each run:

In [24]:
paths = sorted(glob.glob(template))
paths_rms = sorted(glob.glob(template_rms))

In [25]:
paths_rms

['/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/french/wave_french_run1.wav',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/french/wave_french_run2.wav',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/french/wave_french_run3.wav',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/french/wave_french_run4.wav',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/french/wave_french_run5.wav',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/french/wave_french_run6.wav',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/french/wave_french_run7.wav',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/french/wave_french_run8.wav',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPri

In [26]:
slice_period=10e-3

In [27]:
iterator_list = [tokenize(path, language, train=False) for path in paths]
rms_iterator_list = [rms_tokenizer(path_to_audio, slice_period=slice_period) for path_to_audio in paths_rms]
onsets = [pd.read_csv('/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/onsets-offsets/{}/word_run{}.csv'.format(language, index)) for index in range(1,10)]


path_to_onset_folder = f'/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/onsets-offsets/{language}'

for run_index, iterator in enumerate(rms_iterator_list):
    iter_, frame_rate, n_frames, slice_length = iterator
    create_onsets_files(path_to_onset_folder, n_frames, frame_rate, slice_period, run_index+1)
    

100%|██████████| 135/135 [00:00<00:00, 536203.64it/s]
100%|██████████| 136/136 [00:00<00:00, 718419.83it/s]
100%|██████████| 184/184 [00:00<00:00, 732212.46it/s]
100%|██████████| 174/174 [00:00<00:00, 303202.70it/s]
100%|██████████| 177/177 [00:00<00:00, 770115.98it/s]
100%|██████████| 211/211 [00:00<00:00, 232527.10it/s]
100%|██████████| 192/192 [00:00<00:00, 765500.35it/s]
100%|██████████| 143/143 [00:00<00:00, 689408.59it/s]
100%|██████████| 197/197 [00:00<00:00, 839713.30it/s]


In [65]:
for i in range(9):
    print(i, rms_iterator_list[i][1:], len(rms_iterator_list[i][0]))

0 (44100, 24462270, 441) 55470
1 (44100, 25921980, 441) 58780
2 (44100, 29608740, 441) 67140
3 (44100, 26380620, 441) 59820
4 (44100, 23029020, 441) 52220
5 (44100, 29904210, 441) 67810
6 (44100, 28312200, 441) 64200
7 (44100, 25392780, 441) 57580
8 (44100, 32065110, 441) 72710


In [14]:
def add_at_index(df, index, word, method='last'):
    # method can be in ['last', 'interpolate', 'next']
    if method=='last':
        onset = df['onsets'].iloc[index-1]
        offset = df['offsets'].iloc[index-1]
    elif method=='interpolate':
        onset = df['offsets'].iloc[index-1]
        offset = df['onsets'].iloc[index]
    elif method=='next':
        onset = df['onsets'].iloc[index]
        offset = df['offsets'].iloc[index]
    else:
        raise ValueError('Method {} not implemented...'.format(method))
    data = {'word':[word],
           'onsets':[onset],
           'offsets':[offset]}
    tmp = pd.DataFrame(data)
    result = pd.concat([df.loc[:index-1], tmp, df.loc[index:]], axis=0, ignore_index=True)
    return result

In [16]:
for i, text in enumerate(iterator_list):
    print(i, '-'*50)
    it = sentence_to_words(text)
    j = 0
    for index, value in enumerate(it):
        if value.lower() == onsets[i]['word'].iloc[j]:
            j+=1
        else:
            onsets[i] = add_at_index(onsets[i], index, value.lower(), method='interpolate')
            j+=1
            print(index, value)
            print(onsets[i].iloc[j-3:j+3])
            print(it[index-1], it[index], it[index+1])

0 --------------------------------------------------
1 --------------------------------------------------
2 --------------------------------------------------
3 --------------------------------------------------
4 --------------------------------------------------
5 --------------------------------------------------
6 --------------------------------------------------
7 --------------------------------------------------
8 --------------------------------------------------


In [17]:
for index, onset in enumerate(onsets):
    onset.to_csv('/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/wave/english/onset-offsets/word_run{}.csv'.format(index+1), index=False)

In [20]:
for i, onset in enumerate(onsets):
    print(len(onset['word']))

1519 1519
1719 1719
1853 1853
1636 1636
1532 1532
1818 1818
1787 1787
1584 1584
1978 1978


In [21]:
for index, onset in enumerate(onsets):
    for i, value in enumerate(onset['word']):
        if value != sentence_to_words(iterator_list[index])[i].lower():
            print(index, i)

## Activation extraction

In [66]:
kwargs = {'path_to_function_words_list': f'/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/{language}/function_words.txt',
         'path_to_onset_folder': f'/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/onsets-offsets/{language}',
         'path_to_lexique_database': f'/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/{language}/lexique_database.tsv'}

In [67]:
saving_path_folders

['/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/english/rms']

In [68]:
for index, function in enumerate(functions):
    extractor = BasicFeaturesExtractor([function], language, **kwargs)
    print(extractor.name, ' - Extracting activations ...')
    for run_index, iterator in tqdm(enumerate(iterator_list)):
        print("############# Run {} #############".format(run_index))
        check_folder(saving_path_folders[index])
        activations  = extractor.extract_activations(iterator, rms_iterator_list[run_index])
        
        activations.to_csv(os.path.join(saving_path_folders[index], 'activations_run{}.csv'.format(run_index + 1)), index=False)
        
        

0it [00:00, ?it/s]

rms  - Extracting activations ...
############# Run 0 #############


1it [00:03,  3.44s/it]

55470
############# Run 1 #############


2it [00:06,  3.45s/it]

58780
############# Run 2 #############


3it [00:10,  3.62s/it]

67140
############# Run 3 #############


4it [00:14,  3.59s/it]

59820
############# Run 4 #############


5it [00:17,  3.44s/it]

52220
############# Run 5 #############


6it [00:21,  3.61s/it]

67810
############# Run 6 #############


7it [00:25,  3.67s/it]

64200
############# Run 7 #############


8it [00:28,  3.60s/it]

57580
############# Run 8 #############


9it [00:33,  3.68s/it]

72710





In [13]:
activations

Unnamed: 0,embedding-1,embedding-2,embedding-3,embedding-4,embedding-5,embedding-6,embedding-7,embedding-8,embedding-9,embedding-10,...,embedding-291,embedding-292,embedding-293,embedding-294,embedding-295,embedding-296,embedding-297,embedding-298,embedding-299,embedding-300
0,0.300710,-0.468670,-0.206170,-0.809780,-0.238890,0.243290,0.016538,-0.035687,-0.223060,0.95189,...,0.119920,0.146110,0.160340,0.072431,-0.43760,-0.259790,0.581580,0.49267,-0.112760,-0.277750
1,-0.255390,-0.257230,0.131690,-0.042688,0.218170,-0.022702,-0.178540,0.107560,0.058936,-1.38540,...,0.075968,-0.014359,-0.073794,0.221760,0.14652,0.566860,0.053307,-0.23290,-0.122260,0.354990
2,-0.141540,0.027303,0.135940,-0.120160,0.316880,-0.002833,0.049514,0.012035,0.050774,-1.78970,...,0.016749,-0.279860,0.091358,-0.116660,0.10341,0.231110,-0.089390,-0.40974,0.126680,0.114250
3,0.300710,-0.468670,-0.206170,-0.809780,-0.238890,0.243290,0.016538,-0.035687,-0.223060,0.95189,...,0.119920,0.146110,0.160340,0.072431,-0.43760,-0.259790,0.581580,0.49267,-0.112760,-0.277750
4,0.065573,0.022011,-0.131820,-0.213300,-0.045275,-0.095786,-0.197060,0.008206,-0.292850,-1.82300,...,0.345770,-0.229280,0.243410,0.336540,0.29751,0.446170,0.300770,-0.21916,-0.431860,-0.080348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,-0.210040,-0.395560,-0.460310,-0.122070,-0.882160,-0.355770,-0.029017,0.435330,-0.467000,-1.21150,...,-0.243670,-0.040827,-0.500400,-0.137830,-0.19901,0.016252,0.881520,-0.44566,0.693910,0.171590
1890,0.007368,0.062532,-0.097432,0.282890,0.179070,0.155630,-0.060022,-0.187060,0.252200,-1.43660,...,-0.150380,0.084015,-0.055967,-0.013686,0.19127,-0.425410,-0.138320,-0.32432,0.196110,0.293620
1891,-0.451630,0.127710,0.028132,0.008648,0.085709,0.051218,-0.068144,-0.155410,0.101610,-1.91470,...,-0.304990,0.179810,0.761070,0.141640,0.32768,0.136020,-0.016393,-0.54141,0.159790,-0.020832
1892,0.233640,-0.399180,-0.236820,-0.032494,-0.419840,0.409520,-0.452690,0.254970,0.110240,-1.77100,...,-0.072694,-0.302780,-0.271480,0.242300,0.33633,0.022533,0.129580,-0.43302,0.072295,0.440340


In [13]:
database = pd.read_csv(kwargs['path_to_lexique_database'], delimiter='\t')

In [22]:
from utils import sentence_to_words