# Notebook to extract embeddings from Glove 

In [1]:
import os
import glob
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from model import GloveExtractor
from modeling_hacked_glove import Glove
from tokenizer import tokenize
from utils import set_seed

In [2]:
def check_folder(path):
    """Create adequate folders if necessary."""
    try:
        if not os.path.isdir(path):
            check_folder(os.path.dirname(path))
            os.mkdir(path)
    except:
        pass

Defining variables:

In [3]:
template = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/text_english_run*.txt' # path to text input
language = 'english'

In [4]:
name_template = 'glove.6B.300d.txt'

In [5]:
kwargs = {'embedding_size': 300}
vocab_path = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/glove_training'
path_to_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations'

In [6]:
template = '/Users/alexpsq/Code/Parietal/data/text_english_run*.txt' # path to text input
path_to_data = '/Users/alexpsq/Code/data/stimuli-representations'
vocab_path = os.path.join('/Users/alexpsq/Code/Parietal/data/glove.6B', name_template)

In [14]:
saving_path_folders = [os.path.join(path_to_data, 'glove_embeddings')]
config_paths = [None]

Creating iterator for each run:

In [8]:
paths = sorted(glob.glob(template))

In [9]:
glove = Glove(vocab_path)
vocab = glove.model

In [10]:
iterator_list = [tokenize(path, language, train=False, vocab=vocab) for path in paths]

100%|██████████| 135/135 [00:00<00:00, 404450.74it/s]
100%|██████████| 135/135 [00:00<00:00, 20423.86it/s]
100%|██████████| 135/135 [00:00<00:00, 668513.62it/s]
100%|██████████| 135/135 [00:00<00:00, 65948.18it/s]
100%|██████████| 176/176 [00:00<00:00, 429434.27it/s]
100%|██████████| 176/176 [00:00<00:00, 71001.01it/s]
100%|██████████| 173/173 [00:00<00:00, 735917.44it/s]
100%|██████████| 173/173 [00:00<00:00, 70550.76it/s]
100%|██████████| 177/177 [00:00<00:00, 648377.12it/s]
100%|██████████| 177/177 [00:00<00:00, 97683.13it/s]
100%|██████████| 216/216 [00:00<00:00, 950650.22it/s]
100%|██████████| 216/216 [00:00<00:00, 91688.05it/s]
100%|██████████| 196/196 [00:00<00:00, 416034.20it/s]
100%|██████████| 196/196 [00:00<00:00, 139288.98it/s]
100%|██████████| 145/145 [00:00<00:00, 707179.16it/s]
100%|██████████| 145/145 [00:00<00:00, 115359.27it/s]
100%|██████████| 207/207 [00:00<00:00, 838051.09it/s]
100%|██████████| 207/207 [00:00<00:00, 109402.84it/s]

Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.





## Activation extraction

In [16]:
for index, config in enumerate(config_paths):
    extractor = GloveExtractor(vocab_path, 'english', **kwargs)
    print(extractor.name, ' - Extracting activations ...')
    for run_index, iterator in tqdm(enumerate(iterator_list)):
        print("############# Run {} #############".format(run_index))
        check_folder(saving_path_folders[index])
        activations  = extractor.extract_activations(iterator)
        
        activations.to_csv(os.path.join(saving_path_folders[index], 'activations_run{}.csv'.format(run_index + 1)), index=False)
        
        

0it [00:00, ?it/s]
100%|██████████| 1894/1894 [00:00<00:00, 573947.82it/s]

GLOVE_embedding-size_300_language_english  - Extracting activations ...
############# Run 0 #############



1it [00:00,  1.08it/s]
100%|██████████| 2093/2093 [00:00<00:00, 798497.21it/s]


############# Run 1 #############


2it [00:01,  1.18it/s]
100%|██████████| 2297/2297 [00:00<00:00, 883801.15it/s]


############# Run 2 #############


3it [00:02,  1.27it/s]
100%|██████████| 2152/2152 [00:00<00:00, 1039638.59it/s]


############# Run 3 #############


4it [00:02,  1.37it/s]
100%|██████████| 2065/2065 [00:00<00:00, 1130267.23it/s]


############# Run 4 #############


5it [00:03,  1.46it/s]
100%|██████████| 2404/2404 [00:00<00:00, 1082342.94it/s]


############# Run 5 #############


6it [00:04,  1.46it/s]
100%|██████████| 2435/2435 [00:00<00:00, 1252069.42it/s]


############# Run 6 #############


7it [00:04,  1.45it/s]
100%|██████████| 2038/2038 [00:00<00:00, 1173207.73it/s]


############# Run 7 #############


8it [00:05,  1.52it/s]
100%|██████████| 2530/2530 [00:00<00:00, 1379561.77it/s]


############# Run 8 #############


9it [00:06,  1.37it/s]


In [15]:
saving_path_folders

['/Users/alexpsq/Code/data/stimuli-representations/glove_embeddings']

In [13]:
activations

Unnamed: 0,embedding-1,embedding-2,embedding-3,embedding-4,embedding-5,embedding-6,embedding-7,embedding-8,embedding-9,embedding-10,...,embedding-291,embedding-292,embedding-293,embedding-294,embedding-295,embedding-296,embedding-297,embedding-298,embedding-299,embedding-300
0,0.300710,-0.468670,-0.206170,-0.809780,-0.238890,0.243290,0.016538,-0.035687,-0.223060,0.95189,...,0.119920,0.146110,0.160340,0.072431,-0.43760,-0.259790,0.581580,0.49267,-0.112760,-0.277750
1,-0.255390,-0.257230,0.131690,-0.042688,0.218170,-0.022702,-0.178540,0.107560,0.058936,-1.38540,...,0.075968,-0.014359,-0.073794,0.221760,0.14652,0.566860,0.053307,-0.23290,-0.122260,0.354990
2,-0.141540,0.027303,0.135940,-0.120160,0.316880,-0.002833,0.049514,0.012035,0.050774,-1.78970,...,0.016749,-0.279860,0.091358,-0.116660,0.10341,0.231110,-0.089390,-0.40974,0.126680,0.114250
3,0.300710,-0.468670,-0.206170,-0.809780,-0.238890,0.243290,0.016538,-0.035687,-0.223060,0.95189,...,0.119920,0.146110,0.160340,0.072431,-0.43760,-0.259790,0.581580,0.49267,-0.112760,-0.277750
4,0.065573,0.022011,-0.131820,-0.213300,-0.045275,-0.095786,-0.197060,0.008206,-0.292850,-1.82300,...,0.345770,-0.229280,0.243410,0.336540,0.29751,0.446170,0.300770,-0.21916,-0.431860,-0.080348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,-0.210040,-0.395560,-0.460310,-0.122070,-0.882160,-0.355770,-0.029017,0.435330,-0.467000,-1.21150,...,-0.243670,-0.040827,-0.500400,-0.137830,-0.19901,0.016252,0.881520,-0.44566,0.693910,0.171590
1890,0.007368,0.062532,-0.097432,0.282890,0.179070,0.155630,-0.060022,-0.187060,0.252200,-1.43660,...,-0.150380,0.084015,-0.055967,-0.013686,0.19127,-0.425410,-0.138320,-0.32432,0.196110,0.293620
1891,-0.451630,0.127710,0.028132,0.008648,0.085709,0.051218,-0.068144,-0.155410,0.101610,-1.91470,...,-0.304990,0.179810,0.761070,0.141640,0.32768,0.136020,-0.016393,-0.54141,0.159790,-0.020832
1892,0.233640,-0.399180,-0.236820,-0.032494,-0.419840,0.409520,-0.452690,0.254970,0.110240,-1.77100,...,-0.072694,-0.302780,-0.271480,0.242300,0.33633,0.022533,0.129580,-0.43302,0.072295,0.440340
