# Notebook to extract hidden-states and attention heads activations from LSTM model predictions

In [1]:
import os
import glob
import torch
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from model import LSTMExtractor
from sklearn.preprocessing import StandardScaler
from numpy import linalg as la
from tokenizer import tokenize
from utils import set_seed
from data import Dictionary
from utils import read_yaml, save_yaml, batchify_text_with_memory_size

In [2]:
def check_folder(path):
    """Create adequate folders if necessary."""
    try:
        if not os.path.isdir(path):
            check_folder(os.path.dirname(path))
            os.mkdir(path)
    except:
        pass

In [3]:
def transform(activations, path, name, run_index, n_layers_hidden=1, hidden_size=300):
    assert activations.values.shape[1] == ((n_layers_hidden) * hidden_size + 2)
    indexes = [[index*hidden_size, (index+1)*hidden_size] for index in range(n_layers_hidden)]
    indexes += [[-2, -1], [-1, activations.values.shape[1]]]
    for order in [None]:
        matrices = []
        for index in indexes:
            matrix = activations.values[:, index[0]:index[1]]
            with_std = True if order=='std' else False
            scaler = StandardScaler(with_mean=True, with_std=with_std)
            scaler.fit(matrix)
            matrix = scaler.transform(matrix)
            if order is not None and order != 'std':
                matrix = matrix / np.mean(la.norm(matrix, ord=order, axis=1))
            matrices.append(matrix)
        matrices = np.hstack(matrices)
        new_data = pd.DataFrame(matrices, columns=activations.columns)
        new_path = path + '_norm-' + str(order).replace('np.', '')
        check_folder(new_path)
        new_data.to_csv(os.path.join(new_path, name + '_run{}.csv'.format(run_index + 1)), index=False)


Defining variables:

In [4]:
template = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/text_english_run*.txt' # path to text input
language = 'english'

In [5]:
name_template = 'weights_{}_embedding-size-{}_nhid-{}_nlayers-{}_dropout-{}_memory-size-{}_wiki-kristina_english.pt'


In [6]:
rnn_types = ['LSTM'] * 17
ninps = ['600'] * 17
nhids =  ['300'] * 17
nlayers = ['1'] * 17
dropouts =  ['02'] * 17
memory_sizes = [1, 2, 3, 4, 5, 7, 10, 12, 15, 17, 20, 25, 30, 35, 40, 50, np.inf]
vocab_path = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/lstm_training'
config_path_folder = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/models/LSTM/configs/'
trained_model_folder = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/models/english/'
path_to_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations'

In [7]:
#template = '/Users/alexpsq/Code/Parietal/data/text_english_run*.txt' # path to text input
#config_path_folder = '/Users/alexpsq/Code/Parietal/data/configs/'
#trained_model_folder = '/Users/alexpsq/Code/Parietal'
#path_to_data = '/Users/alexpsq/Code/data/stimuli-representations'
#vocab_path = '/Users/alexpsq/Code/data/'

In [8]:
pretrained_lstm_models = [
    trained_model_folder + name_template.format(rnn_type, ninp, nhid, nlayer, dropout, memory_size) for (rnn_type, ninp, nhid, nlayer, dropout, memory_size) in zip(rnn_types, ninps, nhids, nlayers, dropouts, memory_sizes)] # path to the model from which we want to retrieve the activations
infos = [os.path.basename(model).split('_') for model in pretrained_lstm_models]
names = ['_'.join(os.path.basename(model).split('.')[0].split('_')[1:]) for model in pretrained_lstm_models]
config_paths = [os.path.join(config_path_folder, 'config_' + name + '.yml') for name in names]
config_paths = ['_'.join(config.split('_')[:-3]) + '_' + '_'.join(config.split('_')[-2:]) for config in config_paths]
saving_path_folders = [
    os.path.join(path_to_data, '{}/{}'.format(language, name)) for name in names]
prediction_types = ['sequential' for i in pretrained_lstm_models]

In [9]:
saving_path_folders

['/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/english/LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-1_wiki-kristina_english',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/english/LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-2_wiki-kristina_english',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/english/LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-3_wiki-kristina_english',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/english/LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-4_wiki-kristina_english',
 '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/english/LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-5

In [10]:
def get_config(rnn_type='LSTM',
               language='english',
               ntoken=50001,
               ninp=650,
               nhid=650,
               nlayers=2,
               dropout='02',
               memory_size=np.inf,
               tie_weights=False,
               eos_separator='<eos>',
               cuda=True,
               weights_path=None,
               path_to_vocab=None,
               includ_surprisal=True,
               includ_entropy=True,
               parameters=['in', 'forget', 'out', 'c_tilde', 'hidden', 'cell']):
    config_template = {
        'rnn_type': rnn_type,
        'language': language,
        'ntoken': ntoken,
        'ninp': ninp,
        'nhid': nhid,
        'nlayers': nlayers,
        'dropout': int(dropout)/10,
        'memory_size': memory_size,
        'tie_weights': tie_weights,
        'eos_separator': eos_separator,
        'cuda': cuda,
        'weights_path': os.path.join(weights_path, name_template.format(rnn_type, ninp, nhid, nlayers, dropout, 0)).replace('_memory-size-0', ''),
        'path_to_vocab': path_to_vocab,
        'includ_surprisal': includ_surprisal,
        'includ_entropy': includ_entropy,
        'parameters': parameters}
    return config_template


In [11]:
for index, (rnn_type, ninp, nhid, nlayer, dropout, memory_size) in enumerate(zip(rnn_types, ninps, nhids, nlayers, dropouts, memory_sizes)):
    config_template =  get_config(rnn_type=rnn_type,
                                   language='english',
                                   ntoken=50001,
                                   ninp=int(ninp),
                                   nhid=int(nhid),
                                   nlayers=int(nlayer),
                                   dropout=dropout,
                                   memory_size=memory_size,
                                   weights_path=trained_model_folder,
                                   path_to_vocab=vocab_path,
                                 parameters=['hidden'])
    save_yaml(config_template, config_paths[index])

In [12]:
config_template

{'rnn_type': 'LSTM',
 'language': 'english',
 'ntoken': 50001,
 'ninp': 600,
 'nhid': 300,
 'nlayers': 1,
 'dropout': 0.2,
 'memory_size': inf,
 'tie_weights': False,
 'eos_separator': '<eos>',
 'cuda': True,
 'weights_path': '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/derivatives/fMRI/models/english/weights_LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_wiki-kristina_english.pt',
 'path_to_vocab': '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/lstm_training',
 'includ_surprisal': True,
 'includ_entropy': True,
 'parameters': ['hidden']}

Creating iterator for each run:

In [13]:
paths = sorted(glob.glob(template))

In [14]:
vocab = Dictionary(vocab_path, language)

In [15]:
iterator_list = [tokenize(path, language, train=False, vocab=vocab) for path in paths]

100%|██████████| 135/135 [00:00<00:00, 154834.85it/s]
100%|██████████| 135/135 [00:00<00:00, 621.31it/s]
100%|██████████| 135/135 [00:00<00:00, 210338.42it/s]
100%|██████████| 135/135 [00:00<00:00, 533.69it/s]
100%|██████████| 176/176 [00:00<00:00, 243549.16it/s]
100%|██████████| 176/176 [00:00<00:00, 711.79it/s]
100%|██████████| 173/173 [00:00<00:00, 201727.72it/s]
100%|██████████| 173/173 [00:00<00:00, 772.33it/s]
100%|██████████| 177/177 [00:00<00:00, 231780.15it/s]
100%|██████████| 177/177 [00:00<00:00, 913.51it/s]
100%|██████████| 216/216 [00:00<00:00, 266461.67it/s]
100%|██████████| 216/216 [00:00<00:00, 1026.84it/s]
100%|██████████| 196/196 [00:00<00:00, 229120.29it/s]
100%|██████████| 196/196 [00:00<00:00, 740.46it/s]
100%|██████████| 145/145 [00:00<00:00, 221961.34it/s]
100%|██████████| 145/145 [00:00<00:00, 709.31it/s]
100%|██████████| 207/207 [00:00<00:00, 257403.18it/s]
100%|██████████| 207/207 [00:00<00:00, 1008.55it/s]


## Activation extraction

In [None]:
for index, config in enumerate(config_paths):
    extractor = LSTMExtractor(config, language, names[index], prediction_types[index], output_hidden_states=True, memory_size=memory_sizes[index])
    print(extractor.name, ' - Extracting activations ...')
    for run_index, iterator in tqdm(enumerate(iterator_list)):
        print("############# Run {} #############".format(run_index))
        check_folder(saving_path_folders[index])
        activations  = extractor.extract_activations(iterator, language)
        
        #transform(
        #    activations, 
        #    saving_path_folders[index], 
        #    'activations', 
        #    run_index=run_index,
        #    n_layers_hidden=1,
        #    hidden_size=300)  
        activations.to_csv(os.path.join(saving_path_folders[index], 'activations_run{}.csv'.format(run_index + 1)), index=False)
        
        
        

0it [00:00, ?it/s]
0it [00:00, ?it/s][A
9it [00:00, 82.51it/s][A

LSTM_embedding-size_600_nhid_300_nlayers_1_dropout_02  - Extracting activations ...
############# Run 0 #############



17it [00:00, 78.61it/s][A
26it [00:00, 78.21it/s][A
36it [00:00, 82.96it/s][A
43it [00:00, 76.88it/s][A
62it [00:00, 91.77it/s][A
77it [00:00, 100.88it/s][A
88it [00:00, 88.77it/s] [A
98it [00:01, 79.87it/s][A
118it [00:01, 97.36it/s][A
130it [00:01, 95.03it/s][A
142it [00:01, 87.14it/s][A
152it [00:01, 87.01it/s][A
179it [00:01, 109.05it/s][A
194it [00:01, 118.39it/s][A
209it [00:01, 126.26it/s][A
232it [00:02, 145.27it/s][A
254it [00:02, 161.73it/s][A
273it [00:02, 135.56it/s][A
289it [00:02, 118.49it/s][A
303it [00:02, 110.18it/s][A
316it [00:02, 112.77it/s][A
329it [00:02, 106.27it/s][A
341it [00:03, 94.61it/s] [A
352it [00:03, 80.86it/s][A
362it [00:03, 76.07it/s][A
371it [00:03, 72.40it/s][A
381it [00:03, 76.45it/s][A
395it [00:03, 86.29it/s][A
406it [00:03, 91.38it/s][A
416it [00:03, 85.42it/s][A
426it [00:04, 81.24it/s][A
435it [00:04, 81.28it/s][A
444it [00:04, 78.95it/s][A
453it [00:04, 80.02it/s][A
463it [00:04, 83.95it/s][A
472it [00:04, 

############# Run 1 #############



22it [00:00, 108.60it/s][A
28it [00:00, 87.16it/s] [A
38it [00:00, 89.84it/s][A
47it [00:00, 89.43it/s][A
58it [00:00, 90.77it/s][A
66it [00:00, 78.86it/s][A
90it [00:00, 97.45it/s][A
116it [00:00, 119.86it/s][A
132it [00:01, 107.72it/s][A
146it [00:01, 94.16it/s] [A
161it [00:01, 104.19it/s][A
186it [00:01, 125.84it/s][A
213it [00:01, 149.64it/s][A
233it [00:01, 139.07it/s][A
251it [00:01, 131.16it/s][A
267it [00:02, 113.97it/s][A
281it [00:02, 96.86it/s] [A
293it [00:02, 94.68it/s][A
304it [00:02, 89.83it/s][A
314it [00:02, 80.12it/s][A
324it [00:02, 83.99it/s][A
336it [00:03, 91.64it/s][A
346it [00:03, 93.96it/s][A
356it [00:03, 90.46it/s][A
366it [00:03, 87.92it/s][A
376it [00:03, 89.02it/s][A
386it [00:03, 85.41it/s][A
398it [00:03, 90.65it/s][A
411it [00:03, 97.62it/s][A
424it [00:03, 103.23it/s][A
435it [00:04, 94.98it/s] [A
445it [00:04, 84.97it/s][A
454it [00:04, 80.23it/s][A
465it [00:04, 84.95it/s][A
492it [00:04, 106.77it/s][A
518it [00:0

############# Run 2 #############



13it [00:00, 53.02it/s][A
32it [00:00, 66.91it/s][A
42it [00:00, 72.96it/s][A
51it [00:00, 71.82it/s][A
67it [00:00, 85.68it/s][A
82it [00:00, 97.84it/s][A
94it [00:00, 102.12it/s][A
111it [00:00, 115.49it/s][A
137it [00:01, 138.47it/s][A
165it [00:01, 162.85it/s][A
186it [00:01, 172.32it/s][A
207it [00:01, 141.16it/s][A
225it [00:01, 128.46it/s][A
241it [00:01, 103.54it/s][A
254it [00:02, 90.09it/s] [A
265it [00:02, 80.44it/s][A
275it [00:02, 84.87it/s][A
285it [00:02, 86.99it/s][A
295it [00:02, 78.15it/s][A
304it [00:02, 77.08it/s][A
315it [00:02, 84.31it/s][A
325it [00:02, 85.19it/s][A
334it [00:03, 82.67it/s][A
344it [00:03, 86.85it/s][A
365it [00:03, 105.18it/s][A
393it [00:03, 128.08it/s][A

In [17]:
names

['LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-1_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-2_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-3_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-4_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-5_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-7_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-10_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-12_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-15_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_memory-size-17_wiki-kristina_english',
 'LSTM_embedding-size-600_nhid-300_nlayers-1_dropout-02_

In [26]:
activations

Unnamed: 0,hidden-layer-1-1,hidden-layer-1-2,hidden-layer-1-3,hidden-layer-1-4,hidden-layer-1-5,hidden-layer-1-6,hidden-layer-1-7,hidden-layer-1-8,hidden-layer-1-9,hidden-layer-1-10,...,hidden-layer-1-293,hidden-layer-1-294,hidden-layer-1-295,hidden-layer-1-296,hidden-layer-1-297,hidden-layer-1-298,hidden-layer-1-299,hidden-layer-1-300,surprisal,entropy
0,-0.009343,0.004728,0.020219,0.001229,-0.002103,0.006810,0.023671,0.456862,0.020347,0.027560,...,-0.136505,-0.008620,-0.000140,0.019614,-0.019661,-0.008184,-0.002025,-0.021357,7.510410,3.288988
1,0.059053,-0.291021,-0.023248,-0.000256,-0.010412,-0.000283,0.649610,0.002013,0.011817,0.000040,...,-0.006221,-0.016686,0.031706,-0.024747,-0.013827,0.022457,0.143735,-0.028213,2.280959,4.822252
2,0.003183,-0.043191,-0.005218,-0.000076,-0.097851,0.000788,0.073366,-0.093171,0.028572,0.000750,...,-0.018625,-0.016255,0.002736,0.011992,-0.003812,-0.007940,0.048375,-0.019784,6.703503,6.749106
3,-0.075047,-0.260891,0.710530,-0.013033,-0.414856,0.008764,-0.004385,0.008527,-0.074609,-0.011487,...,0.004501,-0.022174,0.029474,-0.000377,0.279350,0.002559,0.232537,0.002798,9.189131,3.540587
4,0.000112,-0.271419,0.002731,-0.000031,-0.512300,-0.001794,0.322740,0.002040,0.039313,-0.039021,...,-0.043292,-0.020852,0.025520,0.000300,0.059670,-0.497802,0.041170,0.009986,0.830372,6.087246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,0.050916,-0.040301,-0.002676,-0.079301,-0.012413,-0.000150,0.019106,0.015590,-0.041344,0.000258,...,-0.070237,0.211453,0.000145,0.002055,-0.428825,-0.569217,-0.261878,-0.331965,4.751386,6.544256
2526,-0.075146,-0.021213,0.002198,-0.016276,-0.003278,-0.000257,0.319500,0.043155,-0.015755,0.000444,...,-0.002624,0.252582,0.000002,0.000143,-0.379103,0.065756,0.127261,-0.389973,9.103417,9.232536
2527,-0.099023,-0.063729,0.020083,-0.020541,0.005853,-0.004721,0.007316,0.000683,-0.007743,-0.001299,...,0.001139,0.137877,-0.000093,-0.001862,0.209638,0.090016,0.011607,-0.433547,10.239527,8.252920
2528,0.488984,0.042901,-0.002047,-0.009047,-0.086916,-0.023077,-0.424121,0.011148,0.004937,-0.014340,...,-0.044774,0.211255,0.000910,0.018582,0.088907,-0.060101,0.045650,-0.372656,8.623935,5.192456
