# Notebook to extract hidden-states and attention heads activations from roberta model predictions

In [1]:
import os
import glob
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from model import RobertaExtractor
from tokenizer import tokenize


import torch
from sklearn.preprocessing import StandardScaler
from utils import set_seed
from numpy import linalg as la
import utils

In [2]:
def check_folder(path):
    """Create adequate folders if necessary."""
    try:
        if not os.path.isdir(path):
            check_folder(os.path.dirname(path))
            os.mkdir(path)
    except:
        pass

In [3]:
def transform(activations, path, name, run_index, n_layers_hidden=13, n_layers_attention=12, hidden_size=768):
    assert activations.values.shape[1] == (n_layers_hidden + n_layers_attention) * hidden_size
    indexes = [[index*hidden_size, (index+1)*hidden_size] for index in range(n_layers_hidden + n_layers_attention)]
    for order in [2]:
        matrices = []
        for i, index in enumerate(indexes):
            matrix = activations.values[:, index[0]:index[1]]
            #with_std = True if order=='std' else False
            #scaler = StandardScaler(with_mean=True, with_std=with_std)
            #scaler.fit(matrix)
            #matrix = scaler.transform(matrix)
            if order is not None and order != 'std':
                matrix = matrix / np.mean(la.norm(matrix, ord=order, axis=1))
            matrices.append(matrix)
        matrices = np.hstack(matrices)
        new_data = pd.DataFrame(matrices, columns=activations.columns)
        new_path = path + '_norm-' + str(order).replace('np.', '')
        check_folder(new_path)
        new_data.to_csv(os.path.join(new_path, name + '_run{}.csv'.format(run_index + 1)), index=False)


Defining variables:

In [4]:
template = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/text_english_run*.txt' # path to text input
language = 'english'
#template = '/Users/alexpsq/Code/Parietal/data/text_english_run*.txt' # path to text input


Creating iterator for each run:

In [5]:
paths = sorted(glob.glob(template))

In [6]:
iterator_list = [tokenize(path, language, train=False) for path in paths]

100%|██████████| 135/135 [00:00<00:00, 529682.92it/s]
100%|██████████| 135/135 [00:00<00:00, 784253.52it/s]
100%|██████████| 176/176 [00:00<00:00, 967493.45it/s]
100%|██████████| 173/173 [00:00<00:00, 1012014.77it/s]
100%|██████████| 177/177 [00:00<00:00, 987223.15it/s]
100%|██████████| 216/216 [00:00<00:00, 1047363.77it/s]
100%|██████████| 196/196 [00:00<00:00, 1100513.50it/s]
100%|██████████| 145/145 [00:00<00:00, 716341.67it/s]
100%|██████████| 207/207 [00:00<00:00, 766302.67it/s]


In [7]:
#from transformers import AutoTokenizer, RobertaTokenizer
#
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#
#for index in [0]:
#    batches, indexes = utils.batchify_per_sentence_with_pre_and_post_context(
#                iterator_list[index], 
#                1, 
#                1, 
#                0, 
#                'roberta-base', 
#                max_length=512)
#
#for index, batch in enumerate(batches):
#    batch = '<s> ' + batch + ' </s>'
#    tokenized_text = tokenizer.tokenize(batch, add_special_tokens=False)
#    print(batch)
#    beg = indexes[index][0] + 1 # because of the special token at the beginning
#    end = indexes[index][1] + 1
#    print(indexes[index][0], indexes[index][1])
#    print(tokenized_text)
#    print(tokenized_text[beg:end])
#    print()

## Activation extraction

In [8]:
pretrained_roberta_models = ['roberta-base'] 
names = [
         'roberta-base_pre-1_1_post-0',
         'roberta-base_pre-2_1_post-0',
         'roberta-base_pre-5_1_post-0',
         'roberta-base_pre-7_1_post-0',
         'roberta-base_pre-10_1_post-0',
         'roberta-base_pre-15_1_post-0',
         'roberta-base_pre-20_1_post-0'
         ]
config_paths = [None] * 8
saving_path_folders = [
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/roberta_pre-1_1_post-0'.format(language),
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/roberta_pre-2_1_post-0'.format(language),
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/roberta_pre-5_1_post-0'.format(language),
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/roberta_pre-7_1_post-0'.format(language),
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/roberta_pre-10_1_post-0'.format(language),
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/roberta_pre-15_1_post-0'.format(language),
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/roberta_pre-20_1_post-0'.format(language)
]
prediction_types = ['control-context'] * 8
number_of_sentence_list = [1] * 8
number_of_sentence_before_list = [1, 2, 5, 7, 10, 15, 20] #
number_of_sentence_after_list = [0] * 8


attention_length_before_list = [2] * 34
attention_length_after_list = [3] * 34
stop_attention_at_sent_before_list = [None] * 34
stop_attention_before_sent_list = [0] * 34


In [9]:
for index, roberta_model in enumerate(pretrained_roberta_models):
    extractor = RobertaExtractor(roberta_model, 
                                 language, 
                                 names[index], 
                                 prediction_types[index], 
                                 output_hidden_states=True, 
                                 output_attentions=False, 
                                 config_path=config_paths[index],
                                 max_length=512,
                                 number_of_sentence=number_of_sentence_list[index], 
                                 number_of_sentence_before=number_of_sentence_before_list[index], 
                                 number_of_sentence_after=number_of_sentence_after_list[index],
                                 attention_length_before=attention_length_before_list[index],
                                 attention_length_after=attention_length_after_list[index],
                                )
    print(extractor.name, ' - Extracting activations ...')
    for run_index, iterator in tqdm(enumerate(iterator_list)):
        gc.collect()
        print("############# Run {} #############".format(run_index+1))
        activations  = extractor.extract_activations(iterator, language)
        hidden_states_activations = activations[0]
        attention_heads_activations = activations[1]
        #(cls_hidden_states_activations, cls_attention_activations) = activations[2]
        #(sep_hidden_states_activations, sep_attention_activations) = activations[3]
        #activations = pd.concat([hidden_states_activations, attention_heads_activations], axis=1)
        #cls_activations = pd.concat([cls_hidden_states_activations, cls_attention_activations], axis=1)
        #sep_activations = pd.concat([sep_hidden_states_activations, sep_attention_activations], axis=1)
        
        transform(
            hidden_states_activations, 
            saving_path_folders[index], 
            'activations', 
            run_index=run_index,
            n_layers_hidden=13,
            n_layers_attention=0, 
            hidden_size=768)
                
        #transform(cls_activations, saving_path_folders[index], 'cls')
        #transform(sep_activations, saving_path_folders[index], 'sep')
        
        #activations.to_csv(os.path.join(saving_path_folders[index], 'activations_run{}.csv'.format(run_index + 1)), index=False)
        #cls_activations.to_csv(os.path.join(saving_path_folders[index], 'cls_run{}.csv'.format(run_index + 1)), index=False)
        #sep_activations.to_csv(os.path.join(saving_path_folders[index], 'sep_run{}.csv'.format(run_index + 1)), index=False)
        del activations
        #del cls_activations
        #del sep_activations
        #del hidden_states_activations
        #del attention_heads_activations
        #del cls_hidden_states_activations
        #del cls_attention_activations
        #del sep_hidden_states_activations
        #del sep_attention_activations
        

        

0it [00:00, ?it/s]

roberta-base_pre-1_1_post-0  - Extracting activations ...
############# Run 1 #############
(13, 32, 768)
(13, 13, 768)
(13, 8, 768)
(13, 21, 768)
(13, 22, 768)


0it [00:06, ?it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Applications/anaconda3/envs/parietal/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-82c11d23a320>", line 20, in <module>
    activations  = extractor.extract_activations(iterator, language)
  File "/Users/alexpsq/Code/Parietal/NLP_models/ROBERTA/model.py", line 118, in extract_activations
    activations = self.get_token_level_activations(iterator, language)
  File "/Users/alexpsq/Code/Parietal/NLP_models/ROBERTA/model.py", line 254, in get_token_level_activations
    encoded_layers = self.model(inputs_ids, attention_mask=attention_mask) # last_hidden_state, pooler_output, hidden_states, attentions
  File "/Applications/anaconda3/envs/parietal/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/Users/alexpsq/Code/Parietal/NLP_models/ROBERTA/model

KeyboardInterrupt: 

In [None]:
294912/12

In [None]:
24576.0/12

In [None]:
2048.0/32