# Notebook to extract hidden-states and attention heads activations from bert model predictions

In [2]:
import os
import glob
import pandas as pd
from tqdm import tqdm
from model import RobertaExtractor
from tokenizer import tokenize

In [3]:
def check_folder(path):
    """Create adequate folders if necessary."""
    try:
        if not os.path.isdir(path):
            check_folder(os.path.dirname(path))
            os.mkdir(path)
    except:
        pass

Defining variables:

In [4]:
template = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/text_english_run*.txt' # path to text input
language = 'english'
prediction_type = 'sentence'# 'sequential'


In [5]:
pretrained_roberta_models = ['roberta-base']
names = ['roberta-base']
config_paths = [None]
saving_path_folders = [
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/roberta-base'.format(language)]
prediction_types = ['sentence']

In [6]:
names

['roberta-base']

Creating iterator for each run:

In [7]:
paths = sorted(glob.glob(template))

In [8]:
iterator_list = [tokenize(path, language, train=False) for path in paths]

100%|██████████| 135/135 [00:00<00:00, 210808.28it/s]
100%|██████████| 135/135 [00:00<00:00, 143458.59it/s]
100%|██████████| 176/176 [00:00<00:00, 268826.48it/s]
100%|██████████| 173/173 [00:00<00:00, 232792.62it/s]
100%|██████████| 177/177 [00:00<00:00, 271242.90it/s]
100%|██████████| 216/216 [00:00<00:00, 282938.68it/s]
100%|██████████| 196/196 [00:00<00:00, 301239.86it/s]
100%|██████████| 145/145 [00:00<00:00, 272846.16it/s]
100%|██████████| 207/207 [00:00<00:00, 361307.09it/s]

Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.





## Activation extraction

In [11]:
for index, roberta_model in enumerate(pretrained_roberta_models):
    extractor = RobertaExtractor(roberta_model, language, names[index], prediction_types[index], output_hidden_states=True, output_attentions=True, config_path=config_paths[index])
    print(extractor.name, ' - Extracting activations ...')
    for run_index, iterator in tqdm(enumerate(iterator_list)):
        print("############# Run {} #############".format(run_index))
        check_folder(saving_path_folders[index])
        activations  = extractor.extract_activations(iterator, language)
        hidden_states_activations = activations[0]
        attention_heads_activations = activations[1]
        (cls_hidden_states_activations, cls_attention_activations) = activations[2]
        (sep_hidden_states_activations, sep_attention_activations) = activations[3]
        activations = pd.concat([hidden_states_activations, attention_heads_activations], axis=1)
        cls_activations = pd.concat([cls_hidden_states_activations, cls_attention_activations], axis=1)
        sep_activations = pd.concat([sep_hidden_states_activations, sep_attention_activations], axis=1)
        
        activations.to_csv(os.path.join(saving_path_folders[index], 'activations_run{}.csv'.format(run_index + 1)), index=False)
        cls_activations.to_csv(os.path.join(saving_path_folders[index], 'cls_run{}.csv'.format(run_index + 1)), index=False)
        sep_activations.to_csv(os.path.join(saving_path_folders[index], 'sep_run{}.csv'.format(run_index + 1)), index=False)
        
        

0it [00:00, ?it/s]

roberta-base  - Extracting activations ...
############# Run 0 #############


0it [00:16, ?it/s]


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 19968 and the array at index 4 has size 29952