# Notebook to extract hidden-states and attention heads activations from bert model predictions

In [1]:
import os
import glob
import pandas as pd
from tqdm import tqdm
from model import BertExtractor
from tokenizer import tokenize

In [7]:
def check_folder(path):
    """Create adequate folders if necessary."""
    try:
        if not os.path.isdir(path):
            check_folder(os.path.dirname(path))
            os.mkdir(path)
    except:
        pass

Defining variables:

In [2]:
template = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/text_english_run*.txt' # path to text input
language = 'english'
prediction_type = 'sentence'

In [3]:
pretrained_bert_models = [
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/models/BERT/NER_CONLL2003_bert_base_cased/fine_tuned',
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/models/BERT/SENTENCE_CLASSIFICATION_SST-2_bert_base_cased/fine_tuned',
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/models/BERT/SENTENCE_CLASSIFICATION_COLA_bert_base_cased/fine_tuned',
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/models/BERT/POS_CONLL2003_bert_base_cased/fine_tuned'
] # path to the model from which we want to retrieve the activations
infos = [os.path.basename(os.path.dirname(model)).split('_') for model in pretrained_bert_models]
names = ['{}_{}_{}_{}'.format(info[2], info[0], info[1], os.path.basename(model)) for (info, model) in zip(infos, pretrained_bert_models)]
config_paths = [os.path.join(os.path.dirname(model), 'config.yml') for model in pretrained_bert_models]
saving_path_folders = [
    '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/{}/{}'.format(language, name) for name in names]

Creating iterator for each run:

In [4]:
paths = sorted(glob.glob(template))

In [5]:
iterator_list = [tokenize(path, language, train=False) for path in paths]

100%|██████████| 135/135 [00:00<00:00, 237513.02it/s]
100%|██████████| 135/135 [00:00<00:00, 252331.12it/s]
100%|██████████| 176/176 [00:00<00:00, 311607.22it/s]
100%|██████████| 173/173 [00:00<00:00, 319232.11it/s]
100%|██████████| 177/177 [00:00<00:00, 328491.95it/s]
100%|██████████| 216/216 [00:00<00:00, 332343.97it/s]
100%|██████████| 196/196 [00:00<00:00, 245471.36it/s]
100%|██████████| 145/145 [00:00<00:00, 145010.51it/s]
100%|██████████| 207/207 [00:00<00:00, 291153.90it/s]

Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.





## Activation extraction

In [6]:
for index, bert_model in enumerate(pretrained_bert_models):
    extractor = BertExtractor(bert_model, language, names[index], prediction_type, output_hidden_states=True, output_attentions=True, config_path=config_paths[index])
    print(extractor.name, ' - Extracting activations ...')
    for run_index, iterator in tqdm(enumerate(iterator_list)):
        print("############# Run {} #############".format(run_index))
        check_folder(saving_path_folders[index])
        hidden_states_activations, attention_heads_activations  = extractor.extract_activations(iterator, language)
        hidden_states_activations.to_csv(os.path.join(saving_path_folders[index], 'hidden_states_run{}.csv'.format(run_index)), index=False)
        attention_heads_activations.to_csv(os.path.join(saving_path_folders[index], 'attention_heads_run{}.csv'.format(run_index)), index=False)

0it [00:00, ?it/s]

bert_NER_CONLL2003_fine_tuned  - Extracting activations ...
############# Run 0 #############


0it [00:16, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/stimuli-representations/english/bert_NER_CONLL2003_fine_tuned/hidden_states_run0.csv'