# Notebook to evaluate model entropy

In [1]:
from transformers import BertTokenizer, GPT2Tokenizer,  GPT2LMHeadModel, GPT2Tokenizer, BertForMaskedLM
import scipy
import pandas as pd
import numpy as np
import os
import torch
import glob

In [7]:
from GPT2.tokenizer import tokenize
from LSTM.tokenizer import unk_transform
#from LSTM.model import LSTMExtractor
from LSTM.data import Dictionary

### Functions

In [8]:
def entropy(pk):
    pk = pk.numpy()
    entropy = -np.sum(pk * np.log2(pk), axis=0)
    return entropy 

In [9]:
def eval_output(out): 
    result = np.sum([entropy(scipy.special.softmax(out[0].detach().squeeze(0)[ax])) for ax in range(out[0].detach().squeeze(0).shape[0])]) 
    return result

### Model instanciation

In [38]:
model_base = GPT2LMHeadModel.from_pretrained('gpt2')
t_base = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
model_medium = GPT2LMHeadModel.from_pretrained('gpt2-medium')
t_medium = GPT2Tokenizer.from_pretrained('gpt2-medium')

In [33]:
model_bert = BertForMaskedLM.from_pretrained('bert-base-cased')
t_bert = BertTokenizer.from_pretrained('bert-base-cased') 

In [None]:
model_lstm = LSTMExtractor(...)

In [None]:
data = pd.read_csv('data/stimuli-representations/english/LSTM_embedding-size_600_nhid_300_nlayers_1_dropout_02_wiki_kristina_english/activations_run1.csv')
lstm_result = data['entropy']

### Data retrieval 

In [13]:
language = 'english'

In [14]:
#template = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/text_english_run*.txt' # path to text input  
template = '/USers/alexpsq/Code/Parietal/data/text_english_run*.txt'


In [15]:
paths = sorted(glob.glob(template))

In [16]:
iterator_list = [tokenize(path, language, train=False) for path in paths]

100%|██████████| 135/135 [00:00<00:00, 464123.80it/s]
100%|██████████| 135/135 [00:00<00:00, 865796.70it/s]
100%|██████████| 176/176 [00:00<00:00, 951285.44it/s]
100%|██████████| 173/173 [00:00<00:00, 836925.71it/s]
100%|██████████| 177/177 [00:00<00:00, 618659.84it/s]
100%|██████████| 216/216 [00:00<00:00, 710007.57it/s]
100%|██████████| 196/196 [00:00<00:00, 556967.20it/s]
100%|██████████| 145/145 [00:00<00:00, 259459.93it/s]
100%|██████████| 207/207 [00:00<00:00, 497547.81it/s]

Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.
Tokenizing...
Preprocessing...
Preprocessed.
Tokenized.





In [18]:
res = ' '.join(iterator_list[0])
res = res.split(' ')
print(len(res))

2015


In [None]:
vocab_path = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LePetitPrince/data/text/english/lstm_training'
#vocab_path = '/Users/alexpsq/Code/data/'
vocab = Dictionary(vocab_path, language)

In [None]:
iterator_list_lstm = [[unk_transform(word, vocab) for item in iterator_ for word in item.strip().split(' ')] for iterator_ in iterator_list]


In [232]:
def batchity(iterator, context_length, pretrained_bert, max_length=512):
    """Batchify iterator sentence, to get minimum context length 
    when possible.
    Arguments:
        - iterator: sentence iterator
        - context_length: int
    Returns:
        - batch: sequence iterator
        - indexes: tuple of int
    """
    iterator = [item.strip() for item in iterator]
    max_length -= 2 # for special tokens
    tokenizer = BertTokenizer.from_pretrained(pretrained_bert)
    
    batch = []
    indexes = []
    sentence_count = 0
    n = len(iterator)
    
    assert context_length < max_length
    token_count = 0
    while sentence_count < n and token_count < max_length:
        token_count += len(tokenizer.wordpiece_tokenizer.tokenize(iterator[sentence_count]))
        if token_count < max_length:
            sentence_count += 1
    batch.append(' '.join(iterator[:sentence_count]))
    indexes.append((0, len(tokenizer.wordpiece_tokenizer.tokenize(batch[-1]))))
    
    while sentence_count < n:
        token_count = 0
        sentence_index = sentence_count - 1
        tmp = sentence_count
        while token_count < context_length:
            token_count += len(tokenizer.wordpiece_tokenizer.tokenize(iterator[sentence_index]))
            sentence_index -= 1
        while sentence_count < n and token_count < max_length:
            token_count += len(tokenizer.wordpiece_tokenizer.tokenize(iterator[sentence_count]))
            if token_count < max_length:
                sentence_count += 1
        batch.append(' '.join(iterator[sentence_index+1:sentence_count]))
        indexes.append((len(tokenizer.wordpiece_tokenizer.tokenize(' '.join(iterator[sentence_index+1:tmp]))), len(tokenizer.wordpiece_tokenizer.tokenize(batch[-1]))))
    return batch, indexes

In [221]:
print(' '.join(iterator_list[0][0:5]))
print([item for l in [iterator_list[0][index] for index in range(5)] for item in l.split(' ')])

Once , when I was six years old , I saw a magnificent picture in a book about the primeval forest called ‘ Real - life Stories . ’  It showed a boa constrictor swallowing a wild animal .  Here is a copy of the drawing . It said in the book : “ Boa constrictors swallow their prey whole , without chewing .  Then they are not able to move , and they sleep for the six months it takes for digestion . ”
['Once', ',', 'when', 'I', 'was', 'six', 'years', 'old', ',', 'I', 'saw', 'a', 'magnificent', 'picture', 'in', 'a', 'book', 'about', 'the', 'primeval', 'forest', 'called', '‘', 'Real', '-', 'life', 'Stories', '.', '’', '', 'It', 'showed', 'a', 'boa', 'constrictor', 'swallowing', 'a', 'wild', 'animal', '.', '', 'Here', 'is', 'a', 'copy', 'of', 'the', 'drawing', '.', 'It', 'said', 'in', 'the', 'book', ':', '“', 'Boa', 'constrictors', 'swallow', 'their', 'prey', 'whole', ',', 'without', 'chewing', '.', '', 'Then', 'they', 'are', 'not', 'able', 'to', 'move', ',', 'and', 'they', 'sleep', 'for', 't

In [222]:
iterator_list[0]

['Once , when I was six years old , I saw a magnificent picture in a book about the primeval forest called ‘ Real - life Stories . ’',
 ' It showed a boa constrictor swallowing a wild animal .',
 ' Here is a copy of the drawing .',
 'It said in the book : “ Boa constrictors swallow their prey whole , without chewing .',
 ' Then they are not able to move , and they sleep for the six months it takes for digestion . ”',
 ' So I thought a lot about the adventures of the jungle and , in turn , I managed , with a coloured pencil , to make my first drawing .',
 ' My Drawing Number one .',
 ' It looked like this : I showed my masterpiece to the grownups and I asked them if my drawing frightened them .',
 'They answered me : “ Why would anyone be frightened by a hat ? ”',
 ' My drawing was not of a hat .',
 ' It showed a boa constrictor digesting an elephant .',
 ' I then drew the inside of the boa constrictor , so that the grownups could understand .',
 ' They always need to have things explai

In [194]:
len(iterator_list[0][16:50])

34

In [244]:
for index in range(1, len('the dog runs . '.split()) +1):
    print(index)
    print('the dog runs . '.split()[:index])

1
['the']
2
['the', 'dog']
3
['the', 'dog', 'runs']
4
['the', 'dog', 'runs', '.']


In [235]:
t_base.tokenize('the dog runs . ', add_prefix_space=True)

['Ġthe', 'Ġdog', 'Ġruns', 'Ġ.']

In [247]:
for i in range(8):
    batch, indexes = batchity(iterator_list[0], 200, 'bert-base-cased', max_length=512)
    print(indexes)
    print()

[(0, 499), (217, 504), (202, 489), (211, 508), (204, 487), (203, 502), (221, 268)]

[(0, 499), (217, 504), (202, 489), (211, 508), (204, 487), (203, 502), (221, 268)]

[(0, 499), (217, 504), (202, 489), (211, 508), (204, 487), (203, 502), (221, 268)]

[(0, 499), (217, 504), (202, 489), (211, 508), (204, 487), (203, 502), (221, 268)]

[(0, 499), (217, 504), (202, 489), (211, 508), (204, 487), (203, 502), (221, 268)]

[(0, 499), (217, 504), (202, 489), (211, 508), (204, 487), (203, 502), (221, 268)]

[(0, 499), (217, 504), (202, 489), (211, 508), (204, 487), (203, 502), (221, 268)]

[(0, 499), (217, 504), (202, 489), (211, 508), (204, 487), (203, 502), (221, 268)]



In [159]:
for i in batch:
    print(i)
    print()
    print()

I thus learned a second very important thing : that his home planet was barely bigger than a house ! It didn ' t surprise me much .  I knew that , apart from the large planets like the Earth , Jupiter , Mars , and Venus , which have been given names , there are hundreds of others that are sometimes so small that one has great difficulty in spotting them through the telescope .  When an astronomer discovers one of these , he gives it a number for a name .  He might call it for example “ asteroid three hundred and twenty - five . ”  I have serious reason to believe that the planet from where the little prince came is the asteroid B - six hundred and twelve .  This asteroid has only been seen through a telescope once , in one thousand , nine hundred and nine , by a Turkish astronomer . He had then given a big presentation on his discovery at an international astronomy conference . But nobody had believed him because of his outfit .  Grownups are like that . Fortunately for the reputation 

In [237]:
vars(model_bert)

{'training': False,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('bert',
               BertModel(
                 (embeddings): BertEmbeddings(
                   (word_embeddings): Embedding(28996, 768, padding_idx=0)
                   (position_embeddings): Embedding(512, 768)
                   (token_type_embeddings): Embedding(2, 768)
                   (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                   (dropout): Dropout(p=0.1, inplace=False)
                 )
                 (encoder): BertEncoder(
                   (layer): ModuleList(
                     (0): BertLayer(
                       (attention): BertAttention(
                         (self): BertSelfAttention(
                           (query): Li

In [248]:
for _, i in enumerate(batch):
    tokens = t_bert.tokenize(i)
    
    
    print(tokens)
    print()
    print(len(tokens))
    print(tokens[:200])
    print(indexes[_][1])
    print(indexes[_][0])
    print(tokens[indexes[_][0]])
    print(tokens[indexes[_][1]-1])
    print(len(tokens))
    print()
    print()

['Once', ',', 'when', 'I', 'was', 'six', 'years', 'old', ',', 'I', 'saw', 'a', 'magnificent', 'picture', 'in', 'a', 'book', 'about', 'the', 'prime', '##val', 'forest', 'called', '‘', 'Real', '-', 'life', 'Stories', '.', '’', 'It', 'showed', 'a', 'b', '##oa', 'con', '##st', '##ric', '##tor', 'swallowing', 'a', 'wild', 'animal', '.', 'Here', 'is', 'a', 'copy', 'of', 'the', 'drawing', '.', 'It', 'said', 'in', 'the', 'book', ':', '“', 'Bo', '##a', 'con', '##st', '##ric', '##tors', 'swallow', 'their', 'prey', 'whole', ',', 'without', 'chewing', '.', 'Then', 'they', 'are', 'not', 'able', 'to', 'move', ',', 'and', 'they', 'sleep', 'for', 'the', 'six', 'months', 'it', 'takes', 'for', 'dig', '##est', '##ion', '.', '”', 'So', 'I', 'thought', 'a', 'lot', 'about', 'the', 'adventures', 'of', 'the', 'jungle', 'and', ',', 'in', 'turn', ',', 'I', 'managed', ',', 'with', 'a', 'coloured', 'pencil', ',', 'to', 'make', 'my', 'first', 'drawing', '.', 'My', 'Drawing', 'Number', 'one', '.', 'It', 'looked', '

In [43]:
t_bert.decode(t_bert.convert_tokens_to_ids(t_bert.wordpiece_tokenizer.tokenize(' Straight ahead ... ”')))

'Straight ahead... ”'

In [40]:
t_base.tokenize(' Straight ahead ... ”')

['ĠStraight', 'Ġahead', 'Ġ...', 'ĠâĢ', 'Ŀ']

In [None]:
result = []                                                                                                                                                                                        

for line in iterator_list[0]:  
    result.append(len([word for word in line.strip().split(' ')])) 

In [None]:
for i in result:
    out_lstm.append(np.sum(lstm_result[index:index+i]))
    index+=i 

### Evaluation

In [None]:
results = []

In [None]:
for index, line in enumerate(iterator_list[0]):  
    out_gpt2_base = model_base(**t_base.encode_plus(line,add_special_tokens = False,max_length = 128, return_attention_mask = True,return_tensors = 'pt' )) 
    out_gpt2_medium = model_medium(**t_medium.encode_plus(line,add_special_tokens = False,max_length = 128, return_attention_mask = True,return_tensors = 'pt' ))  
    out_bert = model_bert(**t_bert.encode_plus(line,add_special_tokens = False,max_length = 128, return_attention_mask = True,return_tensors = 'pt' ))  
    results.append(eval_output(out_gpt2_base), eval_output(out_gpt2_medium), eval_output(out_bert), out_lstm[index]) 
    

In [None]:
result = pd.DataFram(results, columns=['GPT2-base', 'GPT2-medium', 'BERT-base-cased', 'LSTM-E600-H300-L1'])

In [None]:
result