In [1]:
import torch

In [2]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [3]:
import matplotlib.pyplot as plt

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [5]:
import pickle as pkl

with open("../resources/annotated_data.pickle","rb") as pkl_in:
    data = pkl.load(pkl_in)
    text = pkl.load(pkl_in)
    lu = pkl.load(pkl_in)
    pos_tag = pkl.load(pkl_in)
    frame_name = pkl.load(pkl_in)
    frame_element = pkl.load(pkl_in)
    frame_element_lu = pkl.load(pkl_in)
    lang = pkl.load(pkl_in)

In [6]:
sentences_en = text['en']
sentences_pt = text['pt']
sentences_de = text['de']

In [7]:
def fetch_token_embedding(encoded_layers, tokenized_text):
    
    # Convert the hidden state embeddings into single token vectors

    # Holds the list of 12 layer embeddings for each token
    # Will have the shape: [# tokens, # layers, # features]
    token_embeddings = [] 
    token_dict = {}
    batch_i = 0

    # For each token in the sentence...
    for token_i in range(len(tokenized_text)):
        
        # Holds 12 layers of hidden states for each token 
        hidden_layers = [] 

        # For each of the 12 layers...
        for layer_i in range(len(encoded_layers)):

            # Lookup the vector for `token_i` in `layer_i`
            vec = encoded_layers[layer_i][batch_i][token_i]

            hidden_layers.append(vec)
            
        token_embeddings.append(hidden_layers)
        concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072]
        #key = token + tokenid
        #print(token_i, tokenized_text[token_i])
        token_dict[tokenized_text[token_i] + '_' + str(token_i)] = concatenated_last_4_layers
        
    # Sanity check of the dimensions:
    print ('Shape is: %d x %d' % (len(concatenated_last_4_layers), len(concatenated_last_4_layers[0])))
    print ("Number of tokens in sequence:", len(token_embeddings))
    print ("Number of layers per token:", len(token_embeddings[0]))
    return token_dict

In [8]:
def preprocessing(sent_list):
    
    token_dict = {}
    sent_dict = {}
    marked_text = ['[CLS] ' + sent + ' [SEP]' for sent in sent_list]
    
    for i, text in enumerate(marked_text):
        tokenized_text = tokenizer.tokenize(text)
        print (tokenized_text)
    
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        
        # Convert inputs to PyTorch tensors
        tokens_tensors = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        
        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers, _ = model(tokens_tensors, segments_tensors)
        
        #word vectors
        w_dict = fetch_token_embedding(encoded_layers, tokenized_text)
        
        for k, v in w_dict.items():
            token_dict [k + '_' + str(i)] = v
                
        #sentence vector
        temp = text.replace('[CLS] ', '')
        temp = temp.replace(' [SEP]', '')
        sent_dict[temp] = torch.mean(encoded_layers[11], 1)
        
    return token_dict, sent_dict
        

In [9]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

#pre-processing
#english
token_dict_en, sent_dict_en = preprocessing(sentences_en)

#portuguese
token_dict_pt, sent_dict_pt = preprocessing(sentences_pt)

#deutsche
token_dict_de, sent_dict_de = preprocessing(sentences_de)

['[CLS]', 'It', "'", 's', 'been', 'great', ',', 'has', '##n', "'", 't', 'it', '?', '[SEP]']
Shape is: 14 x 3072
Number of tokens in sequence: 14
Number of layers per token: 12
['[CLS]', 'I', "'", 've', 'been', 'blow', '##n', 'away', 'by', 'the', 'whole', 'thing', '.', '[SEP]']
Shape is: 14 x 3072
Number of tokens in sequence: 14
Number of layers per token: 12
['[CLS]', 'In', 'fact', ',', 'I', "'", 'm', 'leaving', '.', '[SEP]']
Shape is: 10 x 3072
Number of tokens in sequence: 10
Number of layers per token: 12
['[CLS]', 'There', 'have', 'been', 'three', 'themes', 'running', 'through', 'the', 'conference', 'which', 'are', 'relevant', 'to', 'what', 'I', 'want', 'to', 'talk', 'about', '.', '[SEP]']
Shape is: 22 x 3072
Number of tokens in sequence: 22
Number of layers per token: 12
['[CLS]', 'One', 'is', 'the', 'extraordinary', 'evidence', 'of', 'human', 'creat', '##ivity', 'in', 'all', 'of', 'the', 'presentation', '##s', 'that', 'we', "'", 've', 'had', 'and', 'in', 'all', 'of', 'the', 'peo

Shape is: 8 x 3072
Number of tokens in sequence: 8
Number of layers per token: 12
['[CLS]', 'Well', ',', 'I', 'was', 'born', '.', '.', '.', 'no', '.', '[SEP]']
Shape is: 12 x 3072
Number of tokens in sequence: 12
Number of layers per token: 12
['[CLS]', 'I', 'heard', 'a', 'great', 'story', 'recently', '-', '-', 'I', 'love', 'telling', 'it', '-', '-', 'of', 'a', 'little', 'girl', 'who', 'was', 'in', 'a', 'drawing', 'less', '##on', '.', '[SEP]']
Shape is: 28 x 3072
Number of tokens in sequence: 28
Number of layers per token: 12
['[CLS]', 'She', 'was', 'six', ',', 'and', 'she', 'was', 'at', 'the', 'back', ',', 'drawing', ',', 'and', 'the', 'teacher', 'said', 'this', 'girl', 'hard', '##ly', 'ever', 'paid', 'attention', ',', 'and', 'in', 'this', 'drawing', 'less', '##on', ',', 'she', 'did', '.', '[SEP]']
Shape is: 37 x 3072
Number of tokens in sequence: 37
Number of layers per token: 12
['[CLS]', 'The', 'teacher', 'was', 'fa', '##sci', '##nated', '.', '[SEP]']
Shape is: 9 x 3072
Number of t

Shape is: 10 x 3072
Number of tokens in sequence: 10
Number of layers per token: 12
['[CLS]', 'You', 'don', "'", 't', 'think', 'of', 'Shakespeare', 'having', 'a', 'father', ',', 'do', 'you', '?', '[SEP]']
Shape is: 16 x 3072
Number of tokens in sequence: 16
Number of layers per token: 12
['[CLS]', 'Being', 'sent', 'to', 'bed', 'by', 'his', 'dad', ',', 'you', 'know', ',', 'to', 'Shakespeare', ',', "'", 'Go', 'to', 'bed', ',', 'now', '!', '[SEP]']
Shape is: 23 x 3072
Number of tokens in sequence: 23
Number of layers per token: 12
['[CLS]', 'And', 'put', 'the', 'pen', '##cil', 'down', '.', "'", '[SEP]']
Shape is: 10 x 3072
Number of tokens in sequence: 10
Number of layers per token: 12
['[CLS]', 'Any', '##way', ',', 'we', 'moved', 'from', 'Stratford', 'to', 'Los', 'Angeles', ',', 'and', 'I', 'just', 'want', 'to', 'say', 'a', 'word', 'about', 'the', 'transition', '.', '[SEP]']
Shape is: 25 x 3072
Number of tokens in sequence: 25
Number of layers per token: 12
['[CLS]', 'My', 'son', 'didn',

['[CLS]', 'Have', 'you', 'heard', 'of', 'her', '?', '[SEP]']
Shape is: 8 x 3072
Number of tokens in sequence: 8
Number of layers per token: 12
['[CLS]', 'She', "'", 's', 'a', 'cho', '##reo', '##grapher', ',', 'and', 'every', '##body', 'knows', 'her', 'work', '.', '[SEP]']
Shape is: 17 x 3072
Number of tokens in sequence: 17
Number of layers per token: 12
['[CLS]', 'She', 'did', "'", 'Cats', "'", 'and', "'", 'Phantom', 'of', 'the', 'Opera', '.', "'", '[SEP]']
Shape is: 15 x 3072
Number of tokens in sequence: 15
Number of layers per token: 12
['[CLS]', 'She', "'", 's', 'won', '##der', '##ful', '.', '[SEP]']
Shape is: 9 x 3072
Number of tokens in sequence: 9
Number of layers per token: 12
['[CLS]', 'I', 'used', 'to', 'be', 'on', 'the', 'board', 'of', 'The', 'Royal', 'Ballet', ',', 'as', 'you', 'can', 'see', '.', '[SEP]']
Shape is: 19 x 3072
Number of tokens in sequence: 19
Number of layers per token: 12
['[CLS]', 'Any', '##way', ',', 'Gill', '##ian', 'and', 'I', 'had', 'lu', '##nch', 'one

Shape is: 23 x 3072
Number of tokens in sequence: 23
Number of layers per token: 12
['[CLS]', 'I', 'believe', 'our', 'only', 'hope', 'for', 'the', 'future', 'is', 'to', 'adopt', 'a', 'new', 'conception', 'of', 'human', 'e', '##cology', ',', 'one', 'in', 'which', 'we', 'start', 'to', 're', '##cons', '##titut', '##e', 'our', 'conception', 'of', 'the', 'rich', '##ness', 'of', 'human', 'capacity', '.', '[SEP]']
Shape is: 41 x 3072
Number of tokens in sequence: 41
Number of layers per token: 12
['[CLS]', 'There', 'was', 'a', 'won', '##der', '##ful', 'quo', '##te', 'by', 'Jonas', 'Sal', '##k', ',', 'who', 'said', ',', "'", 'If', 'all', 'the', 'insects', 'were', 'to', 'disa', '##ppe', '##ar', 'from', 'the', 'Earth', ',', 'within', '50', 'years', 'all', 'life', 'on', 'Earth', 'would', 'end', '.', '[SEP]']
Shape is: 42 x 3072
Number of tokens in sequence: 42
Number of layers per token: 12
['[CLS]', 'We', 'have', 'to', 'be', 'care', '##ful', 'now', 'that', 'we', 'use', 'this', 'gift', 'wise', '#

Shape is: 16 x 3072
Number of tokens in sequence: 16
Number of layers per token: 12
['[CLS]', 'A', 'im', '##pre', '##visi', '##bilidade', ',', 'pra', 'mi', '##m', ',', 'é', 'extra', '##ord', '##in', '##ária', '.', '[SEP]']
Shape is: 18 x 3072
Number of tokens in sequence: 18
Number of layers per token: 12
['[CLS]', 'A', 'terceira', 'coi', '##sa', 'é', 'que', 'nós', 'todos', 'con', '##cor', '##dam', '##os', ',', 'apesar', 'de', 'tudo', ',', 'com', 'a', 'capacidade', 'extra', '##ord', '##in', '##ária', 'que', 'as', 'crianças', 'têm', '.', '[SEP]']
Shape is: 31 x 3072
Number of tokens in sequence: 31
Number of layers per token: 12
['[CLS]', 'Sua', 'capacidade', 'de', 'in', '##ova', '##ção', '.', '[SEP]']
Shape is: 9 x 3072
Number of tokens in sequence: 9
Number of layers per token: 12
['[CLS]', 'Sir', '##ena', 'ont', '##em', 'a', 'noite', 'foi', 'uma', 'mara', '##vil', '##ha', ',', 'não', 'foi', '?', '[SEP]']
Shape is: 17 x 3072
Number of tokens in sequence: 17
Number of layers per token:

Shape is: 42 x 3072
Number of tokens in sequence: 42
Number of layers per token: 12
['[CLS]', 'E', 'ele', 'disse', ':', "'", 'Cl', '##aro', '!', 'Por', 'quê', '?', 'Esta', '##va', 'er', '##rado', '?', "'", '[SEP]']
Shape is: 19 x 3072
Number of tokens in sequence: 19
Number of layers per token: 12
['[CLS]', 'Eles', 'troca', '##ram', 'a', 'ordem', ',', 'só', 'isso', '.', '[SEP]']
Shape is: 11 x 3072
Number of tokens in sequence: 11
Number of layers per token: 12
['[CLS]', 'En', '##fim', ',', 'os', 'três', 'gar', '##oto', '##s', 'entrar', '##am', ',', 'crianças', 'de', 'quatro', 'anos', 'com', 'toa', '##lhas', 'na', 'cabeça', ',', 'e', 'colocar', '##am', 'as', 'caixa', '##s', 'no', 'ch', '##ão', '.', '[SEP]']
Shape is: 33 x 3072
Number of tokens in sequence: 33
Number of layers per token: 12
['[CLS]', 'O', 'primeiro', 'gar', '##oto', 'disse', ':', "'", 'Eu', 'tra', '##go', 'ouro', '.', "'", '[SEP]']
Shape is: 15 x 3072
Number of tokens in sequence: 15
Number of layers per token: 12
['[CL

Shape is: 28 x 3072
Number of tokens in sequence: 28
Number of layers per token: 12
['[CLS]', 'Me', '##u', 'filho', 'não', 'queria', 'vir', '.', '[SEP]']
Shape is: 9 x 3072
Number of tokens in sequence: 9
Number of layers per token: 12
['[CLS]', 'Eu', 'ten', '##ho', 'dois', 'filhos', '.', '[SEP]']
Shape is: 8 x 3072
Number of tokens in sequence: 8
Number of layers per token: 12
['[CLS]', 'Ele', 'agora', 'tem', '21', 'e', 'minh', '##a', 'filha', '16', '.', '[SEP]']
Shape is: 12 x 3072
Number of tokens in sequence: 12
Number of layers per token: 12
['[CLS]', 'Ele', 'não', 'queria', 'vir', 'para', 'Los', 'Angeles', '.', '[SEP]']
Shape is: 10 x 3072
Number of tokens in sequence: 10
Number of layers per token: 12
['[CLS]', 'Ele', 'ado', '##rava', ',', 'mas', 'tinha', 'uma', 'nam', '##ora', '##da', 'na', 'Inglaterra', '.', '[SEP]']
Shape is: 15 x 3072
Number of tokens in sequence: 15
Number of layers per token: 12
['[CLS]', 'Era', 'o', 'amor', 'de', 'sua', 'vida', ',', 'Sarah', '.', '[SEP]']

Shape is: 16 x 3072
Number of tokens in sequence: 16
Number of layers per token: 12
['[CLS]', 'Eles', 'vive', '##m', 'lá', 'em', 'cima', 'e', 'leve', '##mente', 'para', 'um', 'lado', '.', '[SEP]']
Shape is: 15 x 3072
Number of tokens in sequence: 15
Number of layers per token: 12
['[CLS]', 'Eles', 'sa', '##íram', 'do', 'corpo', ',', 'quase', 'literalmente', '.', '[SEP]']
Shape is: 11 x 3072
Number of tokens in sequence: 11
Number of layers per token: 12
['[CLS]', 'Eles', 'v', '##ê', '##em', 'o', 'próprio', 'corpo', 'como', 'uma', 'forma', 'de', 'transporte', 'para', 'a', 'cabeça', '.', '[SEP]']
Shape is: 18 x 3072
Number of tokens in sequence: 18
Number of layers per token: 12
['[CLS]', 'Não', 'é', 'assim', '?', '[SEP]']
Shape is: 6 x 3072
Number of tokens in sequence: 6
Number of layers per token: 12
['[CLS]', 'É', 'um', 'je', '##ito', 'de', 'levar', '##em', 'suas', 'cabeça', '##s', 'às', 'con', '##ferência', '##s', '.', '[SEP]']
Shape is: 17 x 3072
Number of tokens in sequence: 17
Nu

Shape is: 52 x 3072
Number of tokens in sequence: 52
Number of layers per token: 12
['[CLS]', 'É', 'um', 'processo', 'de', 'in', '##f', '##la', '##ção', 'ac', '##ad', '##êm', '##ica', '.', '[SEP]']
Shape is: 15 x 3072
Number of tokens in sequence: 15
Number of layers per token: 12
['[CLS]', 'E', 'é', 'um', 'indica', '##tivo', 'de', 'que', 'toda', 'a', 'estrutura', 'edu', '##ca', '##cional', 'está', 'muda', '##ndo', 'na', 'frente', 'do', 'nos', '##so', 'nar', '##iz', '.', '[SEP]']
Shape is: 26 x 3072
Number of tokens in sequence: 26
Number of layers per token: 12
['[CLS]', 'Pre', '##cisa', '##mos', 'rep', '##ensa', '##r', 'radical', '##mente', 'nos', '##sa', 'visão', 'de', 'intel', '##ig', '##ência', '.', '[SEP]']
Shape is: 18 x 3072
Number of tokens in sequence: 18
Number of layers per token: 12
['[CLS]', 'Sa', '##bem', '##os', 'três', 'coisas', 'sobre', 'intel', '##ig', '##ência', '.', '[SEP]']
Shape is: 12 x 3072
Number of tokens in sequence: 12
Number of layers per token: 12
['[CLS]

Shape is: 5 x 3072
Number of tokens in sequence: 5
Number of layers per token: 12
['[CLS]', 'Ela', 'é', 'uma', 'core', '##óg', '##raf', '##a', 'e', 'todo', 'mundo', 'con', '##he', '##ce', 'seu', 'trabalho', '.', '[SEP]']
Shape is: 18 x 3072
Number of tokens in sequence: 18
Number of layers per token: 12
['[CLS]', 'Ela', 'trabalhou', 'em', "'", 'Cats', "'", ',', 'e', "'", 'O', 'Fan', '##tas', '##ma', 'da', 'Ó', '##pera', "'", '.', '[SEP]']
Shape is: 20 x 3072
Number of tokens in sequence: 20
Number of layers per token: 12
['[CLS]', 'Ela', 'é', 'mara', '##vil', '##hosa', '.', '[SEP]']
Shape is: 8 x 3072
Number of tokens in sequence: 8
Number of layers per token: 12
['[CLS]', 'Eu', 'estava', 'no', 'con', '##sel', '##ho', 'do', 'Royal', 'Ballet', ',', 'na', 'Inglaterra', ',', 'como', 'podem', 'ver', '.', '[SEP]']
Shape is: 19 x 3072
Number of tokens in sequence: 19
Number of layers per token: 12
['[CLS]', 'Gill', '##ian', 'e', 'eu', 'al', '##mo', '##ça', '##mos', 'um', 'dia', 'e', 'eu', 'p

Shape is: 30 x 3072
Number of tokens in sequence: 30
Number of layers per token: 12
['[CLS]', 'Outra', 'pessoa', 'poderia', 'ter', 're', '##cei', '##tado', 'um', 're', '##mé', '##dio', 'e', 'dito', 'para', 'ela', 'se', 'ac', '##al', '##mar', '.', '[SEP]']
Shape is: 22 x 3072
Number of tokens in sequence: 22
Number of layers per token: 12
['[CLS]', 'Hoje', ',', 'eu', 'ach', '##o', '.', '.', '.', '[SEP]']
Shape is: 10 x 3072
Number of tokens in sequence: 10
Number of layers per token: 12
['[CLS]', 'Eu', 'ach', '##o', 'que', 'se', 'res', '##ume', 'a', 'isso', ':', 'Al', 'Gore', 'fa', '##lou', 'outra', 'noite', 'sobre', 'e', '##cologia', ',', 'e', 'a', 'rev', '##olu', '##ção', 'des', '##en', '##cade', '##ada', 'por', 'Rachel', 'Carson', '.', '[SEP]']
Shape is: 35 x 3072
Number of tokens in sequence: 35
Number of layers per token: 12
['[CLS]', 'Eu', 'acre', '##dito', 'que', 'nos', '##sa', 'única', 'espera', '##nça', 'para', 'o', 'futuro', 'é', 'a', 'ado', '##ção', 'de', 'uma', 'nova', 'con'

Shape is: 26 x 3072
Number of tokens in sequence: 26
Number of layers per token: 12
['[CLS]', 'Ich', 'habe', 'ein', 'großes', 'Interesse', 'an', 'Bildung', ',', 'und', 'ich', 'den', '##ke', ',', 'das', 'haben', 'wir', 'alle', '.', '[SEP]']
Shape is: 20 x 3072
Number of tokens in sequence: 20
Number of layers per token: 12
['[CLS]', 'Wir', 'haben', 'ein', 'großes', ',', 'pers', '##ön', '##liches', 'Interesse', ',', 'teilweise', 'Bildung', 'dazu', 'ge', '##dacht', 'ist', ',', 'uns', 'in', 'diese', 'Zukunft', 'zu', 'bringen', ',', 'die', 'wir', 'nicht', 'fa', '##ssen', 'können', '.', '[SEP]']
Shape is: 33 x 3072
Number of tokens in sequence: 33
Number of layers per token: 12
['[CLS]', 'Den', '##ken', 'Sie', 'nur', ':', 'Kinder', ',', 'die', 'dieses', 'Jahr', 'in', 'die', 'Schule', 'kommen', ',', 'werden', 'im', 'Jahr', '206', '##5', 'in', 'Ren', '##te', 'gehen', '.', '[SEP]']
Shape is: 27 x 3072
Number of tokens in sequence: 27
Number of layers per token: 12
['[CLS]', 'Kei', '##ner', 'hat

In [10]:
with open("../resources/bert_embeddings.pickle", "wb") as pkl_out:
    pkl.dump(token_dict_en, pkl_out)
    pkl.dump(sent_dict_en, pkl_out)
    pkl.dump(token_dict_pt, pkl_out)
    pkl.dump(sent_dict_pt, pkl_out)
    pkl.dump(token_dict_de, pkl_out)
    pkl.dump(sent_dict_de, pkl_out)