In [1]:
import torch

In [2]:
from pytorch_pretrained_bert import BertTokenizer, BertModel    #, BertForMaskedLM

In [3]:
import matplotlib.pyplot as plt

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [5]:
import pickle as pkl

with open("../resources/annotated_data.pickle","rb") as pkl_in:
    data = pkl.load(pkl_in)
    text = pkl.load(pkl_in)
    lu = pkl.load(pkl_in)
    pos_tag = pkl.load(pkl_in)
    frame_name = pkl.load(pkl_in)
    frame_element = pkl.load(pkl_in)
    frame_element_lu = pkl.load(pkl_in)
    lang = pkl.load(pkl_in)

In [6]:
sentences_en = text['en']
sentences_pt = text['pt']
sentences_de = text['de']

In [7]:
def sentwise_ft(lang, list_):
    ft_dict = {}
    for i in list_[lang] :
        if i[2] not in ft_dict:
            ft_dict[i[2]] = {i[1] : i[0]}
        else:
            ft_dict[i[2]][i[1]] = i[0]
            
    return ft_dict

In [8]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def lu_to_word_mapper(lu_list, sentence):
    words = word_tokenize(sentence)

    tokens = []
    for w in words:
        for piece in w.split('-'):
            tokens.append(piece)

    stem_dict = {ps.stem(w) : w for w in tokens}
    lu_to_word = {}
    #lu_to_word = {lu : [stem_dict[w] if lu in w or w in lu else stem_dict[w] if lu[:5] in w[:5] else None for w in stem_dict.keys()][0] for lu in lu_list}

    for lu in lu_list:
        choice = 0
        for w in stem_dict.keys():
            if len(lu) <= 2 or len(w) <= 2:
                if lu == w : 
                    lu_to_word[lu] = stem_dict[w]
            else:
                #print(lu, w)
                if lu == w :
                    lu_to_word[lu] = stem_dict[w]
                    choice = 1
                elif choice != 1:
                    if lu in w or w in lu :
                        lu_to_word[lu] = stem_dict[w]

                    elif w.lower() in lu or lu in w.lower():
                        lu_to_word[lu] = stem_dict[w]

                    elif lu[:2] in w.lower()[:2] :
                        lu_to_word[lu] = stem_dict[w]

                    elif lu not in lu_to_word:
                            lu_to_word[lu] = lu

    return lu_to_word

In [9]:
def fetch_token_embedding(encoded_layers, tokenized_text):
    
    # Convert the hidden state embeddings into single token vectors

    # Holds the list of 12 layer embeddings for each token
    # Will have the shape: [# tokens, # layers, # features]
    token_embeddings = [] 
    token_dict = {}
    batch_i = 0

    # For each token in the sentence...
    for token_i in range(len(tokenized_text)):
        
        # Holds 12 layers of hidden states for each token 
        hidden_layers = [] 
        concatenated_last_4_layers = []

        # For each of the 12 layers...
        for layer_i in range(len(encoded_layers)):

            # Lookup the vector for `token_i` in `layer_i`
            vec = encoded_layers[layer_i][batch_i][token_i]

            hidden_layers.append(vec)
            
        token_embeddings.append(hidden_layers)
        concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072]
        #key = token + tokenid
        token_dict[tokenized_text[token_i] + '_' + str(token_i)] = concatenated_last_4_layers[len(concatenated_last_4_layers) - 1]
        
    # Sanity check of the dimensions:
    '''print ('Shape is: %d x %d' % (len(concatenated_last_4_layers), len(concatenated_last_4_layers[0])))
    print ("Number of tokens in sequence:", len(token_embeddings))
    print ("Number of layers per token:", len(token_embeddings[0]))'''
    return token_dict

In [10]:
import numpy as np
def make_embed(v, w_dict, lu_to_word):
    
    embed = []
    problem = []
    
    for k1, v1 in w_dict.items():
        try:
            if k1.split('_')[0].replace('##','').lower() == v :
                v1 = v1.numpy()
                embed.append(v1)

            elif k1.split('_')[0].replace('##','').lower() in lu_to_word[v] and len(k1.split('_')[0].replace('##','')) > 2 :
                    v1 = v1.numpy()
                    embed.append(v1)
        except:
                continue

    if len(embed) > 1:
        embed = np.average(embed, axis = 0)

    elif len(embed) == 1:
        embed = list(embed)[0]

    if len(embed) == 0:
        problem.append((v, k1, lu_to_word))
        embed = list(np.zeros(3072))
    
    #print(len(embed))
        
    return embed, problem

In [11]:
import numpy as np
def preprocessing(sent_list, lu_dict, fe_lu_dict, extra_sent):
    
    token_dict = {}
    token_dict_fe = {}
    sent_dict = {}
    if extra_sent == 0:
        marked_text = ['[CLS] ' + sent[0] + ' [SEP]' for sent in sent_list]
        #print(marked_text)
        sent_id = {sent[0] : sent[1] for sent in sent_list}
    elif extra_sent == 1:
        marked_text = ['[CLS] ' + sent + ' [SEP]' for sent in sent_list]
    problem = []
    problem_fe = []
    
    for text in marked_text:
        tokenized_text = tokenizer.tokenize(text)
        #print(tokenized_text)
    
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        
        # Convert inputs to PyTorch tensors
        tokens_tensors = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        
        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers, _ = model(tokens_tensors, segments_tensors)
            
        #sentence vector
        temp = text.replace('[CLS] ', '')
        temp = temp.replace(' [SEP]', '')
        sent_dict[temp] = (torch.mean(encoded_layers[11], 1)).numpy()
        
        if extra_sent == 0 :
            #word vectors
            w_dict = fetch_token_embedding(encoded_layers, tokenized_text)

            lu_list = [v for k, v in lu_dict[sent_id[temp]].items()]
            lu_to_word = lu_to_word_mapper(lu_list, temp)
            #now key = token + token id + sent id 
            for k, v in lu_dict[sent_id[temp]].items():
                token_dict[v + '_' + k + '_' + sent_id[temp]], prob = make_embed(v, w_dict, lu_to_word)

            problem.extend(prob)

            try:
                fe_lu_list = [v for k, v in fe_lu_dict[sent_id[temp]].items()]
                fe_lu_to_word = lu_to_word_mapper(fe_lu_list, temp)
                for k, v in fe_lu_dict[sent_id[temp]].items():
                    token_dict_fe[v + '_' + k + '_' + sent_id[temp]], prob_fe = make_embed(v, w_dict, lu_to_word)
                    problem_fe.extend(prob_fe)
            except:
                pass


    if extra_sent == 0:
        print(len(problem))
        print(len(problem_fe))

    if extra_sent == 1:
        return sent_dict
    elif extra_sent == 0:
        return token_dict, token_dict_fe, sent_dict
        

In [12]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

#pre-processing
#english
lu_en = sentwise_ft('en', lu)
fe_lu_en = sentwise_ft('en', frame_element_lu)
token_dict_en, token_dict_en_fe, sent_dict_en = preprocessing(sentences_en, lu_en, fe_lu_en, 0)

#portuguese
lu_pt = sentwise_ft('pt', lu)
fe_lu_pt = sentwise_ft('pt', frame_element_lu)
token_dict_pt, token_dict_pt_fe, sent_dict_pt = preprocessing(sentences_pt, lu_pt, fe_lu_pt, 0)

#deutsche
lu_de = sentwise_ft('de', lu)
fe_lu_de = sentwise_ft('de', frame_element_lu)
token_dict_de, token_dict_de_fe, sent_dict_de = preprocessing(sentences_de, lu_de, fe_lu_de, 0)

8
424
18
1441
6
93


In [13]:
len(token_dict_de['gut_29470_1275'])

3072

In [14]:
#for non-frame-annotated sentences 
print(len(sent_dict_en), len(sent_dict_pt), len(sent_dict_de))
extra_sents_en = []
extra_sents_pt = []
extra_sents_de = []

import pandas as pd
path_name = "../resources/en-pt-de.csv"
df = pd.read_csv(path_name, skiprows = 1, names = ["En_Id","En_Sentence","Pt_Id","Pt_Sentence","De_Id","De_Sentence"], encoding = 'utf-8')

for index, rows in df.iterrows():
    if rows['En_Sentence'] not in sent_dict_en:
        extra_sents_en.append(rows['En_Sentence'])
        
    elif rows['Pt_Sentence'] not in sent_dict_pt:
        extra_sents_pt.append(rows['Pt_Sentence'])
    
    elif rows['De_Sentence'] not in sent_dict_de:
        extra_sents_de.append(rows['De_Sentence'])

157 265 42


In [15]:
#for non-frame annotated sentences
extra_sent_dict_en = preprocessing(extra_sents_en, {}, {}, 1)
extra_sent_dict_pt = preprocessing(extra_sents_pt, {}, {}, 1)
extra_sent_dict_de = preprocessing(extra_sents_de, {}, {}, 1)

for k, v in extra_sent_dict_en.items():
    sent_dict_en[k] = v

for k, v in extra_sent_dict_pt.items():
    sent_dict_pt[k] = v

for k, v in extra_sent_dict_de.items():
    sent_dict_de[k] = v
    
print(len(sent_dict_en), len(sent_dict_pt), len(sent_dict_de))

161 272 62


In [16]:
with open("../resources/bert_embeddings.pickle", "wb") as pkl_out:
    pkl.dump(token_dict_en, pkl_out)
    pkl.dump(sent_dict_en, pkl_out)
    pkl.dump(token_dict_pt, pkl_out)
    pkl.dump(sent_dict_pt, pkl_out)
    pkl.dump(token_dict_de, pkl_out)
    pkl.dump(sent_dict_de, pkl_out)
    pkl.dump(token_dict_en_fe, pkl_out)
    pkl.dump(token_dict_pt_fe, pkl_out)
    pkl.dump(token_dict_de_fe, pkl_out)