In [34]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
import torch.nn.functional as F
from nltk.tokenize import sent_tokenize
import pandas as pd
import math
import argparse
from tqdm import tqdm
import numpy as np

In [35]:
bertMaskedLM = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
#bertMaskedLM.eval()


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:

def score(sentence, lang):
    sentence = "[CLS] "+sentence+" [SEP]"

    if lang == "da":
        prons = ["sin", "sit", "sine"]
    if lang == "ru":
        prons = "свой,своя́,своё,свои́,своего́,свое́й,своего́,свои́х,своему́,свое́й,своему́,\
                свои́м,своего́,свою,своего́,свои́х,свои́м,свое́й,свои́м,свои́ми,своём,свое́й,\
                своём,свои́х,свои,своей,своем,своего,своего,свои".lower().split(",")

    if lang == "sv":
        prons = ["sin", "sitt", "sina"]

    if lang == "zh":
        prons = "自己"

    print("Tokenizing....")
    tokenize_input = tokenizer.tokenize(sentence)
    segments_ids = [0] * len(tokenize_input)

    segments_tensors = torch.tensor([segments_ids])

    no_pron = True
    for i, token in enumerate(tokenize_input):
        if token in prons:
            pron_index = i
            no_pron = False
            break
        else:pass

    if no_pron==True: return "no pronouns to replace"

    print("masking reflexive pronoun.....")
    #slightly different logics for each language
    tokenize_mask_male = tokenize_input.copy()
    tokenize_mask_female = tokenize_input.copy()
    tokenize_mask_refl = tokenize_input.copy()

    if lang == "da":
        tokenize_mask_male[pron_index] = "hans"
        tokenize_mask_female[pron_index] = "hendes"


    if lang == "ru":
        tokenize_mask_male[pron_index] = "его"
        tokenize_mask_female[pron_index] = "ее"

    if lang == "zh":
        tokenize_mask_male = tokenizer.tokenize(sentence.replace("自己","他 UNK" ))
        tokenize_mask_female = tokenizer.tokenize(sentence.replace("自己","她 UNK" ))
        tokenize_mask_refl = tokenize_input.copy()

        print(tokenize_mask_female,tokenize_mask_refl )

        truth_index = tokenize_input.index("己")
        male_index = tokenize_mask_male.index("他")
        female_index = tokenize_mask_female.index("她")

    if lang == "sv":
        tokenize_mask_male[pron_index] = "hans"
        tokenize_mask_female[pron_index] = "hennes"

    if lang == "zh":

        tensor_input_male = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_mask_male)])

        tensor_input_female = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_mask_female)])
        tensor_truth = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])

    else:
        tensor_input_male = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_mask_male)])

        tensor_input_female = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_mask_female)])
        tensor_truth = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])


    print("predicting...")

    with torch.no_grad():
        predictions_male = bertMaskedLM(tensor_input_male, segments_tensors)[0]


    with torch.no_grad():
        predictions_female = bertMaskedLM(tensor_input_female, segments_tensors)[0]

    with torch.no_grad():
        predictions_truth = bertMaskedLM(tensor_truth, segments_tensors)[0]

    #predicted_male = predictions_male[0, pron_index].unsqueeze(0)
    #predicted_female = predictions_female[0, pron_index].unsqueeze(0)
    #truth_ = torch.tensor([tensor_truth[0,pron_index].item()])

    #loss_fct = torch.nn.CrossEntropyLoss()
    #loss_male = loss_fct(predictions_male.squeeze(),tensor_truth.squeeze()).data
    loss_male = F.cross_entropy(predictions_male.squeeze(), tensor_truth.squeeze())
    #loss_female = loss_fct(predictions_female.squeeze(),tensor_truth.squeeze()).data
    loss_female = F.cross_entropy(predictions_female.squeeze(), tensor_truth.squeeze())
    #loss_ref = loss_fct(predictions_truth.squeeze(),tensor_truth.squeeze()).data
    loss_ref = F.cross_entropy(predictions_truth.squeeze(), tensor_truth.squeeze())
    
    #print(loss)
    return "male: "+ str(loss_male.item())+" "+ str(torch.exp(loss_male))+ " female: "+ str(loss_female.item())+ " " +\
            str(torch.exp(loss_female)) + " refl: "+ str(loss_ref.item())+ " " + str(torch.exp(loss_ref))


In [39]:
filename = "/work/cool-programmer-astrid/ABC-dataset/data/COREF_LM/coref_lm.da"
lang = "da"

reflexive_sents = []
with open(filename, "r") as f:
    lines = f.readlines()

    restart = 0
    for line in lines:
        if "--------------" in line: pass
        elif "---" in line:
            restart = 0
        else:
            if restart == 0:
                reflexive_sents.append(line.strip())
                restart = 1

with open("/work/cool-programmer-astrid/ABC-dataset/outputs/lm/out_"+lang+"_replicate_new.txt", "w") as f:
    for i, sent in tqdm(enumerate(reflexive_sents)):
        scores = score(sent, lang)
        f.write(sent +" "+ scores +"\n")

1it [00:00,  5.77it/s]

Tokenizing....
masking reflexive pronoun.....
predicting...
Tokenizing....
masking reflexive pronoun.....
predicting...


3it [00:00,  5.36it/s]

Tokenizing....
masking reflexive pronoun.....
predicting...
Tokenizing....
masking reflexive pronoun.....
predicting...


5it [00:00,  5.33it/s]

Tokenizing....
masking reflexive pronoun.....
predicting...
Tokenizing....
masking reflexive pronoun.....
predicting...


7it [00:01,  5.60it/s]

Tokenizing....
masking reflexive pronoun.....
predicting...
Tokenizing....
masking reflexive pronoun.....
predicting...


9it [00:01,  5.62it/s]

Tokenizing....
masking reflexive pronoun.....
predicting...
Tokenizing....
masking reflexive pronoun.....
predicting...


11it [00:01,  5.82it/s]

Tokenizing....
masking reflexive pronoun.....
predicting...
Tokenizing....
masking reflexive pronoun.....
predicting...


11it [00:02,  5.15it/s]


KeyboardInterrupt: 

In [11]:
torch.exp(torch.tensor(6.381105899810791)) #590.5804

math.exp(6.381105899810791)

590.5804694768062

In [13]:
#2.408111333847046 11.112952654759475
math.exp(2.408111333847046)

TypeError: exp(): argument 'input' (position 1) must be Tensor, not float