In [1]:
from transformers import RobertaForMaskedLM, RobertaTokenizer
import torch
from tqdm import tqdm

In [2]:
model = RobertaForMaskedLM.from_pretrained('./output/')
tokenizer = RobertaTokenizer.from_pretrained('./output/')

In [3]:
import os
import pandas as pd
from random import choice

import nltk

In [4]:
path_to_intents = os.path.join('data', 'raw')
intents = os.listdir(path_to_intents)
get_path = lambda x: os.path.join('data', 'raw', x, x + ".csv")

In [5]:
os.chdir('helpers')
from analyze import questions, entities, get_data
data = get_data()
os.chdir('..')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from collections import defaultdict
text = defaultdict(list)
encoded_text = defaultdict(list)

ans = defaultdict(list)
response = defaultdict(list)

scores = defaultdict(list)

for (_questions, intent, entity) in zip(questions, intents, entities):
    question = choice(_questions)
    
    for (r, e) in zip(data[intent]['df']['text'], data[intent]['df'][entity]):
        text[intent].append("<s> " + r.strip() + '. ' + question.strip() + " ".join(["<mask>" for x in str(e).split()]) + " </s>")
        ans[intent].append((str(e)).split())

In [7]:
for intent in intents:
    for row in text[intent]:
        encoded_text[intent].append(torch.tensor([tokenizer.encode(row, add_special_tokens=False)]))

In [8]:
encoded_text['BookRestaurant'][0]

tensor([[    0,  6298,    20,  2367,   953,  1437,  2391,    11,  2808,    13,
          5996,     4,  4820,   116, 50264, 50264, 50264, 50264, 50264,     2]])

In [9]:
text['BookRestaurant'][0]

'<s> book The Middle East  restaurant in IN for noon. Where?<mask> <mask> <mask> <mask> <mask> </s>'

In [10]:
ans['BookRestaurant'][0]

['The', 'Middle', 'East', 'in', 'IN']

In [11]:
with torch.no_grad():
    for intent in intents:
        for i, row in tqdm(enumerate(encoded_text[intent])):
            out = model(row)
            response[intent].append(tokenizer.decode(torch.argmax(out[0][0], dim=1).tolist()).split())
            
            score = nltk.translate.bleu_score.sentence_bleu(
                    [ans[intent][i]],
                    response[intent][i],
                    smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4,
                    auto_reweigh=True
                )
            scores[intent].append(score)
            

2042it [02:49, 12.03it/s]
596it [00:49, 10.32it/s]

KeyboardInterrupt: 

In [None]:
" ".join(response['BookRestaurant'][0])

In [None]:
for intent in intents:
    print(f"For {intent}, avg. BLEU score is {sum(scores[intent]) / len(scores[intent])}")

In [12]:
os.chdir('transformers')
from examples.run_generation import sample_sequence
os.chdir('..')
_out = sample_sequence(
    model, 5, encoded_text['BookRestaurant'][0][0], top_p = 0.9,
    is_xlm_mlm=True, xlm_mask_token=50264
)

  context = torch.tensor(context, dtype=torch.long, device=device)

  0%|                                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|█████████████████▍                                                                     | 1/5 [00:00<00:00,  5.75it/s][A
 40%|██████████████████████████████████▊                                                    | 2/5 [00:00<00:00,  5.78it/s][A
 60%|████████████████████████████████████████████████████▏                                  | 3/5 [00:00<00:00,  5.86it/s][A
 80%|█████████████████████████████████████████████████████████████████████▌                 | 4/5 [00:00<00:00,  5.97it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.90it/s][A


In [13]:
encoded_text['BookRestaurant'][0][0]

tensor([    0,  6298,    20,  2367,   953,  1437,  2391,    11,  2808,    13,
         5996,     4,  4820,   116, 50264, 50264, 50264, 50264, 50264,     2])

In [14]:
tokenizer

<transformers.tokenization_roberta.RobertaTokenizer at 0x1d9bb551dc8>

In [15]:
tokenizer.decode(_out.squeeze(0).tolist())

'<s>book The Middle East  restaurant in IN for noon. Where?<mask><mask><mask><mask><mask></s>Ise</s> by<mask>'

In [None]:
tokenizer.decode([50264])

In [None]:
_out.squeeze(0).tolist()

In [None]:
scores

In [None]:
response

In [None]:
def fill_mask(masked_input, model, tokenizer, topk=5):
    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
    assert masked_input.count('<mask>') == 1
    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
    logits = logits[0, masked_index, :]
    prob = logits.softmax(dim=0)
    values, indices = prob.topk(k=topk, dim=0)
    topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
                                         for i in range(len(indices))])
    masked_token = tokenizer.mask_token
    topk_filled_outputs = []
    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
        predicted_token = predicted_token_bpe.replace('\u2581', ' ')
        if " {0}".format(masked_token) in masked_input:
            topk_filled_outputs.append((
                masked_input.replace(
                    ' {0}'.format(masked_token), predicted_token
                ),
                values[index].item(),
                predicted_token,
            ))
        else:
            topk_filled_outputs.append((
                masked_input.replace(masked_token, predicted_token),
                values[index].item(),
                predicted_token,
            ))
    return topk_filled_outputs


In [None]:
with open("data/masked.txt", 'r', encoding='utf-8') as f:
    for x in f.read().split("\n\n"):
        _x = tokenizer.encode(x, add_special_tokens=False)
        out = model(torch.tensor([_x]))
        print(" ".join(tokenizer.decode(torch.argmax(out[0][0], dim=1).tolist()).split()))
#         print(fill_mask("The answer is <mask>.", model, tokenizer, topk=1)[0])

In [None]:
_x