In [1]:
import re
import numpy as np
import pandas as pd
import nltk
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
logging.basicConfig(level=logging.INFO)

In [2]:
# example
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')  # bert-large-cased-whole-word-masking
# model.eval()

INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/aleksandr.khvorov/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/aleksandr.khvorov/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddi

In [3]:
# example
text = "[CLS] Who was Jim Henson ? Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

masked_index = 2
tokenized_text[masked_index] = '[MASK]'
tokenized_text[3] = '[MASK]'
# assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])

with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
# assert predicted_token == 'henson'
print(predicted_token)
print(tokenizer.convert_ids_to_tokens([torch.argmax(predictions[0, 3]).item()])[0])


['[CLS]', 'who', 'was', 'jim', 'henson', '?', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '[SEP]']
was
jim


In [4]:
w_tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking')
w_model = BertForMaskedLM.from_pretrained('bert-large-cased-whole-word-masking')  # bert-large-cased-whole-word-masking
# w_model.eval()

INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt from cache at /Users/aleksandr.khvorov/.cache/torch/pytorch_transformers/d64950f174bc2864a79ac854dd0e76a0daa587610f43c47f24eb977d31bcec0c.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json from cache at /Users/aleksandr.khvorov/.cache/torch/pytorch_transformers/eed84da4538f127abd693ec6310279654f6b9ce6c3c367c73d8d46812172c942.e1d0cd972de64b28f3a5bee0ffccda07658b2b3e827e0ef38c5799e9aaa23f19
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediat

In [4]:
def generate_mask(words, mask=None, word_inds=None):
    if mask:
        return mask
    mask = [1] * len(words)
    for word_ind in word_inds:
        mask[word_ind] = 0
    return mask

def predict_masked(model, tokenizer, words, mask=None, word_inds=None):
    mask = generate_mask(words, mask, word_inds)
    tokenized_text = []
    tokenized_mask = []
    for word, m in zip(words, mask):
        tokenized_word = tokenizer.tokenize(word)
        tokenized_text.extend(tokenized_word)
        tokenized_mask.extend([m] * len(tokenized_word))
    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    mask_tensor = torch.tensor([tokenized_mask])

    with torch.no_grad():
        outputs = model(tokens_tensor, masked_lm_labels=mask_tensor)
        predictions = outputs[1]

    tokenized_output = []
    for masked_index, is_masked in enumerate(tokenized_mask):
        if not is_masked:
            predicted_index = torch.argmax(predictions[0, masked_index]).item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
            tokenized_output.append(predicted_token)
#             print("predict:", predicted_token)
        else:
            tokenized_output.append(tokenized_text[masked_index])
#     print("Tokenized:", tokenized_output, word_ind)
    return tokenizer.convert_tokens_to_string(tokenized_output)

def accuracy(model, tokenizer, texts, k=1):
    true_pred = 0
    all_pred = 0
    for text in texts:
        tokenized_text = nltk.word_tokenize(text.lower())
        words = ['[CLS]'] + tokenized_text + ['[SEP]'] # token_pattern.findall # nltk.word_tokenize
        tokenized_text = predict_masked(model, tokenizer, words, word_inds=[])[6:-6]
        for ind, word in enumerate(words[1:-k]):
            predicted_sentance = predict_masked(model, tokenizer, words, word_inds=list(range(ind + 1, ind + 1 + k)))[6:-6]
#             print('|' + tokenized_text + '|', '|' + predicted_sentance + '|')
            if predicted_sentance == tokenized_text:
                true_pred += 1
            all_pred += 1
    return true_pred / all_pred

In [5]:
predict_masked(model, tokenizer, ['[CLS]', 'who', 'was', 'jim', 'hendson', '?', '[SEP]'], [1, 1, 0, 1, 1, 1, 1])

'[CLS] who was jim hendson ? [SEP]'

In [6]:
accuracy(model, tokenizer, ["Who was Jim Henson ?", "Jim Henson was a puppeteer ."], k=2)

0.8888888888888888

In [8]:
def read_nips_data(path, documents_limit=None):
    df = pd.read_csv(path, compression='gzip', sep=',')
    docs = df['paper_text'].values.astype(np.str)
    random = np.random.RandomState(13)
    random.shuffle(docs)
    docs = docs if documents_limit is None else docs[:documents_limit]
    docs = [nltk.sent_tokenize(text) for text in docs]
    docs = [item for sublist in docs for item in sublist]
    return docs

In [9]:
# docs_100 = read_nips_data('../resources/datasets/nips-papers.csv.gz', 100)
docs_1 = read_nips_data('../resources/datasets/nips-papers.csv.gz', 1)

In [140]:
docs_1[:5]

['Boosting Density Estimation\n\nSaharon Rosset\nDepartment of Statistics\nStanford University\nStanford, CA, 94305\nsaharon@stat.stanford.edu\n\nEran Segal\nComputer Science Department\nStanford University\nStanford, CA, 94305\neran@cs.stanford.edu\n\nAbstract\nSeveral authors have suggested viewing boosting as a gradient descent search for\na good fit in function space.',
 'We apply gradient-based boosting methodology to\nthe unsupervised learning problem of density estimation.',
 'We show convergence\nproperties of the algorithm and prove that a strength of weak learnability property applies to this problem as well.',
 'We illustrate the potential of this approach\nthrough experiments with boosting Bayesian networks to learn density models.',
 '1 Introduction\nBoosting is a method for incrementally building linear\x02\x01\x04combinations\nof ?weak?']

In [150]:
accuracy(model, tokenizer, docs_1)
# docs_100

0.9488834612700628

In [160]:
accuracy(model, tokenizer, docs_1, k=2)

0.917276720351391

In [10]:
for k in range(1, 10):
    print(f"accuracy for {k}: {accuracy(model, tokenizer, docs_1, k=k)}")

accuracy for 1: 0.9488834612700628
accuracy for 2: 0.917276720351391
accuracy for 3: 0.8899365018279777
accuracy for 4: 0.8648593971272507
accuracy for 5: 0.8405704555129843
accuracy for 6: 0.8171223666517257
accuracy for 7: 0.7944785276073619
accuracy for 8: 0.771542090886516
accuracy for 9: 0.7511111111111111


In [None]:
# def mask_sentanse

In [6]:
w_tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking')
w_model = BertForMaskedLM.from_pretrained('bert-large-cased-whole-word-masking')  # bert-large-cased-whole-word-masking
w_model.eval()

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt not found in cache, downloading to /var/folders/sy/k57bmcxn26s1mbtgfzj4ff540000gp/T/tmpfwvavooj
100%|██████████| 213450/213450 [00:01<00:00, 191138.54B/s]
INFO:pytorch_transformers.file_utils:copying /var/folders/sy/k57bmcxn26s1mbtgfzj4ff540000gp/T/tmpfwvavooj to cache at /Users/aleksandr.khvorov/.cache/torch/pytorch_transformers/d64950f174bc2864a79ac854dd0e76a0daa587610f43c47f24eb977d31bcec0c.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
INFO:pytorch_transformers.file_utils:creating metadata file for /Users/aleksandr.khvorov/.cache/torch/pytorch_transformers/d64950f174bc2864a79ac854dd0e76a0daa587610f43c47f24eb977d31bcec0c.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
INFO:pytorch_transformers.file_utils:removing temp file /var/folders/sy/k57bmcxn26s1mbtgfzj4ff540000gp/T/tmpfwvavooj
INFO:pytorch_transformers.tokeniz

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate):