In [5]:
import pickle
import time

with open("../../data/article_texts.txt",'rb') as f:
    texts = pickle.load(f, encoding="UTF-8")
with open("../../data/english_anecs_list.pickle", "rb") as f:
    english_anecs_list = pickle.load(f, encoding="UTF-8")
with open("../../data/ids_to_labels.pickle", "rb") as f:
    ids_to_labels = pickle.load(f, encoding="utf-8")
with open("../../data/labels_to_ids.pickle", "rb") as f:
    labels_to_ids = pickle.load(f, encoding="utf-8")
with open("../../data/unique_labels.pickle", "rb") as f:
    unique_labels = pickle.load(f, encoding="UTF-8")
with open("../../data/translated_anecs.txt", "r") as f:
    translated_anecs = f.read().replace("<unk> ", "").replace("♪ ", "").split("\n")

In [11]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('dslim/bert-base-NER')


def tokenize(data: str):
    inputs = tokenizer(data, return_tensors="pt", truncation=True, padding=True)
    return inputs


def ids_to_tokens(text_input):
    return tokenizer.convert_ids_to_tokens(text_input)

In [13]:

from transformers import BertForTokenClassification
import torch


class BertModel(torch.nn.Module):
    def __init__(self):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('dslim/bert-base-NER', num_labels=len(unique_labels),
                                                               ignore_mismatched_sizes=True)

    def forward(self, input_ids, label=None):
        output = self.bert(labels=label, input_ids=input_ids, return_dict=False)
        return output


model = BertModel()

model.load_state_dict(torch.load('../models/bert_trainedNEREnglish', map_location=torch.device('cpu')))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([17, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [14]:
import numpy as np


def get_ners(text: str) -> list:
    output = []
    B = np.asarray([tokenizer(text.replace("-", ""))["input_ids"]]).reshape(1, 1, -1)
    logits = model(torch.as_tensor(np.array(B))[0])[0]
    for j in range(logits.shape[0]):
        #print(logits[i])
        logits_clean = logits[j].argmax(dim=1)
        words = text.replace("-", "").split()
        tokenized_sentence = ids_to_tokens(tokenizer(text.replace("-", ""))["input_ids"])
        #for i in range(len(logits_clean)):
        #    print(tokenized_sentence[i], ids_to_labels[logits_clean[i].item()])
        #print([ids_to_labels[x.item()] for x in logits_clean])
        k = 0
        i = 0
        for el in logits_clean[1:-1]:
            if i == len(words):
                break
            elem = logits_clean[1:-1][k]
            if i + 1 <= len(logits_clean[1:-1]) and tokenized_sentence[i][:2] == "##":
                if elem.item() == "O":
                    label = logits_clean[1:-1][i]
                else:
                    label = elem.item()
                output.append({"word": words[i], "entity": ids_to_labels[label]})
                k += 2
            else:
                output.append({"word": words[i], "entity": ids_to_labels[elem.item()]})
                k += 1
            i += 1
    return output

In [15]:
import gensim.downloader

glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')

In [16]:
def get_embeddings(list_of_tags: list):
    emeddings = []
    for tag in list_of_tags:
        try:
            #print(tokenizer.convert_ids_to_tokens(tokenizer(tag["word"])["input_ids"][1]))
            embed = glove_vectors[tag["word"]]
            emeddings.append({'entity': tag["entity"], 'word': tag["word"], "embedding": embed})
        except:
            emeddings.append({'entity': tag["entity"], 'word': tag["word"], "embedding": glove_vectors["base"]})
            #print("Broken embedding", tag["word"], tag["entity"])
    return emeddings


def get_non_o(ner_words):
    a = []
    for x in ner_words:
        if x["entity"] != "O":
            a.append(x)
    return a

In [7]:
get_embeddings(get_non_o(get_ners(english_anecs_list[0])))

[{'entity': 'B-per',
  'word': 'watching',
  'embedding': array([ 2.1202e-01,  2.8917e-01,  4.5201e-01, -1.3361e-04, -1.5867e-01,
          3.6638e-01, -2.4863e-01, -9.0621e-02, -1.6291e-01, -4.7911e-01,
          1.6346e-01, -3.9370e-02,  5.3345e-02, -1.0756e-01, -5.5540e-02,
         -1.3395e-01,  6.0352e-02, -4.6904e-02,  6.3587e-01, -3.3198e-02,
          3.4816e-01,  1.1032e-01,  1.9035e-01,  2.5777e-01,  9.0213e-02,
         -2.0360e-02,  1.9367e-01, -4.5360e-01,  4.4856e-01, -1.0442e-01,
         -1.5787e-01, -2.6481e-01,  1.2450e-01, -9.8333e-02, -1.2574e+00,
          3.9935e-01, -4.3491e-01,  8.3862e-02, -3.0352e-01,  2.3797e-02,
          3.3142e-01, -9.9772e-02, -1.5809e-01, -2.6553e-01, -1.4040e-01,
         -2.9482e-01,  3.3828e-01,  2.0326e-01,  4.8495e-01,  1.0957e-01,
         -2.7091e-01, -3.4331e-01, -1.7515e-01, -5.4136e-01, -1.7845e-01,
          5.1594e-01,  2.0518e-01,  2.5785e-01,  8.5071e-02,  2.8581e-01,
         -1.0776e-01, -1.9538e-01,  4.8971e-01,  7.9599e

In [17]:
import time
from tqdm.notebook import tqdm
def prepare_embeddings(anecs_list: list) -> list:
    start = time.time()
    anecs_prepared = []
    errors = []
    i = 0
    for anec in tqdm(anecs_list):
        i+=1
        try:
            filtred_anec = get_non_o(get_ners(anec))
            anec_embeddings = get_embeddings(filtred_anec)
            anecs_prepared.append((anec_embeddings, anec) )
        except Exception as e:
            errors.append(f"{e} : {anec}")
    print("This data was corrupted", errors)
    return anecs_prepared
#with open("../../data/translated_anecs_prepared.pickle", "wb") as f:
#   pickle.dump(prepare_embeddings(translated_anecs), f)
with open("../../data/english_anecs_prepared.pickle", "wb") as f:
    pickle.dump(prepare_embeddings(english_anecs_list), f)


  0%|          | 0/3042 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors




In [18]:
with open("../../data/translated_anecs_prepared.pickle", "rb") as f:
    translated_anecs_prepared = pickle.load(f)
with open("../../data/english_anecs_prepared.pickle", "rb") as f:
    english_anecs_prepared = pickle.load(f)

In [20]:
english_anecs_prepared[3]

([],
 'A duck walks into a shop and asks the manager:-Got any fresh fruit?-No.-Got any fresh vegetables?-No.We have only dry goods.')