In [1]:
import pickle


with open("../../data/article_texts.txt",'rb') as f:
    texts = pickle.load(f, encoding="UTF-8")
with open("../../data/english_anecs_list.pickle", "rb") as f:
    english_anecs_list = pickle.load(f, encoding="UTF-8")
with open("../../data/ids_to_labels.pickle", "rb") as f:
    ids_to_labels = pickle.load(f, encoding="utf-8")
with open("../../data/labels_to_ids.pickle", "rb") as f:
    labels_to_ids = pickle.load(f, encoding="utf-8")
with open("../../data/unique_labels.pickle", "rb") as f:
    unique_labels = pickle.load(f, encoding="UTF-8")
with open("../../data/translated_anecs.txt", "r") as f:
    translated_anecs = f.read().replace("<unk> ", "").replace("♪ ", "").split("\n")


In [2]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('dslim/bert-base-NER')


def tokenize(data: str):
    inputs = tokenizer(data, return_tensors="pt", truncation=True, padding=True)
    return inputs


def ids_to_tokens(text_input):
    return tokenizer.convert_ids_to_tokens(text_input)

In [3]:

from transformers import BertForTokenClassification
import torch


class BertModel(torch.nn.Module):
    def __init__(self):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('dslim/bert-base-NER', num_labels=len(unique_labels),
                                                               ignore_mismatched_sizes=True)

    def forward(self, input_ids, label=None):
        output = self.bert(labels=label, input_ids=input_ids, return_dict=False)
        return output


model = BertModel()

model.load_state_dict(torch.load('../models/bert_trainedNEREnglish', map_location=torch.device('cpu')))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([17, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [4]:
import os
if "glove_vectors.pickle" in os.listdir("../../data/"):
    with open("../../data/glove_vectors.pickle", "rb") as f:
        glove_vectors = pickle.load(f)
else:
    import gensim.downloader
    glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')
    with open("../../data/glove_vectors.pickle", "wb") as f:
        pickle.dump(glove_vectors, f)


In [5]:
import numpy as np


def get_ners(text: str) -> list:
    output = []
    B = np.asarray([tokenizer(text.replace("-", ""))["input_ids"]]).reshape(1, 1, -1)
    logits = model(torch.as_tensor(np.array(B))[0])[0]
    for j in range(logits.shape[0]):
        #print(logits[i])
        logits_clean = logits[j].argmax(dim=1)
        words = text.replace("-", "").split()
        tokenized_sentence = ids_to_tokens(tokenizer(text.replace("-", ""))["input_ids"])
        #for i in range(len(logits_clean)):
        #    print(tokenized_sentence[i], ids_to_labels[logits_clean[i].item()])
        #print([ids_to_labels[x.item()] for x in logits_clean])
        i = 1
        for elem in logits_clean[1:-1]:
            if i > 1 and (tokenized_sentence[i][:2] == "##" or ids_to_labels[elem.item()][0] == "I"):
                if tokenized_sentence[i][:2] == "##":
                    output[-1]["word"] += tokenized_sentence[i][2:]
                else:
                    output[-1]["word"] += tokenized_sentence[i]
            else:
                output.append({"word": tokenized_sentence[i], "entity": ids_to_labels[elem.item()]})
            i += 1
    return output

In [6]:
def get_embeddings(list_of_tags: list):
    emeddings = []
    for tag in list_of_tags:
        try:
            #print(tokenizer.convert_ids_to_tokens(tokenizer(tag["word"])["input_ids"][1]))
            embed = glove_vectors[tag["word"]]
            emeddings.append({'entity': tag["entity"], 'word': tag["word"], "embedding": embed})
        except:
            emeddings.append({'entity': tag["entity"], 'word': tag["word"], "embedding": glove_vectors["base"]})
            #print("Broken embedding", tag["word"], tag["entity"])
    return emeddings


def get_non_o(ner_words):
    a = []
    for x in ner_words:
        if x["entity"] != "O":
            a.append(x)
    return a

In [7]:
get_embeddings(get_non_o(get_ners(english_anecs_list[0])))

[{'entity': 'B-per',
  'word': 'Jesus',
  'embedding': array([-0.85078  ,  0.14052  ,  0.19218  , -0.56941  ,  0.012045 ,
         -0.13574  ,  0.026476 ,  0.63774  ,  0.13145  , -1.5406   ,
          0.022069 , -0.093937 , -0.0075906,  0.18398  ,  0.098117 ,
          0.34659  , -0.29379  , -0.023845 ,  0.34028  , -0.16479  ,
         -0.4635   , -0.7426   , -0.098289 , -0.66905  , -0.20917  ,
         -0.097382 , -0.028347 ,  0.5289   , -0.1507   ,  0.52132  ,
          0.096284 , -0.20201  , -0.23403  ,  0.35867  ,  0.14002  ,
          0.13554  ,  0.15319  ,  0.32805  ,  0.020936 ,  0.1657   ,
         -0.0089688,  0.059181 , -0.1375   ,  0.035753 , -0.17763  ,
          0.12395  ,  0.041585 ,  0.51095  ,  0.11174  ,  0.030507 ,
         -0.079747 , -0.70903  , -0.60473  ,  0.36105  , -0.16732  ,
         -0.044104 ,  0.10538  ,  0.39964  ,  0.22989  ,  0.31049  ,
         -0.24369  , -0.33433  ,  0.3908   , -0.52597  , -0.44613  ,
         -0.055256 , -0.084394 , -0.28462  ,  0.36

In [8]:
import time
from tqdm.notebook import tqdm
def prepare_embeddings(anecs_list: list) -> list:
    start = time.time()
    anecs_prepared = []
    errors = []
    i = 0
    for anec in tqdm(anecs_list):
        i+=1
        try:
            ner_anec = get_ners(anec)
            a = []
            for x in ner_anec:
                a.append(x["word"])
            filtred_anec = get_non_o(ner_anec)
            anec_embeddings = get_embeddings(filtred_anec)
            anecs_prepared.append((anec_embeddings, " ".join(a)) )
        except Exception as e:
            errors.append(f"{e} : {anec}")
    print("This data was corrupted", errors)
    return anecs_prepared
with open("../../data/translated_anecs_prepared.pickle", "wb") as f:
   pickle.dump(prepare_embeddings(translated_anecs), f)
with open("../../data/english_anecs_prepared.pickle", "wb") as f:
    pickle.dump(prepare_embeddings(english_anecs_list), f)


  0%|          | 0/1143 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors


This data was corrupted ['The expanded size of the tensor (622) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 622].  Tensor sizes: [1, 512] : Dad: Do you want me to make you a roller coaster out of these beads? Son: Why not, Dad? Dad: Well, why? It\'s... It\'s... What do you need roller skates for? You\'ll be riding around the house... from the kitchen to the bedroom. Then back to the kitchen. To the pantry again. To the bathroom over there. It\'s... It\'s... I can make it. Fuck you! If you don\'t want to, you can say, "Fuck you, Dad and your roller coaster!" I don\'t want anything from you, and I don\'t give a fuck about your fatherly care! Well, you\'re thinking that now, aren\'t you? You did? Son: No, it was like... Dad: It was like! It\'s... It\'s... In the fucking garden! - What? - What? I think so! It\'s... It\'s... In the water! - What? - What? When you feel like it, you should be baptized! You got it? Do you know that wise saying, or are yo

  0%|          | 0/3042 [00:00<?, ?it/s]



In [15]:
with open("../../data/translated_anecs_prepared.pickle", "rb") as f:
    translated_anecs_prepared = pickle.load(f)
with open("../../data/english_anecs_prepared.pickle", "rb") as f:
    english_anecs_prepared = pickle.load(f)

In [16]:
english_anecs_prepared[7]

([{'entity': 'B-per',
   'word': 'Grandma',
   'embedding': array([-0.85078  ,  0.14052  ,  0.19218  , -0.56941  ,  0.012045 ,
          -0.13574  ,  0.026476 ,  0.63774  ,  0.13145  , -1.5406   ,
           0.022069 , -0.093937 , -0.0075906,  0.18398  ,  0.098117 ,
           0.34659  , -0.29379  , -0.023845 ,  0.34028  , -0.16479  ,
          -0.4635   , -0.7426   , -0.098289 , -0.66905  , -0.20917  ,
          -0.097382 , -0.028347 ,  0.5289   , -0.1507   ,  0.52132  ,
           0.096284 , -0.20201  , -0.23403  ,  0.35867  ,  0.14002  ,
           0.13554  ,  0.15319  ,  0.32805  ,  0.020936 ,  0.1657   ,
          -0.0089688,  0.059181 , -0.1375   ,  0.035753 , -0.17763  ,
           0.12395  ,  0.041585 ,  0.51095  ,  0.11174  ,  0.030507 ,
          -0.079747 , -0.70903  , -0.60473  ,  0.36105  , -0.16732  ,
          -0.044104 ,  0.10538  ,  0.39964  ,  0.22989  ,  0.31049  ,
          -0.24369  , -0.33433  ,  0.3908   , -0.52597  , -0.44613  ,
          -0.055256 , -0.084394 ,

In [23]:
for anec in english_anecs_prepared:
    if len(anec[0]) > 0:
        if len(anec[0][0]) > 0:
            if anec[0][0]["entity"] == "B-org":
                print(anec[0][0]["entity"], anec[0][0]["word"], anec[1])

B-org DobermanPinscher There's a guy with a Doberman Pinscher and a guy with a Chihuahua. The guy with the Doberman Pinscher says to the guy with a Chihuahua, 'Let's go over to that restaurant and get something to eat.' 
B-org Dane Three dogs are sitting in the waiting room of a vets office. One is a poodle, one is a schnauzer and the other is a great Dane. The poodle turns to the schnauzer and asks "why are you here?" The schnauzer responds, "I'm 17 years old. I don't see or hear very well. I've been having accidents in the house. My owner says I'm too old and sick so he brought me here to be put to sleep." The schnauzer asks the poodle "why are you here?" The poodle responds, "I've not been myself lately. I've been especially high strung. I've been barking all the time, I've been snapping at people and I even bit one of the neighbor's kids. Nobody knows why this has been happening. My owner says he can't risk me biting somebody else so he brought me here to be put to sleep." The pood