In [7]:
import pickle
import time

with open("../../data/article_texts.txt",'rb') as f:
    texts = pickle.load(f, encoding="UTF-8")
with open("../../data/english_anecs_list.pickle", "rb") as f:
    english_anecs_list = pickle.load(f, encoding="UTF-8")
with open("../../data/ids_to_labels.pickle", "rb") as f:
    ids_to_labels = pickle.load(f, encoding="utf-8")
with open("../../data/labels_to_ids.pickle", "rb") as f:
    labels_to_ids = pickle.load(f, encoding="utf-8")
with open("../../data/unique_labels.pickle", "rb") as f:
    unique_labels = pickle.load(f, encoding="UTF-8")

In [8]:
len(english_anecs_list)

3042

In [9]:
text = ['SpaceX Starship Blows Up Minutes After Launch',
 'SpaceX’s Starship rocket, the most powerful ever built, blasted off on an unpiloted maiden flight Thursday, flying for more than two minutes before exploding. What do you think?']

In [125]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize(data:str):
    inputs = tokenizer(data, return_tensors="pt", truncation=True, padding=True)
    return inputs
def ids_to_tokens(text_input):
    return tokenizer.convert_ids_to_tokens(text_input)

In [30]:
from transformers import BertForTokenClassification
import torch

class BertModel(torch.nn.Module):
    def __init__(self):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('dslim/bert-base-NER', num_labels=len(unique_labels),
                                                               ignore_mismatched_sizes=True)

    def forward(self, input_ids, label=None):
        output = self.bert(labels=label, input_ids=input_ids, return_dict=False)
        return output



model = BertModel()

model.load_state_dict(torch.load('bert_trainedNEREnglish', map_location=torch.device('cpu')))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([17, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [90]:
import numpy as np
def get_ners(text:str) -> list:
    output = []
    B = np.asarray([tokenizer(text.replace("-", ""))["input_ids"]]).reshape(1,1,-1)
    logits = model(torch.as_tensor(np.array(B))[0])[0]
    for j in range(logits.shape[0]):
        #print(logits[i])
        logits_clean = logits[j].argmax(dim=1)
        words = text.replace("-", "").split()
        #for i in range(len(eval(answer))-1):
        #    print(words[i], eval(answer)[i])
        i = 0
        tokenized_sentence = ids_to_tokens(tokenizer(text.replace("-", ""))["input_ids"])
        #print([ids_to_labels[x.item()] for x in logits_clean])
        k = 0
        for el in logits_clean:
            if i == len(words):
                break
            elem = logits_clean[k]
            if i+1 <= len(logits_clean) and tokenized_sentence[i][:2] == "##":
                if elem.item() == "O":
                    label = logits_clean[i]
                else:
                    label = elem.item()
                output.append({"word":words[i], "entity":ids_to_labels[label]})
                k+=2
            else:
                output.append({"word":words[i], "entity":ids_to_labels[elem.item()]})
                k+=1
            i += 1
    return output
print(get_ners(text))

[{'word': 'NPR', 'entity': 'O'}, {'word': 'plans', 'entity': 'B-org'}, {'word': 'to', 'entity': 'B-org'}, {'word': 'shut', 'entity': 'O'}, {'word': 'down', 'entity': 'O'}, {'word': 'its', 'entity': 'O'}, {'word': 'official', 'entity': 'O'}, {'word': 'Twitter', 'entity': 'O'}, {'word': 'accounts', 'entity': 'O'}, {'word': 'after', 'entity': 'B-org'}, {'word': 'the', 'entity': 'O'}, {'word': 'Elon', 'entity': 'O'}, {'word': 'Muskowned', 'entity': 'O'}, {'word': 'platform’s', 'entity': 'O'}, {'word': 'decision', 'entity': 'I-org'}, {'word': 'to', 'entity': 'I-org'}, {'word': 'label', 'entity': 'O'}, {'word': 'it', 'entity': 'O'}, {'word': 'as', 'entity': 'O'}, {'word': '“stateaffiliated,”', 'entity': 'O'}, {'word': 'which', 'entity': 'O'}, {'word': 'categorizes', 'entity': 'O'}, {'word': 'all', 'entity': 'O'}, {'word': '52', 'entity': 'O'}, {'word': 'NPRrun', 'entity': 'O'}, {'word': 'Twitter', 'entity': 'O'}, {'word': 'accounts', 'entity': 'O'}, {'word': 'as', 'entity': 'I-tim'}, {'word'

In [23]:
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')

In [100]:
def get_embeddings(list_of_tags: list):
    emeddings = []
    for tag in list_of_tags:
        try:
            #print(tokenizer.convert_ids_to_tokens(tokenizer(tag["word"])["input_ids"][1]))
            embed = glove_vectors[tag["word"]]
            emeddings.append({'entity': tag["entity"] , 'word': tag["word"], "embedding": embed})
        except:
            pass
    return emeddings
def get_non_o(ner_words):
    a = []
    for x in ner_words:
        if x["entity"] != "O" :
            a.append(x)
    return a


In [122]:
from numpy.linalg import norm
from collections import Counter

def count_cos_embeddings(text_embeddings, anec_embeddings) -> (float, dict):
    suitable_pairs = {0: []}
    anec_unique_tags_counter = Counter()
    embedding_cosine_sum = 0
    for embedding in text_embeddings:
        cosines = []
        pair = ""
        simmilarity_tags = {}
        for embed in anec_embeddings:
            anec_unique_tags_counter[embed["entity"]] += 1
            if embed["entity"] == embedding["entity"]:
                v1 = embedding["embedding"]
                v2 = embed["embedding"]
                cos = np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
                cosines.append((cos, embed["word"], embed["entity"] ))
        simmilarity_tags[embedding["word"]] = cosines
        suitable_pairs[0].append(simmilarity_tags)

    top_similarity = {}
    for key in suitable_pairs.keys():
        for tag in anec_unique_tags_counter.keys():
            for embed in anec_embeddings:
                top_similarity[embed["word"]] = (-10, "")
            for word_dict in suitable_pairs[key]:
                word = list(word_dict.keys())[0]
                for sim in word_dict[word]:
                    if top_similarity[sim[1]][0] < sim[0]:
                        top_similarity[sim[1]] = (sim[0], word)
    #print(top_similarity)
    #print(suitable_pairs)
    for key in top_similarity.keys():
        embedding_cosine_sum += top_similarity[key][0]
    embedding_cosine_sum /= len(top_similarity.keys()) if len(top_similarity.keys())>5 else 5
    return embedding_cosine_sum, top_similarity

In [None]:
anec = english_anecs_list[3]
text = texts[9][1]

In [106]:
filtred_anec = get_non_o(get_ners(anec))
filtred_text = get_non_o(get_ners(text))
anec_embeddings = get_embeddings(filtred_anec)
text_embeddings = get_embeddings(filtred_text)
count_cos_embeddings(text_embeddings, anec_embeddings)

Counter({'B-org': 32, 'I-org': 8})
{'walks': (0.27021483, 'you'), 'into': (0.46386227, 'to'), 'and': (0.53074193, 'to'), 'asks': (0.43194485, 'you'), 'any': (0.58032215, 'you')}


0.45541720390319823

In [123]:
# anec_embeddings - pickle embeddings of all anecs
best_anec = None
best_cos = -10
best_text_ner = None
best_simmilarity = None
for anec in english_anecs_list[0:100]:
    try:
        filtred_anec = get_non_o(get_ners(anec))
        filtred_text = get_non_o(get_ners(text))
        anec_embeddings = get_embeddings(filtred_anec)
        text_embeddings = get_embeddings(filtred_text)
        new_cosine, new_simmilarity = count_cos_embeddings(text_embeddings, anec_embeddings)
        if new_cosine > best_cos:
            best_cos = new_cosine
            best_anec = anec
            best_text_ner = get_non_o(get_ners(text))
            best_simmilarity = new_simmilarity
    except:
        print(anec)
print(best_anec,best_cos, best_text_ner, best_simmilarity)

Counter({'I-org': 8, 'B-org': 8, 'B-geo': 8, 'B-tim': 8})
Counter({'B-per': 8})
Counter({'B-org': 16})
Counter({'B-org': 32, 'I-org': 8})
Counter({'B-tim': 16, 'I-tim': 16, 'B-geo': 8, 'B-org': 8, 'I-org': 8})
Counter({'B-org': 32, 'B-geo': 24, 'I-org': 16, 'B-tim': 16, 'I-tim': 8})
Counter()
Counter({'B-tim': 32, 'B-org': 16, 'B-per': 16, 'I-per': 8, 'I-org': 8})
Counter({'B-org': 56, 'B-per': 24, 'I-org': 16, 'B-gpe': 8})
Counter({'B-org': 40, 'I-org': 32, 'B-geo': 32})
Counter({'B-org': 16, 'I-org': 8})
Counter({'B-geo': 8})
Counter({'B-org': 40, 'I-org': 32, 'B-tim': 24, 'B-geo': 16, 'B-per': 8, 'I-per': 8, 'B-gpe': 8})
Counter({'B-per': 8})
Counter({'B-org': 64, 'I-org': 48, 'B-geo': 8, 'I-tim': 8})
Counter({'B-org': 16, 'I-per': 16, 'B-per': 16, 'I-org': 16, 'B-tim': 8})
A man running a little behind schedule arrives at a picture theatre, goes  in to watch the movie that has already started, and as his eyes adjust to  the darkness, he is surprised to see a dog sitting beside its 

In [124]:
words = best_anec.split()
key_words = best_simmilarity.keys()
for i in range(len(words)):
    if words[i] in key_words:
        words[i] = best_simmilarity[words[i]][1]
resulted_text = " ".join(words)
print(resulted_text)

A little old lady buys a pair of parrots, do do identify their sexes. She calls the shop, and the man you do her to watch them carefully and all would become to in time.
