In [44]:
import pickle
import time

with open("../../data/article_texts.txt",'rb') as f:
    texts = pickle.load(f, encoding="UTF-8")
with open("../../data/english_anecs_list.pickle", "rb") as f:
    english_anecs_list = pickle.load(f, encoding="UTF-8")

In [3]:
text = ['SpaceX Starship Blows Up Minutes After Launch',
 'SpaceX’s Starship rocket, the most powerful ever built, blasted off on an unpiloted maiden flight Thursday, flying for more than two minutes before exploding. What do you think?']

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize(data:str):
    inputs = tokenizer(data, return_tensors="pt", truncation=True, padding=True)
    return inputs

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [33]:
tokenized_text = []
for i in text:
    tokenized_text.append(tokenize(i))
print(tokenized_text)
print(tokenizer.decode(tokenized_text[0].input_ids[0]))
print(tokenizer.convert_ids_to_tokens(tokenized_text[0].input_ids[0]))

[{'input_ids': tensor([[  101,  2686,  2595,  3340,  5605, 13783,  2039,  2781,  2044,  4888,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}, {'input_ids': tensor([[  101,  2686,  2595,  1521,  1055,  3340,  5605,  7596,  1010,  1996,
          2087,  3928,  2412,  2328,  1010, 18461,  2125,  2006,  2019,  4895,
          8197, 10994,  2098, 10494,  3462,  9432,  1010,  3909,  2005,  2062,
          2084,  2048,  2781,  2077, 20728,  1012,  2054,  2079,  2017,  2228,
          1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}]
[CLS] spacex starship blows up minutes after launch [SEP]
['[CLS]', 'spac

In [39]:
from transformers import AutoModelForTokenClassification, TokenClassificationPipeline
from transformers import pipeline

pos_model = AutoModelForTokenClassification.\
    from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
pos_pipeline = TokenClassificationPipeline(model=pos_model, tokenizer=tokenizer)


ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer)

In [55]:
example = texts[1][1]
ner_results = ner_pipeline(example)

keywords = [result["word"] for result in ner_results]
print(ner_results, example, keywords)


[{'entity': 'B-PER', 'score': 0.9989837, 'index': 1, 'word': 'netflix', 'start': 0, 'end': 7}, {'entity': 'B-PER', 'score': 0.9984754, 'index': 21, 'word': 'trail', 'start': 94, 'end': 99}, {'entity': 'I-PER', 'score': 0.99533194, 'index': 22, 'word': '##bla', 'start': 99, 'end': 102}] Netflix announced it will be ending its DVD-by-mail rental service that set the stage for its trailblazing video streaming service, ending an era that began 25 years ago when delivering discs through the mail was considered a revolutionary concept. What do you think? ['netflix', 'trail', '##bla']


In [57]:
pos_tags = pos_pipeline(texts[1][1])
unique_pos_tags = []
for pos_tag in pos_tags:
    if pos_tag["word"] in keywords:
        unique_pos_tags.append(pos_tag["entity"])
unique_pos_tags

['NNP', 'FW', 'FW']

In [58]:
pos_template_result = pos_pipeline(english_anecs_list[0])
pos_template_tags = []
for pos_tag in pos_template_result:
    pos_template_tags.append(pos_tag["entity"])
pos_template_tags

['JJ',
 'JJ',
 'FW',
 'FW',
 'NN',
 'FW',
 'FW',
 'FW',
 'FW',
 'NN',
 'FW',
 'NN',
 'FW',
 'CD',
 'NN',
 'FW',
 'FW',
 'FW',
 'CD',
 'CD',
 'CD',
 'FW',
 ':',
 'FW',
 'FW',
 'FW',
 'CD',
 'FW',
 'JJ',
 'FW',
 'FW',
 'FW',
 'NN',
 'FW',
 'NN',
 'FW',
 'FW',
 'FW',
 'NN',
 'FW',
 'FW',
 'FW',
 'NN',
 'FW',
 'FW',
 'FW',
 'FW',
 'FW',
 'FW',
 'NN',
 'FW',
 'FW',
 'FW',
 'CD',
 'NN',
 ':',
 'FW',
 'FW',
 'FW',
 'FW',
 'CD',
 'FW',
 'JJ',
 'NN',
 'NN',
 'CD',
 'NN',
 'NN',
 'CD',
 'FW',
 'FW',
 'NN',
 'NN',
 'CD',
 'NN',
 'FW',
 'FW',
 'NN',
 'FW',
 ':',
 'FW',
 'FW',
 'FW',
 'NN',
 'NN',
 'NN',
 'CD',
 'FW',
 'FW',
 'NN',
 'NN',
 'CD',
 'NN',
 'NNS',
 ':',
 'FW',
 'NN',
 'FW',
 'FW',
 'NN',
 'NN',
 'CD',
 'FW',
 'FW',
 'FW',
 'NN',
 'FW']