In [7]:
# needed to load the REBEL model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch

# wrapper for wikipedia API
import wikipedia

# scraping of web articles
from newspaper import Article, ArticleException

# google news scraping
from GoogleNews import GoogleNews

# graph visualization
from pyvis.network import Network

# show HTML in notebook
import IPython

In [8]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [9]:
# from https://huggingface.co/Babelscape/rebel-large
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [10]:
# knowledge base class
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [11]:
# build a knowledge base from text
def from_small_text_to_kb(text, verbose=False):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt')
    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb

In [15]:
text = """There had been numerous calls for the Games to be postponed, warning that the anticipated attendance of 500,000 international visitors could cause the virus to rapidly spread outside of the country.[3] In the first quarter of 2016, there were also more cases of the mosquito-borne Dengue fever than in 2015 alone.[4][5] Dr. Amir Attaran of the University of Ottawa, writing for the Harvard Public Health Review, noted that Rio had the highest concentration of Zika infections out of all Brazilian states. He argued that the Olympics could result in a "global catastrophe" of Zika outbreaks, and asserted that it was "socially irresponsible" and "ethically questionable" to allow them to continue. On the other hand, it has been argued that the threat of Zika will not be as high during the Games, citing computer models and simulations, as well as the fact that the Games will be held during Southern Hemisphere winter, which is when mosquitoes are least active. The initial outbreak of Zika in Brazil occurred during the winter months, but in Northeastern states near the equator, where there is no winter season throughout the year. Tom Frieden, director of the U.S. Centers for Disease Control and Prevention, stated that "there is no public health reason to cancel or delay the Olympics".[4][5][6][7]"""

In [16]:
len(text)

1304

In [17]:
kb = from_small_text_to_kb(text, verbose=True)
kb.print()

Num tokens: 271
Relations:
  {'head': 'Amir Attaran', 'type': 'employer', 'tail': 'University of Ottawa'}
  {'head': 'Amir Attaran', 'type': 'educated at', 'tail': 'University of Ottawa'}
  {'head': 'Tom Frieden', 'type': 'employer', 'tail': 'Centers for Disease Control and Prevention'}
