# Install the relevant libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
from newspaper import Article, ArticleException
from GoogleNews import GoogleNews
import IPython
from pyvis.network import Network
import pandas as pd

# Load the REBEL model

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

# From short text to KB

In [None]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

# Filter and normalize entities with Wikipedia

- remove all entities that doesn't have a page on Wikipedia
- merge entities if they have the same wikipedia page

In [None]:
class KB():
    def __init__(self):
        self.entities = {}
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

# Extract KB from web article

In [None]:
def from_text_to_kb(text, article_url, span_length=128, article_title=None,
                    article_publish_date=None, verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                article_url: {
                    "spans": [spans_boundaries[current_span_index]]
                }
            }
            kb.add_relation(relation, article_title, article_publish_date)
        i += 1

    return kb

In [None]:
class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

In [None]:
def get_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article

def from_url_to_kb(url):
    article = get_article(url)
    config = {
        "article_title": article.title,
        "article_publish_date": article.publish_date
    }
    l=len(article.text)
    n=l//1024
    x=[]       
    kb = from_text_to_kb(article.text, article.url, verbose=True)

    return kb

In [None]:
from_url_to_kb(Websites['URMC'][1])

In [None]:
def save_network_html(kb, filename="network.html"):
    # create network
    net = Network(directed=True, width="1400px", height="1400px", bgcolor="#eeeeee")

    # nodes
    color_entity = "#FFFF00"
    for e in kb.entities:
        net.add_node(e, shape="circle", color=color_entity)

    # edges
    for r in kb.relations:
        net.add_edge(r["head"], r["tail"],
                    title=r["type"], label=r["type"])
        
    # save network
    net.repulsion(
        node_distance=200,
        central_gravity=0.2,
        spring_length=200,
        spring_strength=0.05,
        damping=0.09
    )
    net.set_edge_smooth('dynamic')
    net.show(filename)

In [None]:
for i in Websites.iterrows():
  df_wiki = pd.DataFrame()
  df_urmc = pd.DataFrame()
  
  
  print(f'Working on {i[1][0]} - Wikipedia Article')

  # Wikipedia Articles
  data_obj_wiki = from_url_to_kb(i[1][1])
  tempdf_wiki = pd.DataFrame(data_obj_wiki.relations)
  df_wiki = df_wiki.append(tempdf_wiki, ignore_index = True)
  df_wiki.to_csv(f'/content/drive/MyDrive/REBEL_Outputs/Wiki_{i[1][0]}.csv')
  filename = f"WIKI_{i[1][0]}_Graph.html"
  save_network_html(data_obj_wiki, filename=filename)

  print(f'Working on {i[1][0]} - URMC Article')
    # URMC Articles
  data_obj_urmc = from_url_to_kb(i[1][2])
  tempdf_urmc = pd.DataFrame(data_obj_urmc.relations)
  df_urmc = df_urmc.append(tempdf_urmc, ignore_index = True)
  df_urmc.to_csv(f'/content/drive/MyDrive/REBEL_Outputs/URMC_{i[1][0]}.csv')
  filename = f"URMC_{i[1][0]}_Graph.html"
  save_network_html(data_obj_urmc, filename=filename)
  print()


Working on Liver cancer - Wikipedia Article


Token indices sequence length is longer than the specified maximum sequence length for this model (5393 > 1024). Running this sequence through the model will result in indexing errors


Input has 5393 tokens
Input has 43 spans
Span boundaries are [[0, 128], [125, 253], [250, 378], [375, 503], [500, 628], [625, 753], [750, 878], [875, 1003], [1000, 1128], [1125, 1253], [1250, 1378], [1375, 1503], [1500, 1628], [1625, 1753], [1750, 1878], [1875, 2003], [2000, 2128], [2125, 2253], [2250, 2378], [2375, 2503], [2500, 2628], [2625, 2753], [2750, 2878], [2875, 3003], [3000, 3128], [3125, 3253], [3250, 3378], [3375, 3503], [3500, 3628], [3625, 3753], [3750, 3878], [3875, 4003], [4000, 4128], [4125, 4253], [4250, 4378], [4375, 4503], [4500, 4628], [4625, 4753], [4750, 4878], [4875, 5003], [5000, 5128], [5125, 5253], [5250, 5378]]




  lis = BeautifulSoup(html).find_all('li')


Working on Liver cancer - URMC Article
Input has 1929 tokens
Input has 16 spans
Span boundaries are [[0, 128], [120, 248], [240, 368], [360, 488], [480, 608], [600, 728], [720, 848], [840, 968], [960, 1088], [1080, 1208], [1200, 1328], [1320, 1448], [1440, 1568], [1560, 1688], [1680, 1808], [1800, 1928]]

Working on Lung cancer - Wikipedia Article
Input has 11653 tokens
Input has 92 spans
Span boundaries are [[0, 128], [126, 254], [252, 380], [378, 506], [504, 632], [630, 758], [756, 884], [882, 1010], [1008, 1136], [1134, 1262], [1260, 1388], [1386, 1514], [1512, 1640], [1638, 1766], [1764, 1892], [1890, 2018], [2016, 2144], [2142, 2270], [2268, 2396], [2394, 2522], [2520, 2648], [2646, 2774], [2772, 2900], [2898, 3026], [3024, 3152], [3150, 3278], [3276, 3404], [3402, 3530], [3528, 3656], [3654, 3782], [3780, 3908], [3906, 4034], [4032, 4160], [4158, 4286], [4284, 4412], [4410, 4538], [4536, 4664], [4662, 4790], [4788, 4916], [4914, 5042], [5040, 5168], [5166, 5294], [5292, 5420], [5

In [None]:
mv -f /content/*.html /content/drive/MyDrive/REBEL_Outputs/Graphs