A simple example of how to build a relational knowledge database from a bunch of texts, using NLP and then representing them in Streamlit / Storing in an MySQL database!

In [2]:
%pip install transformers 




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
%pip install wikipedia newspaper3k GoogleNews pyvis





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
%pip install torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from transformers import AutoModelForSeq2SeqLM,  AutoTokenizer
import math
import torch
import wikipedia
from newspaper import Article, ArticleException
from GoogleNews import GoogleNews
import IPython
import pyvis


#Load the pretrained REBEL model and auto-tokeniser
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

def extractRelationsFromModel(inputCorpus):
    relations = []
    relation, subject, relation, object_ = "","","",""
    inputCorpus = inputCorpus.strip() #Remove all random whitespaces
    textReplaceWS = inputCorpus.replace("<s>","") #Remove all the spaces between characters
    textReplacePad = textReplaceWS.replace("<pad>","") #Remove the padding between characters
    textReplaceBL = textReplacePad.replace("</s>","") #Remove the breaklines and paragraph spacing
    finalProcessedCorpus = textReplaceBL
    wordTokenizedCorpus = finalProcessedCorpus.split()

    current = "x"
    
    for token in wordTokenizedCorpus: #According to the rebel model, the input corpus has already been processed to label some parts as <triplet> or <subj> or whatever.
        #We are now parsing these definitions and getting them out
        if token == "<triplet>":
            current = "t"
            if relation != "": #relation is not null
                relations.append({'head': subject.strip(),'type':relation.strip(),'tail':object_.strip()})
                relation =""

            subject = ""

        elif token == "<subj>":
            current = "s"
            if relation != "":
                relations.append({'head': subject.strip(), 'type':relation.strip(), 'tail': object_.strip()})

            object_ = ""

        elif token == "<obj>":
            current = 'o'
            relation = ''
        
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token

    if subject != '' and relation != '' and object_ != '':
        relations.append({'head':subject.strip(), 'type':relation.strip(),'tail':object_.strip()})

    return relations


In [None]:
#KB class for parsing both short and long input texts.

class KB(): #create a knowledge base object
    def __init__(self):
        self.relations = []
    
    def are_relations_equal(self, r1, r2):
        return all(r1[attribute] == r2[attribute] for attribute in ["head","type","tail"])
    
    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)
    
    def add_relations(self,r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations: ")
        for r in self.relations:
            print(f" {r}")

    def merge_relations(self,r1):
        r2 = [r for r in self.relations if self.are_relations_equal(r1,r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"] if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def add_relation(self,r):
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def get_wikipedia_data(self, candiate_entity):
        page = wikipedia.page(candiate_entity, auto_suggest=False) #returns a wikipedia page object, thus an actual page that can be found on wikepedia!!!!
        if page == None:
            return None
        
        entity_data = {
            "title":page.title,
            "url":page.url,
            "summary":page.summary
        }

        return entity_data
    
    

In [None]:

#Updated KB class for parsing actual wikipedia page objects

class KB(): #create a knowledge base object
    def __init__(self):
        self.relations = []
        self.entities = {}
    
    def are_relations_equal(self, r1, r2):
        return all(r1[attribute] == r2[attribute] for attribute in ["head","type","tail"])
    
    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)
    
    def add_relations(self,r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations: ")
        for r in self.relations:
            print(f" {r}")

    def merge_relations(self,r1):
        r2 = [r for r in self.relations if self.are_relations_equal(r1,r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"] if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self,r):
        #check on wikipedia first
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(candidate_entity) for candidate_entity in candidate_entities]

        #if there is one entity that does not even exist, stop and return. WE DONT want null information
        if any(candidate_entity is None for candidate_entity in entities):
            return
        
        for ent in entities:
            self.add_entity(ent)

        #rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        #manage new relations
        if not self.exists_relation(r): #find another entity that has a relation with r. if there isnt, exeucte this below
            self.relations.append(r) #add r as its own separate relation
        
        else:
            self.merge_relations(r) #merge / connect r with the found relations(s)

        


    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None
    
    

In [None]:

#Updated KB class for parsing actual wikipedia pages, NOW with URl and date attributes

class KB(): #create a knowledge base object
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }
    
    def are_relations_equal(self, r1, r2):
        return all(r1[attribute] == r2[attribute] for attribute in ["head","type","tail"])
    
    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)
    
    def add_relations(self,r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def merge_relations(self,r1):
        
        r2 = [r for r in self.relations if self.are_relations_equal(r1,r)][0]

        #if different article
        article_url = list(r1["meta"].keys())[0]
        if article_url not in r2["meta"]:
            r2["meta"][article_url] = r1["meta"][article_url] #initialize the article url field of r2 to be the same as that of r1, since the content in the two articles are related.

        else:
            spans_to_add = [span for span in r1["meta"][article_url]["spans"] if span not in r2["meta"][article_url]["spans"]]
            r2["meta"][article_url]["spans"] += spans_to_add

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self,r, article_title, article_publish_date):
        #check on wikipedia first
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(candidate_entity) for candidate_entity in candidate_entities]

        #if there is one entity that does not even exist, stop and return. WE DONT want null information
        if any(candidate_entity is None for candidate_entity in entities):
            return
        
        for ent in entities:
            self.add_entity(ent)

        #rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        #add source if not present in knowledge base kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date":article_publish_date 
            }

        #manage new relations
        if not self.exists_relation(r): #find another entity that has a relation with r. if there isnt, exeucte this below
            self.relations.append(r) #add r as its own separate relation
        
        else:
            self.merge_relations(r) #merge / connect r with the found relations(s)

    def print(self):
        print("Relations:\n ")
        for r in self.relations:
            print(f" {r}")
        
        print("Entities:\n")
        
        for e in self.entities.items():
            print(f" {e}") #returns the key,value pair as self.entities is a dictionary, and .items returns in a format of (key,value)   

        print("Sources:\n")     
        for s in self.sources.items():
            print(f" {s}")  #returns the tuple of the key,value pair which is s, since self.sources is a dictionary!

        


    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None
    
    

In [None]:
def from_small_text_to_kb(text, verbose = False):
    kb = KB()

    #Tokernizer text
    model_inputs = tokenizer(text, max_length=512,padding=True,truncation=True,return_tensors='pt')

    if verbose:
        print(f"Num Tokens: {len(model_inputs['input_ids'][0])}")

    #generate the relations
    gen_kwargs = {
        "max_length": 216,
        "length_penalty":0,
        "num_beams": 3,
        "num_return_sequences": 3
    }

    generated_tokens = model.generate(
        **model_inputs, **gen_kwargs
    )

    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    #create knowledge base
    for sentence_pred in decoded_preds:
        relations = extractRelationsFromModel(sentence_pred)
        for r in relations:
            kb.add_relations(r)
    
    return kb


def from_text_to_kb(text, span_length =128, verbose = False):
    #tokenize whole text first
    inputs = tokenizer([text],return_tensors="pt")

    num_tokens = len(inputs["input_ids"][0])
    num_spans = math.ceil(num_tokens/span_length)
    if verbose:
        print(f"Input has {num_tokens} tokens")
        print(f"Input has {num_spans} spans")
    
    overlap = math.ceil((num_spans * span_length - num_tokens) / max(num_spans-1,1))

    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start+ span_length * i, start + span_length * (i+1)])
        start -= overlap

    if verbose:
        print(f"Span boundaries are {spans_boundaries}")


    #transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries]
    inputs = {
        "input_ids":torch.stack(tensor_ids),
        "attention_mask":torch.stack(tensor_masks) # type: ignore
    }

    #generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length":512,
        "length_penalty":0,
        "num_beams":3,
        "num_return_sequences":num_return_sequences
    }

    generated_tokens = model.generate(**inputs, **gen_kwargs)

    #decode the relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,skip_special_tokens = False)

    #create knowledge base kb
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extractRelationsFromModel(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.add_relation(relation)

        i+=1

    
    return kb






In [17]:

#Part 4: updated KB class to allow for the merging of multiple articles!!! Across multiple sources.
#Updated KB class for parsing actual wikipedia pages, NOW with URl and date attributes

class KB(): #create a knowledge base object
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }
    
    def are_relations_equal(self, r1, r2):
        return all(r1[attribute] == r2[attribute] for attribute in ["head","type","tail"])
    
    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)
    
    def add_relations(self,r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def merge_relations(self,r1):
        
        r2 = [r for r in self.relations if self.are_relations_equal(r1,r)][0]

        #if different article
        article_url = list(r1["meta"].keys())[0]
        if article_url not in r2["meta"]:
            r2["meta"][article_url] = r1["meta"][article_url] #initialize the article url field of r2 to be the same as that of r1, since the content in the two articles are related.

        else:
            spans_to_add = [span for span in r1["meta"][article_url]["spans"] if span not in r2["meta"][article_url]["spans"]]
            r2["meta"][article_url]["spans"] += spans_to_add

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self,r, article_title, article_publish_date):
        #check on wikipedia first
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(candidate_entity) for candidate_entity in candidate_entities]

        #if there is one entity that does not even exist, stop and return. WE DONT want null information
        if any(candidate_entity is None for candidate_entity in entities):
            return
        
        for ent in entities:
            self.add_entity(ent)

        #rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        #add source if not present in knowledge base kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date":article_publish_date 
            }

        #manage new relations
        if not self.exists_relation(r): #find another entity that has a relation with r. if there isnt, exeucte this below
            self.relations.append(r) #add r as its own separate relation
        
        else:
            self.merge_relations(r) #merge / connect r with the found relations(s)

    def merge_with_kb(self, kb2): #merge with the knowledge base derived from a second article source
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"], source_data["article_publish_date"])


    def print(self):
        print("Relations:\n ")
        for r in self.relations:
            print(f" {r}")
        
        print("Entities:\n")
        
        for e in self.entities.items():
            print(f" {e}") #returns the key,value pair as self.entities is a dictionary, and .items returns in a format of (key,value)   

        print("Sources:\n")     
        for s in self.sources.items():
            print(f" {s}")  #returns the tuple of the key,value pair which is s, since self.sources is a dictionary!

        


    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None
    
    

In [7]:
%pip install pandas





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [18]:
from abc import ABC, abstractmethod

class NLPDriver:

    @abstractmethod
    def get_news_links(self):
        pass


        #generating a knowledge base from multiple urls!
    def from_urls_to_kb(self,urls, verbose = False):
        kb = KB()
        for url in urls:
            if verbose:
                print(f"{len(urls)} links to visit!")
            try:
                kb_url = self.from_url_to_kb(url=url)
                kb.merge_with_kb(kb_url)
            except ArticleException:
                if verbose:
                    print(f"Could not download the specified article at url: {url}")

        return kb


    def from_text_to_kb(self,text,article_url,span_length =128, article_title=None, article_publish_date = None,verbose = False):
        #tokenize whole text first
        inputs = tokenizer([text],return_tensors="pt")

        num_tokens = len(inputs["input_ids"][0])
        num_spans = math.ceil(num_tokens/span_length)
        if verbose:
            print(f"Input has {num_tokens} tokens")
            print(f"Input has {num_spans} spans")
        
        overlap = math.ceil((num_spans * span_length - num_tokens) / max(num_spans-1,1))

        spans_boundaries = []
        start = 0
        for i in range(num_spans):
            spans_boundaries.append([start+ span_length * i, start + span_length * (i+1)])
            start -= overlap

        if verbose:
            print(f"Span boundaries are {spans_boundaries}")


        #transform input with spans
        tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries]
        tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries]
        inputs = {
            "input_ids":torch.stack(tensor_ids),
            "attention_mask":torch.stack(tensor_masks) # type: ignore
        }

        #generate relations
        num_return_sequences = 3
        gen_kwargs = {
            "max_length":256,
            "length_penalty":0,
            "num_beams":3,
            "num_return_sequences":num_return_sequences
        }

        generated_tokens = model.generate(**inputs, **gen_kwargs)

        #decode the relations
        decoded_preds = tokenizer.batch_decode(generated_tokens,skip_special_tokens = False)

        #create knowledge base kb
        kb = KB()
        i = 0
        for sentence_pred in decoded_preds:
            current_span_index = i // num_return_sequences
            relations = extractRelationsFromModel(sentence_pred)
            for relation in relations:
                relation["meta"] = {
                    article_url:{
                        "spans": [spans_boundaries[current_span_index]] # a dictionary of spans in a dictionary of article_url
                    }
                }
                kb.add_relation(relation,article_title=article_title, article_publish_date=article_publish_date)

            i+=1

        
        return kb


    #using the newspaper library to download and parse the articles by URL.
    def get_article(self,url):
        article = Article(url=url)
        article.download()
        article.parse()
        return article


    def from_url_to_kb(self,url):
        article = self.get_article(url=url)
        config = {
            "article_title":article.title,
            "article_publish_date":article.publish_date
        }

        kb = self.from_text_to_kb(article.text, article.url, **config)
        return kb
    


class GoogleNewsArticlesDriver(NLPDriver):
    def __init__(self):
        super().__init__()


    #using Google News library to get the URLs of recent news articles, and find relations between them
    def get_news_links(self,query, lang ="en", region = "US", pages=1, max_links=100000):
        googlenews = GoogleNews(lang, region)
        googlenews.search(query)

        all_urls = [] #a list of all the urls
        for page in range(pages):
            googlenews.get_page(page=page)
            all_urls += googlenews.get_links() #get all the links from the search results from the (query) variable
        return list(set(all_urls))[:max_links] #return a list of the first 1..to n links, where n is the maximum number of links we wanna get defined by the max_links argument
    


import json
import pandas as pd
import numpy

class LocalArticleDatasetDriver(NLPDriver):
    def __init__(self, dataset_file_path):
        super().__init__()
        self.dataset_file_path = dataset_file_path
        

    def get_news_links(self, max_article_links):
        article_url_links = []
        filepath = self.dataset_file_path
        #file = open(self.dataset_file_path, "r")
        df = pd.read_json(filepath, lines = True)
        article_links = df["link"]
        df_1d = article_links.to_numpy().flatten()


        return df_1d[:max_article_links]
        

        # count = 0
        # for data_entry in data["link"]:
        #     if count > max_article_links:
        #         break
        #     else:
        #         article_url_links.append(data_entry)
        #         count += 1

        # return article_url_links


        


In [19]:
#Part 1

'''
text = "Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 " \
"May 1821), and later known by his regnal name Napoleon I, was a French military " \
"and political leader who rose to prominence during the French Revolution and led " \
"several successful campaigns during the Revolutionary Wars. He was the de facto " \
"leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, " \
"he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's " \
"political and cultural legacy has endured, and he has been one of the most " \
"celebrated and controversial leaders in world history."
'''

#Part 2
'''
text = """
Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 May 1821), 
and later known by his regnal name Napoleon I, was a French military and political leader 
who rose to prominence during the French Revolution and led several successful campaigns during 
the Revolutionary Wars. He was the de facto leader of the French Republic as First Consul from 1799 to 1804. 
As Napoleon I, he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's political and 
cultural legacy has endured, and he has been one of the most celebrated and controversial leaders 
in world history. Napoleon was born on the island of Corsica not long after its annexation by the
 Kingdom of France.[5] He supported the French Revolution in 1789 while serving in the French army, 
 and tried to spread its ideals to his native Corsica. He rose rapidly in the Army after
 he saved the governing French Directory by firing on royalist insurgents. 
 In 1796, he began a military campaign against the Austrians and their Italian allies, 
 scoring decisive victories and becoming a national hero. Two years later, he led a military 
 expedition to Egypt that served as a springboard to political power. He engineered a coup in 
 November 1799 and became First Consul of the Republic. Differences with the British meant that 
 the French faced the War of the Third Coalition by 1805. Napoleon shattered this coalition with victories 
 in the Ulm Campaign, and at the Battle of Austerlitz, which led to the dissolving of the Holy Roman Empire. 
 In 1806, the Fourth Coalition took up arms against him because Prussia became worried about growing French 
 influence on the continent. Napoleon knocked out Prussia at the battles of Jena and Auerstedt, marched the 
 Grande Armée into Eastern Europe, annihilating the Russians in June 1807 at Friedland, and forcing the defeated nations 
 of the Fourth Coalition to accept the Treaties of Tilsit. Two years later, the Austrians challenged the French again during 
 the War of the Fifth Coalition, but Napoleon solidified his grip over Europe after triumphing at the Battle of Wagram. 
 Hoping to extend the Continental System, his embargo against Britain, Napoleon invaded the Iberian Peninsula and declared his brother 
 Joseph King of Spain in 1808. The Spanish and the Portuguese revolted in the Peninsular War, culminating in defeat for Napoleon's marshals. 
 Napoleon launched an invasion of Russia in the summer of 1812. The resulting campaign witnessed the catastrophic retreat of Napoleon's Grande Armée. 
 In 1813, Prussia and Austria joined Russian forces in a Sixth Coalition against France. A chaotic military campaign resulted in a large coalition 
 army defeating Napoleon at the Battle of Leipzig in October 1813. The coalition invaded France and captured Paris, forcing Napoleon to abdicate in April 1814. 
 He was exiled to the island of Elba, between Corsica and Italy. In France, the Bourbons were restored to power. However, Napoleon escaped Elba in 
 February 1815 and took control of France.[6][7] The Allies responded by forming a Seventh Coalition, which defeated Napoleon at 
 the Battle of Waterloo in June 1815. The British exiled him to the remote island of Saint Helena in the Atlantic, where he died in 1821 at the age of 51.
Napoleon had an extensive impact on the modern world, bringing liberal reforms to the many countries he conquered, especially the Low Countries, 
Switzerland, and parts of modern Italy and Germany. He implemented liberal policies in France and Western Europe.
"""
'''

#Part 3
url = "https://www.straitstimes.com/business/battle-erupts-between-billionaire-kwek-leng-beng-and-son-sherman-for-control-of-cdl"

#Part 1
#kb = from_small_text_to_kb(text=text, verbose=True)
#Part 2
#kb = from_text_to_kb(text=text, verbose=True)
#Part 3
articlesDriver = GoogleNewsArticlesDriver()
kb = articlesDriver.from_url_to_kb(url)
kb.print()

Token indices sequence length is longer than the specified maximum sequence length for this model (2551 > 1024). Running this sequence through the model will result in indexing errors


  lis = BeautifulSoup(html).find_all('li')


Relations:
 
 {'head': 'City Developments Limited', 'type': 'headquarters location', 'tail': 'Singapore', 'meta': {'https://www.straitstimes.com/business/battle-erupts-between-billionaire-kwek-leng-beng-and-son-sherman-for-control-of-cdl': {'spans': [[0, 128]]}}}
 {'head': 'City Developments Limited', 'type': 'country', 'tail': 'Singapore', 'meta': {'https://www.straitstimes.com/business/battle-erupts-between-billionaire-kwek-leng-beng-and-son-sherman-for-control-of-cdl': {'spans': [[0, 128]]}}}
 {'head': 'Kwek Leng Beng', 'type': 'child', 'tail': 'Sherman Kwek', 'meta': {'https://www.straitstimes.com/business/battle-erupts-between-billionaire-kwek-leng-beng-and-son-sherman-for-control-of-cdl': {'spans': [[0, 128]]}}}
 {'head': 'Sherman Kwek', 'type': 'father', 'tail': 'Kwek Leng Beng', 'meta': {'https://www.straitstimes.com/business/battle-erupts-between-billionaire-kwek-leng-beng-and-son-sherman-for-control-of-cdl': {'spans': [[0, 128]]}}}
 {'head': 'Committee', 'type': 'facet of', '

Now, on to Visualising the Network!

In [None]:
from pyvis.network import Network

#using pyvis to visualize the relationship network!

#Rudimentarily, after the KB class has updated its relations, entities, and sources dictionaries respectively with all of the relevant data,
#we now want to visualise this compendium of data. The Entities will become the nodes, the Relations will become the edges!

def save_network_html(kb, filename= "network.html"):
    #create network
    net = Network(directed=True, width="700px", height="700px", bgcolor="eeeeee")

    #create the nodes from the entities dictionary in KB class
    color_entity = "#00FF00"
    print(kb.entities)
    for e in kb.entities:
        net.add_node(e, shape="circle",color=color_entity)

    #create the edges from the relations dictionary in KB class
    for r in kb.relations:
        net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"]) 
        #r["head"] is the source or start point, r["tail"] is the destination.

    
    #now, save the network as a previewable html website!
    net.repulsion(
        node_distance=200, central_gravity=0.2,spring_length=200, spring_strength=0.05, damping=0.09)
    
    net.set_edge_smooth('dynamic') #sets the edges to follow a certain smoothing 'look' profile

    net.show(filename, notebook=False)

    #

Simple network visualisation through Pyvis. No database implementation yet

In [None]:
#Now, testing all of this with a knowledge base of 20 news articles from Google!


import IPython.display
import numpy



# googleNewsNLPDriver = GoogleNewsArticlesDriver()
# news_links = googleNewsNLPDriver.get_news_links("Google",pages=5,max_links=20)
# kb = googleNewsNLPDriver.from_urls_to_kb(news_links, verbose=True)
# filename = 'test.html'
# save_network_html(kb, filename=filename)
# IPython.display.HTML(filename=filename)


newsArticlesDatasetDriver = LocalArticleDatasetDriver("News_Category_Dataset_v3.json")
df = newsArticlesDatasetDriver.get_news_links(5)
kb = newsArticlesDatasetDriver.from_urls_to_kb(df,verbose=True)
filename = 'test.html'
save_network_html(kb, filename=filename)
IPython.display.HTML(filename=filename)

5 links to visit!




  lis = BeautifulSoup(html).find_all('li')


5 links to visit!
5 links to visit!
5 links to visit!
5 links to visit!


In [None]:
%pip install psycopg2

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Exploring Databases

In [None]:
import psycopg2 

#connect to a databse
try:
    connection = psycopg2.connect(
        host = "localhost",
        dbname = "TestKb", #the name can be found under the databses 
        #portion in the vs code extension!!! Once the server to the postgresql database has been established, 
        #and that the server is running on the local machine as well which can be verified via services.msc or pgadmin
        user = "postgres",
        password = "1234",
        port = "5432"
    )
finally:
    print("connection successful") #we encapsulate it in the try/finally block, to safeguard against an exception for when the connection is unsuccessful.capitalize

cur = connection.cursor()
cur.execute("CREATE TABLE people(name TEXT, age INT, height REAL)")
cur.execute("INSERT INTO people(name, age, height) VALUES(%s, %s, %s)", ("Justine",12,140))
connection.commit()


connection successful


Bring in the NLP data into the PostgreSQL database

In [None]:
# newsArticlesDatasetDriver = LocalArticleDatasetDriver("News_Category_Dataset_v3.json")
# df = newsArticlesDatasetDriver.get_news_links(2)
# kb = newsArticlesDatasetDriver.from_urls_to_kb(df,verbose=True)


entities = kb.entities
relations = kb.relations
print(entities)

{'COVID-19': {'url': 'https://en.wikipedia.org/wiki/COVID-19', 'summary': "Coronavirus disease 2019 (COVID-19) is a contagious disease caused by the coronavirus SARS-CoV-2. In January 2020, the disease spread worldwide, resulting in the COVID-19 pandemic.\nThe symptoms of COVID‑19 can vary but often include fever, fatigue, cough, breathing difficulties, loss of smell, and loss of taste. Symptoms may begin one to fourteen days after exposure to the virus. At least a third of people who are infected do not develop noticeable symptoms. Of those who develop symptoms noticeable enough to be classified as patients, most (81%) develop mild to moderate symptoms (up to mild pneumonia), while 14% develop severe symptoms (dyspnea, hypoxia, or more than 50% lung involvement on imaging), and 5% develop critical symptoms (respiratory failure, shock, or multiorgan dysfunction). Older people have a higher risk of developing severe symptoms. Some complications result in death. Some people continue to e