In [None]:
# Uncomment this code first time

#!pip3 install openai
#!pip3 install requests
#!pip3 install numpy
#!pip3 install torch
#!pip3 install torchvision
#!pip3 install transformers
#!pip3 install span-marker
#!pip3 install langdetect
#!pip3 install regex  
#!pip3 install pickle5  
#!pip3 install spacy-langdetect  
#!pip3 install langdetect
#!pip3 install python-dotenv
#!pip3 install SPARQLWrapper
#!pip3 install urllib3
#!pip3 install py2neo


In [1]:
import openai
import os
import json
import requests
import numpy as np
import torch, torchvision
from transformers import AutoTokenizer, TFAutoModelForTokenClassification
from transformers import pipeline
from span_marker import SpanMarkerModel
from langdetect import detect
from langdetect import LangDetectException
import re
import pickle
import time
from langchain.text_splitter import SpacyTextSplitter
from dotenv import load_dotenv
from SPARQLWrapper import SPARQLWrapper, JSON
import urllib.error
from py2neo import Graph, Node, Relationship
from openai import AzureOpenAI


#nltk.download('punkt')


2024-02-14 19:07:59.934051: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load the environment variables

load_dotenv()

True

Initialize Entity Categories and Relation Labels

In [3]:

categories = [
    
    "Person",
    "Location",
    "Organization",
    "Event",
    "Product",
    "Project",
    "Skill",
    "Strategy"
]


In [4]:
relation_labels = [
    "implements",
    "funds",
    "focuses_on",
    "in",
    "partners_with",
    "contributes_to",
    "monitors",
    "targets",
    "addresses",
    "employs",
    "collaborates_with",
    "supports",
    "administers",
    "measures",
    "aligns_with",
    "an_instance_of"
]

# Setting up OpenAI connection

In [5]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai_deployment = "sdgi-gpt-35-turbo-16k"

neo4j_pass = os.getenv("NEO4JPASS")
#openai.api_key = os.getenv("OPENAI_KEY")


In [6]:
client = AzureOpenAI(
  api_key = os.getenv("OPENAI_API_KEY"),  
  api_version = os.getenv("OPENAI_API_VERSION"),
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)

In [7]:
# function to get answers from older version of openAI

def get_answer_old(user_question, timeout_seconds):
    messages = [
        {'role': 'user', 'content': user_question},
    ]
    try:
        response = openai.ChatCompletion.create(
            engine="sdgi-gpt-35-turbo-16k", 
            messages=messages,
            temperature=0,
            request_timeout = timeout_seconds
            # max_tokens=2000
        )
        return response.choices[0].message["content"]
    except requests.Timeout:
        print(f"Request timed out")
        return []
   

In [8]:
# function to get answers from GPT

def get_answer(prompt):  
    response_entities = openai.chat.completions.create(
                    model=openai_deployment,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": prompt},
                    ]
                )
    response = response_entities.choices[0].message.content
    return response


# Entity Extraction using Transformers 


In [10]:
# connecting with two transformer models using API Inference

WIKI_API = "https://api-inference.huggingface.co/models/Babelscape/wikineural-multilingual-ner"
BERT_API = "https://api-inference.huggingface.co/models/dslim/bert-base-NER"

headers = {"Authorization": "Bearer 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"}

#functions to get responses from the above models

def query_wiki(payload):
	response = requests.post(WIKI_API, headers=headers, json=payload)
	return response.json()
	

def query_bert(payload):
	response = requests.post(BERT_API, headers=headers, json=payload)
	return response.json()


In [11]:
# function to extract acronyms from text using GPT prompt

def query_gpt(text):
    
    entities_prompt = f"""

    You will be given a >>>>>TEXT<<<<<. You have two tasks:
    
    1. Your first task is to detect acronyms with their names and store them in python dictionary.
    2. Your second task is to detect Proper Nouns in the text and store them in python list.
    
    Return a JSON array contaning dictionary and the list.

    >>>>>TEXT<<<<<
    {text}

    
    """
    #start_time = time.time()

    result = get_answer(entities_prompt)
    result = json.loads(result)
    
    #end_time = time.time()
    #elapsed_time = end_time - start_time
    #print (f"TIME TAKEN TO EXECUTE PROMPT: {elapsed_time}")
    return result
    


# Text Pre-Processing 

In [12]:
#functions to clean and pre-process data

def split_text_spacy(chunk_size, text):
    
    text_splitter = SpacyTextSplitter(chunk_size=chunk_size)
    sections = text_splitter.split_text(text)
    
    return sections


def get_text_section(limit, text):
    sections_list = []
    length = len(text)
    i = 0

    while i < length - 1:
        j = i + limit

        if j >= length:
            j = length - 1
        elif text[j] not in ('.', '\n', ';'):
            while text[j] not in ('.', '\n', ';'):
                j -= 1
            j += 1

        section = text[i:j]

        if is_valid_section(section):
            sections_list.append(section)
        else: 
            print("INVALID SECTION DETECTED")
            print(section)
            #section_list[-1].extend(section)
        i = j
    
    
    return sections_list

def is_valid_section(section):
    return section and len(section) > 20


def clean_text(input_text):
    # Remove lines with only whitespace
    input_text = re.sub(r'^\s*$', '', input_text, flags=re.MULTILINE)

    # Remove lines containing only uppercase text (potential headings)
    input_text = re.sub(r'^\s*[A-Z\s]+\s*$', '', input_text, flags=re.MULTILINE)

    # Remove lines with multiple consecutive uppercase words (potential headings)
    input_text = re.sub(r'^\s*(?:[A-Z]+\s*){2,}\s*$', '', input_text, flags=re.MULTILINE)
    
    input_text = re.sub(r'^\s*[A-Za-z\s]+\.{3,}\s*\d+\s*$', '', input_text, flags=re.MULTILINE)

    return input_text

def is_english(line):
    try:
        return detect(line) == 'en'
    except LangDetectException as e:
        print(f"An exception occurred: {e} : {line}")
        return False

# Functions to process extracted entities

In [13]:
# merging the broken entities recieved by transformers to needed format

def create_entities(lst):
    i = 1
    while i < len(lst):
        if lst[i]["word"].startswith('##'):
            lst[i]["word"] = lst[i-1]["word"] + lst[i]["word"][2:]
            lst[i]["score"] = max(lst[i-1]["score"] , lst[i]["score"])
            del lst[i-1]
        else:
            i += 1
            

# threshold score to eliminate unimportant entities
def apply_threshold(list_, threshold):
    words_list = []
    for item in list_:
        if item['score'] > threshold:  
            words_list.append(item['word'])
    return words_list


In [14]:
# function to get raw version of entities for comparison
def get_raw(list_):
    output = []
    for sublist in list_:
        new = []
        obj = {}
        for item in sublist:
            #obj = {}
            key = ''.join(filter(str.isalpha, item))
            obj[key]= item
            #obj['raw']= ''.join(filter(str.isalpha, item))
        output.append(obj)
    return output

In [15]:
# function to find intersection of entities found from transformers 

def merge_extracted_entities(wiki, bert, gpt):
    
    output = set(wiki.values())
    dict_ = gpt
    
    bert_set = set(bert.keys()) - set(wiki.keys())
    gpt_set = set(gpt.keys())
    
    A = gpt_set.intersection(bert_set)

    matched = list(set(A))
    print ("GPT/BERT: " + str(matched))

    for i in matched:
        output.add(dict_[i])
        
    return output

In [40]:
def merge_extracted_entities_old(wiki, bert, gpt):
    
    output = []
    dict_ = gpt
    
    
    #print(dict_.items())
    wiki_set = set(wiki.keys())
    #bert_set = set(bert.keys())
    gpt_set = set(gpt.keys())
    
    
    #A = gpt_set.intersection(bert_set)
    #B = bert_set.intersection(wiki_set)
    C = wiki_set.intersection(gpt_set)
 
    #matched = list(A.union(B).union(C))
    
    
    for i in C:
        output.append(dict_[i])
        
    return output

In [16]:
# function to remove any unwanted characters
def validate_entities(list_):
    
    # Define a regular expression pattern to match invalid characters.
    pattern = r'\s*{}\s*'.format(re.escape("’"))
    pattern1 = r'\s*{}\s*'.format(re.escape("/"))
    output_list = []

    for item in list_:
        item = re.sub(pattern, "’", item)
        tem = re.sub(pattern1, "/", item)
            
    return output_list



#  Categorize entities

In [83]:
# Zero shot prompt to categorize entities

def categorize_entities(text, entities, categories):
    
    
    categorization_prompt = f"""

    You will be given a >>>>>TEXT<<<<<, an >>>>>EntityList<<<<< and >>>>>Categories<<<<<. 
    Your task is to assign a sutiable category to each element of >>>>>EntityList<<<<<.
    
    Return a list of JSON objects of categorized entities. 


    >>>>>TEXT<<<<<
    {text}

    >>>>>Categories<<<<<
    {categories}

    >>>>>EntityList<<<<<
    {entities}
    """

    categorized_entities = get_answer(categorization_prompt)
    categorized_entities = json.loads(categorized_entities)
    
    return (categorized_entities)


# Relation Extraction

Two approaches were tried to extract relations. Finally the extract_ontology_relations was used.

In [84]:
with open("ontology.ttl") as f:
    ontology = f.read()
    
    f.close()

In [85]:
# Zero shot prompt to extract relations using relation labels


def extract_relation_details(text, entities, relation_labels):
    relation_extraction_prompt = f"""
    
    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to extract a 
    Knowledge Graph from the UNDP dataset.
    You will be given a >>>>>TEXT<<<<<, an >>>>>EntityList<<<<< and a list of >>>>>RelationLabels<<<<<.

   [Task]
   
   Your task is to perform Relation Extraction on the given >>>>>TEXT<<<<< 
   to find relations between elements of provided >>>>>EntityList<<<<<.
   
   Please make sure to read these instructions and constraints carefully.

    [Instructions]
    1. Carefully read and store the >>>>>RelationLabels<<<<<.
    2. Scan the >>>>>TEXT<<<<< to find Named Entites from >>>>>EntityList<<<<< that are related.
    3. Scan the >>>>>RelationLabels<<<<< to select a suitable label to
    describe the relation between the above selected entities. Mark this label as "Relation".
    4. Assign "Subject" and "Object" to entities depending on the selected "Relation"
    selected in previous step to create a tuple.
    5. If available, select a small "Description" from the >>>>>TEXT<<<<< for the above relation.
    6. Assign a Relevance score between 1 to 10 to the extracted relation, with 10 being the most relevant.
    7. Repeat the process to extract remaining relations from >>>>>TEXT<<<<<.
    
    
    [Constraints]
    1. Values of 'Relation' key should belong to >>>>>RelationLabels<<<<<.
    
    [Output Format]
    Provide the result as a JSON array.

    Perform relation extraction on the below:
    
    >>>>>TEXT<<<<<
    {text}

    >>>>>EntityList<<<<<
    {entities}

    >>>>>RelationLabels<<<<<
    {relation_labels}
    
"""

    relations = get_answer(relation_extraction_prompt)
    relations = json.loads(relations)

    return relations

In [86]:
# Zero shot prompt to extract relations using ontology


def extract_ontology_relations(text, entities, ontology):
    relation_extraction_prompt = f"""
    
    [Context]
    You belong to a team of consultants at UNDP's Sustainable Energy Hub (SEH), working on a project to extract a 
    Knowledge Graph from the UNDP dataset.
    You will be given a >>>>>TEXT<<<<<, an >>>>>EntityList<<<<< and an >>>>>ONTOLOGY<<<<<.

   [Task]
   
   Your task is to perform Relation Extraction on the given >>>>>TEXT<<<<< 
   to find relations between elements of provided >>>>>EntityList<<<<<. Use the given >>>>>ONTOLOGY<<<<<
   for this purpose.
   
   Please make sure to read these instructions and constraints carefully.

    [Instructions]
    1. Carefully read and understand the >>>>>ONTOLOGY<<<<<.
    2. Scan the >>>>>TEXT<<<<< to find Named Entites in >>>>>EntityList<<<<< that are related.
    3. Read the >>>>>ONTOLOGY<<<<< to select a relationship type for the related entities. Mark this label as "Relation".
    4. Assign "Subject" and "Object" to entities depending on the "Relation"
    selected in previous step to create a tuple.
    5. If available, select a small "Description" from the >>>>>TEXT<<<<< for the above relation.
    6. Assign a Relevance score between 1 to 10 to the extracted relation, with 10 being the most relevant.
    7. Repeat the process to extract remaining relations from >>>>>TEXT<<<<<.
    
    
    [Constraints]
    1. Values of 'Relation' key should be a label from properties in >>>>>ONTOLOGY<<<<<.
    
    [Output Format]
    Provide the result as a JSON array.

    Perform relation extraction on the below:
    
    >>>>>TEXT<<<<<
    {text}

    >>>>>EntityList<<<<<
    {entities}

    >>>>>ONTOLOGY<<<<<
    {ontology}
    
"""

    relations = get_answer(relation_extraction_prompt)
    relations = json.loads(relations)

    return relations

# Functions to get knowledge from Dbpedia

Used for entity enrichment. Provides summaries for entities found in DBpedia

In [87]:

# Define the DBpedia SPARQL endpoint
sparql_endpoint = "http://dbpedia.org/sparql"

# Create a SPARQLWrapper instance
sparql = SPARQLWrapper(sparql_endpoint)

# Function to search for an entity by label and return its DBpedia URI
def search_entity(label):
    query = f"""
    SELECT ?entity
    WHERE {{
      ?entity rdfs:label "{label}"@en.
    }}
    LIMIT 1
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    results = sparql.query().convert()

    if "results" in results and "bindings" in results["results"] and results["results"]["bindings"]:
        entity_uri = results["results"]["bindings"][0]["entity"]["value"]
        return entity_uri
    else:
        return None

# Function to retrieve and return the abstract or comment of an entity
def retrieve_entity_summary(entity_uri):
    # Try to retrieve the abstract
    abstract_query = f"""
    SELECT ?abstract
    WHERE {{
      <{entity_uri}> dbo:abstract ?abstract.
      FILTER (LANGMATCHES(LANG(?abstract), "en"))
    }}
    """

    sparql.setQuery(abstract_query)
    sparql.setReturnFormat(JSON)

    abstract_results = sparql.query().convert()

    if "results" in abstract_results and "bindings" in abstract_results["results"]:
        for result in abstract_results["results"]["bindings"]:
            abstract = result["abstract"]["value"]
            return abstract

    # If abstract is not found, try to retrieve the comment
    comment_query = f"""
    SELECT ?comment
    WHERE {{
      <{entity_uri}> rdfs:comment ?comment.
      FILTER (LANGMATCHES(LANG(?comment), "en"))
    }}
    """

    sparql.setQuery(comment_query)
    sparql.setReturnFormat(JSON)

    comment_results = sparql.query().convert()

    if "results" in comment_results and "bindings" in comment_results["results"]:
        for result in comment_results["results"]["bindings"]:
            comment = result["comment"]["value"]
            return comment

    # If neither abstract nor comment is found, return None
    return None



In [88]:

def dbpedia_summary(search_label):
    entity_uri = search_entity(search_label)

    if entity_uri:
        print(f"Entity found with DBpedia URI: {entity_uri}")
        try:
            summary = retrieve_entity_summary(entity_uri)
            if summary:
                return summary
            else:
                print("No abstract or comment found for this entity.")
        except urllib.error.URLError as e:
            print(f"Error: {e}")
    else:
        print(f"No entity found with the label: {search_label}")


In [89]:
def extract_summaries(entities):
    
    updated_entities = []

    for item in entities:
        try:
            summary = dbpedia_summary(item['entity'])
            if summary:
                # Only add the summary if it's not None or empty
                item['summary'] = summary
            updated_entities.append(item)  # Add the item regardless of summary presence
        except Exception as e:
            print(f"Error: {e}")
            continue
    
    return updated_entities

# Creating Graph in Neo4j

In [90]:
#add user and password of database on your Neo4j 
graph = Graph(uri = 'bolt://localhost:7687',user='neo4j',password=neo4j_pass)

Functions to add entities and relations to Neo4j

In [91]:

class Document:
    def __init__(self, metadata, entities, relations):
        self.metadata = metadata
        self.entities = entities
        self.relations = relations

# Define a function to create or retrieve a node
def get_or_create_node(label, key, value):
    # Attempt to find an existing node with the given label and key
    existing_node = get_node(label, key, value)
    if not existing_node:
        existing_node = get_node(label, 'acronym', value)
    
    if existing_node:
        return existing_node
    else:
        new_node = Node(label, **{key: value})
        graph.create(new_node)
        return new_node
    
def get_node(label, key, value):
    node = graph.nodes.match(label, **{key:value}).first()
    return node
def get_node_without_label(key, value):
    node = graph.nodes.match(**{key:value}).first()
    return node

# Define a function to insert relations 
def insert_relations_neo4j(entities, relations):
    
    for item in entities:
        
        node = get_or_create_node('Entity', "name", item["entity"])
        node['category'] = item["category"]
        if "acronym" in item:
            node['acronym'] = item["acronym"]
        if "summary" in item:
            node['summary'] = item["summary"]
        
        graph.push(node)
        
        
    for item in relations:
        subject = get_or_create_node( "Entity", "name", item["Subject"])
        obj = get_or_create_node("Entity", "name", item["Object"])
        relation = Relationship(subject, item["Relation"], obj)
        if 'Description' in item:
            relation["description"] = item["Description"]
        graph.create(relation)
        
    
# Define a function to insert summaries 
def insert_summary_neo4j(data):
    for item in data:
        node = get_node("Entity", "name", item.name)
        node["Summary"] = item.summary
        graph.push(node)
       

# Extraction Pipeline

In [92]:
#read the files from Data folder
folder_path = ('Data/AFG')
file_list = os.listdir(folder_path)

# Filter the list to include only text files (e.g., .txt files)
text_files = [file for file in file_list if file.endswith(".txt")]

print (f"Number of files: {len(text_files)}\n")  
print (text_files)

Number of files: 6

['AFG-NRER-2017-EN.txt', 'AFG-CPD-2014-EN.txt', 'AFG-NEP-2015-EN.txt', 'AFG-NEPro-2022-EN.txt', 'AFG-NREP-2013-EN.txt', 'AFG-NREP-2015-EN.txt']


In [93]:
#Select the file 
file_path = os.path.join(folder_path, text_files[3])

In [94]:
with open (file_path, 'r') as file:
    head = [next(file) for _ in range(11)]
    next(file)
    raw_text = file.read()
    
    file.close()

print (f"Original text length: {len(raw_text)}")  

Original text length: 7562


In [95]:
 # Open the file in read mode
with open(file_path, 'r') as file:
    
    pattern = re.compile(r'.*?\.{3}.*?$', re.MULTILINE)
    # Initialize an empty string to store the lines
    raw_text = ''
    
    head = [next(file) for _ in range(11)]
    next(file)
    
    # Iterate over each line in the file
    for line in file:
        # Append the current line to the string
        
        if not pattern.search(line) and is_english(line):
            raw_text += line
            
print(f"Read text length: {len(raw_text)}") 

text = clean_text(raw_text)

print(f"Cleaned text length: {len(text)}")


An exception occurred: No features in text. : 

An exception occurred: No features in text. : 56%

An exception occurred: No features in text. : 2%

An exception occurred: No features in text. : 22%

An exception occurred: No features in text. : 16%1% 0%

An exception occurred: No features in text. : 83%

An exception occurred: No features in text. : 98%

An exception occurred: No features in text. : 33%

An exception occurred: No features in text. : 19%

An exception occurred: No features in text. : 0%

An exception occurred: No features in text. : 20%

An exception occurred: No features in text. : 40%

An exception occurred: No features in text. : 60%

An exception occurred: No features in text. : 80%

An exception occurred: No features in text. : 100%

An exception occurred: No features in text. : 2014 2015 2016 2017 2018 2019 2020

An exception occurred: No features in text. : 75.6 74.3 74.5 74.9

An exception occurred: No features in text. : 82.5

An exception occurred: No feature

An exception occurred: No features in text. : 0%

An exception occurred: No features in text. : 20%

An exception occurred: No features in text. : 40%

An exception occurred: No features in text. : 60%

An exception occurred: No features in text. : 80%

An exception occurred: No features in text. : 100%

An exception occurred: No features in text. : <260 260-420 420-560 560-670 670-820 820-1060 >1060

An exception occurred: No features in text. : 200

An exception occurred: No features in text. : 0

An exception occurred: No features in text. : 1

An exception occurred: No features in text. : 2

An exception occurred: No features in text. : 3

An exception occurred: No features in text. : 4

An exception occurred: No features in text. : 5 6

An exception occurred: No features in text. : 7

An exception occurred: No features in text. : 8

An exception occurred: No features in text. : 9

An exception occurred: No features in text. : 10

An exception occurred: No features in text. : 11

A

In [96]:
metadata = {}

# Iterate through the data list
for item in head:
    # Split each element by ':' and strip the resulting strings
    key, value = item.split(':')
    key = key.strip()
    value = value.strip()
    
    # Add the key-value pair to the dictionary
    metadata[key] = value


if 'Exists?' in metadata:
    metadata.pop('Exists?')
print(metadata)


{'File Name': 'AFG-NEPro-2022-EN', 'Year': '2022', 'Country Name': 'Afghanistan', 'Country Code': 'AFG', 'Category': 'NEPro', 'Document Title': 'Energy Profile Afghanistan IRENA', 'Publication Date': '24th August, 2022', 'Start Year': '2014', 'End Year': '2022', 'Language': 'EN'}


In [97]:
#split the text from file into section of length 2000 for token limits

text_sections = split_text_spacy(2000, text)
print (f"The number of sections from the text: {len(text_sections)}")
text_length = len(text_sections)

The number of sections from the text: 3


In [98]:
start_index = 0
wiki_entity_list = [''] * text_length
bert_entity_list = [''] * text_length
gpt_entity_list = [''] * text_length
acronyms = {}

In [99]:
wiki_output

[{'entity_group': 'LOC',
  'score': 0.837629497051239,
  'word': 'Masdar City',
  'start': 339,
  'end': 350},
 {'entity_group': 'LOC',
  'score': 0.9379422068595886,
  'word': 'United Arab Emirates',
  'start': 351,
  'end': 371},
 {'entity_group': 'ORG',
  'score': 0.5029942989349365,
  'word': 'IRENA',
  'start': 381,
  'end': 386},
 {'entity_group': 'MISC',
  'score': 0.5442724227905273,
  'word': 'UN SDG Database',
  'start': 437,
  'end': 452},
 {'entity_group': 'ORG',
  'score': 0.9609768390655518,
  'word': 'WHO',
  'start': 473,
  'end': 476},
 {'entity_group': 'ORG',
  'score': 0.9636392593383789,
  'word': 'World Bank',
  'start': 478,
  'end': 488},
 {'entity_group': 'ORG',
  'score': 0.927504301071167,
  'word': 'IEA',
  'start': 490,
  'end': 493},
 {'entity_group': 'ORG',
  'score': 0.7723690867424011,
  'word': 'IRENA',
  'start': 495,
  'end': 500},
 {'entity_group': 'ORG',
  'score': 0.5234919190406799,
  'word': 'UNSD',
  'start': 506,
  'end': 510},
 {'entity_group'

In [100]:
# Extract entities by sending text sections to models

start_time = time.time()
for index in range(text_length):
    try:
        segment = text_sections[index]
        
        ## WIKINEURAL BILINGUAL MODEL
        wiki_output = query_wiki({
            "inputs": segment,
        })
        create_entities(wiki_output)
        wiki_words = list(set(apply_threshold(wiki_output, 0.7)))
        wiki_entity_list[index] = wiki_words
        print ("WIKI DONE")

        ## BERT BASE MODEL
        bert_output = query_bert({
            "inputs": segment,
        })
        create_entities(bert_output)
        bert_words = list(set(apply_threshold(bert_output, 0.7)))
        bert_entity_list[index] = bert_words
        print ("BERT DONE")


        ## GPT PROMPT
        gpt_output = query_gpt(segment)
        gpt_entity_list[index] = gpt_output['proper_nouns']

        print ("GPT DONE")

        ## Acronyms extraction
        acronyms.update(gpt_output['acronyms'])
    
        
        print(f"NUMBER OF PROCESSED SECTIONS: {index}")

        
    except Exception as e:
        print(f"Error processing section {index}: {str(e)}")
        #save_checkpoint(index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms)

        continue  # Exit the loop in case of an error

end_time = time.time()
elapsed_time = end_time - start_time
print(f"TIME TAKEN TO EXTRACT ENTITIES from {text_length} section: {elapsed_time}")

WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 0
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 1
WIKI DONE
BERT DONE
GPT DONE
NUMBER OF PROCESSED SECTIONS: 2
TIME TAKEN TO EXTRACT ENTITIES from 3 section: 14.453114032745361


In [101]:
#View extarcted acronyms
print (acronyms)

{'TES': 'Total Energy Supply', 'TJ': 'Terajoules', 'GDP': 'Gross Domestic Product', 'PPP': 'Purchasing Power Parity', 'TFEC': 'Total Final Energy Consumption', 'GW': 'Gigawatts', 'MW': 'Megawatts', 'RE': 'Renewable Energy', 'CO2': 'Carbon Dioxide', 'Elec.': 'Electricity', 'Mt': 'Metric Tons', 'AFG': 'Afghanistan', 'GWh': 'Gigawatt-hours', 'PV': 'Photovoltaic', 'MWh': 'Megawatt-hours', 'kWp': 'Kilowatt-peak', 'NREL': 'National Renewable Energy Laboratory', 'NPP': 'Net primary production', 'IRENA': 'International Renewable Energy Agency', 'UN': 'United Nations', 'SDG': 'Sustainable Development Goals', 'WHO': 'World Health Organization', 'IEA': 'International Energy Agency', 'UNSD': 'United Nations Statistics Division', 'COMTRADE': 'United Nations Commodity Trade Statistics Database', 'EDGAR': 'Emissions Database for Global Atmospheric Research', 'REN21': 'Renewable Energy Policy Network for the 21st Century'}


In [102]:
i = 0
while i < text_length:
    #print (text_sections[i])
    print (wiki_entity_list[i])
    print (bert_entity_list[i])
    print (gpt_entity_list[i])
    print ("--------")
    i = i+1

['Total Energy Supply']
['TF']
['Total Energy Supply', 'Non-renewable', 'Renewable', 'Growth', 'Primary', 'Imports', 'Exports', 'Energy', 'Coal', 'Renewables', 'Geothermal', 'Access', "USD'000s", 'GDP', 'Real GDP', 'Public', 'Consumption', 'Industry', 'Households', 'Other', 'Capacity', 'Utilisation', 'Net capacity', 'Installed capacity', 'Electricity', 'Commercial heat', 'Bioenergy', 'Solar', 'Fossil fuels', 'Nuclear', 'Hydro and marine', 'Avoided emissions', 'CO2 emissions', 'Per capita electricity generation']
--------
[]
['Asia World Rene', 'NREL']
['Asia', 'World', 'Afghanistan', 'Renewable', 'Geothermal', 'Biomass', 'Solar', 'Onshore', 'NREL']
--------
['United Arab Emirates', 'WHO', 'Masdar City', 'IEA', 'IRENA', 'UN World Population Prospects', 'Harmonised System', 'World Bank']
['UNSD', 'IRENA Joint Policies and Measures Database', 'IRENA Global Atlas', 'Global Wind Atlas', 'UNSD Energy Ba', 'E', 'UN COMT', 'WHO', 'World Bank World Development Indicators', 'Haised System', 'UN 

Processing the Entities 

In [103]:
# Get raw version of entities for comparison
raw_wiki = get_raw(wiki_entity_list)
raw_bert = get_raw(bert_entity_list)
raw_gpt = get_raw(gpt_entity_list)

In [104]:
entity_objects = []
entity_filter = []
merged = []
i = 0

while i < len(wiki_entity_list):
    merged = merge_extracted_entities_old(raw_wiki[i], raw_bert[i], raw_gpt[i])
    print (f"\nThe number of matching entities in section {i}: {len(merged)}\n")
    print (merged)
    
    print ("\n--------------")
    entity_filter.extend(merged)
    entity_objects.append(merged)
    
    i = i+1


The number of matching entities in section 0: 1

['Total Energy Supply']

--------------

The number of matching entities in section 1: 0

[]

--------------

The number of matching entities in section 2: 3

['United Arab Emirates', 'World Bank', 'Masdar City']

--------------


In [105]:
# invert acronyms dict to ease look up
acronyms_dict = {v: k for k, v in acronyms.items()}
print (acronyms_dict)

{'Total Energy Supply': 'TES', 'Terajoules': 'TJ', 'Gross Domestic Product': 'GDP', 'Purchasing Power Parity': 'PPP', 'Total Final Energy Consumption': 'TFEC', 'Gigawatts': 'GW', 'Megawatts': 'MW', 'Renewable Energy': 'RE', 'Carbon Dioxide': 'CO2', 'Electricity': 'Elec.', 'Metric Tons': 'Mt', 'Afghanistan': 'AFG', 'Gigawatt-hours': 'GWh', 'Photovoltaic': 'PV', 'Megawatt-hours': 'MWh', 'Kilowatt-peak': 'kWp', 'National Renewable Energy Laboratory': 'NREL', 'Net primary production': 'NPP', 'International Renewable Energy Agency': 'IRENA', 'United Nations': 'UN', 'Sustainable Development Goals': 'SDG', 'World Health Organization': 'WHO', 'International Energy Agency': 'IEA', 'United Nations Statistics Division': 'UNSD', 'United Nations Commodity Trade Statistics Database': 'COMTRADE', 'Emissions Database for Global Atmospheric Research': 'EDGAR', 'Renewable Energy Policy Network for the 21st Century': 'REN21'}


In [106]:
#initialize lists for storing entities and relations
entities_list_unfiltered = []
relations_list = []
entities_with_sections = []
relations_with_section = []
seen_entities = set()
seen_acronyms = set()

In [107]:
#loop to categorize all extracted entities and extract relations for each text_section
start_time = time.time()

for index, uncategorized_entities in enumerate(entity_objects):
    try:
        entities_subset = categorize_entities(text_sections[index], uncategorized_entities, categories)
        #print(seen_acronyms)

        for item in entities_subset:
            if item["entity"] not in seen_entities and item["entity"] not in seen_acronyms:
                seen_entities.add(item["entity"])
                
                
                if item["entity"] in acronyms_dict.keys():
                    item["acronym"] = acronyms_dict[item["entity"]]
                    seen_acronyms.add(item['acronym'])
                    
                elif item['entity'] in acronyms.keys():
                    item["acronym"] = item['entity']
                    item["entity"] = acronyms[item["entity"]]
                    seen_acronyms.add(item['acronym'])
                    
                entities_list_unfiltered.append(item)

        print ("CATEGORIZED ENTITIES of Section: " + str(index) + "\n")
        print (entities_subset)
        #store the categorized entities in order of lists for later processing

        relations_subset = extract_ontology_relations(text_sections[index], entity_objects[index], ontology)

        print ("\n EXTRACTED RELATIONS: \n")
        print (relations_subset)

        relations_list.extend(relations_subset)

        print ("\n-------------------")

    except Exception as e:
            print(f"Error processing section {index}: {str(e)}")
            #save_checkpoint(index, wiki_entity_list, bert_entity_list, gpt_entity_list, acronyms)

    continue  # Exit the loop in case of an error


end_time = time.time()
elapsed_time = end_time - start_time
print(f"TIME TAKEN TO EXTRACT RELATIONS FROM {text_length} SECTIONS: {elapsed_time}")

CATEGORIZED ENTITIES of Section: 0

[{'entity': 'Total Energy Supply', 'category': 'Organization', 'acronym': 'TES'}]

 EXTRACTED RELATIONS: 

[{'Subject': 'Total Energy Supply', 'Relation': 'addresses', 'Object': 'Renewable energy supply in 2019', 'Description': 'Total energy supply in 2019', 'Relevance': 8}]

-------------------
CATEGORIZED ENTITIES of Section: 1

[]

 EXTRACTED RELATIONS: 

[{'Relation': 'focuses on', 'Subject': 'Electricity generation trend', 'Object': 'Energy-related CO2 emissions by sector'}, {'Relation': 'contributes to', 'Subject': 'Electricity generation trend', 'Object': 'Per capita electricity generation (kWh)'}, {'Relation': 'contributes to', 'Subject': 'Electricity generation trend', 'Object': 'Renewable share (%)'}, {'Relation': 'contributes to', 'Subject': 'Electricity generation trend', 'Object': 'Gigawatt-hours (GWh)'}, {'Relation': 'contributes to', 'Subject': 'Electricity generation trend', 'Object': 'Annual generation per unit of installed PV capaci

In [108]:
entities_list = []
for i in entities_list_unfiltered:
    if i['entity'] in entity_filter:
        entities_list.append(i)

In [109]:
# extract summaries from Dbpedia
entity_summaries = extract_summaries(entities_list)

No entity found with the label: Total Energy Supply
Entity found with DBpedia URI: http://dbpedia.org/resource/United_Arab_Emirates
Entity found with DBpedia URI: http://dbpedia.org/resource/Category:World_Bank
No abstract or comment found for this entity.
Entity found with DBpedia URI: http://dbpedia.org/resource/Masdar_City


In [110]:
## create a list of entity_names, to check for valid relations

entity_names = set([item['entity'] for item in entities_list])
entity_names.update([item['acronym'] for item in entities_list if 'acronym' in item ])


# Write the output to files

In [None]:
entity_dic = {}
final_entities = []

for i in entity_summaries:
    entity_dic[i['entity']] = i
    
for i in entity_dic.values():
    final_entities.append(i)

In [None]:
json_entities = json.dumps(final_entities, indent=2)

In [None]:
with open('Entities/' + metadata['File Name']+ '.json', "w") as output_file:
    output_file.write(json_entities)
    output_file.close()

In [None]:
final_relations = []
for i in relations_list:
    if 'Subject' in i and i['Subject'] in entity_names and i['Object'] in entity_names and i['Relation'] in relation_labels:
        final_relations.append(i)
    elif 'Description' in i and 'Subject' in i and i['Subject'] in entity_dic.keys():
        entity_dic[i['Subject']].update({'information':i['Description']})
    elif 'Description' in i and 'Object' in i and i['Object'] in entity_dic.keys():
        entity_dic[i['Object']].update({'information':i['Description']})

In [None]:
json_relations = json.dumps(final_relations, indent=2)

In [None]:
with open('Relations/' + metadata['File Name']+ '.json', "w") as output_file:
    output_file.write(json_relations)
    output_file.close()

In [None]:
print ("No. of extracted entities:" + str(len(final_entities)))
print ("No. of extracted relations:" + str(len(final_relations)))

In [None]:
# Insert relations in the Neo4j database
insert_relations_neo4j(final_entities, final_relations)

# Add Relations to Spreadsheet for Review

Do not run this if you don't have a google api key

In [None]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Use the credentials from the service account key JSON file you downloaded
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name('energy-moonshot-ai-97aa9045e45f.json', scope)
client = gspread.authorize(creds)

# Open the Google Sheet by its title or URL
spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/1yZ-XQQs52kaI5k9MjvV_CdbgWQi-GazjHHGqQUF8gko/edit')


# Enter relations in the first sheet
sheet = spreadsheet.get_worksheet(0)

# Start row index from 5
start_row_index = 5
index = 1

# Check if there's valid data to insert
if final_relations:
    # Create a list of lists where each inner list represents the values of a row
    batch_relations = []
    for index, row_data in enumerate(final_relations):
        row = [index, row_data['Subject'], row_data['Relation'], row_data.get('Object', ''), 
               row_data.get('Description', ''), row_data.get('Relevance', '')]
        
        batch_relations.append(row)
        index = index + 1

    # Insert the data into the Google Sheet starting from row 5
    sheet.insert_rows(batch_relations, start_row_index)

    print(f"{len(final_relations)} entries added to Google Sheet.")
else:
    print("No data to insert.")
    
    
# Enter entities in the second sheet
sheet = spreadsheet.get_worksheet(1)


# Start row index from 5
start_row_index = 5
index = 1

if final_entities:
    batch_entities = []
    for index, row_data in enumerate(final_entities):
        row = [index, row_data['entity'], row_data['category'], row_data.get('acronym', ''), row_data.get('summary', '')]
        batch_entities.append(row)
        
        index = index + 1
    sheet.insert_rows(batch_entities, start_row_index)