In [1]:
import langchain
from dotenv import load_dotenv
import json
from tqdm import tqdm

load_dotenv()

True

In [None]:
with open("data.json", 'r') as json_file:
    dict_p = json.load(json_file)

In [None]:

# Load the LLM model
chat = langchain.chat_models.ChatOpenAI(temperature=0.2, model_name='gpt-4',)


def get_basic_chat_chain(system_template: str, user_template: str) -> langchain.LLMChain:
    """
    Returns a langchain chain given an user and chat inputs

    Parameters:
        system_template (str): The system template
        user_template (str): The user template

    Returns:
        langchain.LLMChain: The langchain chain
    """

    system_prompt = langchain.prompts.SystemMessagePromptTemplate.from_template(
        system_template)

    user_prompt = langchain.prompts.HumanMessagePromptTemplate.from_template(
        user_template)

    chat_prompt = langchain.prompts.ChatPromptTemplate.from_messages([
        system_prompt,
        user_prompt
    ])

    return langchain.LLMChain(llm=chat, prompt=chat_prompt)



In [None]:
system_promt = """
Your are an assistant and you are creating a knowledge graph from a scientific article the talks about the moon exploration.
You will be given as input a piece of text from the article. 
Your job is to identify the main concepts to use them as nodes in our knowledge graph. 
Then you have to identify the relations between each concept.

You will return the answer as a JSON object with the keys:
    - "nodes" and the value as a list of strings ['concept1','concept2','concept3',...].
    - "relations" and the value as a list of lists [['concept9','concept10','relation1'],['concept9','concept2','relation2']...]
You have to only return the JSON object.
Your answer begins with a left curly bracket and ends with a right curly bracket.
No preface text.

"""
user_prompt = """
## TEXT:
{text}

"""
chain = get_basic_chat_chain(system_promt,user_prompt)

In [None]:
for p in tqdm(dict_p["paragraphs"][:12]):
    text=p["text"]
    response = chain.run(text=text)
    try:
        p["graph"] = json.loads(response)
    except:
        print(response)



In [2]:
with open("data_graph.json","w") as f:
    json.dump(dict_p, f, indent=4)

NameError: name 'dict_p' is not defined

In [4]:
with open("data_graph.json","r") as f:
    dict_p = json.load(f)

In [5]:
nodes = []
relations = []
for p in dict_p["paragraphs"][:12]:
    nodes += p["graph"]["nodes"]
    relations += p["graph"]["relations"]
    

In [None]:
system_promt = """
Your are an assistant and you are creating a knowledge graph from a scientific article the talks about the moon exploration.
You will be given as input all the nodes.

You have to return a list of Cypher queries to create all this nodes, with the label "concept"
You will return the Cypher queries a List of strings:
['query1','quey2',...]
Just return the list, nothing more.
Your answer begins with a left  bracket and ends with a right  bracket.
No preface text.

"""
user_prompt = """
## NODES:
{nodes}

"""
cleaning_chain = get_basic_chat_chain(system_promt,user_prompt)

In [None]:
system_promt = """
Your are an assistant and you are creating a knowledge graph from a scientific article the talks about the moon exploration.
You will be given as input all the nodes.
The nodes is a list of concepts.
Your job is to  merge nodes that are very similar in a final node.


You will return the mapping as a JSON object with the keys as the ancien node names and the values as the new node names.
You have to only return the JSON object.
Your answer begins with a left curly bracket and ends with a right curly bracket.
No preface text.

"""
user_prompt = """
## NODES:
{nodes}
"""
cleaning_chain = get_basic_chat_chain(system_promt,user_prompt)

In [None]:
response = cleaning_chain.run(nodes=str(nodes))

In [None]:
print(response)

In [9]:
hierarchy_dict = {
    "Article":{"label":"article","childs":{
        "Abstract":{"label":"Section","childs":{"Abstract P1":{"label":"Paragraph"}}},
        "1. Introduction":{"label":"Section","childs":{"1. P1":{"label":"Paragraph"}}},
        "2. Science of the Moon":{"label":"Section","childs":
                                  {
                                      "2. P1":{"label":"Paragraph"},
                                      "2. P2":{"label":"Paragraph"},
                                      "2.1 The Bombardment History of the Inner Solar System":{
                                          "label":"Subsection",
                                          "childs":{"2.1 P1":{"label":"Paragraph"},
                                          "2.1 P2":{"label":"Paragraph"},
                                          "2.1 P3":{"label":"Paragraph"}},
                                          },
                                      "2.2 The structure and composition of the lunar interior":{
                                          "label":"Subsection",
                                          "childs":{"2.2 P1":{"label":"Paragraph"},
                                          "2.2 P2":{"label":"Paragraph"},
                                          "2.2 P3":{"label":"Paragraph"},
                                          "2.2 P4":{"label":"Paragraph"},
                                          "2.2 P5":{"label":"Paragraph"}},
                                          },
                                      }
        }
    }
}}

list_of_cypher_queries = []

def building_hierarchy_tree(parent,hierarchy_dict,list_of_cypher_queries):
    for title,content in hierarchy_dict.items():
        label = content["label"]
        list_of_cypher_queries += ['CREATE (:' + label.lower()+ ' {name: "'+ title +'"});']
        if parent is not None :
            list_of_cypher_queries += ['MATCH (a:'+parent['label'].lower()+'{name:"'+parent['name']+'"}), (b:' + label.lower() +' {name:"' + title +'"})' + '  CREATE (b)-[:IS_PART_OF]->(a);']
        if "childs" in content.keys():
            list_of_cypher_queries = building_hierarchy_tree({'name':title,'label':label},content["childs"],list_of_cypher_queries)
    return list_of_cypher_queries

list_of_cypher_queries = building_hierarchy_tree(None,hierarchy_dict,list_of_cypher_queries)


In [10]:

def build_node_name(title,number):
    name = ""
    if "Abstract" in title:
        return "Abstract P1"
    else:
        if title[2].isnumeric():
            return title[:3] + " P" + str(number)
        else:
            return title[:2] + " P" + str(number)
        
for p in dict_p["paragraphs"][:12]:
    paragraph_name = build_node_name(p["section"],p["number"])
    nodes = p["graph"]["nodes"]
    relations = p["graph"]["relations"]
    for node in nodes :
        list_of_cypher_queries += ['CREATE (:concept {name: "'+ node +'"});']
        list_of_cypher_queries += ['MATCH (a:paragraph {name:"' + paragraph_name +'"}), (b:concept {name:"' + node +'"})' + '  CREATE (a)-[:TALKS_ABOUT]->(b);']
    for relation in relations:
        if relation[0] not in nodes:
            list_of_cypher_queries += ['CREATE (:concept {name: "'+ relation[0] +'"});']
            list_of_cypher_queries += ['MATCH (a:paragraph {name:"' + paragraph_name +'"}), (b:concept {name:"' + relation[0] +'"})' + '  CREATE (a)-[:TALKS_ABOUT]->(b);']
        if relation[1] not in nodes:
            list_of_cypher_queries += ['CREATE (:concept {name: "'+ relation[1] +'"});']
            list_of_cypher_queries += ['MATCH (a:paragraph {name:"' + paragraph_name +'"}), (b:concept {name:"' + relation[1] +'"})' + '  CREATE (a)-[:TALKS_ABOUT]->(b);']

        list_of_cypher_queries += ['MATCH (a:concept {name:"' + relation[0] +'"}), (b:concept {name:"' + relation[1] +'"})' + '  CREATE (a)-[:'+ relation[2].upper().replace(" ","_").replace(".","_") +']->(b);']

In [11]:
with open('../llm_functions/cypher_queries.txt', 'w') as f:
    for line in list_of_cypher_queries:
        f.write(f"{line}\n")
