In [None]:
import re
from ollama import Client
import json
from pathlib import Path

client = Client(host='127.0.0.1:12345')
regex = "Nodes:\s+(.*?)\s?\s?Relationships:\s+(.*)"
internalRegex = "\[(.*?)\]"
jsonRegex = "\{.*\}"


def nodesTextToListOfDict(nodes):
    result = []
    for node in nodes:
        nodeList = node.split(",")
        if len(nodeList) < 2:
            continue

        name = nodeList[0].strip().replace('"', "")
        label = nodeList[1].strip().replace('"', "")
        properties = re.search(jsonRegex, node)
        if properties is None:
            properties = "{}"
        else:
            properties = properties.group(0)
        properties = properties.replace("True", "true")
        try:
            properties = json.loads(properties)
        except ValueError:
            properties = {}
        result.append({"name": name, "label": label, "properties": properties})
    return result


def relationshipTextToListOfDict(relationships):
    result = []
    for relation in relationships:
        relationList = relation.split(",")
        if len(relation) < 3:
            continue
        start = relationList[0].strip().replace('"', "")
        end = relationList[2].strip().replace('"', "")
        type = relationList[1].strip().replace('"', "")

        properties = re.search(jsonRegex, relation)
        if properties is None:
            properties = "{}"
        else:
            properties = properties.group(0)
        properties = properties.replace("True", "true")
        try:
            properties = json.loads(properties)
        except ValueError:
            properties = {}
        result.append(
            {"start": start, "end": end, "type": type, "properties": properties}
        )
    return result

sys_prompt_simple = """You are a data scientist working for a company that is building a graph database. Your task is to extract information from data and convert it into a graph database.
Provide a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of relationships in the form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES].
It is IMPORTANT that the ENTITY_ID_1 and ENTITY_ID_2 exists as nodes with a matching ENTITY_ID. Do not pair any relationship with non-existing nodes. If you can't pair a relationship with a pair of nodes don't add it.
When you find a node or relationship you want to add try to create a generic TYPE for it that  describes the entity you can also think of it as a label.
You will be given a list of types that you should try to use when creating the TYPE for a node. If you can't find a type that fits the node you can create a new one.
NO YAPPING before or after your answers. DO NOT add comments in your answers. Format your answer to strictly follow the rules in the example below.

Example:
Data: Alice lawyer and is 25 years old and Bob is her roommate since 2001. Bob works as a journalist. Alice owns a the webpage www.alice.com and Bob owns the webpage www.bob.com.
Nodes: ["alice", "Person", {"age": 25, "occupation": "lawyer", "name":"Alice"}], ["bob", "Person", {"occupation": "journalist", "name": "Bob"}], ["alice.com", "Webpage", {"url": "www.alice.com"}], ["bob.com", "Webpage", {"url": "www.bob.com"}]
Relationships: ["alice", "roommate", "bob", {"start": 2021}], ["alice", "owns", "alice.com", {}], ["bob", "owns", "bob.com", {}]
"""

sys_prompt_cplx = ("# Knowledge Graph Instructions for GPT-4\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph.\n"
    "Try to capture as much information from the text as possible without "
    "sacrifing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "accessible for a vast audience.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'"
    "  - **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination."
    """
    Example:
    Data: Alice lawyer and is 25 years old and Bob is her roommate since 2001. Bob works as a journalist. Alice owns a the webpage www.alice.com and Bob owns the webpage www.bob.com.
    Nodes: ["alice", "Person", {"age": 25, "occupation": "lawyer", "name":"Alice"}], ["bob", "Person", {"occupation": "journalist", "name": "Bob"}], ["alice.com", "Webpage", {"url": "www.alice.com"}], ["bob.com", "Webpage", {"url": "www.bob.com"}]
    Relationships: ["alice", "roommate", "bob", {"start": 2021}], ["alice", "owns", "alice.com", {}], ["bob", "owns", "bob.com", {}]
    """
)

def getNodesAndRelationshipsFromResult(result):
    regex = "Nodes:\s*(.*?)\s*Relationships:\s*(.*)"
    internalRegex = "\[(.*?)\]"
    nodes = []
    relationships = []
    for row in result:
        print(row)
        parsing = re.match(regex, row, flags=re.S)
        if parsing is None:
            print(parsing)
            continue
        rawNodes = str(parsing.group(1))
        rawRelationships = parsing.group(2)
        nodes.extend(re.findall(internalRegex, rawNodes))
        relationships.extend(re.findall(internalRegex, rawRelationships))

    result = dict()
    result["nodes"] = []
    result["relationships"] = []
    result["nodes"].extend(nodesTextToListOfDict(nodes))
    result["relationships"].extend(relationshipTextToListOfDict(relationships))
    return result

def getTypesFromDict(result):
    labels = [] 
    for node in result['nodes']:
        if node['label'] not in labels:
            labels.append(node['label'])
    return labels

def mergeDicts(dict1, dict2):
    for k, v in dict2.items():
        if k in dict1:
            dict1[k] += v
        else:
            dict1[k] = v
    return dict1

In [None]:
toy_data = Path("/home/user/large-disk/toy_data")
kg = dict()
labels = []
data = []
for page_number in range(3, 10):
    with open(toy_data / f"output_{page_number}.txt", "r", encoding="utf-8") as f:
        text = f.readlines()
        text = " ".join(text)
        data = text.rstrip()
    stream = client.chat(
        model="llama3:70b",
        messages=[
            {"role": "system", "content": sys_prompt_simple},
            {"role": "user", "content": f"Data: {data}\nTypes: {labels}"},
        ],
        stream=False,
    )
    ans = [stream['message']['content'].replace('\n', ' ')]
    sub_kg = getNodesAndRelationshipsFromResult(ans)
    labels = labels + getTypesFromDict(sub_kg)
    kg = mergeDicts(kg, sub_kg)

In [None]:
with open("toy_kg_70b.json", "w") as outfile: 
    json.dump(kg, outfile, indent=4)

In [None]:
toy_data = Path("/home/user/large-disk/toy_data")
kg = dict()
labels = []
data = []
for page_number in range(3, 10):
    with open(toy_data / f"output_{page_number}.txt", "r", encoding="utf-8") as f:
        text = f.readlines()
        text = " ".join(text)
        data = text.rstrip()
    stream = client.chat(
        model="llama3",
        messages=[
            {"role": "system", "content": sys_prompt_simple},
            {"role": "user", "content": f"Data: {data}\nTypes: {labels}"},
        ],
        stream=False,
    )
    ans = [stream['message']['content'].replace('\n', ' ')]
    sub_kg = getNodesAndRelationshipsFromResult(ans)
    labels = labels + getTypesFromDict(sub_kg)
    kg = mergeDicts(kg, sub_kg)

In [None]:
with open("toy_kg.json", "w") as outfile: 
    json.dump(kg, outfile, indent=4)