In [74]:
%%capture

%pip install requirements.txt

In [84]:
import sys
print(sys.executable)

/Users/timchen2/Desktop/Knowledge-Graph-Chat-demo/venv/bin/python


In [85]:
import os
import re

from string import Template
import json
from neo4j import GraphDatabase
import glob # search for files that match a specific file pattern or name

from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
import requests



### 1. Configuration

In [86]:
# Load env variables
load_dotenv()

True

In [87]:
# Neo4j configuration & constraints
neo4j_url = os.getenv("NEO4J_URI")
neo4j_user = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
gds = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))

### 2. Helper Function Outline

##### Flow : LLM process unstructured data into entities&relationship -> Generate Cypher to construct Knowledge Graph

#### LLM model
- ##### Function to call the OpenAI API (better performance but need $$)
    - def process_gpt(optional)
- ##### Function to call the Ollama API (Opensource free LLM model can be installed on Local as REST API)
    - def process_ollama(file_prompt, system_msg)

##### Function to pre-process unstructureed data in order to return JSON-object of all the entities and relationships for building Knowledge Graph
- def extract_entities_relationships(folder, prompt_template)

##### Function to take JSON-object of entities and relationships and generate cypher query for creating those entities
- def generate_cypher(json_obj)

In [83]:
def process_ollama(file_prompt, system_msg):
    # Combine system message and user prompt
    combined_prompt = f"{system_msg}\n\nHuman: {file_prompt}\n\n Assistant:"
    
    # Ollama API endpoint
    url ="http://localhost:11434/api/generate"
    payload = {
        "model": "mistral-custom",
        "prompt": combined_prompt,
        "stream": False
    }

    try:
        response = requests.post(url, json=payload)
        response.raise_for_status() # Raise an HTTPError for bad responses

        result = response.json()
        nlp_results = result['response']
    except requests.RequestException as e:
        nlp_results = f"Error: {str(e)}"
    
    sleep(8) # Sleep for 8 seconds to avoid overloading the server - Rate Limiting
    return nlp_results

def fix_json(raw_json):
    # Remove all comments (both single-line and multi-line)
    raw_json = re.sub(r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', 
                      lambda m: '' if m.group(0).startswith('/') else m.group(0), 
                      raw_json, flags=re.DOTALL | re.MULTILINE)
    
    # Replace unescaped apostrophes with escaped ones, but only within string values
    raw_json = re.sub(r'(?<!\\)"(.*?)(?<!\\)"', lambda m: '"{}"'.format(m.group(1).replace("'", "\\'")), raw_json)
    
    # Replace single quotes with double quotes
    raw_json = raw_json.replace("'", '"')
    
    # Replace None with null
    raw_json = raw_json.replace("None", "null")
    
    # Remove trailing commas in arrays and objects
    raw_json = re.sub(r',\s*}', '}', raw_json)
    raw_json = re.sub(r',\s*]', ']', raw_json)
    
    # Remove any remaining whitespace at the end of the JSON
    raw_json = raw_json.strip()
    
    try:
        # Try to parse the fixed JSON
        return json.loads(raw_json)
    except json.JSONDecodeError as e:
        print(f"JSON decoding error after fixing: {e}")
        print("Partially fixed JSON:")
        print(raw_json)
        
        # If it still fails, try to extract valid JSON using regex
        match = re.search(r'\{.*\}', raw_json, re.DOTALL)
        if match:
            try:
                return json.loads(match.group(0))
            except json.JSONDecodeError as e:
                print(f"Failed to extract valid JSON: {e}")
                return None
        return None

def extract_entities_relationships(folder, prompt_template):
    start = timer()
    files = glob.glob(f"./data/{folder}/*")

    system_msg = "You are a helpful IT-project and account management expert who is extremely skillful to extract information from documents and always return VALID JSON. YOU ALWAY RETURN VALID JSON"
    
    print(f"Running pipeline for {len(files)} files in {folder} folder.")

    results = []
    for i, file in enumerate(files):
        print(f"\nProcessing file {i+1}/{len(files)}: {file} .")
        try:
            with open(file, 'r', encoding='utf-8') as f:
                # Read the file and remove any trailing whitespaces
                text = f.read().strip()
                # Replace the template with the actual text
                prompt = Template(prompt_template).substitute(ctext=text)
                result = process_ollama(prompt, system_msg = system_msg)
                
                # edge case for empty response
                if not result.strip():
                    print(f"Warning🌝: Empty response for file {file}")
                    continue
                
                # more tracker to track the progress
                print("Attemping to parse JSON...")
                try:
                    parsed_result = json.loads(result)
                    results.append(parsed_result)
                    print("JSON parsed successfully")
                
                except json.JSONDecodeError as json_err:
                    print(f"JSON parsing error: {json_err}")
                    print("Full raw result:")
                    print(result)
                    print("\nAttempting to fix JSON...")
                    fixed_result = fix_json(result)
                    if fixed_result:
                        results.append(parsed_result)
                        print("Fixed JSON successfully")
                    else:
                        print("Failed to fix JSON")
                        print("Full raw result:")
                        print(fixed_result)
        except Exception as e:
            print(f"Error processing {file} : {str(e)}")
    
    end = timer()
    print(f"\nPipeline completed in {end-start} seconds")
    return results
# Function to take JSON-object of entities and relationships. Generate valid cypher statements(entities&relationship) in order to load them into Neo4j database.
def generate_cypher(json_obj):
    en_statements = []
    rel_statements = []

    en_label_dict = {}

    # 
    for i, obj in enumerate(json_obj):
        print(f"Generating Cypher for file {i + 1} of {len(json_obj)}")

        # Loop through entities and create cypher statements for each entity(node)
        for entity in obj['entities']:
            label = entity['label']
            id = entity['id']
            id = id.replace("-", "").replace("_", "")
            # only extract properties that are not lable or id
            properties = {k: v for k, v in entity.items() if k not in ['label', 'id']}

            # MERGE - create if not exist, if exist, it will update
            # The reason why don't use CREATE is to avoid creating duplicate nodes
            cypher = f'MERGE (n:{label} {{id: "{id}" }})'
            if properties:
                props_str = ", ".join(
                    [f'n.{key} = "{value}"' for key, value in properties.items()]
                )
                cypher += f" ON CREATE SET {props_str}"
            
            en_statements.append(cypher)
            # This is entity for relationship - key=id: value=label
            en_label_dict[id] = label

        # Loop through relationships and create cypher statements by merging the ndoes and creating the relationships
        for rel in obj["relationships"]:
            source_id, rel_type, target_id = rel.split('|')
            source_id = source_id.replace("-", "").replace("_", "")
            target_id = target_id.replace("-", "").replace("_", "")

            # test to find whether the entities are in the en_label_dict
            try:
                source_label = en_label_dict[source_id]
                target_label = en_label_dict[target_id]
            except KeyError as e:
                print(f"Error: Entity not found in en_label_dict: {e}")
                print(f"Relationship: {rel}")
                print("Skipping this relationship...🚬")
                continue

            cypher = f"MERGE (a:{source_label} {{id: '{source_id}'}}) MERGE (b:{target_label} {{id: '{target_id}'}}) MERGE (a)-[:{rel_type}]->(b)" 
            rel_statements.append(cypher)      

    with open(f"cypher.txt", "w") as outfile:
        outfile.write("\n".join(en_statements + rel_statements))
    
    return en_statements + rel_statements


# Function to bring all steps together
def run_pipeline(folders):
    # Extracting the entites and relationships from each folder, append into one json_obj
    entities_relationships =[]

    # Unstructured raw data -> JSON-object
    for key, value in folders.items():
        entities_relationships.extend(extract_entities_relationships(key, value))
    
    # Generate and execute cypher statements
    # JSON-object -> Cypher statements
    cypher_statements = generate_cypher(entities_relationships)
    for _, stmts in enumerate(cypher_statements):
        print(f"Executing Cypher statements for file {_ + 1} of {len(cypher_statements)}")
        try:
            gds.execute_query(stmts)
        except Exception as e:
            with open("falied_cypher.txt", "w") as f:
                f.write(f"{stmts} - Exception: {e}\n")



#### Test generate_cypher function

In [88]:
with open('test_slack_msg.json', 'r') as f:
    json_obj = json.load(f)

cypher = generate_cypher(json_obj)

Generating Cypher for file 1 of 7
Error: Entity not found in en_label_dict: 'noahWilson'
Relationship: noahWilson|SENT|6ec813a60
Skipping this relationship...🚬
Error: Entity not found in en_label_dict: 'aidenLewis'
Relationship: aidenLewis|SENT|4203972ad
Skipping this relationship...🚬
Error: Entity not found in en_label_dict: 'benjaminWhite'
Relationship: benjaminWhite|SENT|f230a0f7a
Skipping this relationship...🚬
Error: Entity not found in en_label_dict: 'sophiaAnderson'
Relationship: sophiaAnderson|SENT|843bab5fa
Skipping this relationship...🚬
Error: Entity not found in en_label_dict: 'sophiaAnderson'
Relationship: sophiaAnderson|SENT|408133de0
Skipping this relationship...🚬
Generating Cypher for file 2 of 7
Error: Entity not found in en_label_dict: 'noahWilson'
Relationship: noahWilson|SENT|6ec813a60
Skipping this relationship...🚬
Error: Entity not found in en_label_dict: 'aidenLewis'
Relationship: aidenLewis|SENT|4203972ad
Skipping this relationship...🚬
Error: Entity not found in e

#### Test Extract function can successfully extract entities and relationship from files in data folder

In [73]:
project_prompt_template = """
From the Project Brief below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. Document must be summarized and stored inside Project entity under `summary` property. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Project',id:string,name:string;summary:string //Project mentioned in the brief; `id` property is the full name of the project, in lowercase, with no capital letters, special characters, spaces or hyphens; Contents of original document must be summarized inside 'summary' property
    label:'Technology',id:string,name:string //Technology Entity; `id` property is the name of the technology, in camel-case. Identify as many of the technologies used as possible
    label:'Client',id:string,name:string;industry:string //Client that the project was done for; `id` property is the name of the Client, in camel-case; 'industry' is the industry that the client operates in, as mentioned in the project brief.
    
2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    project|USES_TECH|technology 
    project|HAS_CLIENT|client


3. The output should look like :
{
    "entities": [{"label":"Project","id":string,"name":string,"summary":string}],
    "relationships": ["projectid|USES_TECH|technologyid"]
}

Case Sheet:
$ctext
"""

people_prompt_template = """From the list of people below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Person',id:string,name:string //Person that the data is about. `id` property is the name of the person, in camel-case. 'name' is the person's name, as spelled in the text.
    label:'Project',id:string,name:string;summary:string //Project mentioned in the profile; `id` property is the full lowercase name of the project, with no capital letters, special characters, spaces or hyphens.
    label:'Technology',id:string,name:string //Technology Entity, as listed in the "skills"-section of every person; `id` property is the name of the technology, in camel-case.
    
2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    person|HAS_SKILLS|technology 
    project|HAS_PEOPLE|person


3. The output should look like :
{
    "entities": [{"label":"Person","id":string,"name":string}],
    "relationships": ["projectid|HAS_PEOPLE|personid"]
}

Case Sheet:
$ctext
"""

# result = extract_entities_relationships('people_profiles', project_prompt_template)
# with open('test_pople_llama3.json', 'w') as f:
#     json.dump(result, f)

Running pipeline for 3 files in people_profiles folder
Extracting entities and relationships for ./data/people_profiles/people-profiles1.md
Error processing ./data/people_profiles/people-profiles1.md: [Errno 63] File name too long: 'Here is the extracted information in JSON format:\n\n```json\n{\n  "entities": [\n    {\n      "label": "Project",\n      "id": "betahealth-secure-healthcare-data-analytics-platform-on-azure",\n      "name": "BetaHealth Secure Healthcare Data Analytics Platform on Azure",\n      "summary": "A project for BetaHealth that involves secure healthcare data analytics platform on Azure."\n    },\n    {\n      "label": "Project",\n      "id": "gammatech-smart-logistics-platform-on-azure",\n      "name": "GammaTech Smart Logistics Platform on Azure",\n      "summary": "A project for GammaTech that involves smart logistics platform on Azure."\n    },\n    {\n      "label": "Project",\n      "id": "alphacorp-aws-powered-supply-chain-optimization-platform",\n      "nam

### Defining Prompts

In [89]:
project_prompt_template = """
From the Project Brief below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. Document must be summarized and stored inside Project entity under `summary` property. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Project',id:string,name:string;summary:string //Project mentioned in the brief; `id` property is the full name of the project, in lowercase, with no capital letters, special characters, spaces or hyphens; Contents of original document must be summarized inside 'summary' property
    label:'Technology',id:string,name:string //Technology Entity; `id` property is the name of the technology, in camel-case. Identify as many of the technologies used as possible
    label:'Client',id:string,name:string;industry:string //Client that the project was done for; `id` property is the name of the Client, in camel-case; 'industry' is the industry that the client operates in, as mentioned in the project brief.
    
2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    project|USES_TECH|technology 
    project|HAS_CLIENT|client


3. The output should look like :
{
    "entities": [{"label":"Project","id":string,"name":string,"summary":string}],
    "relationships": ["projectid|USES_TECH|technologyid"]
}

Case Sheet:
$ctext
"""

people_prompt_template = """From the list of people below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Person',id:string,name:string //Person that the data is about. `id` property is the name of the person, in camel-case. 'name' is the person's name, as spelled in the text.
    label:'Project',id:string,name:string;summary:string //Project mentioned in the profile; `id` property is the full lowercase name of the project, with no capital letters, special characters, spaces or hyphens.
    label:'Technology',id:string,name:string //Technology Entity, as listed in the "skills"-section of every person; `id` property is the name of the technology, in camel-case.
    
2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    person|HAS_SKILLS|technology 
    project|HAS_PEOPLE|person


3. The output should look like :
{
    "entities": [{"label":"Person","id":string,"name":string}],
    "relationships": ["projectid|HAS_PEOPLE|personid"]
}

Case Sheet:
$ctext
"""

slack_prompt_template = """
From the list of messages below, extract the following Entities & relationships described in the mentioned format 
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Person',id:string,name:string //Person that sent the message. `id` property is the name of the person, in camel-case; for example, "michaelClark", or "emmaMartinez"; 'name' is the person's name, as spelled in the text.
    label:'SlackMessage',id:string,text:string //The Slack-Message that was sent; 'id' property should be the message id, as spelled in the reference. 'text' property is the text content of the message, as spelled in the reference
    
3. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    personid|SENT|slackmessageid

The output should look like :
{
    "entities": [{"label":"SlackMessage","id":string,"text":string}],
    "relationships": ["personid|SENT|messageid"]
}

Case Sheet:
$ctext
""" 

#### Run the pipeline for all files in data folder

In [90]:
folders = {
    "people_profiles": people_prompt_template,
    "project_briefs": project_prompt_template,
    "slack_messages": slack_prompt_template
}

run_pipeline(folders)

Running pipeline for 3 files in people_profiles folder.

Processing file 1/3: ./data/people_profiles/people-profiles1.md .
Attemping to parse JSON...
JSON parsed successfully

Processing file 2/3: ./data/people_profiles/people-profiles3.md .
Attemping to parse JSON...
JSON parsed successfully

Processing file 3/3: ./data/people_profiles/people-profiles2.md .
Attemping to parse JSON...
JSON parsed successfully

Pipeline completed in 179.86132937500952 seconds
Running pipeline for 11 files in project_briefs folder.

Processing file 1/11: ./data/project_briefs/BetaHealth Telemedicine Platform on Microsoft Azure.md .
Attemping to parse JSON...
JSON parsed successfully

Processing file 2/11: ./data/project_briefs/DeltaEdu Virtual Classroom Platform on AWS.md .
Attemping to parse JSON...
JSON parsed successfully

Processing file 3/11: ./data/project_briefs/GammaTech Autonomous Fleet Management System on Azure.md .
Attemping to parse JSON...
JSON parsed successfully

Processing file 4/11: ./d