# Stage 1 : Entity & RelationShips Extraction

In [None]:
#Import required libraries

from expertai.settings import OPENAI_API_KEY
import os
from datetime import datetime
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
import json
import string
import random

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_aws import ChatBedrock
from langchain_core.prompts import PromptTemplate


# Warning control
import warnings
warnings.filterwarnings("ignore")

NEO4J_URI = 'neo4j+s://7fc9791f.databases.neo4j.io'
NEO4J_USERNAME = 'neo4j'
NEO4J_PASSWORD = 'YCCaF7BS6OGkTzKePeFIJ59l4Ai8HsaOntX4lAWz4XM'
NEO4J_DATABASE = 'neo4j'

#### Schema of Knowledge Graph

Node properties:

Person {type: STRING, uuid: STRING, name: STRING, occupation: STRING, creation_date: STRING, last_modified: STRING, color: STRING}
Company {type: STRING, uuid: STRING, name: STRING, country: STRING, creation_date: STRING, last_modified: STRING, color: STRING}

Concept {type: STRING, uuid: STRING, name: STRING, synonyms: LIST, description: STRING, textEmbedding: LIST, creation_date: STRING, last_modified: STRING, color: STRING}
Metric {type: STRING, uuid: STRING, name: STRING, description: STRING, creation_date: STRING, last_modified: STRING, color: STRING}
Formula {type: STRING, uuid: STRING, name: STRING, formula: STRING, creation_date: STRING, last_modified: STRING, color: STRING}
Rule {type: STRING, uuid: STRING, name: STRING, description: STRING, creation_date: STRING, last_modified: STRING, color: STRING}
Condition {type: STRING, uuid: STRING, name: STRING, condition: STRING, creation_date: STRING, last_modified: STRING, color: STRING}
Example {type: STRING, uuid: STRING, name: STRING, example: STRING, creation_date: STRING, last_modified: STRING, color: STRING}

Chunk {type: STRING, uuid: STRING, name: STRING, data: STRING, textEmbedding: LIST, source: STRING, creation_date: STRING, last_modified: STRING, color: STRING}


Relationship properties:

The relationships are the following:
(:Person)-[:WORKS_AT]->(:Company),
(:Concept)-[:IS_RELATED_TO]->(:Concept),
(:Concept)-[:IS_RELATED_TO]->(:Metric),
(:Metric)-[:FORMULA]->(:Formula),
(:Concept)-[:USES_RULE]->(:Rule),
(:Rule)-[:IS_CONDITIONED]->(:Condition),
(:Concept)-[:EXAMPLE]->(:Example),

#### Reduced Schema for LLM
Person {type: STRING, name: STRING, occupation: STRING}
Company {type: STRING, name: STRING}

Concept {type: STRING, name: STRING, synonyms: LIST, description: STRING}
Metric {type: STRING, name: STRING, description: STRING}
Formula {type: STRING, name: STRING, formula: STRING}
Rule {type: STRING, name: STRING, description: STRING}
Condition {type: STRING, name: STRING, condition: STRING}
Example {type: STRING, name: STRING, example: STRING}


#### Connect to Neo4J

In [2]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

llm_light = ChatBedrock(
        model_id="us.anthropic.claude-3-5-haiku-20241022-v1:0",
        model_kwargs=dict(temperature=0, max_tokens=4096),
    )

llm = ChatBedrock(
        model_id="us.anthropic.claude-3-5-sonnet-20241022-v2:0",
        model_kwargs=dict(temperature=0, max_tokens=4096),
    )

#Probar los de amazon Nova si no haiku3.5

In [None]:
# Drop all constraints
constraints = kg.query("SHOW CONSTRAINTS")
for constraint in constraints:
    constraint_name = constraint["name"]
    print(f"Dropping constraint: {constraint_name}")
    kg.query(f"DROP CONSTRAINT {constraint_name}")

# Drop all indexes
indexes = kg.query("SHOW INDEXES")
for index in indexes:
    index_name = index["name"]
    print(f"Dropping index: {index_name}")
    kg.query(f"DROP INDEX {index_name}")

print("All schema properties (constraints and indexes) have been deleted.")

Dropping constraint: unique_company_name
Dropping constraint: unique_concept_name
Dropping constraint: unique_condition_name
Dropping constraint: unique_example_name
Dropping constraint: unique_formula_name
Dropping constraint: unique_metric_name
Dropping constraint: unique_person_name
Dropping constraint: unique_rule_name
Dropping constraint: unqiue_chunk_name
Dropping index: vector_chunks
Dropping index: vector_concepts
All schema properties (constraints and indexes) have been deleted.


In [None]:
#Uniqueness constraint to avoid duplicate nodes
def create_uniqueness_constraints(kg):
    kg.query("""
    CREATE CONSTRAINT unique_person_name IF NOT EXISTS 
        FOR (p:Person) REQUIRE p.name IS UNIQUE;
    """)

    kg.query("""
    CREATE CONSTRAINT unique_company_name IF NOT EXISTS 
        FOR (c:Company) REQUIRE c.name IS UNIQUE;
    """)

    kg.query("""
    CREATE CONSTRAINT unique_concept_name IF NOT EXISTS 
        FOR (co:Concept) REQUIRE co.name IS UNIQUE;
    """)

    kg.query("""
    CREATE CONSTRAINT unique_metric_name IF NOT EXISTS 
        FOR (m:Metric) REQUIRE m.name IS UNIQUE;
    """)

    kg.query("""
    CREATE CONSTRAINT unique_formula_name IF NOT EXISTS 
        FOR (f:Formula) REQUIRE f.name IS UNIQUE;
    """)

    kg.query("""
    CREATE CONSTRAINT unique_rule_name IF NOT EXISTS 
        FOR (r:Rule) REQUIRE r.name IS UNIQUE;
    """)

    kg.query("""
    CREATE CONSTRAINT unique_condition_name IF NOT EXISTS 
        FOR (cond:Condition) REQUIRE cond.name IS UNIQUE;
    """)

    kg.query("""
    CREATE CONSTRAINT unique_example_name IF NOT EXISTS 
        FOR (e:Example) REQUIRE e.name IS UNIQUE;
    """)

    kg.query("""
    CREATE CONSTRAINT unique_chunk_name IF NOT EXISTS 
        FOR (e:Chunk) REQUIRE e.name IS UNIQUE;
    """)

[]

In [None]:
#Query to delete the graph
#kg.query("MATCH (n) DETACH DELETE n")


[]

In [234]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {chunkSeqId: INTEGER, uuid: STRING, source: STRING, content: STRING, textEmbedding: LIST, name: STRING, type: STRING, creation_date: STRING, last_modified: STRING}
Company {uuid: STRING, name: STRING, type: STRING, country: STRING, creation_date: STRING, last_modified: STRING, color: STRING}
Concept {uuid: STRING, name: STRING, type: STRING, creation_date: STRING, last_modified: STRING, color: STRING, synonyms: LIST, description: STRING, descriptionEmbedding: LIST}
Metric {uuid: STRING, name: STRING, type: STRING, creation_date: STRING, last_modified: STRING, color: STRING, description: STRING}
Formula {uuid: STRING, name: STRING, type: STRING, creation_date: STRING, last_modified: STRING, color: STRING, formula: STRING}
Rule {uuid: STRING, name: STRING, type: STRING, creation_date: STRING, last_modified: STRING, color: STRING, rule_description: STRING}
Condition {uuid: STRING, name: STRING, type: STRING, creation_date: STRING, last_modified: STRING, color: STRIN

#### Get the Documents 

In [235]:
roi_file_path = '/data/expertai/expertai/knowledge/knowledgebase/expertai.txt'

with open(roi_file_path, 'r', encoding='utf-8') as file:
            content = file.read()

### Chunk the document

In [241]:
def generate_uuid():
    letters_and_digits = string.ascii_letters + string.digits
    def random_segment(length):
        return ''.join(random.choice(letters_and_digits) for _ in range(length))
    
    uuid_like = f"{random_segment(8)}-{random_segment(4)}-{random_segment(4)}-{random_segment(4)}-{random_segment(12)}"
    return uuid_like

In [242]:
#Chunk Documents
def chunk_text(file, name):
    """Chunks and generate metadata for a txt file

    Parameters
    ----------
    file : str
        The content of the file
    name : str
        The name of the file

    Returns
    ---------
    """
    chunks_with_metadata = [] #use this to accumulate chunk records
    
    #LangChain helper function for chunking
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000, #2000 characters
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
    ) #Antropic Contextual RAG sugiere Mario ContextualChunking

    item_text_chunks = text_splitter.split_text(file) # split the text into chunks
    chunk_seq_id = 0

    for chunk in item_text_chunks: 
        # finally, construct a record with metadata and the chunk text
        chunks_with_metadata.append({
            'type': 'chunk',
            'content': chunk, 
            # metadata from looping...
            'chunkSeqId': chunk_seq_id, 
            'name' : f'{name}-chunk{chunk_seq_id:04d}', #To identify each unique chunk
            # constructed metadata...
            'uuid': str(generate_uuid()),
            'creation_date': str(datetime.now()),
            'last_modified': str(datetime.now()),
            # metadata from file...
            'source': f'{name}.txt',
            })
        chunk_seq_id += 1
    print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [243]:
#Change the filename on each iteration
chunks = chunk_text(content, 'expertai') 

	Split into 1 chunks


In [245]:
chunk_names = [chunk['name'] for chunk in chunks]

In [246]:
#Add each chunk node to the graph
def add_chunks(chunks, kg):
    """
    Add Chunk Nodes to Neo4J Knowledge Graph and Create NEXT Relationships.

    Parameters
    ----------
    chunks : list
        List of chunks with metadata
    kg : str
        Neo4j Driver

    Returns
    -------
    str
        Success string
    """
    # Sort chunks based on chunkSeqId to ensure proper order
    chunks = sorted(chunks, key=lambda x: x['chunkSeqId'])

    previous_chunk_seqid = None

    for chunk in chunks:
        chunk_type = chunk['type']
        chunk_name = chunk['name']

        if chunk_type == 'chunk':

            chunk_with_properties = {
                'type': chunk_type,
                'uuid': chunk['uuid'],
                'name': chunk_name,
                'content': chunk['content'],
                'chunkSeqId': chunk['chunkSeqId'],
                'creation_date': chunk['creation_date'],
                'last_modified': chunk['last_modified'],
                'source': chunk['source'],
            }

            merge_chunk_node_query = """
                MERGE(mergedChunk:Chunk {name: $chunkParam.name})
                    ON CREATE SET 
                        mergedChunk.type = $chunkParam.type,
                        mergedChunk.uuid = $chunkParam.uuid,
                        mergedChunk.content = $chunkParam.content, 
                        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
                        mergedChunk.creation_date = $chunkParam.creation_date, 
                        mergedChunk.last_modified = $chunkParam.last_modified,
                        mergedChunk.source = $chunkParam.source
                RETURN mergedChunk
            """
            # Add the chunk node to the graph
            kg.query(merge_chunk_node_query, params={'chunkParam': chunk_with_properties})

            # Create a NEXT relationship if there is a previous chunk
            if previous_chunk_seqid is not None:
                create_next_relationship_query = """
                    MATCH (prevChunk:Chunk {chunkSeqId: $prevSeqid})
                    MATCH (currentChunk:Chunk {chunkSeqId: $currentSeqid})
                    WHERE prevChunk.source = currentChunk.source
                    MERGE (prevChunk)-[:NEXT]->(currentChunk)
                """
                kg.query(create_next_relationship_query, 
                         params={'prevSeqid': previous_chunk_seqid, 'currentSeqid': chunk['chunkSeqId']})

            # Update previous_chunk_uuid to the current chunk's UUID
            previous_chunk_seqid = chunk['chunkSeqId']

        print(f'Entity {chunk_name} of type {chunk_type} added to the graph')

add_chunks(chunks, kg)

Entity expertai-chunk0000 of type chunk added to the graph


In [247]:
def chunk_relationships(chunk_name, node_name, kg):
    """
    Add Chunk Relationships to the Neo4J Knowledge Graph.

    Parameters
    ----------
    chunk_name : str
        The name of the chunk
    node_name : str
        The name of the Concept or Example node
    kg : Neo4j
        The connected Neo4j graph instance.

    Returns
    -------
    None
    """
    source = chunk_name
    target = node_name
    rel_type = "SOURCE"

        # Cypher query to create the relationship
    create_relationship_query = f"""
        MATCH (a {{name: $sourceName}})
        MATCH (b {{name: $targetName}})
        MERGE (a)-[r:{rel_type}]->(b)
        RETURN r
    """

    # Execute the query
    kg.query(
        create_relationship_query,
        params={
                'sourceName': source,
                'targetName': target,
        }
    )
    print(f"Added relationship: ({source})-[:{rel_type}]->({target})")

### Identify Entities with LLM

In [248]:
def Entity_extractor(
    chunk: str = "",
    filename: str = "",
    schema: str = "",
    llm: "RunnableChain" = None,
) -> str:
    
    """
    In JSON format outputs relevant entities and its properties.

    Parameters
    ----------.
    chunk : str
        Data of the chunk
    filename : str
        name of the source filename (Principal Entity)
    schema : str
        The actual schema of the knowledge graph
    Returns
    -------
    str
        JSON of the recognized Entities
    """

    if llm is None:
        raise ValueError("llm must be provided")

    persona = (
        """You are an experienced node entity recognizer.
        You will be provided with a chunk of data corresponding to a txt file.
        Your taks is to recognize most relevant entities inside the chunk of data
        based on the graph schema provided. After identifying the entities you must
        fill the necessary properties for each node. 

        You must output ONLY the JSON file
        """
    )


    context = (
        f"""
        <data chunk>
        {chunk}
        </data chunk>
        """
        if chunk
        else ""
    )

    best_practices = """
    Here’s a comprehensive list of rules for finding the node entities and populating its properties:

    1. **The filename is the source document. Always make a concept entity with the same name.
    2. ** Do not add information of data that is not inside the chunk of data.
    3. ** When an example node is identified, add to its example property every step do not try to summarize the example.
    4. ** For concept nodes if there are different synonyms in the chunk of data that refer to the same, use the most used as the name 
          property and the others
          add them to the list of synonyms.
    5. ** Never add node types not declared on the schema.
    6. ** Dont change the language of data in the chunk, let it as it is. 
    7. ** At example nodes, include all steps explicitly as they appear in the chunk.
    8. ** Each example must retain exact numerical values and computations.
    9. ** Identify all explicit rules, including those prefaced by 'Always', 'Never', or 'Do not', and ensure they are included as `Rule` nodes.
    10. ** Extract all explicit and implicit rules mentioned in the chunk of data.
 """

    user_prompt = """
        <role>
        {ROLE}
        </role>
        <best_practices>
        {BEST_PRACTICES}
        </best_practices>
        <source>
        {FILENAME}
        </source>
        {context}
        <schema>
        {SCHEMA}
        </schema>

    """
    prompt = PromptTemplate(
        template=user_prompt,
        input_variables=["context"],
        partial_variables={
            "BEST_PRACTICES": best_practices,
            "ROLE": persona,
            "FILENAME": filename,
            "SCHEMA": schema,
        },
    )

    chain = prompt | llm 
        
    response = chain.invoke({"context": context}).content.strip()

    # Validate and format JSON
    try:
        json_response = json.loads(response)
        return json.dumps(json_response, indent=4)  # Prettify JSON output
    
    except json.JSONDecodeError:
        raise ValueError("The LLM response is not valid JSON.")
    

In [249]:
schema = """
Person {
    type: STRING,
    name: STRING,
    occupation: STRING
}

Company {
    type: STRING,
    name: STRING,
    description: STRING
}

Concept {
    type: STRING,
    name: STRING,
    synonyms: LIST,
    description: STRING
}

Metric {
    type: STRING,
    name: STRING,
    description: STRING
}

Formula {
    type: STRING,
    name: STRING,
    formula: STRING
}

Rule {
    type: STRING,
    name: STRING,
    rule_description: STRING
}

Condition {
    type: STRING,
    name: STRING,
    condition: STRING
}

Example {
    type: STRING,
    name: STRING,
    example: STRING
}
"""


In [250]:
nodes = Entity_extractor(content, "expertai.txt", schema, llm=llm)

In [251]:
print(nodes)

{
    "nodes": [
        {
            "type": "Concept",
            "name": "expertai",
            "synonyms": [
                "asistente experto de Kuona"
            ],
            "description": "Nueva herramienta de la familia Kuona especializada en an\u00e1lisis de promociones y revenue management"
        },
        {
            "type": "Company",
            "name": "Kuona",
            "description": "Empresa que desarrolla herramientas de an\u00e1lisis de promociones y revenue management"
        },
        {
            "type": "Company",
            "name": "Arca Continental M\u00e9xico",
            "description": "Cliente inicial del asistente experto de Kuona"
        },
        {
            "type": "Metric",
            "name": "ROI",
            "description": "Retorno de inversi\u00f3n en an\u00e1lisis de promociones"
        },
        {
            "type": "Concept",
            "name": "elasticidad",
            "synonyms": [],
            "description": "M\u

### Create Relationships

In [255]:
def Relationship_extractor(
    Entities: str = "",
    filename: str = "",
    schema: str = "",
    llm: "RunnableChain" = None,
) -> str:
    
    """
    In JSON format outputs relevant relationships between entities.

    Parameters
    ----------.
    Entities : str
        Entities extracted form a chunk of data
    filename : str
        name of the source filename (Principal Entity)
    schema : str
        The actual schema of the knowledge graph
    Returns
    -------
    str
        JSON of relationships 
    """

    if llm is None:
        raise ValueError("llm must be provided")

    persona = (
        """You are an experienced Entity relationships recognizer.
        You will be provided with entities extracted form a chunk of data corresponding to a txt file.
        Your taks is to recognize relevant relationships between this entities
        based on the graph schema provided. After identifying the relationships 
        You must output ONLY the JSON file
        """
    )


    context = (
        f"""
        <Entities>
        {Entities}
        </Entities>
        """
        if Entities
        else ""
    )

    best_practices = """
    Here’s a comprehensive list of rules for finding the nodes relationships:

    1. ** Use only relationships defined in the schema.
    2. ** Dont add any relationship that is NOT DEFINED in schema.
    3. ** Source and Target nodes must be only the ones defined in the Entities context.
    4. ** To identify the nodes use only the "name" property of each one 
    5. ** An entity can't Relate back to itself
    6. ** A Source entity do not have to relate to a Destiny entity with the same name.
    7. ** Source and Destiny entities are defines on the SCHEMA. Do Not Mix them. 
    8. ** Example and Rule entity nodes must be related at least to one concept node.

    Example of a good output:

    {
    "relationships": [
        {
            "source": "Arca Continental",
            "type": "IS_RELATED_TO",
            "target": "Arca Mexico"
        },
        {
            "source": "Arca Mexico",
            "type": "IS_RELATED_TO",
            "target": "The Coca-Cola Company"
        },
        {
            "source": "sellin",
            "type": "IS_RELATED_TO",
            "target": "traditional channel"
        },
        {
            "source": "sellin",
            "type": "IS_RELATED_TO",
            "target": "client"
        },
        {
            "source": "Use sellin not sellout",
            "type": "USES_RULE",
            "target": "sellin"
        },
    ]
}

 """

    user_prompt = """
        <role>
        {ROLE}
        </role>
        <best_practices>
        {BEST_PRACTICES}
        </best_practices>
        <source>
        {FILENAME}
        </source>
        {context}
        <schema>
        {SCHEMA}
        </schema>

    """
    prompt = PromptTemplate(
        template=user_prompt,
        input_variables=["context"],
        partial_variables={
            "BEST_PRACTICES": best_practices,
            "ROLE": persona,
            "FILENAME": filename,
            "SCHEMA": schema,
        },
    )

    chain = prompt | llm 
        
    response = chain.invoke({"context": context}).content.strip()

    # Validate and format JSON
    try:
        json_response = json.loads(response)
        return json.dumps(json_response, indent=4)  # Prettify JSON output
    
    except json.JSONDecodeError:
        raise ValueError("The LLM response is not valid JSON.")

In [256]:
schema = """

The relationships are the following:

(:Person)-[:WORKS_AT]->(:Company),
(:Company)-[:IS_RELATED_TO]->(:Company),
(:Company)-[:IS_RELATED_TO]->(:Concept),
(:Concept)-[:IS_RELATED_TO]->(:Concept),
(:Concept)-[:IS_RELATED_TO]->(:Metric),
(:Metric)-[:FORMULA]->(:Formula),
(:Concept)-[:USES_RULE]->(:Rule),
(:Rule)-[:IS_CONDITIONED]->(:Condition),
(:Concept)-[:EXAMPLE]->(:Example),

"""

In [257]:
relationships = Relationship_extractor(nodes, "expertai.txt", schema, llm=llm_light)

In [258]:
relationships

'{\n    "relationships": [\n        {\n            "source": "Kuona",\n            "type": "IS_RELATED_TO",\n            "target": "expertai"\n        },\n        {\n            "source": "Kuona",\n            "type": "IS_RELATED_TO",\n            "target": "revenue management"\n        },\n        {\n            "source": "expertai",\n            "type": "IS_RELATED_TO",\n            "target": "Arca Continental M\\u00e9xico"\n        },\n        {\n            "source": "expertai",\n            "type": "IS_RELATED_TO",\n            "target": "revenue management"\n        },\n        {\n            "source": "revenue management",\n            "type": "IS_RELATED_TO",\n            "target": "elasticidad"\n        },\n        {\n            "source": "revenue management",\n            "type": "IS_RELATED_TO",\n            "target": "canibalizaci\\u00f3n"\n        },\n        {\n            "source": "revenue management",\n            "type": "IS_RELATED_TO",\n            "target": "vol\\

In [283]:
def modify_relationships(relationships, original_node, new_node):
    relationships = json.loads(relationships)
    for relationship in relationships['relationships']:
        if relationship["source"] == original_node:
            relationship["source"] = new_node
        if relationship["target"] == original_node:
            relationship["target"] = new_node
    return relationships

### Add Entities and Relationships to the Knowledge Graph

In [260]:
#Convert to JSON
nodes = json.loads(nodes)

In [261]:
# Now you can iterate over `json_data`
for node in nodes["nodes"]:
    print(node)
    node_type = node['type']
    print(node_type)


{'type': 'Concept', 'name': 'expertai', 'synonyms': ['asistente experto de Kuona'], 'description': 'Nueva herramienta de la familia Kuona especializada en análisis de promociones y revenue management'}
Concept
{'type': 'Company', 'name': 'Kuona', 'description': 'Empresa que desarrolla herramientas de análisis de promociones y revenue management'}
Company
{'type': 'Company', 'name': 'Arca Continental México', 'description': 'Cliente inicial del asistente experto de Kuona'}
Company
{'type': 'Metric', 'name': 'ROI', 'description': 'Retorno de inversión en análisis de promociones'}
Metric
{'type': 'Concept', 'name': 'elasticidad', 'synonyms': [], 'description': 'Métrica de análisis en promociones'}
Concept
{'type': 'Concept', 'name': 'canibalización', 'synonyms': [], 'description': 'Métrica de análisis en promociones'}
Concept
{'type': 'Concept', 'name': 'volúmenes incrementales', 'synonyms': [], 'description': 'Métrica de análisis en promociones'}
Concept
{'type': 'Concept', 'name': 'reve

### Deduplication

In [262]:

def deduplication_query(node_name, node_description):
    similarity_threshold = 0.9
    word_edit_distance = 5


    data = kg.query("""
    WITH genai.vector.encode($description, "OpenAI", {token: $openAiApiKey}) AS description_embedding

    CALL db.index.vector.queryNodes($index_name, $top_k, description_embedding)
    YIELD node, score

    WHERE score > toFloat($cutoff)
    AND (
        toLower(node.name) CONTAINS toLower($name)
        OR toLower($name) CONTAINS toLower(node.name)
        OR apoc.text.distance(toLower(node.name), toLower($name)) < $distance
    )

    RETURN node.name, node.description, score
    ORDER BY score DESC;

    """, params={'name': node_name,
                'description': node_description, 
                'openAiApiKey':OPENAI_API_KEY,
                'index_name':'vector_concepts', 
                'top_k': 3,
                'cutoff': similarity_threshold, 
                'distance': word_edit_distance})
    

    return data

In [263]:
def deduplication_llm(
    new_concept: str = "",
    similar_concepts: list = [],
    llm: "RunnableChain" = None,
) -> str:
    
    """
    LLM compares a list of very similar concept nodes with a new concept node identified previously.

    If LLM identifies new concept node as unique it will output the same name as it is.

    If LLM identifies a existing concept node as the same as new concept node, 
    new concept node name will be added to the list of synonyms of the existing concept node and 
    the output will be the name of the existing concept node.


    Parameters
    ----------.
    new_concept : str
        New Concept Entity identified 
    similar_concepts : list
        list of similar existing concept nodes previously extracted using vector search and Lebenshtein
    Returns
    -------
    str
        Name of the concept node 
    """

    if llm is None:
        raise ValueError("llm must be provided")

    persona = (
        """You are an experienced Entity Resolution recognizer.
        You will be provided with a new concept extracted previously from a document,
        and with a list of very similar concepts extracted from the knowledge graph.
        Based on name and the given description of each one you will define if the
        new concept provided is really a unique concept or if it should be added to
        the synonym list of an existing concept node. 
        
        After deciding You must output 
        ONLY the NAME of the concept.
        """
    )


    context = (
        f"""
        <List of Similar Existing Concept Nodes>
        {similar_concepts}
        </List of Similar Existing Concept Nodes>
        """
        if similar_concepts
        else ""
    )

    best_practices = """
    Here’s a comprehensive list of rules for deciding:

    1. **  If you identify the new concept node as unique you will output the same name of the concept as it is.
    2. ** If you identify a existing concept node as the same as new concept node, 
    new concept node name will be added to the list of synonyms of the existing concept node and 
    the output will be the name of the existing concept node.
    3. ** Take careful consideration on the description of each concept, and if they talk about the same concept.
    4. ** If the list of similar existing concept nodes is empty, then the output must be the name of the new concept.
    5. ** ONLY OUTPUT THE NAME OF THE CONCEPT NODE DONT EXPLAIN THE REASONING

    Example of a good output:

    <new_concept>
       node.name: Impacto de Canibalización, 
       node.description: El impacto de canibalización mide la pérdida causada por la disminución en ventas de otros 
       productos debido a una promoción. Representa lo que se dejó de percibir de productos no promocionados cuando los 
       clientes optaron por comprar el producto en promoción.
    </new_concept>

    <List of Similar Existing Concept Nodes>

    [{'node.name': 'Impacto de Canibalización', 
      'node.description': 'El impacto de canibalización mide la pérdida causada por la disminución en ventas de otros 
      productos debido a una promoción. Representa lo que se dejó de percibir de productos no promocionados cuando los 
      clientes optaron por comprar el producto en promoción.',
      'score': 0.996490478515625}, 
     {'node.name': 'Canibalización',
      'node.description': "La canibalización en el contexto de una promoción se refiere al fenómeno en el que una acción 
      promocional para un producto o servicio afecta negativamente las ventas de otros productos o servicios de la misma empresa. 
      En otras palabras, es cuando una promoción 'se come' las ventas de otros productos.", 
      'score': 0.9651641845703125}]

    </List of Similar Existing Concept Nodes>

    EXPECTED OUTPUT:

    Impacto de Canibalización


 """

    user_prompt = """
        <role>
        {ROLE}
        </role>
        <best_practices>
        {BEST_PRACTICES}
        </best_practices>
        <new_concept>
        {NEW_CONCEPT}
        </new_concept>
        {context}
    """
    prompt = PromptTemplate(
        template=user_prompt,
        input_variables=["context"],
        partial_variables={
            "BEST_PRACTICES": best_practices,
            "ROLE": persona,
            "NEW_CONCEPT": new_concept,
        },
    )

    chain = prompt | llm 
        
    response = chain.invoke({"context": context}).content.strip()

    return response

In [264]:
def add_synonym(concept, synonym, kg):
    change = kg.query("""
    WITH $synonym_to_add AS newSynonym
    MATCH (c:Concept {name: $concept_to_update})
        SET c.synonyms = CASE 
        WHEN c.synonyms IS NULL THEN [newSynonym] 
        ELSE apoc.coll.union(c.synonyms, [newSynonym]) 
    END
    RETURN c
    """, params={'synonym_to_add': synonym,
                'concept_to_update': concept})

In [265]:
def add_to_description(concept, addition, kg):
    change = kg.query("""
    WITH $string_to_add AS addition
    MATCH (c:Concept {name: $concept_to_update})
        SET c.description = CASE 
        WHEN c.description IS NULL THEN addition 
        ELSE c.description + ' ' + addition 
    END
    RETURN c
    """, params={'string_to_add': addition,
                 'concept_to_update': concept})

In [None]:
def add_entities(nodes, kg, chunks, relationships):
    """
    Add Node Entities to Neo4J Knowledge Graph.

    Parameters
    ----------
    nodes : json
        Entities extracted form a chunk of data
    kg : Neo4J
        Neo4J driver
    chunk: List
        List of chunk names retrieved from document
    relationships_ dict
        Json of the proposed relationships for the nodes
    Returns
    -------
    str
        Success string
    """

    for node in nodes['nodes']:

        node_type = node['type']
        node_name = node['name']

        if node_type == 'Person':

            node_with_properties = {
                'type': node_type,
                'uuid': generate_uuid(),
                'name': node['name'],
                'occupation': node['occupation'],
                'creation_date': str(datetime.now()),
                'last_modified': str(datetime.now()),
                'color': 'red',
            }

            merge_person_node_query = """
                MERGE(mergedPerson:Person {name: $personParam.name})
                    ON CREATE SET 
                        mergedPerson.type = $personParam.type,
                        mergedPerson.uuid = $personParam.uuid,
                        mergedPerson.occupation = $personParam.occupation, 
                        mergedPerson.creation_date = $personParam.creation_date, 
                        mergedPerson.last_modified = $personParam.last_modified,  
                        mergedPerson.color = $personParam.color 
                RETURN mergedPerson
                """
            #Only adding one node
            kg.query(merge_person_node_query, 
                    params={'personParam':node_with_properties})
        
        
        if node_type == 'Company':

            node_with_properties = {
                'type': node_type,
                'uuid': generate_uuid(),
                'name': node['name'],
                'country': 'Mexico',
                'description': node['description'],
                'creation_date': str(datetime.now()),
                'last_modified': str(datetime.now()),
                'color': 'orange',
            }

            merge_company_node_query = """
            MERGE(mergedCompany:Company {name: $companyParam.name})
                ON CREATE SET 
                    mergedCompany.type = $companyParam.type,
                    mergedCompany.uuid = $companyParam.uuid,
                    mergedCompany.country = $companyParam.country, 
                    mergedCompany.description = $companyParam.description, 
                    mergedCompany.creation_date = $companyParam.creation_date, 
                    mergedCompany.last_modified = $companyParam.last_modified,  
                    mergedCompany.color = $companyParam.color 
            RETURN mergedCompany
            """

            #Only adding one node
            kg.query(merge_company_node_query, 
                    params={'companyParam':node_with_properties})
            

        #Check for conept entity the vector Embedding property need to be applied 
        if node_type == 'Concept':
            node_description = node['description']

            similar_concepts = deduplication_query(node_name, node_description) #Get similar concept nodes from KG
            print(similar_concepts)

            concept = deduplication_llm(node, similar_concepts, llm=llm) #Outputs name of final concept node

            if concept != node_name:

                relationships = modify_relationships(relationships, node_name, concept) #Modify relationships
                add_synonym(concept, node_name, kg) #Add node name to the list of synonyms of the concept node
                add_to_description(concept, node_description, kg) #Concatenates New description to the existing concept
                #Pending to update the embedding

                print(f'Entity {node_name} identified as synonym of Entity {concept}')

                node_name = concept #For relating chunks with existing concept
                
            else:
                node_with_properties = {
                'type': node_type, 
                'uuid': generate_uuid(),
                'name': node['name'],
                'synonyms' : node['synonyms'], #Check if this is correct for a list type
                'description': node['description'],
                # constructed metadata...
                'creation_date': str(datetime.now()),
                'last_modified': str(datetime.now()),
                'color': 'blue',
                }

                merge_concept_node_query = """
                MERGE(mergedConcept:Concept {name: $conceptParam.name})
                    ON CREATE SET 
                        mergedConcept.type = $conceptParam.type,
                        mergedConcept.uuid = $conceptParam.uuid,
                        mergedConcept.synonyms = $conceptParam.synonyms, 
                        mergedConcept.description = $conceptParam.description, 
                        mergedConcept.creation_date = $conceptParam.creation_date, 
                        mergedConcept.last_modified = $conceptParam.last_modified,  
                        mergedConcept.color = $conceptParam.color 
                RETURN mergedConcept
                """

                #Only adding one node
                kg.query(merge_concept_node_query, 
                        params={'conceptParam':node_with_properties})
            
            #Add "SOURCE" Relationship to each corresponding chunk (Will probably be changed this on the future)
            for chunk in chunks:
                chunk_relationships(chunk, node_name, kg)

    
        if node_type == 'Metric':
            node_with_properties = {
            'type': node_type, 
            'uuid': generate_uuid(),
            'name': node['name'],
            'description': node['description'],
            # constructed metadata...
            'creation_date': str(datetime.now()),
            'last_modified': str(datetime.now()),
            'color': 'purple',
            }

            merge_metric_node_query = """
            MERGE(mergedMetric:Metric {name: $metricParam.name})
                ON CREATE SET 
                    mergedMetric.type = $metricParam.type,
                    mergedMetric.uuid = $metricParam.uuid,
                    mergedMetric.description = $metricParam.description, 
                    mergedMetric.creation_date = $metricParam.creation_date, 
                    mergedMetric.last_modified = $metricParam.last_modified,  
                    mergedMetric.color = $metricParam.color 
            RETURN mergedMetric
            """

            #Only adding one node
            kg.query(merge_metric_node_query, 
                    params={'metricParam':node_with_properties})

        if node_type == 'Formula':
            node_with_properties = {
            'type': node_type, 
            'uuid': generate_uuid(),
            'name': node['name'],
            'formula' : node['formula'], 
            # constructed metadata...
            'creation_date': str(datetime.now()),
            'last_modified': str(datetime.now()),
            'color': 'green',
            }


            merge_formula_node_query = """
            MERGE(mergedFormula:Formula {name: $formulaParam.name})
                ON CREATE SET 
                    mergedFormula.type = $formulaParam.type,
                    mergedFormula.uuid = $formulaParam.uuid,
                    mergedFormula.formula = $formulaParam.formula, 
                    mergedFormula.creation_date = $formulaParam.creation_date, 
                    mergedFormula.last_modified = $formulaParam.last_modified,  
                    mergedFormula.color = $formulaParam.color 
            RETURN mergedFormula
            """

            #Only adding one node
            kg.query(merge_formula_node_query, 
                    params={'formulaParam':node_with_properties})
        
        if node_type == 'Rule':
            node_with_properties = {
            'type': node_type, 
            'uuid': generate_uuid(),
            'name': node['name'],
            'rule_description' : node['rule_description'], 
            # constructed metadata...
            'creation_date': str(datetime.now()),
            'last_modified': str(datetime.now()),
            'color': 'dark green',
            }

            merge_rule_node_query = """
            MERGE(mergedRule:Rule {name: $ruleParam.name})
                ON CREATE SET 
                    mergedRule.type = $ruleParam.type,
                    mergedRule.uuid = $ruleParam.uuid,
                    mergedRule.rule_description = $ruleParam.rule_description, 
                    mergedRule.creation_date = $ruleParam.creation_date, 
                    mergedRule.last_modified = $ruleParam.last_modified,  
                    mergedRule.color = $ruleParam.color 
            RETURN mergedRule
            """

            #Only adding one node
            kg.query(merge_rule_node_query, 
                    params={'ruleParam':node_with_properties})

        if node_type == 'Condition':
            node_with_properties = {
            'type': node_type, 
            'uuid': generate_uuid(),
            'name': node['name'],
            'condition' : node['condition'], 
            # constructed metadata...
            'creation_date': str(datetime.now()),
            'last_modified': str(datetime.now()),
            'color': 'green',
            }

            merge_condition_node_query = """
            MERGE(mergedCondition:Condition {name: $conditionParam.name})
                ON CREATE SET 
                    mergedCondition.type = $conditionParam.type,
                    mergedCondition.uuid = $conditionParam.uuid,
                    mergedCondition.condition = $conditionParam.condition, 
                    mergedCondition.creation_date = $conditionParam.creation_date, 
                    mergedCondition.last_modified = $conditionParam.last_modified,  
                    mergedCondition.color = $conditionParam.color 
            RETURN mergedCondition
            """

            #Only adding one node
            kg.query(merge_condition_node_query, 
                    params={'conditionParam':node_with_properties})

        if node_type == 'Example':
            node_with_properties = {
            'type': node_type, 
            'uuid': generate_uuid(),
            'name': node['name'],
            'example' : node['example'], 
            # constructed metadata...
            'creation_date': str(datetime.now()),
            'last_modified': str(datetime.now()),
            'color': 'green',
            }

            merge_example_node_query = """
            MERGE(mergedExample:Example {name: $exampleParam.name})
                ON CREATE SET 
                    mergedExample.type = $exampleParam.type,
                    mergedExample.uuid = $exampleParam.uuid,
                    mergedExample.example = $exampleParam.example, 
                    mergedExample.creation_date = $exampleParam.creation_date, 
                    mergedExample.last_modified = $exampleParam.last_modified,  
                    mergedExample.color = $exampleParam.color 
            RETURN mergedExample
            """

            #Only adding one node
            kg.query(merge_example_node_query, 
                    params={'exampleParam':node_with_properties})
            
            for chunk in chunks:
                chunk_relationships(chunk, node_name, kg)

        
        print(f'Entity {node_name} of type {node_type} added to the graph')
       

add_entities(nodes, kg, chunk_names, relationships)


[]
Added relationship: (expertai-chunk0000)-[:SOURCE]->(expertai)
Entity expertai of type Concept added to the graph
Entity Kuona of type Company added to the graph
Entity Arca Continental México of type Company added to the graph
Entity ROI of type Metric added to the graph
[]
Added relationship: (expertai-chunk0000)-[:SOURCE]->(elasticidad)
Entity elasticidad of type Concept added to the graph
[{'node.name': 'Impacto de Canibalización', 'node.description': 'El impacto de canibalización mide la pérdida causada por la disminución en ventas de otros productos debido a una promoción. Representa lo que se dejó de percibir de productos no promocionados cuando los clientes optaron por comprar el producto en promoción.', 'score': 0.91766357421875}]
Entity canibalización identified as synonym of Entity Impacto de Canibalización
Added relationship: (expertai-chunk0000)-[:SOURCE]->(Impacto de Canibalización)
Entity Impacto de Canibalización of type Concept added to the graph
[]
Added relationsh

In [278]:
#Convert to JSON
relationships = json.loads(relationships)

In [279]:
def add_relationships(relationships, kg):
    """
    Add Relationships to the Neo4J Knowledge Graph.

    Parameters
    ----------
    relationships : dict
        A dictionary containing the relationships to be added, 
        with each relationship defined by source, type, and target.
    kg : Neo4j
        The connected Neo4j graph instance.

    Returns
    -------
    None
    """
    for relationship in relationships['relationships']:
        source = relationship['source']
        target = relationship['target']
        rel_type = relationship['type']

        # Cypher query to create the relationship
        create_relationship_query = f"""
        MATCH (a {{name: $sourceName}})
        MATCH (b {{name: $targetName}})
        MERGE (a)-[r:{rel_type}]->(b)
        RETURN r
        """

        # Execute the query
        kg.query(
            create_relationship_query,
            params={
                'sourceName': source,
                'targetName': target,
            }
        )
        print(f"Added relationship: ({source})-[:{rel_type}]->({target})")


In [280]:
add_relationships(relationships, kg)

Added relationship: (Kuona)-[:IS_RELATED_TO]->(expertai)
Added relationship: (Kuona)-[:IS_RELATED_TO]->(revenue management)
Added relationship: (expertai)-[:IS_RELATED_TO]->(Arca Continental México)
Added relationship: (expertai)-[:IS_RELATED_TO]->(revenue management)
Added relationship: (revenue management)-[:IS_RELATED_TO]->(elasticidad)
Added relationship: (revenue management)-[:IS_RELATED_TO]->(canibalización)
Added relationship: (revenue management)-[:IS_RELATED_TO]->(volúmenes incrementales)
Added relationship: (revenue management)-[:IS_RELATED_TO]->(ROI)


### Vector Embeddings

In [None]:
#Tasks: 
#Generate the chunk nodes for each document - Complete
#Each Chunk node should be related to Concept and Example nodes - Complete
#Concept Nodes and Chunk Nodes should have a Vector embedding property for vector search - Pending

In [613]:
# Global constants
VECTOR_INDEX_NAME = 'vector_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [None]:
#Vector Index is called vector_chunks and will store embeddings for nodes labeled as Chunk in a property called textEmbedding
#Configuration is based on OpenAI Embedding Model
kg.query("""
         CREATE VECTOR INDEX `vector_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
""")

[]

In [614]:
#Add Embedding Property to the Nodes

kg.query("""
    MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
    WITH chunk, genai.vector.encode(
      chunk.content, 
      "OpenAI", 
      {
        token: $openAiApiKey
      }) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY} )

[]

In [573]:
#Vector Index is called vector_concepts and will store embeddings for nodes labeled as Concept in a property called descriptionEmbedding
#Configuration is based on OpenAI Embedding Model
kg.query("""
         CREATE VECTOR INDEX `vector_concepts` IF NOT EXISTS
          FOR (c:Concept) ON (c.descriptionEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

[]

In [42]:
#Add Embedding Property to the Nodes

kg.query("""
    MATCH (concept:Concept) WHERE concept.descriptionEmbedding IS NULL
    WITH concept, genai.vector.encode(
      concept.description, 
      "OpenAI", 
      {
        token: $openAiApiKey
      }) AS vector
    CALL db.create.setNodeVectorProperty(concept, "descriptionEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY} )

[]

### Busqueda por Concept Nodes

In [616]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.name AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'index_name':'vector_concepts', 
                      'top_k': 3})
  return similar

In [617]:
search_results = neo4j_vector_search(
    'Cual es al elasticidad de mi promocion'
)

In [618]:
search_results

[{'score': 0.9372406005859375, 'text': 'Elasticidad'},
 {'score': 0.9013671875, 'text': 'accuracy'},
 {'score': 0.8984527587890625,
  'text': 'Cuadrantes de desempeño de promociones'}]

### Busqueda por Chunk Nodes

In [619]:
def neo4j_vector_search_chunks(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.name AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'index_name':'vector_chunks', 
                      'top_k': 3})
  return similar

In [622]:
search_results_chunks = neo4j_vector_search_chunks(
    'Que hace la empresa Arca Continental'
)

In [623]:
search_results_chunks

[{'score': 0.9392242431640625, 'text': 'company_info-chunk0000'},
 {'score': 0.9199676513671875, 'text': 'company_info-chunk0001'},
 {'score': 0.8665618896484375, 'text': 'elasticidad-chunk0002'}]

### Next Steps...

- Find a way for deduplication
- For new chunks if entities identified already exist try to concatenate only new data to the existing entity, avoid duplication
- For other type of nodes is there is a exact coincidence in KG, concatenate results

In [None]:
#WORKFLOW FOR SMART DEDUPLICATION

#Document is chunked - Completed
#Entity nodes with property are extracted with LLM - Completed
#Function that retrieve only Concept Nodes and their description - Completed
#Cypher Query that compares each concept node name and vector description with graph - Completed
#LLM that decides whether to concatenate new information with existing concept node or if it should be a new concept node


In [87]:
concept_nodes = []

for node in nodes['nodes']:

    node_type = node['type']
    node_name = node['name']

    if node_type == 'Concept':
        concept_nodes.append(node)

len(concept_nodes)

2

In [None]:
for concept in concept_nodes:
    node_name = concept['name']
    node_description = concept['description']

    similarity_threshold = 0.9
    word_edit_distance = 5


    data = kg.query("""
    WITH genai.vector.encode($description, "OpenAI", {token: $openAiApiKey}) AS description_embedding

    CALL db.index.vector.queryNodes($index_name, $top_k, description_embedding)
    YIELD node, score

    WHERE score > toFloat($cutoff)
    AND (
        toLower(node.name) CONTAINS toLower($name)
        OR toLower($name) CONTAINS toLower(node.name)
        OR apoc.text.distance(toLower(node.name), toLower($name)) < $distance
    )

    RETURN node.name, node.description, score
    ORDER BY score DESC;

    """, params={'name': node_name,
                'description': node_description, 
                'openAiApiKey':OPENAI_API_KEY,
                'index_name':'vector_concepts', 
                'top_k': 3,
                'cutoff': similarity_threshold, 
                'distance': word_edit_distance})
    

    print(f'concept node - {node_name}')
    print(data)

    
    


concept node - Impacto de Canibalización
[{'node.name': 'Impacto de Canibalización', 'node.description': 'El impacto de canibalización mide la pérdida causada por la disminución en ventas de otros productos debido a una promoción. Representa lo que se dejó de percibir de productos no promocionados cuando los clientes optaron por comprar el producto en promoción.', 'score': 0.996490478515625}, {'node.name': 'Canibalización', 'node.description': "La canibalización en el contexto de una promoción se refiere al fenómeno en el que una acción promocional para un producto o servicio afecta negativamente las ventas de otros productos o servicios de la misma empresa. En otras palabras, es cuando una promoción 'se come' las ventas de otros productos.", 'score': 0.9651641845703125}]
concept node - Impacto de Overstock
[{'node.name': 'Impacto de Overstock', 'node.description': 'El impacto de overstock mide la pérdida causada por el exceso de inventario generado después de una promoción. Representa

In [None]:
#Check for common Concept Nodes using vector similarity and Levenstein distance

similarity_threshold = 0.9
word_edit_distance = 5

data = kg.query("""
MATCH (e:Concept)
CALL {
  WITH e
  CALL db.index.vector.queryNodes('vector_concepts', 10, e.descriptionEmbedding)
  YIELD node, score
  WITH node, score
  WHERE score > toFLoat($cutoff)
      AND (toLower(node.name) CONTAINS toLower(e.name) OR toLower(e.name) CONTAINS toLower(node.name)
           OR apoc.text.distance(toLower(node.name), toLower(e.name)) < $distance)
      AND labels(e) = labels(node)
  WITH node, score
  ORDER BY node.name
  RETURN collect(node) AS nodes
}
WITH distinct nodes
WHERE size(nodes) > 1
WITH collect([n in nodes | n.name]) AS results
UNWIND range(0, size(results)-1, 1) as index
WITH results, index, results[index] as result
WITH apoc.coll.sort(reduce(acc = result, index2 IN range(0, size(results)-1, 1) |
        CASE WHEN index <> index2 AND
            size(apoc.coll.intersection(acc, results[index2])) > 0
            THEN apoc.coll.union(acc, results[index2])
            ELSE acc
        END
)) as combinedResult
WITH distinct(combinedResult) as combinedResult
// extra filtering
WITH collect(combinedResult) as allCombinedResults
UNWIND range(0, size(allCombinedResults)-1, 1) as combinedResultIndex
WITH allCombinedResults[combinedResultIndex] as combinedResult, combinedResultIndex, allCombinedResults
WHERE NOT any(x IN range(0,size(allCombinedResults)-1,1) 
    WHERE x <> combinedResultIndex
    AND apoc.coll.containsAll(allCombinedResults[x], combinedResult)
)
RETURN combinedResult  
""", params={'cutoff': similarity_threshold, 'distance': word_edit_distance})
for row in data:
    print(row)

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `db.index.vector.queryNodes`: Caused by: java.lang.NullPointerException: 'query' must not be null}

In [284]:
#Check for common Concept Nodes given as input the name of a potential node using vector similarity and Levenstein distance
#Encodes the description of the node generated by LLM


similarity_threshold = 0.9
word_edit_distance = 5

data = kg.query("""
WITH $conceptName AS conceptNameParam
MATCH (e:Concept {name: conceptNameParam})
CALL {
  WITH e
  CALL db.index.vector.queryNodes('vector_concepts', 10, e.descriptionEmbedding)
  YIELD node, score
  WITH node, score, e
  WHERE score > toFloat($cutoff)
    AND (toLower(node.name) CONTAINS toLower(e.name) 
         OR toLower(e.name) CONTAINS toLower(node.name)
         OR apoc.text.distance(toLower(node.name), toLower(e.name)) < $distance)
    AND labels(e) = labels(node)
  WITH node, score
  ORDER BY node.name
  RETURN collect(node) AS nodes
}
WITH distinct nodes
WHERE size(nodes) > 1
WITH collect([n in nodes | n.name]) AS results
UNWIND range(0, size(results)-1) AS index
WITH results, index, results[index] AS result
WITH apoc.coll.sort(
  reduce(acc = result, index2 IN range(0, size(results)-1) |
    CASE WHEN index <> index2 AND size(apoc.coll.intersection(acc, results[index2])) > 0
         THEN apoc.coll.union(acc, results[index2])
         ELSE acc
    END
  )
) AS combinedResult
WITH distinct(combinedResult) AS combinedResult
WITH collect(combinedResult) AS allCombinedResults
UNWIND range(0, size(allCombinedResults)-1) AS combinedResultIndex
WITH allCombinedResults[combinedResultIndex] AS combinedResult, combinedResultIndex, allCombinedResults
WHERE NOT any(x IN range(0,size(allCombinedResults)-1) 
    WHERE x <> combinedResultIndex
    AND apoc.coll.containsAll(allCombinedResults[x], combinedResult)
)
RETURN combinedResult

""", params={'cutoff': similarity_threshold, 'distance': word_edit_distance, 'conceptName': 'Canibalización'})
for row in data:
    print(row)



{'combinedResult': ['Canibalización', 'Impacto de Canibalización']}


In [141]:
similarity_threshold = 0.9
word_edit_distance = 5

concept_node = concept_nodes[0]
#node_name = concept_node['name']
#node_description = concept_node['description']

node_name = 'Canibalization'
node_description = 'In the context of promotions, "cannibalization" refers to a situation where a newly introduced promotion—or a discounted offer on a specific product—leads customers to shift their purchases away from other items or full-priced products that the company already sells. Essentially, rather than increasing overall sales, the promotion diverts existing demand from one area of the product lineup to another.'

data = kg.query("""
WITH genai.vector.encode($description, "OpenAI", {token: $openAiApiKey}) AS description_embedding

CALL db.index.vector.queryNodes($index_name, $top_k, description_embedding)
YIELD node, score

WHERE score > toFloat($cutoff)
  AND (
    toLower(node.name) CONTAINS toLower($name)
    OR toLower($name) CONTAINS toLower(node.name)
    OR apoc.text.distance(toLower(node.name), toLower($name)) < $distance
  )

RETURN node.name, node.description, score
ORDER BY score DESC;

""", params={'name': node_name,
             'description': node_description, 
              'openAiApiKey':OPENAI_API_KEY,
              'index_name':'vector_concepts', 
              'top_k': 3,
              'cutoff': similarity_threshold, 
              'distance': word_edit_distance})

print(data)

[{'node.name': 'Canibalización', 'node.description': "La canibalización en el contexto de una promoción se refiere al fenómeno en el que una acción promocional para un producto o servicio afecta negativamente las ventas de otros productos o servicios de la misma empresa. En otras palabras, es cuando una promoción 'se come' las ventas de otros productos.", 'score': 0.94403076171875}]


In [142]:
new_concept = (f'node.name: {node_name}, node.description: {node_description}')
new_concept

'node.name: Canibalization, node.description: In the context of promotions, "cannibalization" refers to a situation where a newly introduced promotion—or a discounted offer on a specific product—leads customers to shift their purchases away from other items or full-priced products that the company already sells. Essentially, rather than increasing overall sales, the promotion diverts existing demand from one area of the product lineup to another.'

In [143]:
def deduplication(
    new_concept: str = "",
    similar_concepts: list = [],
    llm: "RunnableChain" = None,
) -> str:
    
    """
    LLM compares a list of very similar concept nodes with a new concept node identified previously.

    If LLM identifies new concept node as unique it will output the same name as it is.

    If LLM identifies a existing concept node as the same as new concept node, 
    new concept node name will be added to the list of synonyms of the existing concept node and 
    the output will be the name of the existing concept node.


    Parameters
    ----------.
    new_concept : str
        New Concept Entity identified 
    similar_concepts : list
        list of similar existing concept nodes previously extracted using vector search and Lebenshtein
    Returns
    -------
    str
        Name of the concept node 
    """

    if llm is None:
        raise ValueError("llm must be provided")

    persona = (
        """You are an experienced Entity Resolution recognizer.
        You will be provided with a new concept extracted previously from a document,
        and with a list of very similar concepts extracted from the knowledge graph.
        Based on name and the given description of each one you will define if the
        new concept provided is really a unique concept or if it should be added to
        the synonym list of an existing concept node. 
        
        After deciding You must output 
        ONLY the NAME of the concept.
        """
    )


    context = (
        f"""
        <List of Similar Existing Concept Nodes>
        {similar_concepts}
        </List of Similar Existing Concept Nodes>
        """
        if similar_concepts
        else ""
    )

    best_practices = """
    Here’s a comprehensive list of rules for deciding:

    1. **  If you identify the new concept node as unique you will output the same name of the concept as it is.
    2. ** If you identify a existing concept node as the same as new concept node, 
    new concept node name will be added to the list of synonyms of the existing concept node and 
    the output will be the name of the existing concept node.
    3. ** Take careful consideration on the description of each concept, and if they talk about the same concept.
    4. ** If the list of similar existing concept nodes is empty, then the output must be the name of the new concept.
    5. ** ONLY OUTPUT THE NAME OF THE CONCEPT NODE DONT EXPLAIN THE REASONING

    Example of a good output:

    <new_concept>
       node.name: Impacto de Canibalización, 
       node.description: El impacto de canibalización mide la pérdida causada por la disminución en ventas de otros 
       productos debido a una promoción. Representa lo que se dejó de percibir de productos no promocionados cuando los 
       clientes optaron por comprar el producto en promoción.
    </new_concept>

    <List of Similar Existing Concept Nodes>

    [{'node.name': 'Impacto de Canibalización', 
      'node.description': 'El impacto de canibalización mide la pérdida causada por la disminución en ventas de otros 
      productos debido a una promoción. Representa lo que se dejó de percibir de productos no promocionados cuando los 
      clientes optaron por comprar el producto en promoción.',
      'score': 0.996490478515625}, 
     {'node.name': 'Canibalización',
      'node.description': "La canibalización en el contexto de una promoción se refiere al fenómeno en el que una acción 
      promocional para un producto o servicio afecta negativamente las ventas de otros productos o servicios de la misma empresa. 
      En otras palabras, es cuando una promoción 'se come' las ventas de otros productos.", 
      'score': 0.9651641845703125}]

    </List of Similar Existing Concept Nodes>

    EXPECTED OUTPUT:

    Impacto de Canibalización


 """

    user_prompt = """
        <role>
        {ROLE}
        </role>
        <best_practices>
        {BEST_PRACTICES}
        </best_practices>
        <new_concept>
        {NEW_CONCEPT}
        </new_concept>
        {context}
    """
    prompt = PromptTemplate(
        template=user_prompt,
        input_variables=["context"],
        partial_variables={
            "BEST_PRACTICES": best_practices,
            "ROLE": persona,
            "NEW_CONCEPT": new_concept,
        },
    )

    chain = prompt | llm 
        
    response = chain.invoke({"context": context}).content.strip()

    result = chain.invoke({"context": context})
    print(result)

    return response

In [144]:
response = deduplication(new_concept, data, llm=llm_light)

content='Canibalización' additional_kwargs={'usage': {'prompt_tokens': 973, 'completion_tokens': 9, 'total_tokens': 982}, 'stop_reason': 'end_turn', 'model_id': 'us.anthropic.claude-3-5-haiku-20241022-v1:0'} response_metadata={'usage': {'prompt_tokens': 973, 'completion_tokens': 9, 'total_tokens': 982}, 'stop_reason': 'end_turn', 'model_id': 'us.anthropic.claude-3-5-haiku-20241022-v1:0'} id='run-ab1c4516-5132-40ae-a6cf-914d7eec0d5c-0' usage_metadata={'input_tokens': 973, 'output_tokens': 9, 'total_tokens': 982}


In [145]:
response

'Canibalización'