In [1]:
# PROCESAMIENTO DE DATOS
from langchain_docling import DoclingLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


# KGs
from langchain_neo4j import GraphCypherQAChain, Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer


# AGENTES Y ORQUESTACIÓN
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from typing_extensions import TypedDict
from typing import List
from langgraph.graph import END, START, StateGraph


# DISPLAYS AND IMAGES
from IPython.display import Image, display, Markdown


  from .autonotebook import tqdm as notebook_tqdm


### Procesamiento de Grafos de Conocimiento

**Objetivo:**
# 
Desarrollar un sistema eficiente para transformar datos no estructurados en grafos de conocimiento, que sirvan como base para agentes inteligentes.

**Fases del Proceso:**

1. **Transformación de Datos:**
   - 1.1 Convertir datos no estructurados en grafos de conocimiento.
   - 1.2 Utilizar grafos de conocimiento como fuente de información para agentes.
   - 1.3 Implementar grafos de conocimiento como sistemas de memoria para agentes (en desarrollo).

2. **KG-Consumer:**
   - Arquitectura de agente reflexivo diseñado para consumir, analizar y responder preguntas de manera crítica.

#### Procesamiento de datos

##### Components

- Modelo y Embedings: `Llama 3.2`
- PDF Loaders (Docling), Chunks & Splitters


In [2]:
llm = ChatOllama(model="llama3.2", temperature=0, base_url="http://localhost:11434")
embeddings = OllamaEmbeddings(model="llama3.2", base_url="http://localhost:11434")

In [3]:
# PDF reader
path_file = "Data/genetics-in-osteoarthritis.pdf"
loader = DoclingLoader(file_path=path_file)
docs = loader.load()

Token indices sequence length is longer than the specified maximum sequence length for this model (1237 > 512). Running this sequence through the model will result in indexing errors


In [4]:
# splitting -Chunking
chunks = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=200).split_documents(docs)


In [5]:
display(Markdown(chunks[5].page_content))


Osteoarthritis  is  a  degenerative  articular  disease  with  a complex pathogeny because diverse factors interact causing a  process  of  deterioration  of  the  cartilage  and  the  subchondral bone. It can be primary or secondary to diverse diseases, but it has  clinical, radiological,  and pathological manifestations in common. Its pathogenesis is complex due to genetic, metabolic and local factors, which interact and cause a process of deterioration of the cartilage, with a proliferative reaction  of  subchondral  bone  and  synovial  inflammation.  Apart from the classical  concept  of

In [6]:
# Embeddings
# Embedding content
# logica para crear los embedding de los chunks.page_content y guardarlos como parte de los metadatos.
for i, doc in enumerate(chunks):
    print(f"Processing chunk {i+1} of {len(chunks)}")
    doc.metadata["embedding"] = embeddings.embed_query(doc.page_content)

Processing chunk 1 of 135
Processing chunk 2 of 135
Processing chunk 3 of 135
Processing chunk 4 of 135
Processing chunk 5 of 135
Processing chunk 6 of 135
Processing chunk 7 of 135
Processing chunk 8 of 135
Processing chunk 9 of 135
Processing chunk 10 of 135
Processing chunk 11 of 135
Processing chunk 12 of 135
Processing chunk 13 of 135
Processing chunk 14 of 135
Processing chunk 15 of 135
Processing chunk 16 of 135
Processing chunk 17 of 135
Processing chunk 18 of 135
Processing chunk 19 of 135
Processing chunk 20 of 135
Processing chunk 21 of 135
Processing chunk 22 of 135
Processing chunk 23 of 135
Processing chunk 24 of 135
Processing chunk 25 of 135
Processing chunk 26 of 135
Processing chunk 27 of 135
Processing chunk 28 of 135
Processing chunk 29 of 135
Processing chunk 30 of 135
Processing chunk 31 of 135
Processing chunk 32 of 135
Processing chunk 33 of 135
Processing chunk 34 of 135
Processing chunk 35 of 135
Processing chunk 36 of 135
Processing chunk 37 of 135
Processing

#### Persistencia y Modelo


- Es posible almacenar la estructura de un documento para crear una topología que sirva como base en el modelo que se desea desarrollar. Por ejemplo, se puede representar mediante un patrón jerárquico las relaciones entre documentos, páginas y fragmentos (chunks).

$$
\text{(Documento)} \xrightarrow{\text{HAS}} \text{(Página)} \xrightarrow{\text{DIVIDED\_INTO}} \text{(Fragmento)}
$$

En cada nodo de esta estructura, se pueden almacenar diversos metadatos. En particular, nos interesa enriquecer los atributos asociados a los fragmentos, como los embeddings. Para lograr esto, se puede implementar una función utilizando pandas, el controlador de Neo4j y Cypher, que permita cargar los datos en un DataFrame o leer directamente los documentos para construir esta estructura.

In [7]:
from neo4j import GraphDatabase


URI = "neo4j://127.0.0.1:7687"
PASSWORD = "onto_research"
USER = "neo4j"

def create_graph(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

driver = create_graph(URI, USER, PASSWORD)



In [8]:
docs[0]

Document(metadata={'source': 'Data/genetics-in-osteoarthritis.pdf', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/3', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 63.84, 't': 690.528, 'r': 525.72, 'b': 680.628, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 90]}]}, {'self_ref': '#/texts/4', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 63.84, 't': 655.567, 'r': 550.546, 'b': 633.155, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 143]}]}, {'self_ref': '#/texts/5', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 106.32, 't': 618.136, 'r': 550.502, 'b': 533.77, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 933]}]}], 'headings': ['Genetics in Osteoarthritis'], 'origin': {'mi

In [9]:
def ingest_to_graph(driver, docs):
    with driver.session() as session:
        for doc in docs:
            # Extraer datos del documento
            document_id = doc.metadata['source']
            page_id = f"{doc.metadata['source']}_page_{doc.metadata['dl_meta']['doc_items'][0]['prov'][0]['page_no']}"
            chunk_id = f"{doc.metadata['source']}_chunk_{hash(doc.page_content)}"
            chunk_content = doc.page_content
            headings = doc.metadata['dl_meta']['headings']
            embedding = doc.metadata['embedding']

            query = """
            MERGE (d:Document {id: $document_id})
            MERGE (p:Page {id: $page_id})
            MERGE (c:Chunk {id: $chunk_id, content: $chunk_content, headings: $headings, embedding: $embedding})
            MERGE (d)-[:HAS]->(p)
            MERGE (p)-[:DIVIDED_IN]->(c)
            """

            # INGEST TO GRAPH
            session.run(query, document_id=document_id, page_id=page_id, chunk_id=chunk_id, 
                    chunk_content=chunk_content, headings=headings, embedding=embedding)
            

In [10]:
def create_vector_index(driver, index_name="chunk_embeddings", dimensions=4096):
    """Crea vector index solo si no existe"""
    with driver.session() as session:
        try:
            query = f"""
            CALL db.index.vector.createNodeIndex(
                '{index_name}',
                'Chunk',
                'embedding',
                {dimensions},
                'cosine'
            )
            """
            session.run(query)
            print("Vector index creado")
        except Exception as e:
            if "already exists" in str(e):
                print("Vector index ya existe")
            else:
                print(f"Error: {e}")


In [11]:
    def create_text_index(driver):
        """Crea índice de texto simple"""
        with driver.session() as session:
            query = """
            CREATE INDEX chunk_content_text IF NOT EXISTS 
            FOR (c:Chunk) ON (c.pag_content)
            """
            session.run(query)

In [12]:
def setup_indexes(driver):
    """Configura todos los índices necesarios"""
    create_vector_index(driver)
    create_text_index(driver)
    print("Índices creados")


In [13]:
setup_indexes(driver)
ingest_to_graph(driver, docs)

Vector index ya existe
Índices creados


KeyError: 'embedding'

### Graph RAG

- a. De la estructura creada del documento.
- a.1 De la extracción de entidades que puede ejercerse sobre los page content en cada chunk.
- b. De Grafos curados con sus debidas ontologías

#### Graph RAG De la estructura creada del documento.

In [None]:
from langchain.chains import GraphSparqlQAChain # DB
from langchain_community.graphs import RdfGraph
from langchain_neo4j import GraphCypherQAChain, Neo4jVector

## From external KG

What is a "hetnet"?
A network (also known as a graph) is a conceptual representation of a group of things — called nodes — and the relationships between them — called edges. Typically, a network has only one type of node and one type of edge. But in many cases, it is necessary to be able to distinguish between different types of entities and relationships.

A hetnet (short for heterogeneous information network) is a network where nodes and edges can be multiple types. This additional dimension allows a hetnet to accurately describe more complex data. Hetnets are particularly useful in biomedicine, where it is important to capture the conceptual distinctions between various components and mechanisms, such as genes and diseases, or upregulation and binding.

The prefix meta is used on this site to refer to the type of the node/edge (e.g. compound), as opposed to the specific node/edge itself (e.g. acetaminophen).

What is Hetionet?
Hetionet is a hetnet of biomedical knowledge. It encodes relationships uncovered by millions of studies conducted over the last half-century into a single resource. The network is constructed from a collection of publicly available databases, and is itself open-source and free to use, barring any upstream restrictions.

Hetionet enables scientists and biologists to formulate novel hypotheses, predictions, and other valuable insights by connecting an existing body of biomedical data across multiple levels and types in a convenient, accessible, holistic way.

Why was Hetionet made?
Hetionet was originally created as part of Project Rephetio, a study that utilized the benefits of hetnets to predict new uses for existing drugs. Although the original resources for the network were selected for drug repurposing, Hetionet is now useful in a much broader sense, and has been used for a variety of purposes.

Hetionet was also made to alleviate some of the inaccuracies and inconveniences of using other integrative networks, or trying to use multiple, separate databases in the same analysis. It unifies data from several different, disparate sources into a single, comprehensive, accessible, common-format network.

What's in Hetionet?
Hetionet combines information from 29 public databases. The network contains 47,031 nodes of 11 types and 2,250,197 edges of 24 types.

Metanode	Description
Gene	Protein-coding human genes. From Entrez Gene.
Compound	Approved small molecule compounds with documented chemical structures. From DrugBank.
Anatomy	Anatomical structures, excluding structures that are known not to be found in humans. From Uberon.
Disease	Complex diseases, selected to be distinct and specific enough to be clinically relevant yet general enough to be well annotated. From Disease Ontology.
Symptom	Signs and Symptoms (i.e. clinical abnormalities that can indicate a medical condition). From the MeSH ontology.
Side Effect	Adverse drug reactions. From SIDER/UMLS.
Biological Process	Larger processes or biological programs accomplished by multiple molecular activities. From Gene Ontology.
Cellular Component	The locations relative to cellular structures in which a gene product performs a function. From Gene Ontology.
Molecular Function	Activities that occur at the molecular level, such as "catalysis" or "transport". From Gene Ontology.
Pathway	A series of actions among molecules in a cell that leads to a certain product or change in the cell. From WikiPathways, Reactome, and Pathway Interaction Database.
Pharmacologic Class	"Chemical/Ingredient", "Mechanism of Action", and "Physiologic Effect" FDA class types. From DrugCentral.


In [None]:
# TOOLS
hetionet_url = "https://neo4j.het.io/"



In [None]:
# AGENTE GENERADOR
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a data modelling expert capable of creating high quality entity-relationship models from denormalised datasets. "
            "You always follow these modeling principles: "
            "You don't overnormalize the model. "
            "You don't use the same name for realtionships connecting different types of entities. "
            "You make sure that all features in the dataset are included in the model. "
            "You make sure there is a one to one mapping between the attributes in the extracted entities and the features in the dataset provided as input. ",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)
model_generate = prompt | llm

In [None]:
# AGENTE REFLECTOR
reflection_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a data modelling expert capable of analysing entity-relationship models and suggest changes that can improve them. " +
            "You are not supposed to generate a new model, just provide suggestions for changes when pertinent. " +
            "You always pay extra attention at the following: " +
            "Detect under-normalized in the model and recommend they are extracted as new entities connected to the existing ones through relevant relationships. " +
            "Detect over-normalized entities in the model and recommend they are merged as part of existing ones. " +
            "Suggest alternative names for terms (property names, entity names, relationship names) used in the model if the proposed ones are not adequate or expressive enough" +
            "You do not recommend combining or merging attributes into composite ones. " +
            "You don't always need to propose changes, if a model is good as-is just do not propose changes. "
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)
model_reflect = reflection_prompt | llm