# Notebook 1.2: Lexical Graph Pattern

Este notebook prueba el patr√≥n **LEXICAL_GRAPH** con datos reales.

## Objetivos

1. **Limpiar el grafo** antes de empezar
2. **Crear patr√≥n LEXICAL_GRAPH** con entidades y chunks
3. **Ingerir 3 documentos** usando el patr√≥n l√©xico
4. **Explorar el grafo** creado
5. **Probar b√∫squedas** con los patrones GraphRAG implementados

## Patr√≥n LEXICAL_GRAPH

- **Entity**: Entidades extra√≠das (PERSON, ORGANIZATION, CONCEPT, etc.)
- **Chunk**: Fragmentos de texto
- **MENTIONS**: Chunk menciona Entity
- **RELATED_TO**: Entity relacionada con otra Entity

**Estructura:**
```
Chunk -[:MENTIONS]-> Entity -[:RELATED_TO]-> Entity
```

In [ ]:
def add_src_to_path(path_folder: str):
    ''' 
    Helper function for adding the "path_folder" directory to the path.
    in order to work on notebooks and scripts
    '''
    import sys
    from pathlib import Path

    base_path = Path().resolve()
    for parent in [base_path] + list(base_path.parents):
        candidate = parent / path_folder
        if candidate.exists():
            parent_dir = candidate.parent
            if str(parent_dir) not in sys.path:
                sys.path.insert(0, str(parent_dir))
                print(f"Path Folder parent added: {parent_dir}")
            if str(candidate) not in sys.path:
                sys.path.append(str(candidate))
                print(f"Path Folder {path_folder} added: {candidate}")
            return
    print(f"Not found '{path_folder}' folder on the hierarchy of directories")

# Agregar carpetas necesarias al path
add_src_to_path(path_folder="src")
add_src_to_path(path_folder="src/utils")
add_src_to_path(path_folder="src/data")

In [ ]:
# Importar librer√≠as necesarias
import sys
from pathlib import Path
from typing import List, Dict, Any

# Importar handlers
from src.utils.handlers import find_in_project

# Importar ungraph
try:
    import ungraph
    print("‚úÖ Ungraph importado como paquete instalado")
except ImportError:
    import src
    ungraph = src
    print("‚úÖ Ungraph importado desde src/ (modo desarrollo)")

# Importar servicios para limpieza
from infrastructure.services.neo4j_index_service import Neo4jIndexService

# Importar patrones
from domain.value_objects.graph_pattern import GraphPattern, NodeDefinition, RelationshipDefinition

print(f"üì¶ Ungraph version: {ungraph.__version__}")

## Parte 1: Configuraci√≥n y Limpieza

Configuramos Neo4j y limpiamos el grafo antes de empezar.

In [ ]:
# Configurar Neo4j
ungraph.configure(
    neo4j_uri="bolt://localhost:7687",
    neo4j_user="neo4j",
    neo4j_password="Ungraph22",  # ‚ö†Ô∏è CAMBIAR: Usa tu contrase√±a real
    neo4j_database="neo4j",
    embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)

print("‚úÖ Configuraci√≥n completada")

In [ ]:
# Limpiar el grafo antes de empezar
print("üßπ Limpiando grafo...")
print("=" * 80)

index_service = Neo4jIndexService()

# Limpiar todos los nodos y relaciones
try:
    index_service.clean_graph()
    print("‚úÖ Grafo limpiado (todos los nodos y relaciones eliminados)")
except Exception as e:
    print(f"‚ö†Ô∏è  Error al limpiar grafo: {e}")

# Eliminar todos los √≠ndices
try:
    index_service.drop_all_indexes()
    print("‚úÖ √çndices eliminados")
except Exception as e:
    print(f"‚ö†Ô∏è  Error al eliminar √≠ndices: {e}")

print("\n‚úÖ Limpieza completada. Listo para ingesta.")

## Parte 2: Preparar Documentos

Localizamos los 3 documentos de prueba.

In [ ]:
# Encontrar carpeta de datos
data_path = find_in_project(
    target="data",
    search_type="folder",
    project_root=None
)

if data_path:
    print(f"‚úÖ Carpeta de datos encontrada: {data_path}")
    
    # Seleccionar los 3 documentos de prueba
    test_files = [
        data_path / "110225.md",
        data_path / "AnnyLetter.txt",
        data_path / "Usar s√≠mboles de silencio de corchea.docx"
    ]
    
    # Verificar que existen
    available_files = [f for f in test_files if f.exists()]
    print(f"\nüìÑ Archivos disponibles ({len(available_files)}/{len(test_files)}):")
    for f in available_files:
        print(f"   ‚úÖ {f.name}")
    
    for f in test_files:
        if not f.exists():
            print(f"   ‚ö†Ô∏è  No encontrado: {f.name}")
else:
    print("‚ùå Carpeta de datos no encontrada")
    available_files = []

## Parte 3: Crear Patr√≥n LEXICAL_GRAPH

Definimos el patr√≥n l√©xico con entidades y chunks.

In [ ]:
# Crear patr√≥n LEXICAL_GRAPH
print("üìù CREANDO PATR√ìN LEXICAL_GRAPH")
print("=" * 80)

# Nodo Entity
entity_node = NodeDefinition(
    label="Entity",
    required_properties={
        "name": str,
        "type": str  # "PERSON", "ORGANIZATION", "CONCEPT", etc.
    },
    optional_properties={
        "description": str,
        "frequency": int
    },
    indexes=["name", "type"]
)

# Nodo Chunk
chunk_node = NodeDefinition(
    label="Chunk",
    required_properties={
        "chunk_id": str,
        "content": str,
        "embeddings": list,
        "embeddings_dimensions": int
    },
    optional_properties={
        "chunk_id_consecutive": int,
        "source_file": str
    },
    indexes=["chunk_id"]
)

# Relaci√≥n: Chunk menciona Entity
mentions_rel = RelationshipDefinition(
    from_node="Chunk",
    to_node="Entity",
    relationship_type="MENTIONS",
    properties={"count": int},  # N√∫mero de veces que se menciona
    direction="OUTGOING"
)

# Relaci√≥n: Entity relacionada con otra Entity
related_rel = RelationshipDefinition(
    from_node="Entity",
    to_node="Entity",
    relationship_type="RELATED_TO",
    properties={"strength": float},  # Fuerza de la relaci√≥n
    direction="OUTGOING"
)

LEXICAL_GRAPH_PATTERN = GraphPattern(
    name="LEXICAL_GRAPH",
    description="Grafo l√©xico con entidades extra√≠das y sus relaciones. √ötil para an√°lisis sem√°ntico.",
    node_definitions=[entity_node, chunk_node],
    relationship_definitions=[mentions_rel, related_rel],
    search_patterns=["basic", "hybrid", "pattern_matching"]
)

print(f"‚úÖ Patr√≥n creado: {LEXICAL_GRAPH_PATTERN.name}")
print(f"   Nodos: {[n.label for n in LEXICAL_GRAPH_PATTERN.node_definitions]}")
print(f"   Relaciones: {[r.relationship_type for r in LEXICAL_GRAPH_PATTERN.relationship_definitions]}")

## Parte 4: Ingesta con Patr√≥n LEXICAL_GRAPH

Ingerimos los documentos usando el patr√≥n l√©xico.

In [ ]:
# Ingesta con patr√≥n LEXICAL_GRAPH
print("üì• INGESTA CON PATR√ìN LEXICAL_GRAPH")
print("=" * 80)

all_chunks_lexical = []

for file_path in available_files:
    print(f"\nüìÑ Procesando: {file_path.name}")
    try:
        chunks = ungraph.ingest_document(
            file_path,
            pattern=LEXICAL_GRAPH_PATTERN,
            chunk_size=1000,
            chunk_overlap=200,
            clean_text=True
        )
        all_chunks_lexical.extend(chunks)
        print(f"   ‚úÖ {len(chunks)} chunks creados")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")

print(f"\n‚úÖ Total chunks con patr√≥n LEXICAL_GRAPH: {len(all_chunks_lexical)}")
print("\nüí° Nota: Este patr√≥n requiere extracci√≥n de entidades.")
print("   En una implementaci√≥n completa, se usar√≠a NER para extraer entidades.")

## Parte 4: Explorar Grafo

Exploramos la estructura del grafo creado.

In [ ]:
# Explorar estructura del grafo
from src.utils.graph_operations import graph_session

driver = graph_session()
with driver.session() as session:
    # Contar nodos por tipo
    result = session.run("""
        MATCH (n)
        RETURN labels(n)[0] as label, count(n) as count
        ORDER BY count DESC
    """)
    
    print("üìä ESTRUCTURA DEL GRAFO:")
    print("=" * 80)
    for record in result:
        print(f"   {record['label']}: {record['count']} nodos")
    
    # Contar relaciones
    result = session.run("""
        MATCH ()-[r]->()
        RETURN type(r) as rel_type, count(r) as count
        ORDER BY count DESC
    """)
    
    print("\nüîó RELACIONES:")
    for record in result:
        print(f"   {record['rel_type']}: {record['count']} relaciones")

driver.close()

## Parte 4.1: Visualizar Grafo

Visualizamos el grafo usando yFiles for Jupyter.

In [ ]:
# Importar funciones de visualizaci√≥n
from src.notebooks.graph_visualization import (
    visualize_file_page_chunk_pattern,
    visualize_simple_chunk_pattern,
    visualize_lexical_graph_pattern,
    visualize_hierarchical_pattern,
    visualize_sequential_chunks_pattern,
    visualize_pattern_structure,
    visualize_custom_query
)

print("‚úÖ Funciones de visualizaci√≥n importadas")

## Parte 5: Probar B√∫squedas

Probamos los patrones de b√∫squeda GraphRAG implementados.

In [ ]:
# Probar b√∫squedas
test_query = "test"
print(f"üîç PROBANDO B√öSQUEDAS CON QUERY: '{test_query}'")
print("=" * 80)

# 1. Basic Retriever
print("\n1. Basic Retriever:")
try:
    results = ungraph.search_with_pattern(
        test_query,
        pattern_type="basic",
        limit=3
    )
    print(f"   ‚úÖ {len(results)} resultados")
    if results:
        print(f"   Score promedio: {sum(r.score for r in results) / len(results):.3f}")
except Exception as e:
    print(f"   ‚ùå Error: {e}")

# 2. Metadata Filtering
print("\n2. Metadata Filtering:")
try:
    # Obtener un filename del grafo
    driver = graph_session()
    with driver.session() as session:
        result = session.run("MATCH (f:File) RETURN f.filename as filename LIMIT 1")
        record = result.single()
        if not record:
            # Intentar con Chunk si no hay File
            result = session.run("MATCH (c:Chunk) RETURN c.source_file as filename LIMIT 1")
            record = result.single()
        if record:
            filename = record["filename"]
            results = ungraph.search_with_pattern(
                test_query,
                pattern_type="metadata_filtering",
                metadata_filters={"filename": filename},
                limit=3
            )
            print(f"   ‚úÖ {len(results)} resultados (filtrado por '{filename}')")
        else:
            print("   ‚ö†Ô∏è  No hay archivos en el grafo")
    driver.close()
except Exception as e:
    print(f"   ‚ùå Error: {e}")

# 3. Parent-Child Retriever (si aplica)
print("\n3. Parent-Child Retriever:")
try:
    results = ungraph.search_with_pattern(
        test_query,
        pattern_type="parent_child",
        parent_label="Page",
        child_label="Chunk",
        relationship_type="HAS_CHUNK",
        limit=3
    )
    print(f"   ‚úÖ {len(results)} resultados")
except Exception as e:
    print(f"   ‚ö†Ô∏è  No aplica para este patr√≥n: {e}")

In [ ]:
# Visualizar patr√≥n LEXICAL_GRAPH
print("üé® VISUALIZANDO PATR√ìN LEXICAL_GRAPH")
print("=" * 80)

driver = graph_session()
try:
    visualize_lexical_graph_pattern(driver, limit_entities=10, limit_chunks=15)
except Exception as e:
    print(f"‚ö†Ô∏è  Error al visualizar: {e}")
    print("üí° Aseg√∫rate de tener yfiles_jupyter_graphs_for_neo4j instalado")
finally:
    driver.close()

## Parte 6: Resumen

Resumen del patr√≥n LEXICAL_GRAPH.

In [ ]:
print("üìä RESUMEN DEL PATR√ìN LEXICAL_GRAPH")
print("=" * 80)
print(f"Chunks creados: {len(all_chunks_lexical)}")
print(f"Estructura: Chunk -[:MENTIONS]-> Entity -[:RELATED_TO]-> Entity")
print(f"Uso recomendado: An√°lisis sem√°ntico, extracci√≥n de entidades")
print("\n‚úÖ Notebook completado exitosamente")