# TIEnc

<div>

<img src="imgs/new_rotated.jpg" alt="Description" style="display: block; margin: 20px auto; width: 80%;" />

</div>

<center>

| Generator | Validator | Status | Nodes | Chapters | Concepts | Relations | Prereqs | Part-Of | Defs | Props | Orphans | AvgRel/Node |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| claude-opus-4-5 | gpt5-1 | ✅ OK | 450 | 104 | 346 | 1395 | 71 | 632 | 30 | 143 | 0 | 3.1 |
| claude-opus-4-5 | claude-opus-4-5 | ✅ OK | 460 | 104 | 356 | 1432 | 69 | 600 | 26 | 198 | 0 | 3.11 |
| gpt5-1 | gpt5-1 | ✅ OK | 459 | 104 | 355 | 1726 | 119 | 536 | 209 | 242 | 0 | 3.76 |
| gpt5-1 | claude-opus-4-5 | ✅ OK | 423 | 104 | 319 | 1601 | 137 | 490 | 117 | 249 | 0 | 3.78 |
</center>



**Estruturas & Modelos**

In [1]:
from enum import Enum
from langchain.chat_models import init_chat_model, BaseChatModel

class Models(str, Enum):
    GPT4_o = "openai:gpt-4o"
    GPT5_1 = "openai:gpt-5.1"
    CLAUDE4_5 = "anthropic:claude-opus-4-5"

def get_llm(model_name: Models) -> BaseChatModel:
    return init_chat_model(model_name.value)

In [98]:
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

class Concept(BaseModel):
    """Represents a single educational concept found in the text."""
    concept_name: str = Field(description="The formal name of the concept (e.g., 'Inductive Logic', 'Backpropagation').")
    chapter: List[int] = Field(description="The number of the current chapter, subchapter, etc (e.g., [1] for chapter 1, [1, 2] for subchapter 1.2, [1,2,5] for subsubchapter 1.2.5)")
    description: str = Field(description="A concise definition or summary of the concept based on the text.")
    # page_start: int = Field(description="The page number where this concept is first introduced.")
    # page_end: Optional[int] = Field(default=None, description="The page number where the discussion of this concept seems to end (or current page if ongoing).")
    # is_main_chapter: bool = Field(default=False, description="True if this is a chapter or main topic, False if it is a subchapter or subtopic.")

class PageExtraction(BaseModel):
    """Container for multiple concepts found on a specific page processing step."""
    concepts: List[Concept] = Field(description="List of concepts extracted from the current text window.")
    
class Relation(BaseModel):
    source: str = Field(description="The subject concept.")
    target: str = Field(description="The object concept.")
    # relation_type: Literal['prerequisite', 'including', 'part-of', 'property', 'definition']
    relation_type: Literal['prerequisite']
    context: Optional[str] = Field(description="Justification text.")

class Critique(BaseModel):
    """A critique of a specific relation."""
    source: str
    target: str
    is_valid: bool = Field(description="True if the relation is supported by text and logic, False otherwise.")
    reasoning: str = Field(description="Why this relation is valid, invalid, or redundant.")

class ModerationDecision(BaseModel):
    """The Moderator's final decision on a specific relation."""
    source: str
    target: str
    status: Literal['approved', 'rejected'] = Field(description="Final verdict.")
    comments: str = Field(description="Why the moderator made this decision.")

class ConceptList(BaseModel):
    concepts: List[Concept]
    
class RelationList(BaseModel):
    relations: List[Relation]

class CritiqueList(BaseModel):
    critiques: List[Critique]

class ModerationList(BaseModel):
    decisions: List[ModerationDecision]


class ConceptAnalysis(BaseModel):
    """LLM Output for a full chapter/concept file."""
    # Map 'new_concepts' to add to registry
    # new_concepts: List[str] = Field(description="List of MAIN concepts defined in this text.")
    relations: List[Relation] = Field(description="Semantic connections found.")
    
class ValidationResult(BaseModel):
    valid_relations: List[Relation] = Field(description="The filtered list of strictly valid educational relations.")
    rejected_relations: List[Relation] = Field(description="List of relations that were removed.")

## Extração de dados

O SINKT já considera um dataset pronto para uso. Sendo assim essa seção busca extrair os conceitos de um ebook PDF. Primeiramente iremos transformar em Markdown, visto que é melhor utilizar texto puro ao invés de páginas de PDF. Além disso, essa proposta facilita a própria extração para o MAIC, posteriormente.

In [3]:
import unicodedata
import re
import xml.etree.ElementTree as ET
from xml.dom import minidom

def normalize_filename(s: str) -> str:
    """
    Remove accents, replace underscores and remove non-alphanumeric characters.
    
    :param s: String to be normalized
    """
    s = unicodedata.normalize('NFKD', s)
    s = ''.join(char for char in s if not unicodedata.combining(char))
    s = re.sub(r'\s+', '_', s)
    s = re.sub(r'[^\w_]', '', s)
    return s.lower()

def prettify_xml(elem: ET.Element) -> str:
    """
    Return a pretty-printed XML string for the Element.
    Strips the annoying extra newlines minidom likes to add.
    
    :param elem: Element (``ET.Element``)
    """
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    # Filter out lines that are purely whitespace
    return '\n'.join([line for line in reparsed.toprettyxml(indent="   ").split('\n') if line.strip()])

Configuração inicial.

In [4]:
from dotenv import load_dotenv
import os
from pathlib import Path
from langsmith import Client
from openai import OpenAI
from langsmith.wrappers import wrap_openai

load_dotenv("../.env")

assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not found"
assert os.getenv("ANTHROPIC_API_KEY"), "ANTHROPIC_API_KEY not found"
assert os.getenv("LANGSMITH_API_KEY"), "ANTHROPIC_API_KEY not found"

BOOK_NAME = 'LinuxFundamentals'
EBOOKS_PATH = Path('ebooks')
base_output_dir = EBOOKS_PATH / BOOK_NAME
os.makedirs(base_output_dir, exist_ok=True)

PDF_PATH = Path('../data/701-LinuxFundamentals_material_full_v14.pdf')

client = wrap_openai(OpenAI())
# client.

Primeiramente é criada a classe de conversão do PDF para markdown, utiliza-se da biblioteca Docling para realizar a conversão. Essa biblioteca permite extrair as imagens e tabelas do texto, posteriormente elas são incluídas no markdown final além de serem salvas juntas.

In [5]:
import os
from PyPDF2 import PdfReader

from docling.document_converter import DocumentConverter
import logging
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat, OutputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions
    )
from docling.document_converter import DocumentConverter, PdfFormatOption, MarkdownFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, DoclingDocument
from tqdm.notebook import tqdm_notebook

class PDFConversor():
    """
    Convert PDF to markdown.
    
    :param pdf_path: Path of the input pdf.
    :param output_dir: Path of the output.
    """
    def __init__(self, pdf_path: Path, output_dir: Path):
        self.input_doc_path: Path = pdf_path
        self.base_output_dir: Path = output_dir
        self.pipeline_options: PdfPipelineOptions = self._set_pipeline_options()
        self.document_converter: DocumentConverter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=self.pipeline_options),
                OutputFormat.MARKDOWN: MarkdownFormatOption(image_mode=ImageRefMode.REFERENCED)
            },
        )
        self.last_page: int = self._get_no_pages()
        self.doc = None
       
    def _set_pipeline_options(self) -> PdfPipelineOptions:
        IMAGE_SCALE = 2.0
        
        pipeline_options = PdfPipelineOptions()
        pipeline_options.generate_picture_images = True
        pipeline_options.generate_page_images = True
        pipeline_options.images_scale = IMAGE_SCALE
        pipeline_options.do_ocr = False
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        pipeline_options.ocr_options.lang = ["pt"]
        pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=4, device=AcceleratorDevice.CUDA
        )
        return pipeline_options

    
    def _get_no_pages(self) -> int:
        reader = PdfReader(self.input_doc_path)
        return len(reader.pages)
    
    def _replace_image_placeholders(selg, md_str: str, image_files: List[Path]) -> None:
        content = md_str
        for img in image_files:
            content = content.replace("<!-- image -->", f"![]({str(img).split('/')[-1]})", 1)
        return content
        
    def save_images(self, doc: DoclingDocument, output_dir: Path) -> List[str]:
        filenames = []
        for page in doc.pictures:
            # print(page)
            page_no = page.self_ref.split('/')[-1]
            page_image_filename = output_dir / f"{page_no}.png"
            print(page_image_filename)
            with page_image_filename.open("wb") as fp:
                page.image.pil_image.save(fp, format="PNG")
            filenames.append(page_image_filename.relative_to(self.base_output_dir))
        return filenames 
    
    def generate_markdown(self, concepts: List[Concept]) -> None:
        """
        Generate a folder for each concept, with the images captured and a ``document.md`` file.
        
        :param concepts: ``List[Concept]`` List of concepts, their pages must in crescent order and sequentially
        (e.g. Chapter 1, 2, 3...).
        """
        for idx in tqdm_notebook(range(len(concepts))):
            curr_chap: Concept = concepts[idx]
            init_page = curr_chap.page_start
            chap_name = normalize_filename(curr_chap.concept_name)

            output_concept_dir = self.base_output_dir / chap_name
            os.makedirs(output_concept_dir, exist_ok=True)
            
            next_page = self.last_page + 1 if idx == len(concepts) - 1 else concepts[idx + 1].page_start - 1

            doc = self.document_converter.convert(self.input_doc_path, page_range=[init_page, next_page]).document
            md_str = doc.export_to_markdown()

            img_filenames = self.save_images(doc, output_concept_dir)
            raw_markdown = self._replace_image_placeholders(md_str, img_filenames)

            with open(output_concept_dir / "document.md", "w") as f:
                f.write(raw_markdown)

``EBookExtractor`` é a classe principal, encapsulando a classe criada anteriormente e servindo como uma interface de mais alto nível.

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents.base import Document
from langchain_core.prompts import ChatPromptTemplate
# from langchain_openai import ChatOpenAI

class EbookExtractor():
    """
    Extract data from an ebook.
    
    :param pdf_file_path: The path of the desired pdf book.
    :param base_output_dir: Directory where the book is going to be saved.
    """
    def __init__(self, pdf_file_path: Path, base_output_dir: Path, llm: BaseChatModel):
        self.pages: List[Document] = None
        self.llm = llm
        self.pdf_conversor: PDFConversor = PDFConversor(pdf_file_path, base_output_dir)
        self.file_path: Path = pdf_file_path
        self._load_pdf_pages()
    
    def _load_pdf_pages(self) -> None:
        """Loads PDF and returns a list of Document objects (one per page)."""
        print(f"Loading PDF: {self.file_path}...")
        loader = PyMuPDFLoader(self.file_path)
        pages = loader.load()
        last_page = len(pages)
        print(f"Loaded {len(pages)} pages.")
        self.pages = pages
        
    
    def extract_toc_structure(self, end_toc_page = 5) -> PageExtraction:
        """
        Scans the first ``end_toc_page`` pages to find a Table of Contents or Summary.
        Returns a list of 'known concepts' to prime the main extractor.

        :param end_toc_page: The first pages where the summary appears. Default to 5.
        """
        print(f"Scouting Table of Contents (Pages 1-{end_toc_page})...")
        
        # Combine first pages (or fewer if small doc)
        limit = min(len(self.pages), end_toc_page)
        toc_text = "\n".join([p.page_content for p in self.pages[:limit]])
        
        # Simple chain for ToC extraction
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an expert content analyzer. Look at the beginning of this book."),
            ("human", """Identify the Table of Contents. 
            Extract ALL chapters, sections, and sub-sections (e.g., 1.1, 1.2.1, 1.2.2) as individual Concepts.
            Do NOT summarize or skip detailed sub-topics. Capture the full hierarchy. Do NOT include chapter number on
            concept_name.
            
            Text:
            {text}""")
        ])
        
        # We reuse the PageExtraction model, though we only care about names/start pages here
        chain = prompt | self.llm.with_structured_output(PageExtraction)
    
        try:
            result = chain.invoke({"text": toc_text})
            print(f"ToC Analysis found {len(result.concepts)} potential concepts.")
            return result.concepts
        except Exception as e:
            print(f"Could not extract ToC (might be missing or unstructured). Proceeding with empty seed. Error: {e}")
            return []
    

Executando o pipeline de extração.

In [7]:
llm = get_llm(Models.GPT5_1)
extractor = EbookExtractor(PDF_PATH, base_output_dir, llm)
toc_concepts = extractor.extract_toc_structure(end_toc_page=5)

Loading PDF: ../data/701-LinuxFundamentals_material_full_v14.pdf...
Loaded 127 pages.
Scouting Table of Contents (Pages 1-5)...


2025-12-14 11:38:02,961 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


ToC Analysis found 104 potential concepts.


Filtrando apenas os capítulos, assim podemos gerar uma pasta para cada, contendo arquivo markdown e imagens.

In [24]:
chapters = []
for c in toc_concepts:
    if len(c.chapter) == 1:
        chapters.append(c)
        print(c.chapter, c.concept_name, c.page_start)

[1] Introdução ao Linux 6
[2] Certificações Linux 13
[3] História do Linux 16
[4] Licenças Open Source 20
[5] Evolução do Linux: distribuições 23
[6] Conhecendo o Linux 34
[7] Tópicos para revisão do capítulo 41
[8] Estrutura do sistema operacional 43
[9] O que é um Shell 52
[10] Variáveis 55
[11] Arquivos de configuração do shell 62
[12] Caminhos de Diretorios 68
[13] Tópicos para revisão do capítulo 74
[14] Como obter ajuda 76
[15] Formas de documentação 77
[16] Comando help 79
[17] Comando apropos 81
[18] Comando whatis 84
[19] Comando man 86
[20] Comando info 89
[21] Comando whereis 91
[22] Comando which 94
[23] FHS, Hierarquia dos Diretórios 96
[24] Aprendendo Comandos do GNU/Linux 110
[25] Localização no sistema 120
[26] Tópicos para revisão do capítulo 127


In [25]:
# extractor.pdf_conversor.generate_markdown(chapters)

# Agentes

**AgentState** será o estado utilizado pelos agentes como uma espécie de memória a curto prazo. Essa estrutura será atualizada conforme os agentes executam e capturam informações.

In [None]:
class Concept(BaseModel):
    name: str = Field(description="The formal name of the concept.")
    description: str = Field(description="Brief definition.")
    def __eq__(self, other):
        return isinstance(other, Concept) and self.name.lower() == other.name.lower()
    def __hash__(self):
        return hash(self.name.lower())

class ConceptCritique(BaseModel):
    """Critique for a specific concept."""
    concept_name: str
    is_valid: bool = Field(description="Is this a valid educational concept (not a stopword, generic term, or proper noun unrelated to the topic)?")
    reasoning: str = Field(description="Why valid or invalid? Mention redundancy if it exists.")

class ConceptModerationDecision(BaseModel):
    concept_name: str
    action: Literal['keep', 'drop', 'rename']
    new_name: Optional[str] = Field(description="If action is rename, provide the new name here.")
    reason: str

class ConceptList(BaseModel):
    concepts: List[Concept]

class ConceptCritiqueList(BaseModel):
    critiques: List[ConceptCritique]

class ConceptModerationList(BaseModel):
    decisions: List[ConceptModerationDecision]

In [126]:
class Relation(BaseModel):
    source: str
    target: str
    relation_type: Literal['prerequisite']
    context: Optional[str]
    
    def __eq__(self, other):
        return (isinstance(other, Relation) and 
                self.source.lower() == other.source.lower() and 
                self.target.lower() == other.target.lower())

class RelationCritique(BaseModel):
    source: str
    target: str
    is_valid: bool
    reasoning: str

class RelationModerationDecision(BaseModel):
    source: str
    target: str
    status: Literal['approved', 'rejected']
    comments: str

class RelationList(BaseModel):
    relations: List[Relation]

class RelationCritiqueList(BaseModel):
    critiques: List[RelationCritique]

class RelationModerationList(BaseModel):
    decisions: List[RelationModerationDecision]
    
class KnowledgeGraph(BaseModel):
    """The final, polished output from the Consensus Agent."""
    concepts: List[Concept]
    relations: List[Relation]

In [151]:
from typing_extensions import TypedDict
from typing import Annotated
import operator

class AgentState(TypedDict):
    """
    The graph state.
    We separate the 'working memory' of each agent to preserve history and context (Rule 2).
    """
    text_segment: str
    knowledge_base: KnowledgeGraph
    
    # Concepts
    extracted_concepts: List[Concept] # raw extraction

    # Delta logic
    new_concepts: List[Concept]
    known_concepts: List[Concept]
    
    concept_critiques: List[ConceptCritique]
    moderated_new_concepts: List[Concept]

    active_concepts: List[Concept]

    # Relation
    proposed_relations: List[Relation]
    
    # Delta logic
    new_relations: List[Relation]
    
    relation_critiques: List[RelationCritique]
    moderated_new_relations: List[Relation]
    
    # Graph
    final_graph_update: KnowledgeGraph
    workflow_trace: Annotated[List[str], operator.add] # Append-only log of visited nodes

### Fase 1 - Extração de Conceitos

**Agente extrator de conceitos**

In [152]:
def concept_extractor(state: AgentState):
    """
    [NEW] Agent 0: Concept Extractor
    Scans the text and identifies the key entities/concepts before relations are proposed.
    """
    print("--- Concept Extractor ---")

    prompt = ChatPromptTemplate.from_messages([
        ("system", """
         You are an expert Ontology Engineer. Identify the key educational concepts or technical terms in the text.
         
         ### RULES: 
         1. Ignore general words.
         2. Focus on specific subjects that would be nodes in a knowledge graph.
         3. Concepts must be small words (such as: Linux, Integrals, Differential Equations, Kernels), avoid sentences.
         4. Ignore generic words (e.g., 'Chapter', 'Diagram'). Focus on technical terms.
         """),
        ("user", "Text: {text}")
    ])
    
    chain = prompt | llm.with_structured_output(ConceptList)
    result = chain.invoke({"text": state['text_segment']})
    
    return {"extracted_concepts": result.concepts, "workflow_trace": ["concept_extractor"]}


**Agente crítico de conceitos**

In [153]:
def concept_delta_filter(state: AgentState):
    """
    [LOGIC NODE] Separates concepts into 'New' (need validation) and 'Known' (skip validation).
    """
    print("--- [1.5] Concept Delta Filter ---")
    kb_concepts = state['knowledge_base'].concepts
    extracted = state['extracted_concepts']
    
    # Simple name matching (case-insensitive via the model's __eq__)
    known = []
    new = []
    
    kb_names = {c.name.lower() for c in kb_concepts}
    
    for c in extracted:
        if c.name.lower() in kb_names:
            known.append(c)
        else:
            new.append(c)
            
    print(f"  > Known (Skipping validation): {[c.name for c in known]}")
    print(f"  > New (Sending to Critic): {[c.name for c in new]}")
    
    return {
        "known_concepts": known,
        "new_concepts": new,
        "workflow_trace": ["concept_delta_filter"]
    }

In [154]:
def concept_critic(state: AgentState):
    """Agent 2: Concept Critic"""
    print("--- [2] Concept Critic ---")
    new_concepts = state['new_concepts']
    
    if not new_concepts: 
        return {"concept_critiques": [], "workflow_trace": ["concept_critic"]}

    c_list = ", ".join([c.name for c in new_concepts])
    prompt = ChatPromptTemplate.from_messages([
        ("system", "Critique these concepts. Mark as INVALID if they are: Too generic (e.g. 'Solution'), Verbs, or duplicates."),
        ("user", "Concepts: {c_list}\n\nCritique each:")
    ])
    
    chain = prompt | llm.with_structured_output(ConceptCritiqueList)
    result = chain.invoke({"c_list": c_list})
    
    return {"concept_critiques": result.critiques, "workflow_trace": ["concept_critic"]}

**Agente moderador de conceitos**

In [155]:
def concept_moderator(state: AgentState):
    """Agent 3: Concept Moderator"""
    print("--- [3] Concept Moderator ---")
    new_concepts = state['new_concepts']
    critiques = state['concept_critiques']

    if not new_concepts:
        return {"moderated_new_concepts": [], "active_concepts": state['known_concepts'], "workflow_trace": ["concept_moderator"]}

    # Prepare Case File
    cases = []
    for c in new_concepts:
        crit = next((x for x in critiques if x.concept_name == c.name), None)
        crit_text = f"Valid: {crit.is_valid}, Reason: {crit.reasoning}" if crit else "No critique."
        cases.append(f"CONCEPT: {c.name}\nCRITIC: {crit_text}")
    
    cases_str = "\n".join(cases)

    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are the Concept Judge. Decide to 'keep', 'drop', or 'rename' concepts based on critiques."),
        ("user", "<CONCEPTS>:\n{cases}")
    ])
    
    chain = prompt | llm.with_structured_output(ConceptModerationList)
    result = chain.invoke({"cases": cases_str})
    
    # Filter and Rename
    approved_new = []
    for decision in result.decisions:
        if decision.action == 'drop': continue
        orig = next((c for c in new_concepts if c.name == decision.concept_name), None)
        if orig:
            if decision.action == 'rename' and decision.new_name: orig.name = decision.new_name
            approved_new.append(orig)

    active = state['known_concepts'] + approved_new
        
    return {
        "moderated_new_concepts": approved_new, 
        "active_concepts": active,
        "workflow_trace": ["concept_moderator"]
    }

### Fase 2 - Extração de relações

**Agente proponente**

In [156]:
def relation_proposer(state: AgentState):
    """Agent 4: Relation Proposer"""
    print("--- [4] Relation Proposer ---")
    text = state['text_segment']
    active = state['active_concepts']
    
    if not active: return {"proposed_relations": [], "workflow_trace": ["relation_proposer"]}
    
    active_names = ", ".join([c.name for c in active])
        
    prompt = ChatPromptTemplate.from_messages([
        ("system", """
         You are an expert Knowledge Graph engineer. 
         Identify 'prerequisite' relations between the following <KNOWN CONCEPTS> based strictly on the provided <TEXT>.
         Make sure these relations are essential (you need to know the prerequisite to understand the concept).
         
         ### RULES
         1. If target concept (the prerequisite) is being presented within the <TEXT> for the first time.
         It MUST COME before the source concept
         """),
        ("user", "<KNOWN CONCEPTS>: {c_names}\n\n<TEXT>: {text}")
    ])

    # Enforce structured output
    chain = prompt | llm.with_structured_output(RelationList)
    result = chain.invoke({"c_names": active_names, "text": text})
    
    # Return update to state
    return {
        "proposed_relations": result.relations,
        "workflow_trace": ["relation_proposer"]
    }

**Agente crítico**

In [157]:
def relation_delta_filter(state: AgentState):
    """[LOGIC NODE] Filters out relations that already exist in the KB."""
    print("--- [4.5] Relation Delta Filter ---")
    proposed = state['proposed_relations']
    kb_relations = state['knowledge_base'].relations
    
    new_rels = []
    # Simple check: Source+Target equality
    kb_set = {(r.source.lower(), r.target.lower()) for r in kb_relations}
    
    for r in proposed:
        if (r.source.lower(), r.target.lower()) in kb_set:
            print(f"  [SKIP] Relation {r.source}->{r.target} already exists.")
        else:
            new_rels.append(r)
            
    return {"new_relations": new_rels, "workflow_trace": ["relation_delta_filter"]}

In [None]:
def relation_critic(state: AgentState):
    """
    Agent 5: Critic
    Checks for inconsistency, redundancy, or hallucinations in the proposed relations.
    """
    print("--- Critic Agent ---")
    
    new_r = state['new_relations']
    if not new_r: return {"relation_critiques": [], "workflow_trace": ["relation_critic"]}

    # Format proposed relations for the prompt
    proposed_str = "\n".join([f"- {r.source} -> {r.target} ({r.relation_type}): {r.context}" for r in new_r])

    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a harsh data auditor. specific <PROPOSED RELATIONS>. Check if they are supported by the text and logically sound. Look for redundancy (A->B and A->B) or cycles (A->B and B->A) which are bad for prerequisite trees."),
        ("user", "Text: {text}\n\n<PROPOSED RELATIONS>:\n\n{proposed_str}\n\nProvide a critique for EACH relation.")
    ])

    chain = prompt | llm.with_structured_output(RelationCritiqueList)
    result = chain.invoke({"text": state['text_segment'], "proposed_str": proposed_str})

    return {
        "critiques": result.critiques,
        "workflow_trace": ["relation_critic"]
    }


**Agente moderador**

In [159]:
def relation_moderator(state: AgentState):
    """Agent 6: Relation Moderator"""
    print("--- [6] Relation Moderator ---")
    new_r = state['new_relations']
    critiques = state['relation_critiques']
    
    # proposed = state['proposed_relations']
    # critiques = state['relation_critiques']
    # text = state['text_segment']
    if not new_r: return {"moderated_new_relations": [], "workflow_trace": ["relation_moderator"]}
    
    cases = []
    for r in new_r:
        c = next((x for x in critiques if x.source == r.source and x.target == r.target), None)
        cases.append(f"REL: {r.source}->{r.target}, VALID: {c.is_valid if c else '?'}, REASON: {c.reasoning if c else ''}")

    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are the Moderator. Your job is to resolve conflicts between a Proposer and a Critic regarding knowledge graph relations."),
        ("user", "Original Text: {text}\n\nCASES TO JUDGE:\n{cases}\n\nFor each case, decide if the relation should be KEPT (approved) or DROPPED (rejected). If the Critic found a valid error, reject it. If the Critic is being too pedantic but the relation is useful, approve it.")
    ])

    # We expect a list of decisions
    chain = prompt | llm.with_structured_output(RelationModerationList)
    result = chain.invoke({"text": state['text_segment'], "cases": '\n- '.join(cases)})
    
    approved = []
    for d in result.decisions:
        if d.status == 'approved':
            orig = next((r for r in new_r if r.source == d.source and r.target == d.target), None)
            if orig: approved.append(orig)
    
            
    return {"moderated_new_relations": approved, "workflow_trace": ["relation_moderator"]}

**Agente de consenso**

In [165]:
def consensus_agent(state: AgentState):
    """
    Agent 7: Consensus (Refinement & Polishing)
    Ensures high quality by merging duplicates and polishing descriptions/contexts.
    """
    print("--- [7] Consensus Agent ---")
    
    # We take the output from the moderators
    m_concepts = state['moderated_new_concepts']
    m_relations = state['moderated_new_relations']
    text = state['text_segment']

    if not m_concepts:
        return {"final_concepts": [], "final_relations": [], "workflow_trace": ["consensus"]}

    # Convert to string for the prompt
    c_str = "\n".join([f"- {c.name}: {c.description}" for c in m_concepts])
    r_str = "\n".join([f"- {r.source} -> {r.target} (Context: {r.context})" for r in m_relations])

    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are the Final Quality Assurance Board. Your goal is to produce a pristine Knowledge Graph.
        
        1. Review the 'Approved Concepts': 
           - Merge synonyms (e.g., 'ANN' and 'Artificial Neural Network').
           - Ensure descriptions are concise and high-quality.
        
        2. Review the 'Approved Relations': 
           - Ensure the context explains *why* A -> B. 
           - If the context is weak, improve it using the source text.
           - Ensure the source/target names match the final concept names exactly.

        3. Output the final, polished list of concepts and relations."""),
        ("user", "Original Text: {text}\n\nDraft Concepts:\n{c_str}\n\nDraft Relations:\n{r_str}\n\nProduce Final Graph:")
    ])

    # We expect a combined object containing both refined lists
    chain = prompt | llm.with_structured_output(KnowledgeGraph)
    result = chain.invoke({"text": text, "c_str": c_str, "r_str": r_str})

    return {
        "final_graph_update": result,
        "workflow_trace": ["consensus"]
    }

**Agente auditor**

In [166]:
def auditor_node(state: AgentState):
    """Agent 8: Auditor"""
    print("--- [8] Auditor ---")
    trace = state['workflow_trace']
    if "consensus" not in trace:
        raise ValueError("Pipeline failed to reach Consensus.")
    print("  [AUDIT PASSED] Quality Assurance complete.")
    return {"workflow_trace": ["auditor"]}


Workflow final

In [None]:
from langgraph.graph import StateGraph, START, END

workflow = StateGraph(AgentState)

workflow.add_node("concept_extractor", concept_extractor)
workflow.add_node("concept_delta_filter", concept_delta_filter) # Logic
workflow.add_node("concept_critic", concept_critic)
workflow.add_node("concept_moderator", concept_moderator)

workflow.add_node("relation_proposer", relation_proposer)
workflow.add_node("relation_delta_filter", relation_delta_filter) # Logic
workflow.add_node("relation_critic", relation_critic)
workflow.add_node("relation_moderator", relation_moderator)
workflow.add_node("consensus", consensus_agent)
workflow.add_node("auditor", auditor_node)

# Flow
workflow.add_edge(START, "concept_extractor")
workflow.add_edge("concept_extractor", "concept_delta_filter")
workflow.add_edge("concept_delta_filter", "concept_critic")
workflow.add_edge("concept_critic", "concept_moderator")
workflow.add_edge("concept_moderator", "relation_proposer")
workflow.add_edge("relation_proposer", "relation_delta_filter")
workflow.add_edge("relation_delta_filter", "relation_critic")
workflow.add_edge("relation_critic", "relation_moderator")
workflow.add_edge("relation_moderator", "consensus")
workflow.add_edge("consensus", "auditor")
workflow.add_edge("auditor", END)

app = workflow.compile()

In [168]:
from langsmith import traceable

@traceable(run_type="chain", name="Chapter run")
def invoke_extractor(initial_state: AgentState):
    return app.invoke(initial_state)

In [None]:
sample_text = """
To understand Backpropagation, one must first grasp the Chain Rule of calculus. 
However, before diving into calculus, a basic understanding of Functions is required.
Inductive Logic is unrelated to this specific derivation tree.
"""

known_concepts = ["Backpropagation", "Chain Rule", "Functions", "Inductive Logic"]

global_kb = KnowledgeGraph(concepts=[], relations=[])
initial_state = {
            "text_segment": sample_text,
            "knowledge_base": global_kb,
            # Init empty lists for required fields
            "extracted_concepts": [], "new_concepts": [], "known_concepts": [],
            "concept_critiques": [], "moderated_new_concepts": [], "active_concepts": [],
            "proposed_relations": [], "new_relations": [], "relation_critiques": [],
            "moderated_new_relations": [], 
            "final_graph_update": KnowledgeGraph(concepts=[], relations=[]),
            "workflow_trace": []
}


final_state = invoke_extractor(initial_state)
for x in final_state['final_graph_update'].concepts:
    print(x)

print('\n')

for x in final_state['final_graph_update'].relations:
    print(x)

name='Backpropagation' description='An algorithm for computing gradients of loss functions with respect to parameters in neural networks using reverse-mode automatic differentiation.'
name='Chain Rule' description='A rule in calculus for computing the derivative of a composite function.'
name='Calculus' description='A branch of mathematics focused on limits, derivatives, integrals, and infinite series.'
name='Functions' description='Mathematical mappings that assign each element of a domain to exactly one element of a codomain.'
name='Inductive Logic' description='A branch of logic concerned with reasoning from specific observations to general conclusions or theories.'


source='Chain Rule' target='Backpropagation' relation_type='prerequisite' context='Understanding the Chain Rule is required to follow how backpropagation computes derivatives through composed functions in a neural network.'
source='Functions' target='Calculus' relation_type='prerequisite' context='A basic understanding

In [28]:
from langchain.agents import create_agent
from langchain_core.messages import SystemMessage, HumanMessage

# Validator prompt
prompt = SystemMessage("""You are a Strict Quality Control Agent for an Educational Knowledge Graph.
    
    Your Goal: Review the 'Prerequisite Relations' extracted from a text'.
    Filter out noise to ensure high-quality graph nodes.
    
    **ACCEPTANCE CRITERIA**:
    1. **Correlation**: The concepts must be strong related, such that someone would struggle to learn the source
    concept without understand the target  
    """)

validator_agent = create_agent(
    name="ValidatorAgent",
    model=get_llm(Models.GPT5_1),
    middleware=[],
    tools=[],
    system_prompt=prompt,
    response_format=ValidationResult
)

def invoke_validator(relations: List[Relation]) -> ValidationResult:
    relations_str = "\n".join([f"{rel.source}-[PREREQUISITE]->{rel.target}" for rel in relations])
    # Pass a dict, not a HumanMessage
    result: ValidationResult = validator_agent.invoke(input={
        "messages": [HumanMessage(content=f"Candidate relations: {relations_str}")]
    })
    return result['structured_response']
        

In [29]:
from langchain.agents.middleware import SummarizationMiddleware

prompt = SystemMessage("""You are a Knowledge Graph Architect. 
    Your goal is to identify the fundamental **Prerequisite Relations** within <CONTENT>.
    You can create between <CURRENT CONCEPTS> and <PREVIOUSLY LEARNED CONCEPTS>.
    
    ### RULES:
    1. If you are creating a prerequisite relation between current concepts, the
    target must have been taught before source.
    
    ### YOUR TASK:
    1. Check the relations:
    - **prerequisite**: Does the current concept require knowing a concept from the PREVIOUSLY LEARNED CONCEPTS list? Answer yes/no and which one.
""")

creator_agent = create_agent(
    name="CreatorAgent",
    model=get_llm(Models.GPT5_1),
    tools=[],
    # middleware=[SummarizationMiddleware(
    #         model=Models.GPT5_1, 
    #         trigger=("tokens", 10),
    #         keep=("messages", 20),)],
    system_prompt=prompt,
    response_format=ConceptAnalysis
)

def invoke_creator(current_concepts_str: str, previous_concepts_str: str, text_content: str) -> ConceptAnalysis:
    human_message = HumanMessage(content=f"""
        <CURRENT CONCEPTS>:\n{current_concepts_str}\n\n<PREVIOUS LEARNED CONCEPTS>:\n{previous_concepts_str}\n\n<CONTENT>:\n{text_content}""")
    return creator_agent.invoke(input={"messages": [human_message]})['structured_response']
    

In [31]:
def run_creator_then_validator(current_concepts_str, previous_concepts_str, text_content):
    creator_result: ConceptAnalysis = invoke_creator(current_concepts_str, previous_concepts_str, text_content)
    validator_result: ValidationResult = invoke_validator(creator_result.relations)
    return validator_result

In [30]:
concepts_test = [
    Concept(concept_name="File Permissions", chapter=[1], description="", page_start=10),
    Concept(concept_name="Linux File System", chapter=[2], description="", page_start=20),
]

previous_concepts_str = "Binary Notation, Kernel, Operating System, Drivers, Shell"
current_concepts_str = "File Permissions, Linux File System"

text_content = """In this chapter, we explore the Linux File System and its permissions model. 
Understanding file permissions is crucial for managing access to files and directories in a multi-user environment.
File permissions in Linux are defined using a combination of read, write, and execute flags for the owner, group, and others."""

# run_creator_then_validator(current_concepts_str, previous_concepts_str, text_content)

In [80]:
ROOT_DIRECTORY = base_output_dir
NODES_XML = ROOT_DIRECTORY / 'nodes.xml'
RELATIONS_XML = ROOT_DIRECTORY / 'relations.xml'

In [211]:
def build(concepts: List[Concept], kg: KnowledgeGraph = None, start_chapter = 0, chapters_to_extract = 1):
    """
    Build the graph containing the initial concepts, new concepts learned and the relation
    between them.
    """
    # output = widgets.Output()
    # display(output) 
    global_kb = kg
    if kg == None:
        global_kb = KnowledgeGraph(concepts=[], relations=[])
    
    output_dir = base_output_dir

    files = {}
    folder_name = ''
    folder_count = 0
    for idx, concept in enumerate(concepts):
        if len(concept.chapter) == 1:
            folder_count += 1
            folder_name = normalize_filename(concept.concept_name)
            files[(folder_name, folder_count)] = []
            
        node_id = normalize_filename(concept.concept_name)
        files[(folder_name, folder_count)].append(concept.concept_name)
    
    sorted_files = sorted(files.keys(), key=lambda x: x[1])
    files_to_extract = sorted_files[start_chapter : start_chapter + chapters_to_extract]
    
    for folder, idx in tqdm_notebook(files_to_extract, desc="Creating relations and new nodes..."):
        
        # Reading content 
        file_path = os.path.join(output_dir, folder, "document.md")
        if not os.path.exists(file_path):
            continue
        
        # with output:
        # output.clear_output(wait=True)
        print(f"Analyzing: {folder}")
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Calling the pipeline
        state_input = {
            "text_segment": content,
            "knowledge_base": global_kb,
            # Init empty lists for required fields
            "extracted_concepts": [], "new_concepts": [], "known_concepts": [],
            "concept_critiques": [], "moderated_new_concepts": [], "active_concepts": [],
            "proposed_relations": [], "new_relations": [], "relation_critiques": [],
            "moderated_new_relations": [], 
            "final_graph_update": KnowledgeGraph(concepts=[], relations=[]),
            "workflow_trace": []
        }
        
        final_state: AgentState = invoke_extractor(state_input, langsmith_extra={"name": folder})
        
        # EXTRACT DELTA
        delta = final_state['final_graph_update']
        
        print(f"--- Chapter {idx+1} Delta ---")
        print(f"New Concepts: {len(delta.concepts)}")
        print(f"New Relations: {len(delta.relations)}")
        
        # UPDATE GLOBAL MEMORY (Manual Merge)
        global_kb.concepts.extend(delta.concepts)
        global_kb.relations.extend(delta.relations)
        
        print(f"--- Global KB Status ---")
        print(f"Total Concepts: {len(global_kb.concepts)}")
        print(f"Total Relations: {len(global_kb.relations)}")
        # break
         
    return global_kb


In [212]:
res = build(toc_concepts, res ,4, 10)

Creating relations and new nodes...:   0%|          | 0/10 [00:00<?, ?it/s]

Analyzing: evolucao_do_linux_distribuicoes
--- Concept Extractor ---


2025-12-14 17:50:26,951 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['GNU/Linux', 'Kernel Linux', 'Distribuição Linux', 'GPL', 'Red Hat Enterprise Linux', 'Ubuntu', 'Linux Mint', 'Fedora', 'openSUSE', 'Android', 'DevOps']
  > New (Sending to Critic): ['Live distribution', 'From scratch distribution', 'Distribuição derivada', 'Rolling release', 'Servidor', 'Desktop', 'RPM', 'DEB', 'Pacman', 'YUM', 'AUR', 'Red Hat Linux', 'Fedora Project', 'CentOS', 'CentOS Stream', 'CentOS Upstream', 'Oracle Linux', 'Ksplice', 'DTrace', 'Slackware Linux', 'Debian GNU/Linux', 'Ubuntu LTS', 'Canonical', 'Cinnamon', 'GNOME', 'KDE', 'Xfce', 'Kubuntu', 'Xubuntu', 'Lubuntu', 'Arch Linux', 'Manjaro Linux', 'SUSE Linux Enterprise', 'YaST', 'Knoppix', 'Gentoo Linux', 'Portage', 'Emerge', 'Metadistribuição', 'MPlayer', 'Amarok', 'OpenOffice', 'RPMFusion', 'EPEL', 'Sistema embarcado', 'Software embarcado', 'Internet das Coisas', 'Raspbian', 'Android Things', 'Debian Tinker', 'OpenWrt', 'Tizen', 'Ubuntu Core', 'Yoc

2025-12-14 17:50:53,695 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 17:51:18,868 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 17:51:53,887 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
--- Critic Agent ---


2025-12-14 17:53:57,073 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 17:55:00,662 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 17:57:21,965 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 6 Delta ---
New Concepts: 90
New Relations: 97
--- Global KB Status ---
Total Concepts: 200
Total Relations: 211
Analyzing: conhecendo_o_linux
--- Concept Extractor ---


2025-12-14 17:58:45,428 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['GNU/Linux', 'Unix', 'Unix-like', 'sistema operacional', 'kernel', 'multitarefa', 'multiusuário', 'software livre', 'open source', 'GIMP', 'KDE', 'Amarok', 'Docker', 'GNOME', 'xfce']
  > New (Sending to Critic): ['MAC OS X', 'arquivo', 'diretório', 'arquivo binário', 'arquivo de texto', 'arquivo especial', 'dispositivo de armazenamento', 'memória física', 'porta serial', 'porta paralela', 'case-sensitive', 'permissão de execução', 'shell script', 'arquivo oculto', 'ls', 'processo', 'superusuário', 'root', 'UID', 'Firefox', 'Chrome', 'Vivaldi', 'Opera', 'Thunderbird', 'SMTP', 'POP3', 'IMAP', 'S/MIME', 'criptografia de mensagens', 'certificado digital', 'desenho vetorial', 'Inkscape', 'SVG', 'DTD', 'XML', 'PNG', 'TIFF', 'GIF', 'JPG', 'AI', 'PDF', 'PS', 'Kdenlive', 'MLT', 'GNU General Public License', 'VirtualBox', 'VMware Workstation', 'virtualização', 'sistema operacional hospedeiro', 'sistema operacional convidado', '

2025-12-14 17:59:44,206 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:00:34,895 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:01:13,449 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
  [SKIP] Relation sistema operacional->kernel already exists.
--- Critic Agent ---


2025-12-14 18:03:19,876 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:04:22,017 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:05:54,672 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 7 Delta ---
New Concepts: 148
New Relations: 113
--- Global KB Status ---
Total Concepts: 348
Total Relations: 324
Analyzing: topicos_para_revisao_do_capitulo
--- Concept Extractor ---


2025-12-14 18:05:59,839 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['Linux']
  > New (Sending to Critic): ['FHS', 'touch', 'file', 'mkdir', 'rm', 'find', 'xargs', 'locate', 'updatedb']
--- [2] Concept Critic ---


2025-12-14 18:06:06,565 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:06:13,429 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:06:15,618 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
--- Critic Agent ---


2025-12-14 18:06:22,848 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:06:26,944 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:06:32,680 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 8 Delta ---
New Concepts: 4
New Relations: 3
--- Global KB Status ---
Total Concepts: 352
Total Relations: 327
Analyzing: estrutura_do_sistema_operacional
--- Concept Extractor ---


2025-12-14 18:06:52,441 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['GNU/Linux', 'Sistema operacional', 'Kernel Linux', 'Bash', 'Multiusuário', 'Multitarefa', 'Unix', 'Debian GNU/Linux', 'Gerenciador de janelas', 'KDE', 'GNOME', 'Case-sensitive', 'Root']
  > New (Sending to Critic): ['Camadas', 'Desktop environment', 'Display manager', 'ttyn', 'Hardware', 'Sessão', 'Login', 'Terminal virtual', 'Console', 'rlogin', 'ssh', 'rsh', 'rdesktop', 'telnet', 'Pseudoterminal', 'Konsole', 'Shell', 'Prompt', 'sudo', 'su', 'whoami', 'who am i', 'Variáveis de ambiente', 'Logout', 'shutdown', 'halt', 'poweroff', 'init 0', 'Sistema de arquivos', 'Nobreak', 'reboot', 'CTRL+ALT+DEL']
--- [2] Concept Critic ---


2025-12-14 18:07:11,282 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:07:42,208 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:08:09,445 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
--- Critic Agent ---


2025-12-14 18:09:05,458 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:09:38,004 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:10:29,952 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 9 Delta ---
New Concepts: 44
New Relations: 50
--- Global KB Status ---
Total Concepts: 396
Total Relations: 377
Analyzing: o_que_e_um_shell
--- Concept Extractor ---


2025-12-14 18:10:35,267 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['Shell', 'GNU/Linux', 'bash', 'UNIX', 'Shell script']
  > New (Sending to Critic): ['Terminal de comandos', 'Bourne shell', 'sh', 'csh', 'tcsh', 'ksh', 'zsh', 'Shell de login', 'chsh']
--- [2] Concept Critic ---


2025-12-14 18:10:40,695 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:10:47,152 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:10:56,569 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
--- Critic Agent ---


2025-12-14 18:11:20,942 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:11:30,262 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:11:44,706 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 10 Delta ---
New Concepts: 14
New Relations: 15
--- Global KB Status ---
Total Concepts: 410
Total Relations: 392
Analyzing: variaveis
--- Concept Extractor ---


2025-12-14 18:12:00,056 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['Shell', 'Variáveis de ambiente', 'SHELL', 'bash']
  > New (Sending to Critic): ['Variáveis', 'Variáveis locais', 'HOME', 'HOSTTYPE', 'TERM', 'USER', 'PATH', 'PS1', 'PS2', 'MAIL', 'LOGNAME', 'OSTYPE', 'echo', 'export', 'set', 'env', 'printenv', 'unset', '/etc/profile', '/etc/environment', '~/.bashrc', '~/.bash_profile', '~/.bash_login', '~/.profile']
--- [2] Concept Critic ---


2025-12-14 18:12:10,611 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:12:23,825 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:12:41,028 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
  [SKIP] Relation bash->Shell already exists.
--- Critic Agent ---


2025-12-14 18:13:36,229 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:14:03,570 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:14:41,871 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 11 Delta ---
New Concepts: 26
New Relations: 26
--- Global KB Status ---
Total Concepts: 436
Total Relations: 418
Analyzing: arquivos_de_configuracao_do_shell
--- Concept Extractor ---


2025-12-14 18:14:53,136 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['bash', '/etc/profile', '~/.bash_profile', '~/.bash_login', '~/.profile', '~/.bashrc', '/etc/environment', 'ls']
  > New (Sending to Critic): ['~/.bash_logout', '/etc/bash.bashrc', 'aliases', 'alias', 'unalias', '--color', '-l', '-a', '/etc/issue', '/etc/motd', '/etc/issue.net', '.bash_history', 'HISTFILE', 'HISTSIZE', 'HISTFILESIZE', 'history', 'fc']
--- [2] Concept Critic ---


2025-12-14 18:15:09,112 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:15:21,607 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:15:47,003 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
--- Critic Agent ---


2025-12-14 18:16:18,340 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:16:42,302 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:17:07,804 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 12 Delta ---
New Concepts: 23
New Relations: 23
--- Global KB Status ---
Total Concepts: 459
Total Relations: 441
Analyzing: caminhos_de_diretorios
--- Concept Extractor ---


2025-12-14 18:17:28,490 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['Linux', 'ls', 'PATH', 'shell', 'GNU/Linux']
  > New (Sending to Critic): ['linha de comando', 'estrutura de arquivos', 'diretório raiz', 'caminho absoluto', 'caminho relativo', 'diretório corrente', 'diretório pai', 'pwd', 'cd', 'variável de ambiente', '/bin', '/home', '/usr', '/tmp', '/etc', '/dev', '/proc', 'cpuinfo', 'home do usuário', 'curinga', 'asterisco (*)', 'ponto de interrogação (?)', 'expansão de colchetes', 'atalhos de teclado', 'clear', 'exit']
--- [2] Concept Critic ---


2025-12-14 18:17:40,777 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:18:00,848 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:18:18,053 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
  [SKIP] Relation Linux->GNU/Linux already exists.
--- Critic Agent ---


2025-12-14 18:19:00,039 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:19:38,936 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:20:06,811 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 13 Delta ---
New Concepts: 30
New Relations: 34
--- Global KB Status ---
Total Concepts: 489
Total Relations: 475
Analyzing: topicos_para_revisao_do_capitulo
--- Concept Extractor ---


2025-12-14 18:20:13,307 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['Linux', 'xargs', 'updatedb']
  > New (Sending to Critic): ['FHS', 'Filesystem Hierarchy Standard', 'touch', 'file', 'mkdir', 'rm', 'find', 'locate']
--- [2] Concept Critic ---


2025-12-14 18:20:22,480 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:20:28,005 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:20:32,409 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
  [SKIP] Relation xargs->Linux already exists.
  [SKIP] Relation updatedb->Linux already exists.
--- Critic Agent ---


2025-12-14 18:20:47,155 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:20:56,877 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:21:09,275 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 14 Delta ---
New Concepts: 8
New Relations: 7
--- Global KB Status ---
Total Concepts: 497
Total Relations: 482
Analyzing: como_obter_ajuda
--- Concept Extractor ---


2025-12-14 18:21:15,114 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [1.5] Concept Delta Filter ---
  > Known (Skipping validation): ['Sistema Operacional', 'GNU/Linux', 'Software Livre']
  > New (Sending to Critic): ['documentação', 'serviços', 'comandos', 'Man Pages']
--- [2] Concept Critic ---


2025-12-14 18:21:20,893 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [3] Concept Moderator ---


2025-12-14 18:21:28,527 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4] Relation Proposer ---


2025-12-14 18:21:36,105 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [4.5] Relation Delta Filter ---
  [SKIP] Relation GNU/Linux->Sistema Operacional already exists.
--- Critic Agent ---


2025-12-14 18:21:47,881 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [6] Relation Moderator ---


2025-12-14 18:21:51,630 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [7] Consensus Agent ---


2025-12-14 18:21:58,224 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


--- [8] Auditor ---
  [AUDIT PASSED] Quality Assurance complete.
--- Chapter 15 Delta ---
New Concepts: 5
New Relations: 4
--- Global KB Status ---
Total Concepts: 502
Total Relations: 486


In [197]:
res.concepts

[Concept(name='Linux', description='Sistema operacional de código aberto baseado no kernel Linux, amplamente utilizado em servidores, desktops, dispositivos móveis e na nuvem.'),
 Concept(name='Open Source', description='Modelo de desenvolvimento e licenciamento de software em que o código-fonte é aberto para uso, modificação e distribuição.'),
 Concept(name='Android', description='Sistema operacional móvel baseado no kernel Linux, amplamente utilizado em smartphones e outros dispositivos móveis.'),
 Concept(name='Sistema operacional', description='Software básico que gerencia o hardware do computador e fornece serviços para programas aplicativos.'),
 Concept(name='Segurança da informação', description='Área que trata da proteção de dados e sistemas contra acessos não autorizados, uso indevido, falhas e ataques.'),
 Concept(name='Testes de invasão', description='Prática de simular ataques a sistemas de informação para identificar vulnerabilidades de segurança, também conhecida como pen

In [213]:
from typing import List, Optional, Literal
from pydantic import BaseModel, Field
from pathlib import Path

class GraphXMLBuilder():
    """Receives a KnowledgeGraph instance and builds the XML files nodes.xml and relations.xml."""
    def __init__(self, kg: KnowledgeGraph, output_dir: Optional[Path] = None):
        self.kg = kg
        self.output_dir = output_dir or Path('.')
        self.nodes_path = self.output_dir / "nodes.xml"
        self.relations_path = self.output_dir / "relations.xml"

    def save(self):
        import xml.etree.ElementTree as ET

        # Build nodes.xml
        root_nodes = ET.Element("nodes")
        for idx, concept in enumerate(self.kg.concepts):
            node = ET.SubElement(root_nodes, "node")
            node.set("id", concept.name.replace(' ', '_').lower())
            node.set("name", concept.name)
            node.set("description", getattr(concept, 'description', ''))
            node.set("order", str(idx))

        # Build relations.xml
        root_rels = ET.Element("relations")
        for rel in self.kg.relations:
            rel_elem = ET.SubElement(root_rels, "relation")
            rel_elem.set("type", rel.relation_type)
            rel_elem.set("source", rel.source.replace(' ', '_').lower())
            rel_elem.set("target", rel.target.replace(' ', '_').lower())
            if rel.context:
                context_elem = ET.SubElement(rel_elem, "context")
                context_elem.text = rel.context

        # Save pretty XML
        xml_nodes_str = prettify_xml(root_nodes)
        with open(self.nodes_path, "w", encoding="utf-8") as f:
            f.write(xml_nodes_str)

        xml_rels_str = prettify_xml(root_rels)
        with open(self.relations_path, "w", encoding="utf-8") as f:
            f.write(xml_rels_str)

        print(f"Updated {self.nodes_path} and {self.relations_path}")

In [214]:
kg_builder = GraphXMLBuilder(res, base_output_dir)
kg_builder.save()

Updated ebooks/LinuxFundamentals/nodes.xml and ebooks/LinuxFundamentals/relations.xml


In [None]:
OUTPUT_HTML = base_output_dir / 'graph.html'
OUTPUT_HTML = str(OUTPUT_HTML)


In [None]:
ADSmport os
import xml.etree.ElementTree as ET
from pyvis.network import Network
import networkx as nx

# Define paths (Update these if your files are in a different directory)
NODES_XML = str(base_output_dir / "nodes.xml")
RELATIONS_XML = str(base_output_dir / "relations.xml")
OUTPUT_HTML = str(base_output_dir / "graph.html")


def get_node_color_by_level(level_str):
    """Returns a color hex code based on hierarchy level."""
    try:
        level = int(level_str)
    except (ValueError, TypeError):
        level = 1

    # Palette: Deep Blue -> Teal -> Green -> Light Green
    palette = {
        0: "#f0a202", # Gold (Root)
        1: "#22577a", # Dark Blue (Chapters)
        2: "#38a3a5", # Teal (Sections)
        3: "#57cc99", # Mint (Sub-sections)
        4: "#80ed99", # Light Green
        5: "#c7f9cc"  # Pale Green
    }
    return palette.get(level, "#c7f9cc") # Default to lightest for deep levels

def create_interactive_graph():
    if not os.path.exists(NODES_XML) or not os.path.exists(RELATIONS_XML):
        print(f"❌ XML files not found: Check {NODES_XML} and {RELATIONS_XML}")
        return

    print("📊 Constructing NetworkX Graph...")
    G = nx.DiGraph()

    # 1. Parse Nodes
    # We add nodes first to establish colors/types, but we will OVERWRITE the size later.
    tree_nodes = ET.parse(NODES_XML)
    for node in tree_nodes.findall("node"):
        node_id = node.get("id")
        name = node.get("name")
        node_type = node.get("type", "chapter") 
        level = node.get("level", "1")
        
        # Default styling
        title = f"Type: {node_type}\nLevel: {level}"
        
        if node_type == "extracted":
            color = "#ffafcc" # Pink/Pastel for Extracted Concepts
            found_in = node.get("found_in_chapter", "unknown")
            title += f"\nFound in: {found_in}"
        elif node_type == "root":
            color = get_node_color_by_level(0)
        else:
            # Structural Node (Chapter/ToC) -> Color by Level
            color = get_node_color_by_level(level)

        # Initial add without specific size (will be calculated based on edges)
        G.add_node(node_id, label=name, title=title, color=color, shape="dot")

    # 2. Parse Relations
    tree_rels = ET.parse(RELATIONS_XML)
    for rel in tree_rels.findall("relation"):
        source = rel.get("source")
        target = rel.get("target")
        rel_type = rel.get("type")
        
        # --- EDGE STYLING LOGIC ---
        color = "#888888" 
        width = 1
        dashes = False
        
        if rel_type == "prerequisite":
            color = "#ff3366"  # Bright Red/Pink (Critical Path)
            width = 3
        elif rel_type == "part-of":
            color = "#4a90e2"  # Solid Blue
            dashes = True      # Dashed to show hierarchy
        elif rel_type == "including":
            color = "#00b4d8"  # Cyan
            dashes = True
        elif rel_type == "definition":
            color = "#9b5de5"  # Purple (Semantic Definition)
            width = 2
        elif rel_type == "property":
            color = "#f15bb5"  # Magenta (Attribute/Property)
            
        if G.has_node(source) and G.has_node(target):
            G.add_edge(source, target, title=rel_type, color=color, width=width, dashes=dashes)

    # -----------------------------------------------------------------
    # 3. NEW LOGIC: Update Node Sizes based on Degree (Connections)
    # -----------------------------------------------------------------
    print("⚖️  Recalculating node sizes based on connection count...")
    
    for node_id in G.nodes():
        # Get total connections (In + Out)
        degree = G.degree[node_id]
        
        # Formula: Base Size + (Degree * Multiplier)
        # Adjust 'multiplier' (e.g., 3 or 4) to make the size difference more dramatic
        base_size = 10
        multiplier = 4 
        new_size = base_size + (degree * multiplier)
        
        # Update the size attribute in the graph
        G.nodes[node_id]['size'] = new_size
        
        # Update title to show the count on hover
        G.nodes[node_id]['title'] += f"\nConnections: {degree}"

    print(f"🕸️  Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

    # 4. Generate PyVis Visualization
    print("🎨 Generating Obsidian-like HTML Visualization...")
    
    # ADDED cdn_resources='in_line' to ensure dependencies load correctly
    net = Network(height="900px", width="100%", bgcolor="#1e1e1e", font_color="#cccccc", select_menu=False, filter_menu=False, cdn_resources='in_line')
    
    net.from_nx(G)
    
    options = """
    var options = {
      "nodes": {
        "borderWidth": 0,
        "borderWidthSelected": 2,
        "font": {
          "size": 14,
          "face": "tahoma",
          "color": "#eeeeee",
          "strokeWidth": 2,
          "strokeColor": "#1e1e1e"
        },
        "shadow": {
            "enabled": true,
            "color": "black",
            "size": 5,
            "x": 2,
            "y": 2
        }
      },
      "edges": {
        "smooth": {
          "type": "continuous",
          "forceDirection": "none"
        },
        "arrows": {
            "to": {
                "enabled": true,
                "scaleFactor": 0.5
            }
        },
        "color": {
            "inherit": false,
            "opacity": 1.0
        }
      },
      "interaction": {
        "hover": true,
        "hoverConnectedEdges": true,
        "selectConnectedEdges": true,
        "navigationButtons": true,
        "keyboard": true,
        "tooltipDelay": 200
      },
      "physics": {
        "stabilization": {
            "enabled": true,
            "iterations": 1000
        },
        "barnesHut": {
          "gravitationalConstant": -8000,
          "springConstant": 0.001,
          "springLength": 200
        }
      }
    }
    """
    net.set_options(options)
    
    net.save_graph(OUTPUT_HTML)
    print(f"✅ Visualization saved to: {os.path.abspath(OUTPUT_HTML)}")

In [225]:
create_interactive_graph()

📊 Constructing NetworkX Graph...
⚖️  Recalculating node sizes based on connection count...
🕸️  Graph created with 439 nodes and 485 edges.
🎨 Generating Obsidian-like HTML Visualization...
✅ Visualization saved to: /home/pras/Documents/LAMIA/EMBRAPII/4LINUX/notebooks/ebooks/LinuxFundamentals/graph.html


In [208]:
import os
import xml.etree.ElementTree as ET
from collections import Counter
from typing import List, Tuple, Dict, Any

def get_single_run_stats(nodes_path: str, relations_path: str) -> Dict[str, Any]:
    """
    Helper function to parse a single pair of XML files and return raw stats.
    """
    stats = {
        "status": "OK",
        "nodes_total": 0,
        "nodes_chapter": 0,
        "nodes_extracted": 0,
        "relations_total": 0,
        "rel_prereq": 0,
        "rel_partof": 0,
        "rel_def": 0,
        "rel_prop": 0,
        "orphans": 0,
        "avg_rel": 0.0
    }

    if not os.path.exists(nodes_path) or not os.path.exists(relations_path):
        stats["status"] = "Missing Files"
        return stats

    try:
        # 1. Analyze Nodes
        tree_nodes = ET.parse(nodes_path)
        root_nodes = tree_nodes.getroot()
        all_nodes = root_nodes.findall("node")
        
        stats["nodes_total"] = len(all_nodes)
        node_ids = set()

        for node in all_nodes:
            n_type = node.get("type", "chapter")
            if n_type == "extracted":
                stats["nodes_extracted"] += 1
            else:
                stats["nodes_chapter"] += 1 # Counts root and chapters together
            
            node_ids.add(node.get("id"))

        # 2. Analyze Relations
        tree_rels = ET.parse(relations_path)
        root_rels = tree_rels.getroot()
        all_rels = root_rels.findall("relation")
        
        stats["relations_total"] = len(all_rels)
        
        connected_nodes = set()

        for rel in all_rels:
            r_type = rel.get("type", "unknown")
            
            if r_type == "prerequisite":
                stats["rel_prereq"] += 1
            elif r_type == "part-of":
                stats["rel_partof"] += 1
            elif r_type == "definition":
                stats["rel_def"] += 1
            elif r_type == "property":
                stats["rel_prop"] += 1
            
            connected_nodes.add(rel.get("source"))
            connected_nodes.add(rel.get("target"))

        # 3. Health Metrics
        orphans = node_ids - connected_nodes
        stats["orphans"] = len(orphans)
        
        if stats["nodes_total"] > 0:
            stats["avg_rel"] = round(stats["relations_total"] / stats["nodes_total"], 2)

    except Exception as e:
        stats["status"] = f"Error: {str(e)[:20]}..."
    
    return stats

def analyze_graph_files(graph_runs):
    headers = [
        "Generator", "Validator", "Status", "Nodes", "Chapters", "Concepts",
        "Relations", "Prereqs", "Part-Of", "Defs", "Props", "Orphans", "Avg Rel/Node"
    ]
    lines = []
    lines.append("### Knowledge Graph Analysis Report")
    header_row = "| " + " | ".join(headers) + " |"
    separator = "| " + " | ".join(["---"] * len(headers)) + " |"
    lines.append(header_row)
    lines.append(separator)
    

    for idx, (run_name, n_path, r_path) in enumerate(graph_runs, 1):
        data = get_single_run_stats(n_path, r_path)
        if data["status"] != "OK":
            row = [str(idx), data["status"]] + ["-"] * (len(headers) - 2)
        else:
            row = [
                str(run_name.split('_')[0]),
                str(run_name.split('_')[1]),
                "✅ OK",
                str(data["nodes_total"]),
                str(data["nodes_chapter"]),
                str(data["nodes_extracted"]),
                str(data["relations_total"]),
                str(data["rel_prereq"]),
                str(data["rel_partof"]),
                str(data["rel_def"]),
                str(data["rel_prop"]),
                str(data["orphans"]),
                str(data["avg_rel"])
            ]
        lines.append("| " + " | ".join(row) + " |")
    lines.append("\n*Note: 'Chapters' includes structural nodes (Root, Sections, Chapters). 'Concepts' are LLM extracted entities.*")
    return "\n".join(lines)

In [72]:
import os
from pathlib import Path

GRAPH_RUNS_DIR = Path("graph_runs")
graph_runs = []

for run_dir in GRAPH_RUNS_DIR.iterdir():
    if run_dir.is_dir():
        nodes_path = run_dir / "nodes.xml"
        relations_path = run_dir / "relations.xml"
        if nodes_path.exists() and relations_path.exists():
            graph_runs.append((run_dir.name, nodes_path, relations_path))
graph_runs

[('claude-opus-4-5_gpt5-1',
  PosixPath('graph_runs/claude-opus-4-5_gpt5-1/nodes.xml'),
  PosixPath('graph_runs/claude-opus-4-5_gpt5-1/relations.xml')),
 ('claude-opus-4-5_claude-opus-4-5',
  PosixPath('graph_runs/claude-opus-4-5_claude-opus-4-5/nodes.xml'),
  PosixPath('graph_runs/claude-opus-4-5_claude-opus-4-5/relations.xml')),
 ('gpt5-1_gpt5-1',
  PosixPath('graph_runs/gpt5-1_gpt5-1/nodes.xml'),
  PosixPath('graph_runs/gpt5-1_gpt5-1/relations.xml')),
 ('gpt5-1_claude-opus-4-5',
  PosixPath('graph_runs/gpt5-1_claude-opus-4-5/nodes.xml'),
  PosixPath('graph_runs/gpt5-1_claude-opus-4-5/relations.xml'))]