# Extra√ß√£o de dados

O SINKT j√° considera um dataset pronto para uso. Sendo assim essa se√ß√£o busca extrair os conceitos de um ebook PDF. Primeiramente iremos transformar em Markdown, visto que √© melhor utilizar texto puro ao inv√©s de p√°ginas de PDF. Al√©m disso, essa proposta facilita a pr√≥pria extra√ß√£o para o MAIC, posteriormente.

In [1]:
import unicodedata
import re

def normalize_filename(s):
    """Remove accents, replace underscores and remove non-alphanumeric characters."""
    s = unicodedata.normalize('NFKD', s)
    s = ''.join(c for c in s if not unicodedata.combining(c))
    s = re.sub(r'\s+', '_', s)
    s = re.sub(r'[^\w_]', '', s)
    return s.lower()

Configura√ß√£o inicial.

In [2]:
from dotenv import load_dotenv
import os
from pathlib import Path

load_dotenv("../.env")

assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not found"

MODEL_NAME = "gpt-4o"

BOOK_NAME = 'LinuxFundamentals'
EBOOKS_PATH = Path('ebooks')
base_output_dir = EBOOKS_PATH / BOOK_NAME
os.makedirs(base_output_dir, exist_ok=True)

PDF_PATH = Path('../data/701-LinuxFundamentals_material_full_v14.pdf')
OUTPUT_CSV = Path('../concepts.csv')

Definindo as estruturas de dados que iremos trabalhar.

In [3]:
from pydantic import BaseModel, Field
from typing import List, Optional

class Concept(BaseModel):
    """Represents a single educational concept found in the text."""
    concept_name: str = Field(description="The formal name of the concept (e.g., 'Inductive Logic', 'Backpropagation').")
    chapter: List[int] = Field(description="The number of the current chapter, subchapter, etc (e.g., [1] for chapter 1, [1, 2] for subchapter 1.2, [1,2,5] for subsubchapter 1.2.5)")
    description: str = Field(description="A concise definition or summary of the concept based on the text.")
    page_start: int = Field(description="The page number where this concept is first introduced.")
    # page_end: Optional[int] = Field(default=None, description="The page number where the discussion of this concept seems to end (or current page if ongoing).")
    is_main_chapter: bool = Field(default=False, description="True if this is a chapter or main topic, False if it is a subchapter or subtopic.")

class PageExtraction(BaseModel):
    """Container for multiple concepts found on a specific page processing step."""
    concepts: List[Concept] = Field(description="List of concepts extracted from the current text window.")


In [4]:
import xml.etree.ElementTree as ET
from typing import Dict, Tuple 
from xml.dom import minidom

class StructuralGraphBuilder:
    """
    Responsible for creating the initial 'part-of' and 'including' relationships
    based strictly on the Table of Contents structure.
    """
    def __init__(self, output_dir: Path):
        self.output_dir = output_dir

    def build_graph(self, concepts: List[Concept]) -> Tuple[Path, Path]:
        """
        Generates two XML files:
        1. nodes.xml: Definitions of concepts/chapters.
        2. relations.xml: Relationships referencing node IDs.
        """
        nodes_root = ET.Element("nodes")
        relations_root = ET.Element("relations")
        
        # Map tuple(chapter_list) -> node_id for easy parent lookup
        # e.g. (1, 1) -> "1_History"
        hierarchy_map: Dict[Tuple[int, ...], str] = {}

        print(f"üèóÔ∏è Building Graph for {len(concepts)} concepts...")

        # First pass: Create all nodes and populate map
        for idx, concept in enumerate(concepts):
            safe_name = normalize_filename(concept.concept_name)
            node_id = f"{safe_name}"
            
            # Save to map for relationship building
            hierarchy_map[tuple(concept.chapter)] = node_id

            # Create Node Element
            node = ET.SubElement(nodes_root, "node")
            node.set("id", node_id)
            node.set("name", concept.concept_name)
            node.set("folder", node_id)
            node.set("order", str(idx))
            node.set("level", str(len(concept.chapter)))
            node.set("page_start", str(concept.page_start))

        # Second pass: Build relationships based on hierarchy
        for concept in concepts:
            current_id = hierarchy_map[tuple(concept.chapter)]
            
            # Infer Parent based on chapter list
            # If current is [1, 2, 1], parent should be [1, 2]
            if len(concept.chapter) > 1:
                parent_key = tuple(concept.chapter[:-1])
                parent_id = hierarchy_map.get(parent_key)
                
                if parent_id:
                    # Relation 1: Parent INCLUDES Child
                    rel1 = ET.SubElement(relations_root, "relation")
                    rel1.set("type", "including")
                    rel1.set("source", parent_id)
                    rel1.set("target", current_id)
                    ET.SubElement(rel1, "context").text = "Structural Hierarchy (ToC)"

                    # Relation 2: Child PART-OF Parent
                    rel2 = ET.SubElement(relations_root, "relation")
                    rel2.set("type", "part-of")
                    rel2.set("source", current_id)
                    rel2.set("target", parent_id)
                    ET.SubElement(rel2, "context").text = "Structural Hierarchy (ToC)"

        # Save Nodes XML
        nodes_str = minidom.parseString(ET.tostring(nodes_root)).toprettyxml(indent="   ")
        nodes_path = self.output_dir / "initial_nodes.xml"
        with open(nodes_path, "w", encoding="utf-8") as f:
            f.write(nodes_str)

        # Save Relations XML
        relations_str = minidom.parseString(ET.tostring(relations_root)).toprettyxml(indent="   ")
        relations_path = self.output_dir / "relations.xml"
        with open(relations_path, "w", encoding="utf-8") as f:
            f.write(relations_str)
            
        print(f"‚úÖ Graph saved: {nodes_path} and {relations_path}")
        return nodes_path, relations_path


Primeiramente √© criada a classe de convers√£o do PDF para markdown, utiliza-se da biblioteca Docling para realizar a convers√£o. Essa biblioteca permite extrair as imagens e tabelas do texto, posteriormente elas s√£o inclu√≠das no markdown final al√©m de serem salvas juntas.

In [5]:
import os
from PyPDF2 import PdfReader

from docling.document_converter import DocumentConverter
import logging
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat, OutputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions
    )
from docling.document_converter import DocumentConverter, PdfFormatOption, MarkdownFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, DoclingDocument
from tqdm.notebook import tqdm_notebook

class PDFConversor():
    """
    Convert PDF to markdown.
    
    :param pdf_path: Path of the input pdf.
    :param output_dir: Path of the output.
    """
    def __init__(self, pdf_path: Path, output_dir: Path):
        self.input_doc_path: Path = pdf_path
        self.base_output_dir: Path = output_dir
        self.pipeline_options: PdfPipelineOptions = self._set_pipeline_options()
        self.document_converter: DocumentConverter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=self.pipeline_options),
                OutputFormat.MARKDOWN: MarkdownFormatOption(image_mode=ImageRefMode.REFERENCED)
            },
        )
        self.last_page: int = self._get_no_pages()
        self.doc = None
       
    def _set_pipeline_options(self) -> PdfPipelineOptions:
        IMAGE_SCALE = 2.0
        
        pipeline_options = PdfPipelineOptions()
        pipeline_options.generate_picture_images = True
        pipeline_options.generate_page_images = True
        pipeline_options.images_scale = IMAGE_SCALE
        pipeline_options.do_ocr = False
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        pipeline_options.ocr_options.lang = ["pt"]
        pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=4, device=AcceleratorDevice.CUDA
        )
        return pipeline_options

    
    def _get_no_pages(self) -> int:
        reader = PdfReader(self.input_doc_path)
        return len(reader.pages)
    
    def _replace_image_placeholders(selg, md_str: str, image_files: List[Path]) -> None:
        content = md_str
        for img in image_files:
            content = content.replace("<!-- image -->", f"![]({str(img).split('/')[-1]})", 1)
        return content
        
    def save_images(self, doc: DoclingDocument, output_dir: Path) -> List[str]:
        filenames = []
        for page in doc.pictures:
            # print(page)
            page_no = page.self_ref.split('/')[-1]
            page_image_filename = output_dir / f"{page_no}.png"
            print(page_image_filename)
            with page_image_filename.open("wb") as fp:
                page.image.pil_image.save(fp, format="PNG")
            filenames.append(page_image_filename.relative_to(self.base_output_dir))
        return filenames 
    
    def generate_markdown(self, concepts: List[Concept]) -> None:
        """
        Generate a folder for each concept, with the images captured and a ``document.md`` file.
        
        :param concepts: ``List[Concept]`` List of concepts, their pages must in crescent order and sequentially
        (e.g. Chapter 1, 2, 3...).
        """
        for idx in tqdm_notebook(range(len(concepts))):
            curr_chap: Concept = concepts[idx]
            init_page = curr_chap.page_start
            chap_name = normalize_filename(curr_chap.concept_name)

            output_concept_dir = self.base_output_dir / chap_name
            os.makedirs(output_concept_dir, exist_ok=True)
            
            next_page = self.last_page + 1 if idx == len(concepts) - 1 else concepts[idx + 1].page_start - 1

            doc = self.document_converter.convert(self.input_doc_path, page_range=[init_page, next_page]).document
            md_str = doc.export_to_markdown()

            img_filenames = self.save_images(doc, output_concept_dir)
            raw_markdown = self._replace_image_placeholders(md_str, img_filenames)

            with open(output_concept_dir / "document.md", "w") as f:
                f.write(raw_markdown)

``EBookExtractor`` √© a classe principal, encapsulando a classe criada anteriormente e servindo como uma interface de mais alto n√≠vel.

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents.base import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

class EbookExtractor():
    """
    Extract data from an ebook.
    
    :param pdf_file_path: The path of the desired pdf book.
    :param base_output_dir: Directory where the book is going to be saved.
    """
    def __init__(self, pdf_file_path: Path, base_output_dir: Path):
        self.pages: List[Document] = None
        self.llm = ChatOpenAI(temperature=0, model=MODEL_NAME)
        self.pdf_conversor: PDFConversor = PDFConversor(pdf_file_path, base_output_dir)
        self.file_path: Path = pdf_file_path
        self._load_pdf_pages()
    
    def _load_pdf_pages(self) -> None:
        """Loads PDF and returns a list of Document objects (one per page)."""
        print(f"Loading PDF: {self.file_path}...")
        loader = PyMuPDFLoader(self.file_path)
        pages = loader.load()
        last_page = len(pages)
        print(f"Loaded {len(pages)} pages.")
        self.pages = pages
        
    
    def extract_toc_structure(self, end_toc_page = 5) -> PageExtraction:
        """
        Scans the first ``end_toc_page`` pages to find a Table of Contents or Summary.
        Returns a list of 'known concepts' to prime the main extractor.

        :param end_toc_page: The first pages where the summary appears. Default to 5.
        """
        print(f"Scouting Table of Contents (Pages 1-{end_toc_page})...")
        
        # Combine first pages (or fewer if small doc)
        limit = min(len(self.pages), end_toc_page)
        toc_text = "\n".join([p.page_content for p in self.pages[:limit]])
        
        # Simple chain for ToC extraction
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an expert content analyzer. Look at the beginning of this book."),
            ("human", """Identify the Table of Contents. 
            Extract ALL chapters, sections, and sub-sections (e.g., 1.1, 1.2.1, 1.2.2) as individual Concepts.
            Do NOT summarize or skip detailed sub-topics. Capture the full hierarchy.
            
            Text:
            {text}""")
        ])
        
        # We reuse the PageExtraction model, though we only care about names/start pages here
        chain = prompt | self.llm.with_structured_output(PageExtraction)
    
        try:
            result = chain.invoke({"text": toc_text})
            print(f"üìã ToC Analysis found {len(result.concepts)} potential concepts.")
            return result.concepts
        except Exception as e:
            print(f"‚ö†Ô∏è Could not extract ToC (might be missing or unstructured). Proceeding with empty seed. Error: {e}")
            return []
    

## Executando o pipeline

In [7]:
extractor = EbookExtractor(PDF_PATH, base_output_dir)

Loading PDF: ../data/701-LinuxFundamentals_material_full_v14.pdf...
Loaded 127 pages.


In [8]:
toc_concepts = extractor.extract_toc_structure(end_toc_page=5)

Scouting Table of Contents (Pages 1-5)...


2025-12-08 22:53:59,864 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üìã ToC Analysis found 104 potential concepts.


Filtrando apenas os cap√≠tulos, assim podemos gerar uma pasta para cada, contendo arquivo markdown e imagens.

In [9]:
chapters = []
for c in toc_concepts:
    if len(c.chapter) == 1:
        chapters.append(c)
        print(c.chapter, c.concept_name, c.page_start)

[1] Introdu√ß√£o ao Linux 6
[2] CertiÔ¨Åca√ß√µes Linux 13
[3] Hist√≥ria do Linux 16
[4] Licen√ßas Open Source 20
[5] Evolu√ß√£o do Linux: distribui√ß√µes 23
[6] Conhecendo o Linux 34
[7] T√≥picos para revis√£o do cap√≠tulo 41
[8] Estrutura do sistema operacional 43
[9] O que √© um Shell 52
[10] Vari√°veis 55
[11] Arquivos de conÔ¨Ågura√ß√£o do shell 62
[12] Caminhos de Diretorios 68
[13] T√≥picos para revis√£o do cap√≠tulo 74
[14] Como obter ajuda 76
[15] Formas de documenta√ß√£o 77
[16] Comando help 79
[17] Comando apropos 81
[18] Comando whatis 84
[19] Comando man 86
[20] Comando info 89
[21] Comando whereis 91
[22] Comando which 94
[23] FHS, Hierarquia dos Diret√≥rios 96
[24] Aprendendo Comandos do GNU/Linux 110
[25] Localiza√ß√£o no sistema 120
[26] T√≥picos para revis√£o do cap√≠tulo 127


In [10]:
graph_builder = StructuralGraphBuilder(base_output_dir)
graph_builder.build_graph(chapters)

üèóÔ∏è Building Graph for 26 concepts...
‚úÖ Graph saved: ebooks/LinuxFundamentals/initial_nodes.xml and ebooks/LinuxFundamentals/relations.xml


(PosixPath('ebooks/LinuxFundamentals/initial_nodes.xml'),
 PosixPath('ebooks/LinuxFundamentals/relations.xml'))

In [None]:
extractor.pdf_conversor.generate_markdown(chapters)

# Gerando grafos

In [11]:
ROOT_DIRECTORY = base_output_dir
OUTPUT_XML = ROOT_DIRECTORY / "global_knowledge_graph.xml"
INITIAL_NODES = ROOT_DIRECTORY / 'initial_nodes.xml'
NODES_XML = ROOT_DIRECTORY / 'nodes.xml'
RELATIONS_XML = ROOT_DIRECTORY / 'relations.xml'

In [12]:
from pydantic import BaseModel, Field
from typing import List, Literal, Optional

class Relation(BaseModel):
    source: str = Field(description="The subject concept.")
    target: str = Field(description="The object concept.")
    relation_type: Literal['prerequisite', 'including', 'part-of', 'property', 'definition']
    context: Optional[str] = Field(description="Justification text.")

class ConceptAnalysis(BaseModel):
    """LLM Output for a full chapter/concept file."""
    # We map 'new_concepts' to add to registry
    new_concepts: List[str] = Field(description="List of MAIN concepts defined in this text.")
    relations: List[Relation] = Field(description="Semantic connections found.")

In [13]:
class GlobalRegistry:
    def __init__(self):
        # Stores simple strings: {"Binary Notation", "Kernel", "File Permissions"}
        self.known_concepts = set()

    def add_concepts(self, concepts: List[str]):
        for c in concepts:
            self.known_concepts.add(c)
    
    def get_context_string(self):
        """Returns a comma-separated string of known concepts for the prompt."""
        return ", ".join(sorted(list(self.known_concepts)))

In [14]:
def analyze_concept_content(current_node_name, text_content, registry, llm):
    """
    Analyzes the entire markdown content for a specific concept node.
    """
    if not text_content.strip():
        return ConceptAnalysis(new_concepts=[], relations=[])

    previous_concepts_str = registry.get_context_string()

    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a Knowledge Graph Expert.
        
        Current Concept: '{current_node}'
        
        Your Goal:
        1. List NEW concepts explicitly taught/defined here.
        2. Extract Semantic Relations:
           - **definition**: If '{current_node}' is defined here.
           - **property**: Key attributes of '{current_node}'.
           - **prerequisite**: Does this text require knowing a concept from the PREVIOUSLY LEARNED list?
        
        PREVIOUSLY LEARNED CONCEPTS:
        [{history}]
        """),
        ("human", "{text}")
    ])

    chain = prompt | llm.with_structured_output(ConceptAnalysis)
    
    try:
        # We assume text_content fits in context window (usually fine for single concept sections)
        return chain.invoke({
            "current_node": current_node_name, 
            "text": text_content[:15000], # Safety cap for tokens
            "history": previous_concepts_str
        })
    except Exception as e:
        print(f"   ‚ö†Ô∏è LLM Error: {e}")
        return ConceptAnalysis(new_concepts_taught=[], relations=[])

In [26]:
def sanitize_id(text):
    """Helper to create XML-safe IDs from concept names."""
    return re.sub(r'[^a-zA-Z0-9_]', '_', text.strip())

def prettify_xml(elem: ET.Element) -> str:
    """
    Return a pretty-printed XML string for the Element.
    Strips the annoying extra newlines minidom likes to add.
    """
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    # Filter out lines that are purely whitespace
    return '\n'.join([line for line in reparsed.toprettyxml(indent="   ").split('\n') if line.strip()])

def process_book_sequentially():
    print("üöÄ Starting Semantic Extraction Agent...")
    
    # 1. SETUP
    llm = ChatOpenAI(temperature=0, model=MODEL_NAME)
    registry = GlobalRegistry() 
    
    if not os.path.exists(INITIAL_NODES) or not os.path.exists(RELATIONS_XML):
        print("‚ùå XML manifests not found. Run the structural builder first.")
        return

    # 2. LOAD MANIFESTS
    # We read nodes to know the order AND to append new nodes later
    tree_nodes = ET.parse(INITIAL_NODES)
    root_nodes = tree_nodes.getroot()
    node_elements = root_nodes.findall("node")
    
    # Sort strictly by order to respect book narrative
    node_elements.sort(key=lambda x: int(x.get("order", 0)))
    
    # Create a lookup of existing names to avoid duplicates when adding new ones
    existing_node_names = {node.get("name").lower().strip() for node in node_elements}
    
    # We read relations to APPEND to it
    tree_rels = ET.parse(RELATIONS_XML)
    root_rels = tree_rels.getroot()

    print(f"üìö Loaded {len(node_elements)} concepts in logical order.")

    # 3. PROCESSING LOOP
    for node in node_elements:
        node_id = node.get("id")
        node_name = node.get("name")
        folder_name = node.get("folder")
        
        # Path to the granular MD file
        file_path = os.path.join(ROOT_DIRECTORY, folder_name, "document.md")
        
        if not os.path.exists(file_path):
            continue
            
        print(f"\nüìñ Analyzing: {node_name}")
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        # A. Semantic Analysis
        analysis = analyze_concept_content(node_name, content, registry, llm)
        
        # B. Append New Relations to XML in memory
        for sem_rel in analysis.relations:
            rel_elem = ET.SubElement(root_rels, "relation")
            rel_elem.set("type", sem_rel.relation_type)
            
            if sem_rel.relation_type == 'prerequisite':
                # Prerequisite Flow: The concept in history (Target of extraction) -> Current Node
                rel_elem.set("source", sem_rel.target) # The old concept
                rel_elem.set("target", node_id)                # The current concept
            else:
                # Definition/Property Flow: Current Node -> Attribute
                rel_elem.set("source", node_id)
                rel_elem.set("target", sem_rel.target)
            
            ET.SubElement(rel_elem, "context").text = sem_rel.context

        # C. Update Registry (Learning) AND Nodes XML
        if analysis.new_concepts:
            new_count = 0
            for concept in analysis.new_concepts:
                clean_name = concept.strip()
                if clean_name.lower() not in existing_node_names:
                    # Create new Node entry
                    new_node_id = f"{sanitize_id(clean_name)}"
                    
                    new_node = ET.SubElement(root_nodes, "node")
                    new_node.set("id", new_node_id)
                    new_node.set("name", clean_name)
                    new_node.set("type", "extracted")
                    new_node.set("found_in_chapter", node_id)
                    
                    existing_node_names.add(clean_name.lower())
                    new_count += 1
            
            registry.add_concepts(analysis.new_concepts)
            print(f"üß† Learned: {len(analysis.new_concepts)} concepts ({new_count} new to XML)")
            
            # Implicitly, the current node itself is now 'known'
            registry.add_concepts([node_name])

    # 4. SAVE FINAL XML FILES
    # Save Relations
    xml_rels_str = prettify_xml(root_rels)
    with open(RELATIONS_XML, "w", encoding="utf-8") as f:
        f.write(xml_rels_str)
        
    # Save Nodes
    xml_nodes_str = prettify_xml(root_nodes)
    with open(NODES_XML, "w", encoding="utf-8") as f:
        f.write(xml_nodes_str)
        
    print(f"\n‚úÖ Updated {NODES_XML} and {RELATIONS_XML}")

In [27]:
process_book_sequentially()

üöÄ Starting Semantic Extraction Agent...
üìö Loaded 26 concepts in logical order.

üìñ Analyzing: Introdu√ß√£o ao Linux


2025-12-08 23:14:35,272 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 9 concepts (9 new to XML)

üìñ Analyzing: CertiÔ¨Åca√ß√µes Linux


2025-12-08 23:14:41,532 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 5 concepts (5 new to XML)

üìñ Analyzing: Hist√≥ria do Linux


2025-12-08 23:14:47,392 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 7 concepts (6 new to XML)

üìñ Analyzing: Licen√ßas Open Source


2025-12-08 23:14:52,502 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 3 concepts (3 new to XML)

üìñ Analyzing: Evolu√ß√£o do Linux: distribui√ß√µes


2025-12-08 23:14:59,306 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 11 concepts (11 new to XML)

üìñ Analyzing: Conhecendo o Linux


2025-12-08 23:15:03,460 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 1 concepts (0 new to XML)

üìñ Analyzing: T√≥picos para revis√£o do cap√≠tulo


2025-12-08 23:15:07,313 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 9 concepts (9 new to XML)

üìñ Analyzing: Estrutura do sistema operacional


2025-12-08 23:15:12,205 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 8 concepts (7 new to XML)

üìñ Analyzing: O que √© um Shell


2025-12-08 23:15:17,233 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 7 concepts (7 new to XML)

üìñ Analyzing: Vari√°veis


2025-12-08 23:15:25,081 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 5 concepts (4 new to XML)

üìñ Analyzing: Arquivos de conÔ¨Ågura√ß√£o do shell


2025-12-08 23:15:33,849 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 8 concepts (8 new to XML)

üìñ Analyzing: Caminhos de Diretorios


2025-12-08 23:15:38,436 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 3 concepts (3 new to XML)

üìñ Analyzing: T√≥picos para revis√£o do cap√≠tulo


2025-12-08 23:15:43,351 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



üìñ Analyzing: Como obter ajuda


2025-12-08 23:15:44,865 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 1 concepts (1 new to XML)

üìñ Analyzing: Formas de documenta√ß√£o


2025-12-08 23:15:49,627 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 4 concepts (3 new to XML)

üìñ Analyzing: Comando help


2025-12-08 23:15:52,758 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 1 concepts (0 new to XML)

üìñ Analyzing: Comando apropos


2025-12-08 23:16:00,508 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 1 concepts (0 new to XML)

üìñ Analyzing: Comando whatis


2025-12-08 23:16:03,703 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 1 concepts (0 new to XML)

üìñ Analyzing: Comando man


2025-12-08 23:16:06,887 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



üìñ Analyzing: Comando info


2025-12-08 23:16:11,759 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 2 concepts (1 new to XML)

üìñ Analyzing: Comando whereis


2025-12-08 23:16:16,358 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 1 concepts (0 new to XML)

üìñ Analyzing: Comando which


2025-12-08 23:16:18,054 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 1 concepts (0 new to XML)

üìñ Analyzing: FHS, Hierarquia dos Diret√≥rios


2025-12-08 23:16:25,361 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



üìñ Analyzing: Aprendendo Comandos do GNU/Linux


2025-12-08 23:16:29,786 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üß† Learned: 9 concepts (9 new to XML)

üìñ Analyzing: Localiza√ß√£o no sistema


2025-12-08 23:16:36,749 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



üìñ Analyzing: T√≥picos para revis√£o do cap√≠tulo


2025-12-08 23:16:40,947 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



‚úÖ Updated ebooks/LinuxFundamentals/nodes.xml and ebooks/LinuxFundamentals/relations.xml


In [28]:
OUTPUT_HTML = base_output_dir / 'graph.html'
OUTPUT_HTML = str(OUTPUT_HTML)

In [29]:
from pyvis.network import Network
import networkx as nx


def create_interactive_graph():
    if not os.path.exists(NODES_XML) or not os.path.exists(RELATIONS_XML):
        print("‚ùå XML files not found.")
        return

    print("üìä Constructing NetworkX Graph...")
    G = nx.DiGraph()

    # 1. Parse Nodes
    tree_nodes = ET.parse(NODES_XML)
    for node in tree_nodes.findall("node"):
        node_id = node.get("id")
        name = node.get("name")
        node_type = node.get("type", "chapter") 
        
        # Obsidian Style: Dots with specific colors
        color = "#8bd3dd"  # Cyan/Blue for Chapters
        size = 15          # Standard size
        title = f"Type: {node_type}"
        
        if node_type == "extracted":
            color = "#ffafcc" # Pink/Pastel for Concepts
            size = 10         # Smaller for concepts
            found_in = node.get("found_in_chapter", "unknown")
            title += f"\nFound in: {found_in}"
        elif node_type == "root":
            color = "#f0a202" # Gold for Root
            size = 25

        G.add_node(node_id, label=name, title=title, color=color, size=size, shape="dot")

    # 2. Parse Relations
    tree_rels = ET.parse(RELATIONS_XML)
    for rel in tree_rels.findall("relation"):
        source = rel.get("source")
        target = rel.get("target")
        rel_type = rel.get("type")
        
        # Edges: Visibility Fix -> Brighter, Solid Colors
        # color = "#666666" # Solid lighter gray for default edges
        # width = 1
        # dashes = False
        color = "#4a90e2" # Solid Blue (instead of faint cyan)
        dashes = True     # Keep dashes to distinguish structure
        width = 2
        
        if rel_type == "prerequisite":
          color = "#ff4d6d" # Bright Red/Pink
          width = 3         # Thicker to stand out
        # elif rel_type in ["part-of", "including"]:

        # if G.has_node(source) and G.has_node(target):
        G.add_edge(source, target, title=rel_type, color=color, width=width, dashes=dashes)

    print(f"üï∏Ô∏è  Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

    # 3. Generate PyVis Visualization (Obsidian Style)
    print("üé® Generating Obsidian-like HTML Visualization...")
    
    # Removed filter_menu and select_menu as requested
    net = Network(height="900px", width="100%", bgcolor="#1e1e1e", font_color="#cccccc", select_menu=False, filter_menu=False)
    
    net.from_nx(G)
    
    # Physics & Interaction Options
    # Removed transparency from shadow colors to improve crispness
    options = """
    var options = {
      "nodes": {
        "borderWidth": 0,
        "borderWidthSelected": 2,
        "font": {
          "size": 14,
          "face": "tahoma",
          "color": "#eeeeee",
          "strokeWidth": 2,
          "strokeColor": "#1e1e1e"
        },
        "shadow": {
            "enabled": true,
            "color": "black",
            "size": 5,
            "x": 2,
            "y": 2
        }
      },
      "edges": {
        "smooth": {
          "type": "continuous",
          "forceDirection": "none"
        },
        "arrows": {
            "to": {
                "enabled": true,
                "scaleFactor": 0.5
            }
        },
        "color": {
            "inherit": false,
            "opacity": 1.0
        }
      },
      "interaction": {
        "hover": true,
        "hoverConnectedEdges": true,
        "selectConnectedEdges": true,
        "navigationButtons": true,
        "keyboard": true,
        "tooltipDelay": 200
      }
    }
    """
    net.set_options(options)
    
    # net.show_buttons(filter_=['physics'])

    # Save
    net.save_graph(OUTPUT_HTML)
    print(f"‚úÖ Visualization saved to: {os.path.abspath(OUTPUT_HTML)}")


In [30]:
create_interactive_graph()

üìä Constructing NetworkX Graph...
üï∏Ô∏è  Graph created with 280 nodes and 214 edges.
üé® Generating Obsidian-like HTML Visualization...
‚úÖ Visualization saved to: /home/pras/EMBRAPII/4linux/notebooks/ebooks/LinuxFundamentals/graph.html
