# Extra√ßao de dados

O SINKT j√° considera um dataset pronto para uso. Sendo assim essa se√ß√£o busca extrair os conceitos de um ebook PDF. Primeiramente iremos transformar em Markdown, visto que √© melhor utilizar texto puro ao inv√©s de p√°ginas de PDF. Al√©m disso, essa proposta facilita a pr√≥pria extra√ß√£o para o MAIC, posteriormente.

In [None]:
import unicodedata
import re

def normalize_filename(s):
    """Remove accents, replace underscores and remove non-alphanumeric characters."""
    s = unicodedata.normalize('NFKD', s)
    s = ''.join(c for c in s if not unicodedata.combining(c))
    s = re.sub(r'\s+', '_', s)
    s = re.sub(r'[^\w_]', '', s)
    return s.lower()

Configura√ß√£o inicial.

In [78]:
from dotenv import load_dotenv
import os
from pathlib import Path

load_dotenv("../.env")

assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not found"

MODEL_NAME = "gpt-4o"

BOOK_NAME = 'LinuxFundamentals'
EBOOKS_PATH = Path('ebooks')
base_output_dir = EBOOKS_PATH / BOOK_NAME
os.makedirs(base_output_dir, exist_ok=True)

PDF_PATH = Path('../data/701-LinuxFundamentals_material_full_v14.pdf')
OUTPUT_CSV = Path('../concepts.csv')

Definindo as estruturas de dados que iremos trabalhar.

In [None]:
from pydantic import BaseModel, Field
from typing import List, Optional

class Concept(BaseModel):
    """Represents a single educational concept found in the text."""
    concept_name: str = Field(description="The formal name of the concept (e.g., 'Inductive Logic', 'Backpropagation').")
    chapter: List[int] = Field(description="The number of the current chapter, subchapter, etc (e.g., [1] for chapter 1, [1, 2] for subchapter 1.2, [1,2,5] for subsubchapter 1.2.5)")
    description: str = Field(description="A concise definition or summary of the concept based on the text.")
    page_start: int = Field(description="The page number where this concept is first introduced.")
    # page_end: Optional[int] = Field(default=None, description="The page number where the discussion of this concept seems to end (or current page if ongoing).")
    is_main_chapter: bool = Field(default=False, description="True if this is a chapter or main topic, False if it is a subchapter or subtopic.")

class PageExtraction(BaseModel):
    """Container for multiple concepts found on a specific page processing step."""
    concepts: List[Concept] = Field(description="List of concepts extracted from the current text window.")


In [None]:
import os
from PyPDF2 import PdfReader

from docling.document_converter import DocumentConverter
import logging
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat, OutputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions
    )
from docling.document_converter import DocumentConverter, PdfFormatOption, MarkdownFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, DoclingDocument
from tqdm.notebook import tqdm_notebook

class PDFConversor():
    """
    Convert PDF to markdown.
    
    :param pdf_path: Path of the input pdf.
    :param output_dir: Path of the output.
    """
    def __init__(self, pdf_path: Path, output_dir: Path):
        self.input_doc_path: Path = pdf_path
        self.base_output_dir: Path = output_dir
        self.pipeline_options: PdfPipelineOptions = self._set_pipeline_options()
        self.document_converter: DocumentConverter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=self.pipeline_options),
                OutputFormat.MARKDOWN: MarkdownFormatOption(image_mode=ImageRefMode.REFERENCED)
            },
        )
        self.last_page: int = self._get_no_pages()
        self.doc = None
       
    def _set_pipeline_options(self) -> PdfPipelineOptions:
        IMAGE_SCALE = 2.0
        
        pipeline_options = PdfPipelineOptions()
        pipeline_options.generate_picture_images = True
        pipeline_options.generate_page_images = True
        pipeline_options.images_scale = IMAGE_SCALE
        pipeline_options.do_ocr = False
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        pipeline_options.ocr_options.lang = ["pt"]
        pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=4, device=AcceleratorDevice.CUDA
        )
        return pipeline_options

    
    def _get_no_pages(self) -> int:
        reader = PdfReader(self.input_doc_path)
        return len(reader.pages)
    
    def _replace_image_placeholders(selg, md_str: str, image_files: List[Path]) -> None:
        content = md_str
        for img in image_files:
            content = content.replace("<!-- image -->", f"![]({str(img).split('/')[-1]})", 1)
        return content
        
    def save_images(self, doc: DoclingDocument, output_dir: Path) -> List[str]:
        filenames = []
        for page in doc.pictures:
            # print(page)
            page_no = page.self_ref.split('/')[-1]
            page_image_filename = output_dir / f"{page_no}.png"
            print(page_image_filename)
            with page_image_filename.open("wb") as fp:
                page.image.pil_image.save(fp, format="PNG")
            filenames.append(page_image_filename.relative_to(self.base_output_dir))
        return filenames 
    
    def generate_markdown(self, concepts: List[Concept]) -> None:
        """
        Generate a folder for each concept, with the images captured and a ``document.md`` file.
        
        :param concepts: ``List[Concept]`` List of concepts, their pages must in crescent order and sequentially
        (e.g. Chapter 1, 2, 3...).
        """
        for idx in tqdm_notebook(range(len(concepts))):
            curr_chap: Concept = concepts[idx]
            init_page = curr_chap.page_start
            chap_name = normalize_filename(curr_chap.concept_name)

            output_concept_dir = self.base_output_dir / chap_name
            os.makedirs(output_concept_dir, exist_ok=True)
            
            next_page = self.last_page + 1 if idx == len(concepts) - 1 else concepts[idx + 1].page_start - 1

            doc = self.document_converter.convert(self.input_doc_path, page_range=[init_page, next_page]).document
            md_str = doc.export_to_markdown()

            img_filenames = self.save_images(doc, output_concept_dir)
            raw_markdown = self._replace_image_placeholders(md_str, img_filenames)

            with open(output_concept_dir / "document.md", "w") as f:
                f.write(raw_markdown)

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents.base import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

class EbookExtractor():
    """
    Extract data from an ebook.
    
    :param pdf_file_path: The path of the desired pdf book.
    :param base_output_dir: Directory where the book is going to be saved.
    """
    def __init__(self, pdf_file_path: Path, base_output_dir: Path):
        self.pages: List[Document] = None
        self.llm = ChatOpenAI(temperature=0, model=MODEL_NAME)
        self.pdf_conversor: PDFConversor = PDFConversor(pdf_file_path, base_output_dir)
        self.file_path: Path = pdf_file_path
        self._load_pdf_pages()
    
    def _load_pdf_pages(self) -> None:
        """Loads PDF and returns a list of Document objects (one per page)."""
        print(f"Loading PDF: {self.file_path}...")
        loader = PyMuPDFLoader(self.file_path)
        pages = loader.load()
        last_page = len(pages)
        print(f"Loaded {len(pages)} pages.")
        self.pages = pages
        
    
    def extract_toc_structure(self, end_toc_page = 5) -> PageExtraction:
        """
        Scans the first ``end_toc_page`` pages to find a Table of Contents or Summary.
        Returns a list of 'known concepts' to prime the main extractor.

        :param end_toc_page: The first pages where the summary appears. Default to 5.
        """
        print(f"Scouting Table of Contents (Pages 1-{end_toc_page})...")
        
        # Combine first pages (or fewer if small doc)
        limit = min(len(self.pages), end_toc_page)
        toc_text = "\n".join([p.page_content for p in self.pages[:limit]])
        
        # Simple chain for ToC extraction
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an expert content analyzer. Look at the beginning of this book."),
            ("human", """Identify the Table of Contents. 
            Extract ALL chapters, sections, and sub-sections (e.g., 1.1, 1.2.1, 1.2.2) as individual Concepts.
            Do NOT summarize or skip detailed sub-topics. Capture the full hierarchy.
            
            Text:
            {text}""")
        ])
        
        # We reuse the PageExtraction model, though we only care about names/start pages here
        chain = prompt | self.llm.with_structured_output(PageExtraction)
    
        try:
            result = chain.invoke({"text": toc_text})
            print(f"üìã ToC Analysis found {len(result.concepts)} potential concepts.")
            return result.concepts
        except Exception as e:
            print(f"‚ö†Ô∏è Could not extract ToC (might be missing or unstructured). Proceeding with empty seed. Error: {e}")
            return []
    

In [81]:
extractor = EbookExtractor(PDF_PATH, base_output_dir)

üìÇ Loading PDF: ../data/701-LinuxFundamentals_material_full_v14.pdf...
‚úÖ Loaded 127 pages.


In [82]:
toc_concepts = extractor.extract_toc_structure(end_toc_page=5)

üïµÔ∏è‚Äç‚ôÇÔ∏è Scouting Table of Contents (Pages 1-5)...


2025-12-07 15:10:49,155 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


üìã ToC Analysis found 104 potential concepts.


Filtrando apenas os cap√≠tulos, assim podemos gerar uma pasta para cada, contendo arquivo markdown e imagens.

In [83]:
chapters = []
for c in toc_concepts:
    if len(c.chapter) == 1:
        chapters.append(c)
        print(c.chapter, c.concept_name, c.page_start)

[1] Introdu√ß√£o ao Linux 6
[2] CertiÔ¨Åca√ß√µes Linux 13
[3] Hist√≥ria do Linux 16
[4] Licen√ßas Open Source 20
[5] Evolu√ß√£o do Linux: distribui√ß√µes 23
[6] Conhecendo o Linux 34
[7] T√≥picos para revis√£o do cap√≠tulo 41
[8] Estrutura do sistema operacional 43
[9] O que √© um Shell 52
[10] Vari√°veis 55
[11] Arquivos de conÔ¨Ågura√ß√£o do shell 62
[12] Caminhos de Diretorios 68
[13] T√≥picos para revis√£o do cap√≠tulo 74
[14] Como obter ajuda 76
[15] Formas de documenta√ß√£o 77
[16] Comando help 79
[17] Comando apropos 81
[18] Comando whatis 84
[19] Comando man 86
[20] Comando info 89
[21] Comando whereis 91
[22] Comando which 94
[23] FHS, Hierarquia dos Diret√≥rios 96
[24] Aprendendo Comandos do GNU/Linux 110
[25] Localiza√ß√£o no sistema 120
[26] T√≥picos para revis√£o do cap√≠tulo 127


In [95]:
extractor.pdf_conversor.generate_markdown(chapters)

  0%|          | 0/26 [00:00<?, ?it/s]

2025-12-07 15:17:36,282 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:17:36,289 - INFO - Going to convert document batch...
2025-12-07 15:17:36,290 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


Generating Markdown file pages [6 - 12]


2025-12-07 15:17:49,570 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 10.10 sec.


ebooks/LinuxFundamentals/introducao_ao_linux/0.png
ebooks/LinuxFundamentals/introducao_ao_linux/1.png
ebooks/LinuxFundamentals/introducao_ao_linux/2.png
ebooks/LinuxFundamentals/introducao_ao_linux/3.png


2025-12-07 15:17:49,828 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:17:49,839 - INFO - Going to convert document batch...
2025-12-07 15:17:49,840 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/introducao_ao_linux/4.png
Generating Markdown file pages [13 - 15]


2025-12-07 15:17:53,869 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 4.04 sec.
2025-12-07 15:17:53,939 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:17:53,947 - INFO - Going to convert document batch...
2025-12-07 15:17:53,949 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/certificacoes_linux/0.png
ebooks/LinuxFundamentals/certificacoes_linux/1.png
Generating Markdown file pages [16 - 19]


2025-12-07 15:17:58,820 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 4.88 sec.
2025-12-07 15:17:58,827 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:17:58,834 - INFO - Going to convert document batch...
2025-12-07 15:17:58,836 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


Generating Markdown file pages [20 - 22]


2025-12-07 15:18:03,718 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 4.89 sec.
2025-12-07 15:18:03,735 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:18:03,743 - INFO - Going to convert document batch...
2025-12-07 15:18:03,744 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/licencas_open_source/0.png
Generating Markdown file pages [23 - 33]


2025-12-07 15:18:19,725 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 13.94 sec.
2025-12-07 15:18:19,782 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:18:19,791 - INFO - Going to convert document batch...
2025-12-07 15:18:19,792 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/evolucao_do_linux_distribuicoes/0.png
Generating Markdown file pages [34 - 40]


2025-12-07 15:18:29,270 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 9.49 sec.
2025-12-07 15:18:29,289 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:18:29,298 - INFO - Going to convert document batch...
2025-12-07 15:18:29,299 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/conhecendo_o_linux/0.png
Generating Markdown file pages [41 - 42]


2025-12-07 15:18:32,169 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 2.88 sec.
2025-12-07 15:18:32,179 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:18:32,185 - INFO - Going to convert document batch...
2025-12-07 15:18:32,186 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/topicos_para_revisao_do_capitulo/0.png
Generating Markdown file pages [43 - 51]


2025-12-07 15:18:45,552 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 13.37 sec.
2025-12-07 15:18:45,629 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:18:45,636 - INFO - Going to convert document batch...
2025-12-07 15:18:45,640 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/estrutura_do_sistema_operacional/0.png
ebooks/LinuxFundamentals/estrutura_do_sistema_operacional/1.png
ebooks/LinuxFundamentals/estrutura_do_sistema_operacional/2.png
Generating Markdown file pages [52 - 54]


2025-12-07 15:18:53,116 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 4.28 sec.
2025-12-07 15:18:53,184 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:18:53,190 - INFO - Going to convert document batch...
2025-12-07 15:18:53,191 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/o_que_e_um_shell/0.png
ebooks/LinuxFundamentals/o_que_e_um_shell/1.png
Generating Markdown file pages [55 - 61]


2025-12-07 15:19:04,314 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 11.13 sec.
2025-12-07 15:19:04,332 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:04,338 - INFO - Going to convert document batch...
2025-12-07 15:19:04,339 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/variaveis/0.png
Generating Markdown file pages [62 - 67]


2025-12-07 15:19:12,526 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 8.19 sec.
2025-12-07 15:19:12,631 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:12,638 - INFO - Going to convert document batch...
2025-12-07 15:19:12,639 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/arquivos_de_configuracao_do_shell/0.png
ebooks/LinuxFundamentals/arquivos_de_configuracao_do_shell/1.png
ebooks/LinuxFundamentals/arquivos_de_configuracao_do_shell/2.png
Generating Markdown file pages [68 - 73]


2025-12-07 15:19:19,472 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 6.84 sec.
2025-12-07 15:19:19,487 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:19,496 - INFO - Going to convert document batch...
2025-12-07 15:19:19,496 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/caminhos_de_diretorios/0.png
Generating Markdown file pages [74 - 75]


2025-12-07 15:19:21,931 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 2.44 sec.
2025-12-07 15:19:21,940 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:21,946 - INFO - Going to convert document batch...
2025-12-07 15:19:21,947 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/topicos_para_revisao_do_capitulo/0.png
Generating Markdown file pages [76 - 76]


2025-12-07 15:19:25,671 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 1.29 sec.
2025-12-07 15:19:25,682 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:25,688 - INFO - Going to convert document batch...
2025-12-07 15:19:25,690 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/como_obter_ajuda/0.png
Generating Markdown file pages [77 - 78]


2025-12-07 15:19:28,256 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 2.57 sec.
2025-12-07 15:19:28,264 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:28,272 - INFO - Going to convert document batch...
2025-12-07 15:19:28,273 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/formas_de_documentacao/0.png
Generating Markdown file pages [79 - 80]


2025-12-07 15:19:31,718 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 3.45 sec.
2025-12-07 15:19:31,729 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:31,737 - INFO - Going to convert document batch...
2025-12-07 15:19:31,738 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/comando_help/0.png
Generating Markdown file pages [81 - 83]


2025-12-07 15:19:36,031 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 4.30 sec.
2025-12-07 15:19:36,042 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:36,048 - INFO - Going to convert document batch...
2025-12-07 15:19:36,050 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/comando_apropos/0.png
Generating Markdown file pages [84 - 85]


2025-12-07 15:19:38,646 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 2.60 sec.
2025-12-07 15:19:38,658 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:38,663 - INFO - Going to convert document batch...
2025-12-07 15:19:38,665 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/comando_whatis/0.png
Generating Markdown file pages [86 - 88]


2025-12-07 15:19:42,421 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 3.76 sec.
2025-12-07 15:19:42,432 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:42,439 - INFO - Going to convert document batch...
2025-12-07 15:19:42,440 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/comando_man/0.png
Generating Markdown file pages [89 - 90]


2025-12-07 15:19:45,391 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 2.96 sec.
2025-12-07 15:19:45,405 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:45,415 - INFO - Going to convert document batch...
2025-12-07 15:19:45,417 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/comando_info/0.png
Generating Markdown file pages [91 - 93]


2025-12-07 15:19:50,043 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 4.64 sec.
2025-12-07 15:19:50,053 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:50,061 - INFO - Going to convert document batch...
2025-12-07 15:19:50,062 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/comando_whereis/0.png
Generating Markdown file pages [94 - 95]


2025-12-07 15:19:52,923 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 2.87 sec.
2025-12-07 15:19:52,937 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:19:52,951 - INFO - Going to convert document batch...
2025-12-07 15:19:52,954 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/comando_which/0.png
Generating Markdown file pages [96 - 109]


2025-12-07 15:20:18,659 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 22.54 sec.
2025-12-07 15:20:18,765 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:20:18,775 - INFO - Going to convert document batch...
2025-12-07 15:20:18,777 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/fhs_hierarquia_dos_diretorios/0.png
ebooks/LinuxFundamentals/fhs_hierarquia_dos_diretorios/1.png
Generating Markdown file pages [110 - 119]


2025-12-07 15:20:32,755 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 13.99 sec.
2025-12-07 15:20:32,776 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:20:32,784 - INFO - Going to convert document batch...
2025-12-07 15:20:32,786 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/aprendendo_comandos_do_gnulinux/0.png
Generating Markdown file pages [120 - 126]


2025-12-07 15:20:44,173 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 8.49 sec.
2025-12-07 15:20:44,195 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-07 15:20:44,201 - INFO - Going to convert document batch...
2025-12-07 15:20:44,202 - INFO - Processing document 701-LinuxFundamentals_material_full_v14.pdf


ebooks/LinuxFundamentals/localizacao_no_sistema/0.png
Generating Markdown file pages [127 - 128]


2025-12-07 15:20:45,590 - INFO - Finished converting document 701-LinuxFundamentals_material_full_v14.pdf in 1.40 sec.


ebooks/LinuxFundamentals/topicos_para_revisao_do_capitulo/0.png


In [28]:
def get_sliding_window_text(pages, current_index):
    """
    Implements the 1.5 page logic:
    - Takes 50% of the previous page (if exists).
    - Takes 100% of the current page.
    """
    current_page = pages[current_index]
    current_text = current_page.page_content
    
    # Context header to help LLM understand where it is
    context_text = f"--- PAGE {current_page.metadata.get('page', current_index) + 1} ---\n{current_text}"

    if current_index > 0:
        prev_page = pages[current_index - 1]
        prev_full_text = prev_page.page_content
        
        # approximate 'half' by character count
        half_point = len(prev_full_text) // 2
        prev_half_text = prev_full_text[half_point:]
        
        # Prepend the previous half
        context_text = f"--- CONTEXT FROM PREVIOUS PAGE ({prev_page.metadata.get('page', current_index - 1) + 1}) ---\n{prev_half_text}\n\n" + context_text
        
    return context_text