In [1]:
from typing import List, Dict, Any, Optional, Union
import aiofiles
from pathlib import Path
import pipmaster as pm

In [2]:
import json
from pathlib import Path

def add_file_metadata(path: str | Path, metadata: dict, output_path: str | Path | None = None) -> Path:
    """
    Add/merge metadata to a file.
    - PDF: writes a new PDF with updated document info.
    - Other: writes sidecar JSON (original.ext.meta.json).
    Returns path to updated artifact (PDF or sidecar).
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(p)
    
    content = ""
    # Get file extension in lowercase
    ext = p.suffix.lower()
    match ext:
        case ".pdf":
            if not pm.is_installed("pypdf2"):
                pm.install("pypdf2")
            from PyPDF2 import PdfReader, PdfWriter

            reader = PdfReader(str(p))
            writer = PdfWriter()
            for page in reader.pages:
                writer.add_page(page)

            # Start from existing metadata (may be None)
            base_meta = {}
            if reader.metadata:
                for k, v in reader.metadata.items():
                    if v is None:
                        continue
                    base_meta[k] = v

            # Normalize incoming keys: accept both '/Title' or 'Title'
            new_meta = {}
            for k, v in metadata.items():
                key = k if k.startswith("/") else f"/{k}"
                new_meta[key] = v

            base_meta.update(new_meta)
            writer.add_metadata(base_meta)

            out = Path(output_path) if output_path else p.with_stem(p.stem + "_with_meta")
            with open(out, "wb") as f:
                writer.write(f)
            return out
        case _:
            raise ValueError(f"Unsupported file format: {ext}")

# Example usage:

# add_file_metadata('C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\dataset\\Own\\98823-Rui-Melo-dissertacao.pdf', {"Title": "My Title", "Author": "Me"})

In [3]:
async def index_file(file_path: Union[str, Path], generate: bool = False) -> None:
    """Index all files inside the folder with support for multiple file formats

    Args:
        file_path: Path to the file to be indexed (str or Path object)

    Raises:
        ValueError: If file format is not supported
        FileNotFoundError: If file doesn't exist
    """
    if not pm.is_installed("aiofiles"):
        pm.install("aiofiles")

    # Convert to Path object if string
    file_path = Path(file_path)

    # Check if file exists
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    content = ""
    # Get file extension in lowercase
    ext = file_path.suffix.lower()

    match ext:
        case ".txt" | ".md":
            # Text files handling
            async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
                content = await f.read()

        case ".pdf":
            if not pm.is_installed("pypdf2"):
                pm.install("pypdf2")
            from PyPDF2 import PdfReader

            # PDF handling
            reader = PdfReader(str(file_path))
            content = ""
            for page in reader.pages:
                content += page.extract_text() + "\n"
            metadata = reader.metadata or {}
        case _:
            raise ValueError(f"Unsupported file format: {ext}")
    if generate:
        from minirag.metadata_plugin import minirag_generate_metadata
        meta = minirag_generate_metadata("DOC-", content, file_path)
        print("Generated metadata:")
        print(meta)

    print(f"Indexing file: {file_path}")
    print(f"Content: {content[:100]}...")  # Print first 100 characters for debugging
    print(f"Metadata: {metadata if 'metadata' in locals() else 'N/A'}")

In [4]:
file_path = "C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\O Acesso a documentos administrativos.pdf"
await index_file(Path(file_path), generate=False)

Indexing file: C:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\MiniRAG\notebooks\O Acesso a documentos administrativos.pdf
Content:  
Página 1 
 
O Acesso a documentos administrativos  
Carla Rodrigues  
                            ...
Metadata: {'/Author': 'alexandra.cabrelon', '/Creator': 'Microsoft® Office Word 2007', '/CreationDate': 'D:20160418133213', '/ModDate': 'D:20160418133213', '/Producer': 'Microsoft® Office Word 2007'}


In [5]:
async def get_text(file_path: Union[str, Path]) -> str:
    if not pm.is_installed("aiofiles"):
        pm.install("aiofiles")

    # Convert to Path object if string
    file_path = Path(file_path)

    # Check if file exists
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    content = ""
    # Get file extension in lowercase
    ext = file_path.suffix.lower()

    match ext:
        case ".txt" | ".md":
            # Text files handling
            async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
                content = await f.read()

        case ".pdf":
            if not pm.is_installed("pypdf2"):
                pm.install("pypdf2")
            from PyPDF2 import PdfReader

            # PDF handling
            reader = PdfReader(str(file_path))
            content = ""
            for page in reader.pages:
                content += page.extract_text() + "\n"
        case _:
            raise ValueError(f"Unsupported file format: {ext}")
    return content
file_path = "C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\O Acesso a documentos administrativos.pdf"
content = await get_text(file_path)
print(content[:200])

 
Página 1 
 
O Acesso a documentos administrativos  
Carla Rodrigues  
                                                                                                                  Jurista  
I. I


In [7]:
from minirag.metadata_plugin import minirag_generate_metadata
meta = minirag_generate_metadata("DOC-", content, file_path)
print(meta)

# check memory usage of meta variable (deep size)
import sys
from collections import deque

def deep_getsizeof(o):
    seen = set()
    total = 0
    queue = deque([o])
    while queue:
        obj = queue.popleft()
        obj_id = id(obj)
        if obj_id in seen:
            continue
        seen.add(obj_id)
        try:
            size = sys.getsizeof(obj)
        except Exception:
            size = 0
        total += size
        if isinstance(obj, dict):
            queue.extend(obj.keys())
            queue.extend(obj.values())
        elif isinstance(obj, (list, tuple, set, frozenset)):
            queue.extend(obj)
    return total

print(f"Deep size of meta: {deep_getsizeof(meta):,} bytes (~{deep_getsizeof(meta)/1024:.1f} KB)")

{'doc_id': 'DOC-', 'mime': 'application/pdf', 'source_path': 'C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\O Acesso a documentos administrativos.pdf', 'pii': ['DATE_GENERIC', 'LOCATION', 'ORGANIZATION', 'PERSON'], 'pii_detections': [{'entity_type': 'DATE_GENERIC', 'start': 950, 'end': 974, 'score': 0.9, 'text': '46/2007, de 24 de agosto', 'context': 'n.º 46/2007, de 24 de agosto, reg'}, {'entity_type': 'DATE_GENERIC', 'start': 6836, 'end': 6850, 'score': 0.9, 'text': '14, março 2014', 'context': 'n.º 14, março 2014'}, {'entity_type': 'DATE_GENERIC', 'start': 26, 'end': 27, 'score': 0.86, 'text': 'm', 'context': 'documentos'}, {'entity_type': 'DATE_GENERIC', 'start': 1069, 'end': 1084, 'score': 0.86, 'text': 'de 26 de agosto', 'context': '/93, de 26 de agosto, com'}, {'entity_type': 'DATE_GENERIC', 'start': 1127, 'end': 1131, 'score': 0.86, 'text': '8/95', 'context': 'n.º 8/95, de'}, {'entity_type': 'DATE_GENERIC', 'start': 1133, '