In [2]:
import sys, os

# Get project root — one level up from "research-report-generation"
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


## Testing Hybrid Chunker

In [3]:
import pandas as pd
from IPython.display import display, Markdown

def chunks_to_df(chunks, preview_chars=250):
    """Convert list[DocumentChunk] -> pandas dataframe for clean display."""
    rows = []
    for c in chunks:
        rows.append({
            "index": c.index,
            "chars": len(c.content),
            "tokens": c.token_count,
            "chunk_method": c.metadata.get("chunk_method"),
            "title": c.metadata.get("title"),
            "source": c.metadata.get("source"),
            "has_context": c.metadata.get("has_context", False),
            "preview": (c.content[:preview_chars] + "…") if len(c.content) > preview_chars else c.content
        })
    return pd.DataFrame(rows)

def show_chunks(chunks, n=10, preview_chars=250):
    """Show top N chunks as a table."""
    df = chunks_to_df(chunks, preview_chars=preview_chars)
    display(df.head(n))

def show_chunk_detail(chunks, idx):
    """Show a single chunk in full detail."""
    chunk = chunks[idx]
    display(Markdown(f"## Chunk #{chunk.index}"))
    display(Markdown(f"**Chars:** {len(chunk.content)} | **Tokens:** {chunk.token_count}"))
    display(Markdown(f"**Method:** `{chunk.metadata.get('chunk_method')}` | **Source:** `{chunk.metadata.get('source')}`"))
    display(Markdown("### Metadata"))
    display(pd.DataFrame([chunk.metadata]).T.rename(columns={0: "value"}))
    display(Markdown("### Content"))
    display(Markdown(f"```markdown\n{chunk.content}\n```"))


In [6]:
import asyncio
from docling.document_converter import DocumentConverter
from chunker import ChunkingConfig, create_chunker

PDF_PATH = r"C:\Users\birok\Python\LLMOPs\regulatory-rag-system\data\R0r4e.pdf"  # update

async def run_hybrid_test():
    converter = DocumentConverter()
    result = converter.convert(PDF_PATH)

    markdown = result.document.export_to_markdown()
    docling_doc = result.document

    config = ChunkingConfig(use_semantic_splitting=True, max_tokens=512)
    chunker = create_chunker(config)

    chunks = await chunker.chunk_document(
        content=markdown,
        title="Hybrid Test",
        source=PDF_PATH,
        docling_doc=docling_doc
    )

    show_chunks(chunks, n=10)
    show_chunk_detail(chunks, 0)

await run_hybrid_test()


2026-01-17 17:27:30,761 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-17 17:27:30,763 - INFO - Going to convert document batch...
2026-01-17 17:27:30,763 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-17 17:27:30,763 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2026-01-17 17:27:30,763 - INFO - easyocr cannot be used because it is not installed.
2026-01-17 17:27:30,763 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2026-01-17 17:27:30,779 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-17 17:27:30,780 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-01-17 17:27:30,781 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\birok\Python\LLMOPs\regulatory-rag-system\venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-17 17:27:30,781 [RapidOCR] main.py:50: Using C:\Users\birok\Python\LLMOPs\

Unnamed: 0,index,chars,tokens,chunk_method,title,source,has_context,preview
0,0,1,3,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,*
1,1,394,76,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Agreement\nConcerning the Adoption of Harmoniz...
2,2,177,42,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Revision 4\nIncorporating all valid text up to...
3,3,212,57,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Uniform provisions concerning the Internationa...
4,4,572,105,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,UNITED NATIONS\nFormer titles of the Agreement...
5,5,554,450,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,"Contents\nRegulation, = . Regulation, Page = ..."
6,6,513,443,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Contents\n= Approval.............................
7,7,614,431,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Contents\n= Modification of IWVTA type and mod...
8,8,744,452,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Contents\n= Names and addresses of Technical S...
9,9,722,481,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Contents\n= Arrangement of the type approval m...


## Chunk #0

**Chars:** 1 | **Tokens:** 3

**Method:** `hybrid` | **Source:** `C:\Users\birok\Python\LLMOPs\regulatory-rag-system\data\R0r4e.pdf`

### Metadata

Unnamed: 0,value
title,Hybrid Test
source,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...
chunk_method,hybrid
total_chunks,70
token_count,3
has_context,True


### Content

```markdown
*
```

## Testing Embedder.py

In [3]:
from dotenv import load_dotenv
from src.parsing.chunker import DocumentChunk, create_chunker
from src.parsing.embedder import create_embedder
from utils.providers import get_embedding_client, get_embedding_model
# Load environment variables
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [4]:
chunks = [
    DocumentChunk(
        content="RBI requires KYC verification of customer identity using valid documents.",
        index=0,
        start_char=0,
        end_char=80,
        metadata={"title": "RBI Guidelines", "source": "test"},
        token_count=20
    ),
    DocumentChunk(
        content="All suspicious transactions should be reported to FIU-IND within prescribed timelines.",
        index=1,
        start_char=81,
        end_char=170,
        metadata={"title": "RBI Guidelines", "source": "test"},
        token_count=22
    )
]

print("Chunks created:", len(chunks))
print("Sample chunk:", chunks[0].content)


Chunks created: 2
Sample chunk: RBI requires KYC verification of customer identity using valid documents.


In [5]:
import boto3, os, json

region = os.environ["AWS_DEFAULT_REGION"]
client = boto3.client("bedrock", region_name=region)

resp = client.list_foundation_models()

# show only embedding models
embed_models = [
    m for m in resp["modelSummaries"]
    if "EMBEDDING" in m.get("outputModalities", []) or "EMBEDDING" in m.get("inferenceTypesSupported", [])
]

for m in resp["modelSummaries"]:
    if "embed" in m["modelId"]:
        print(m["modelId"])


cohere.embed-v4:0
amazon.titan-embed-text-v2:0


In [6]:
import asyncio

async def test_embedder_chunks():
    embedder = create_embedder(batch_size=2)  # small batch for test
    
    def progress(current, total):
        print(f"Embedding batch {current}/{total}")

    embedded = await embedder.embed_chunks(chunks, progress_callback=progress)
    return embedded

embedded_chunks = await test_embedder_chunks()
print("Embedded chunks:", len(embedded_chunks))

Embedding batch 1/1
Embedded chunks: 2


In [7]:
for c in embedded_chunks:
    print("="*90)
    print("Index:", c.index)
    print("Text:", c.content[:80], "...")
    print("Embedding dim:", len(c.embedding))
    print("Provider:", c.metadata.get("embedding_provider"))
    print("Model:", c.metadata.get("embedding_model"))



Index: 0
Text: RBI requires KYC verification of customer identity using valid documents. ...
Embedding dim: 1024
Provider: aws_bedrock_titan
Model: amazon.titan-embed-text-v1
Index: 1
Text: All suspicious transactions should be reported to FIU-IND within prescribed time ...
Embedding dim: 1024
Provider: aws_bedrock_titan
Model: amazon.titan-embed-text-v1


In [8]:
async def test_query():
    embedder = create_embedder()
    v = await embedder.embed_query("KYC verification requirements")
    return v

qvec = await test_query()
print("Query embedding dim:", len(qvec))
print("First 10 values:", qvec[:10])


Query embedding dim: 1024
First 10 values: [-0.018093587830662727, 0.02583203837275505, 0.015874629840254784, -0.008753212168812752, -0.010822040028870106, -0.04115016385912895, 1.2576455446833279e-05, -0.052612025290727615, 0.019322151318192482, -0.02216285839676857]


## Testing Ingest.py

In [1]:
import sys, os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


In [2]:
data_dir=os.path.join(project_root, "data")
data_dir

'c:\\Users\\birok\\Python\\LLMOPs\\regulatory-rag-system\\data'

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
import logging
import asyncio

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)


In [5]:
from utils.models import IngestionConfig
from src.indexing.ingest import DocumentIngestionPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
config = IngestionConfig(
    chunk_size=1000,
    chunk_overlap=200,
    use_semantic_chunking=True
)


In [7]:
#Create pipeline instance
pipeline = DocumentIngestionPipeline(
    config=config,
    documents_folder=str(data_dir)
)



2026-02-10 00:24:59,568 - INFO - Initializing tokenizer: sentence-transformers/all-MiniLM-L6-v2
2026-02-10 00:25:00,470 - INFO - HybridChunker initialized (max_tokens=512)
2026-02-10 00:25:00,473 - INFO - {"available_keys": ["GROQ_API_KEY", "AWS_SECRET_ACCESS_KEY", "AWS_ACCESS_KEY_ID", "AWS_DEFAULT_REGION"], "timestamp": "2026-02-09T18:55:00.473756Z", "level": "info", "event": "Environment variables validated"}
2026-02-10 00:25:00,475 - INFO - {"config_keys": ["embedding_model", "retriver", "llm"], "timestamp": "2026-02-09T18:55:00.475165Z", "level": "info", "event": "Configuration loaded successfully"}
2026-02-10 00:25:00,475 - INFO - {"timestamp": "2026-02-09T18:55:00.475165Z", "level": "info", "event": "Loading embedding model...."}


In [8]:
await pipeline.initialize()

results = await pipeline.ingest_documents(
    progress_callback=lambda c, t: print(f"Ingested {c}/{t}")
)

await pipeline.close()

results


2026-02-10 00:25:01,323 - INFO - PostgreSQL connection pool initialized
2026-02-10 00:25:01,324 - INFO - Ingestion pipeline initialized
2026-02-10 00:25:01,324 - INFO - Processing 1/1: c:\Users\birok\Python\LLMOPs\regulatory-rag-system\data\R141e.pdf
2026-02-10 00:25:01,324 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-02-10 00:25:01,343 - INFO - Going to convert document batch...
2026-02-10 00:25:01,343 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-02-10 00:25:01,354 - INFO - Loading plugin 'docling_defaults'
2026-02-10 00:25:01,356 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-02-10 00:25:01,365 - INFO - Loading plugin 'docling_defaults'
2026-02-10 00:25:01,365 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-02-10 00:25:01,365 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2026-02-10 00:25:01,365 - INFO - easyo

Ingested 1/1


[IngestionResult(document_id='fc875e83-c9d4-4f2c-9d25-ff460bfb7202', title='R141e', chunks_created=34, processing_time_ms=30555.884, errors=[])]

In [9]:
import asyncpg
import os

conn = await asyncpg.connect(os.getenv("DATABASE_URL"))

docs = await conn.fetch("SELECT title FROM documents")
chunks = await conn.fetch("SELECT COUNT(*) FROM chunks")

docs, chunks


([<Record title='r020r3e'>,
  <Record title='R023r5e'>,
  <Record title='R0r5e'>,
  <Record title='R141e'>],
 [<Record count=283>])

## Testing the PIpeline

In [2]:
import sys, os

# Get project root — one level up from "research-report-generation"
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


In [3]:
from src.pipelines.rag_pipeline import RAGPipeline

rag = RAGPipeline()


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
THREAD_ID = "test-session-001"

In [6]:
result = await rag.run(
    query="What is approval process?",
    thread_id=THREAD_ID,
    filters={"title": "R0r5e"},
)

print("ANSWER:\n")
print(result["answer"])



ANSWER:

The approval process for IWVTA is described in the sources as follows:

1. The approval authority shall verify that all type approval certificates issued pursuant to the UN Regulations which are applicable for IWVTA cover the IWVTA type and correspond to the prescribed requirements. [1], [2]
2. The approval authority shall ensure that the vehicle specifications and data contained in the information document (Part II of Annex 5) are included in the data in the information packages and in the type approval certificates in respect of the relevant UN Regulations. [1], [2]
3. The approval authority shall carry out or arrange to be carried out inspections to verify that the vehicle(s) is/are built in accordance with the relevant data contained in the information package in respect of the relevant type approval certificates. [1], [2]
4. The approval authority shall carry out or arrange to be carried out relevant installation checks in respect of systems, equipment and parts where app

## Testing the updated chunking with data cleaning

In [1]:
import sys, os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


In [2]:
from pathlib import Path
from pprint import pprint

from docling.document_converter import DocumentConverter

from src.indexing.cleaning import DocumentCleaner
from src.indexing.section_splitter import SectionSplitter
from src.chunking.chunker import create_chunker
from utils.models import ChunkingConfig


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
chunk_config = ChunkingConfig(
    chunk_size=800,
    chunk_overlap=100,
    max_chunk_size=1200,
    use_semantic_splitting=False,
)

chunker = create_chunker(chunk_config)
cleaner = DocumentCleaner()
section_splitter = SectionSplitter()
converter = DocumentConverter()


2026-02-14 15:59:51,462 - INFO - {"chunk_size": 800, "overlap": 100, "timestamp": "2026-02-14T10:29:51.462282Z", "level": "info", "event": "simple_chunker_initialized"}


In [5]:
pdf_path = "R0r5e.pdf"  # or any of your samples

result = converter.convert(pdf_path)
doc = result.document

raw_markdown = doc.export_to_markdown()

cleaned_text, cleaning_metadata = cleaner.clean(
    raw_markdown,
    doc
)

print("Cleaning metadata:")
pprint(cleaning_metadata)
print("\nPreview (first 1000 chars):\n")
print(cleaned_text[:1000])


2026-02-14 15:59:52,467 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-02-14 15:59:52,511 - INFO - Going to convert document batch...
2026-02-14 15:59:52,511 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-02-14 15:59:52,531 - INFO - Loading plugin 'docling_defaults'
2026-02-14 15:59:52,531 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-02-14 15:59:52,544 - INFO - Loading plugin 'docling_defaults'
2026-02-14 15:59:52,549 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-02-14 15:59:52,549 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2026-02-14 15:59:52,549 - INFO - easyocr cannot be used because it is not installed.
2026-02-14 15:59:52,692 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2026-02-14 15:59:52,711 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-02-14 15:59:52,720 [RapidOCR] device_con

AttributeError: 'PageItem' object has no attribute 'blocks'

#### Debugging to identify the structure

In [6]:
print(type(doc.pages))
print(type(list(doc.pages.values())[0]))


<class 'dict'>
<class 'docling_core.types.doc.document.PageItem'>


In [8]:
page = list(doc.pages.values())[0]
print(dir(page))

['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_computed_fields__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_on_complete__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_model__', '__pydantic_serializer__', '__pydantic_setattr_handlers__', '__pydantic_

In [9]:
print(dir(doc))


['_DocIndex', '__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_computed_fields__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_on_complete__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_model__', '__pydantic_serializer__', '__pydantic_setattr_handlers__',

In [11]:
if hasattr(doc, "items"):
    print(len(doc.items))
    print(type(doc.items[0]))
    print(dir(doc.items[0]))

In [12]:
for item in doc.iterate_items():
    print(type(item))
    print(dir(item))
    break


<class 'tuple'>
['__add__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'count', 'index']


In [13]:
for item in doc.iterate_items():
    if hasattr(item, "text"):
        print("TEXT:", item.text[:100])
        print("PAGE:", getattr(item, "page_no", None))
        print("BBOX:", getattr(item, "bbox", None))
        print("TYPE:", getattr(item, "type", None))
        print("-----")
        break


In [14]:
item = next(doc.iterate_items())

print("Tuple length:", len(item))
print("Types inside tuple:")

for i, element in enumerate(item):
    print(i, type(element))


Tuple length: 2
Types inside tuple:
0 <class 'docling_core.types.doc.document.TextItem'>
1 <class 'int'>


## Testing the updated chunking with data cleaning- V2

In [1]:
import sys, os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


In [2]:
from pathlib import Path
from pprint import pprint

from docling.document_converter import DocumentConverter

from src.indexing.cleaning import DocumentCleaner
from src.chunking.chunker import create_chunker
from utils.models import ChunkingConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
chunk_config = ChunkingConfig(
    chunk_size=800,
    chunk_overlap=100,
    max_chunk_size=1200,
    use_semantic_splitting=False,
)

chunker = create_chunker(chunk_config)
cleaner = DocumentCleaner()
converter = DocumentConverter()

2026-02-14 16:37:19,974 - INFO - {"chunk_size": 800, "overlap": 100, "timestamp": "2026-02-14T11:07:19.974906Z", "level": "info", "event": "simple_chunker_initialized"}


In [5]:
pdf_path = "R0r5e.pdf"   # change to your file

result = converter.convert(pdf_path)
doc = result.document

raw_markdown = doc.export_to_markdown()

print("Pages:", doc.num_pages())
print("Raw length:", len(raw_markdown))


2026-02-14 16:38:01,498 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-02-14 16:38:01,516 - INFO - Going to convert document batch...
2026-02-14 16:38:01,519 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-02-14 16:38:01,531 - INFO - Loading plugin 'docling_defaults'
2026-02-14 16:38:01,538 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-02-14 16:38:01,548 - INFO - Loading plugin 'docling_defaults'
2026-02-14 16:38:01,550 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-02-14 16:38:01,555 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2026-02-14 16:38:01,555 - INFO - easyocr cannot be used because it is not installed.
2026-02-14 16:38:01,657 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2026-02-14 16:38:01,680 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-02-14 16:38:01,687 [RapidOCR] device_con

Pages: 31
Raw length: 84489


In [6]:
cleaned_text, cleaning_metadata = cleaner.clean(
    raw_markdown,
    doc
)
print("Cleaning metadata:")
pprint(cleaning_metadata)

print("\nCleaned preview:\n")
print(cleaned_text[:1000])


Cleaning metadata:
{'layout_cleaning': True}

Cleaned preview:

*
Agreement
Concerning the Adoption of Harmonized Technical United Nations Regulations for Wheeled Vehicles, Equipment and Parts which can be Fitted and/or be Used on Wheeled Vehicles and the Conditions for Reciprocal Recognition of Approvals Granted on the Basis of these United Nations Regulations *
(Revision 3, including the amendments which entered into force on 14 September 2017)
_________
Addendum 0 - UN Regulation No. 0
Revision 5
Incorporating all valid text up to:
Revision 4 - Amendment 1 - Date of entry into force: 8 October 2022
05 series of amendments - Date of entry into force: 5 June 2023
Uniform provisions concerning the International Whole Vehicle Type Approval (IWVTA)
This document is meant purely as documentation tool. The authentic and legal binding texts are: ECE/TRANS/WP.29/2022/2 and
ECE/TRANS/WP.29/2022/111.
_________
UNITED NATIONS
Former titles of the Agreement: Agreement concerning the Adoption of 

In [7]:
chunks = await chunker.chunk_document(
    content=cleaned_text,
    title="Test Regulation",
    source=pdf_path,
    metadata={"test_mode": True},
    docling_doc=doc
)

print("Total chunks created:", len(chunks))


2026-02-14 16:39:12,942 - INFO - {"title": "Test Regulation", "chunk_count": 80, "timestamp": "2026-02-14T11:09:12.940436Z", "level": "info", "event": "simple_chunking_completed"}


Total chunks created: 80


In [8]:
for i, chunk in enumerate(chunks[:5]):
    print("=" * 80)
    print(f"Chunk #{i}")
    print("Chunk Index:", chunk.index)
    print("Token Count:", chunk.token_count)
    print("Metadata:", chunk.metadata)
    print("\nContent Preview:\n")
    print(chunk.content[:500])


Chunk #0
Chunk Index: 0
Token Count: 173
Metadata: {'title': 'Test Regulation', 'source': 'R0r5e.pdf', 'chunk_method': 'recursive', 'total_chunks': 80, 'test_mode': True}

Content Preview:

*
Agreement
Concerning the Adoption of Harmonized Technical United Nations Regulations for Wheeled Vehicles, Equipment and Parts which can be Fitted and/or be Used on Wheeled Vehicles and the Conditions for Reciprocal Recognition of Approvals Granted on the Basis of these United Nations Regulations *
(Revision 3, including the amendments which entered into force on 14 September 2017)
_________
Addendum 0 - UN Regulation No. 0
Revision 5
Incorporating all valid text up to:
Revision 4 - Amendment 
Chunk #1
Chunk Index: 1
Token Count: 64
Metadata: {'title': 'Test Regulation', 'source': 'R0r5e.pdf', 'chunk_method': 'recursive', 'total_chunks': 80, 'test_mode': True}

Content Preview:

Uniform provisions concerning the International Whole Vehicle Type Approval (IWVTA)
This document is meant purely as doc

In [9]:
from collections import Counter

section_counts = Counter(
    chunk.metadata.get("test_mode")
    for chunk in chunks
)

print(section_counts)


Counter({True: 80})


In [10]:
header_hits = [c for c in chunks if "E/ECE" in c.content]

print("Header chunks found:", len(header_hits))


Header chunks found: 0


In [11]:
for c in chunks:
    if "Article 1" in c.content and "Article 2" in c.content:
        print("Cross-article chunk detected!")

## Testing the updated ingestion pipeline

In [1]:
import sys, os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


In [2]:
data_dir=os.path.join(project_root, "data")
data_dir

'c:\\Users\\birok\\Python\\LLMOPs\\regulatory-rag-system\\data'

In [3]:
import asyncio
from pprint import pprint

from utils.models import IngestionConfig
from src.indexing.ingest import DocumentIngestionPipeline
from utils.db_utils import init_db_pool, close_db_pool, get_pool


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
config = IngestionConfig(
    chunk_size=800,
    chunk_overlap=100,
    max_chunk_size=1200,
    use_semantic_chunking=False
)


In [5]:
pipeline = DocumentIngestionPipeline(
    config=config,
    documents_folder=str(data_dir)   # make sure your test PDF is here
)


{"chunk_size": 800, "overlap": 100, "timestamp": "2026-02-14T11:21:04.809905Z", "level": "info", "event": "simple_chunker_initialized"}
{"available_keys": ["GROQ_API_KEY", "AWS_SECRET_ACCESS_KEY", "AWS_ACCESS_KEY_ID", "AWS_DEFAULT_REGION"], "timestamp": "2026-02-14T11:21:04.812106Z", "level": "info", "event": "Environment variables validated"}
{"config_keys": ["embedding_model", "retriver", "llm"], "timestamp": "2026-02-14T11:21:04.812614Z", "level": "info", "event": "Configuration loaded successfully"}
{"timestamp": "2026-02-14T11:21:04.812614Z", "level": "info", "event": "Loading embedding model...."}
{"timestamp": "2026-02-14T11:21:04.877060Z", "level": "info", "event": "embedding_client_initialized"}


In [6]:
await pipeline.initialize()

results = await pipeline.ingest_documents(
    progress_callback=lambda c, t: print(f"Ingested {c}/{t}")
)

await pipeline.close()

results


{"min_size": 1, "max_size": 10, "timestamp": "2026-02-14T11:21:05.841067Z", "level": "info", "event": "db_pool_initialized"}
{"timestamp": "2026-02-14T11:21:05.841067Z", "level": "info", "event": "ingestion_pipeline_initialized"}
{"document_count": 1, "timestamp": "2026-02-14T11:21:05.841067Z", "level": "info", "event": "ingestion_started"}
{"file": "c:\\Users\\birok\\Python\\LLMOPs\\regulatory-rag-system\\data\\R0r5e.pdf", "index": 1, "total": 1, "timestamp": "2026-02-14T11:21:05.845605Z", "level": "info", "event": "document_processing_started"}
detected formats: [<InputFormat.PDF: 'pdf'>]
Going to convert document batch...
Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
Loading plugin 'docling_defaults'
Registered picture descriptions: ['vlm', 'api']
Loading plugin 'docling_defaults'
Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
rapidocr cannot be used because onnxruntime is not installed.


Ingested 1/1


[IngestionResult(document_id='8bbb5b8e-469d-4f38-80ae-45d5577749c7', title='R0r5e', chunks_created=81, processing_time_ms=74999.767, errors=[])]

In [11]:
from utils.db_utils import init_db_pool, get_pool

async def inspect_db():
    # Initialize pool (await it)
    await init_db_pool()
    
    pool = get_pool()   # get actual pool
    
    async with pool.acquire() as conn:
        doc_count = await conn.fetchval("SELECT COUNT(*) FROM documents")
        chunk_count = await conn.fetchval("SELECT COUNT(*) FROM chunks")
        
        print("Documents:", doc_count)
        print("Chunks:", chunk_count)
        
        sample = await conn.fetchrow(
            "SELECT id, title FROM documents LIMIT 5"
        )
        
        if sample:
            print("Sample Document:", dict(sample))
        else:
            print("No documents found.")


In [12]:
await inspect_db()


Documents: 5
Chunks: 337
Sample Document: {'id': UUID('f32e37ed-9670-4026-b4f9-a3b7aa5e9dec'), 'title': 'r020r3e'}
