In [2]:
import sys, os

# Get project root — one level up from "research-report-generation"
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


## Testing Hybrid Chunker

In [3]:
import pandas as pd
from IPython.display import display, Markdown

def chunks_to_df(chunks, preview_chars=250):
    """Convert list[DocumentChunk] -> pandas dataframe for clean display."""
    rows = []
    for c in chunks:
        rows.append({
            "index": c.index,
            "chars": len(c.content),
            "tokens": c.token_count,
            "chunk_method": c.metadata.get("chunk_method"),
            "title": c.metadata.get("title"),
            "source": c.metadata.get("source"),
            "has_context": c.metadata.get("has_context", False),
            "preview": (c.content[:preview_chars] + "…") if len(c.content) > preview_chars else c.content
        })
    return pd.DataFrame(rows)

def show_chunks(chunks, n=10, preview_chars=250):
    """Show top N chunks as a table."""
    df = chunks_to_df(chunks, preview_chars=preview_chars)
    display(df.head(n))

def show_chunk_detail(chunks, idx):
    """Show a single chunk in full detail."""
    chunk = chunks[idx]
    display(Markdown(f"## Chunk #{chunk.index}"))
    display(Markdown(f"**Chars:** {len(chunk.content)} | **Tokens:** {chunk.token_count}"))
    display(Markdown(f"**Method:** `{chunk.metadata.get('chunk_method')}` | **Source:** `{chunk.metadata.get('source')}`"))
    display(Markdown("### Metadata"))
    display(pd.DataFrame([chunk.metadata]).T.rename(columns={0: "value"}))
    display(Markdown("### Content"))
    display(Markdown(f"```markdown\n{chunk.content}\n```"))


In [6]:
import asyncio
from docling.document_converter import DocumentConverter
from chunker import ChunkingConfig, create_chunker

PDF_PATH = r"C:\Users\birok\Python\LLMOPs\regulatory-rag-system\data\R0r4e.pdf"  # update

async def run_hybrid_test():
    converter = DocumentConverter()
    result = converter.convert(PDF_PATH)

    markdown = result.document.export_to_markdown()
    docling_doc = result.document

    config = ChunkingConfig(use_semantic_splitting=True, max_tokens=512)
    chunker = create_chunker(config)

    chunks = await chunker.chunk_document(
        content=markdown,
        title="Hybrid Test",
        source=PDF_PATH,
        docling_doc=docling_doc
    )

    show_chunks(chunks, n=10)
    show_chunk_detail(chunks, 0)

await run_hybrid_test()


2026-01-17 17:27:30,761 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-17 17:27:30,763 - INFO - Going to convert document batch...
2026-01-17 17:27:30,763 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-17 17:27:30,763 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2026-01-17 17:27:30,763 - INFO - easyocr cannot be used because it is not installed.
2026-01-17 17:27:30,763 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2026-01-17 17:27:30,779 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-17 17:27:30,780 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-01-17 17:27:30,781 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\birok\Python\LLMOPs\regulatory-rag-system\venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-17 17:27:30,781 [RapidOCR] main.py:50: Using C:\Users\birok\Python\LLMOPs\

Unnamed: 0,index,chars,tokens,chunk_method,title,source,has_context,preview
0,0,1,3,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,*
1,1,394,76,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Agreement\nConcerning the Adoption of Harmoniz...
2,2,177,42,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Revision 4\nIncorporating all valid text up to...
3,3,212,57,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Uniform provisions concerning the Internationa...
4,4,572,105,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,UNITED NATIONS\nFormer titles of the Agreement...
5,5,554,450,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,"Contents\nRegulation, = . Regulation, Page = ..."
6,6,513,443,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Contents\n= Approval.............................
7,7,614,431,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Contents\n= Modification of IWVTA type and mod...
8,8,744,452,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Contents\n= Names and addresses of Technical S...
9,9,722,481,hybrid,Hybrid Test,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...,True,Contents\n= Arrangement of the type approval m...


## Chunk #0

**Chars:** 1 | **Tokens:** 3

**Method:** `hybrid` | **Source:** `C:\Users\birok\Python\LLMOPs\regulatory-rag-system\data\R0r4e.pdf`

### Metadata

Unnamed: 0,value
title,Hybrid Test
source,C:\Users\birok\Python\LLMOPs\regulatory-rag-sy...
chunk_method,hybrid
total_chunks,70
token_count,3
has_context,True


### Content

```markdown
*
```

## Testing Embedder.py

In [3]:
from dotenv import load_dotenv
from src.parsing.chunker import DocumentChunk, create_chunker
from src.parsing.embedder import create_embedder
from utils.providers import get_embedding_client, get_embedding_model
# Load environment variables
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [4]:
chunks = [
    DocumentChunk(
        content="RBI requires KYC verification of customer identity using valid documents.",
        index=0,
        start_char=0,
        end_char=80,
        metadata={"title": "RBI Guidelines", "source": "test"},
        token_count=20
    ),
    DocumentChunk(
        content="All suspicious transactions should be reported to FIU-IND within prescribed timelines.",
        index=1,
        start_char=81,
        end_char=170,
        metadata={"title": "RBI Guidelines", "source": "test"},
        token_count=22
    )
]

print("Chunks created:", len(chunks))
print("Sample chunk:", chunks[0].content)


Chunks created: 2
Sample chunk: RBI requires KYC verification of customer identity using valid documents.


In [5]:
import boto3, os, json

region = os.environ["AWS_DEFAULT_REGION"]
client = boto3.client("bedrock", region_name=region)

resp = client.list_foundation_models()

# show only embedding models
embed_models = [
    m for m in resp["modelSummaries"]
    if "EMBEDDING" in m.get("outputModalities", []) or "EMBEDDING" in m.get("inferenceTypesSupported", [])
]

for m in resp["modelSummaries"]:
    if "embed" in m["modelId"]:
        print(m["modelId"])


cohere.embed-v4:0
amazon.titan-embed-text-v2:0


In [6]:
import asyncio

async def test_embedder_chunks():
    embedder = create_embedder(batch_size=2)  # small batch for test
    
    def progress(current, total):
        print(f"Embedding batch {current}/{total}")

    embedded = await embedder.embed_chunks(chunks, progress_callback=progress)
    return embedded

embedded_chunks = await test_embedder_chunks()
print("Embedded chunks:", len(embedded_chunks))

Embedding batch 1/1
Embedded chunks: 2


In [7]:
for c in embedded_chunks:
    print("="*90)
    print("Index:", c.index)
    print("Text:", c.content[:80], "...")
    print("Embedding dim:", len(c.embedding))
    print("Provider:", c.metadata.get("embedding_provider"))
    print("Model:", c.metadata.get("embedding_model"))



Index: 0
Text: RBI requires KYC verification of customer identity using valid documents. ...
Embedding dim: 1024
Provider: aws_bedrock_titan
Model: amazon.titan-embed-text-v1
Index: 1
Text: All suspicious transactions should be reported to FIU-IND within prescribed time ...
Embedding dim: 1024
Provider: aws_bedrock_titan
Model: amazon.titan-embed-text-v1


In [8]:
async def test_query():
    embedder = create_embedder()
    v = await embedder.embed_query("KYC verification requirements")
    return v

qvec = await test_query()
print("Query embedding dim:", len(qvec))
print("First 10 values:", qvec[:10])


Query embedding dim: 1024
First 10 values: [-0.018093587830662727, 0.02583203837275505, 0.015874629840254784, -0.008753212168812752, -0.010822040028870106, -0.04115016385912895, 1.2576455446833279e-05, -0.052612025290727615, 0.019322151318192482, -0.02216285839676857]


## Testing Ingest.py

In [1]:
import sys, os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


In [2]:
data_dir=os.path.join(project_root, "data")
data_dir

'c:\\Users\\birok\\Python\\LLMOPs\\regulatory-rag-system\\data'

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
import logging
import asyncio

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)


In [5]:
from utils.models import IngestionConfig
from src.indexing.ingest import DocumentIngestionPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
config = IngestionConfig(
    chunk_size=1000,
    chunk_overlap=200,
    use_semantic_chunking=True
)


In [7]:
#Create pipeline instance
pipeline = DocumentIngestionPipeline(
    config=config,
    documents_folder=str(data_dir)
)



2026-02-10 00:24:59,568 - INFO - Initializing tokenizer: sentence-transformers/all-MiniLM-L6-v2
2026-02-10 00:25:00,470 - INFO - HybridChunker initialized (max_tokens=512)
2026-02-10 00:25:00,473 - INFO - {"available_keys": ["GROQ_API_KEY", "AWS_SECRET_ACCESS_KEY", "AWS_ACCESS_KEY_ID", "AWS_DEFAULT_REGION"], "timestamp": "2026-02-09T18:55:00.473756Z", "level": "info", "event": "Environment variables validated"}
2026-02-10 00:25:00,475 - INFO - {"config_keys": ["embedding_model", "retriver", "llm"], "timestamp": "2026-02-09T18:55:00.475165Z", "level": "info", "event": "Configuration loaded successfully"}
2026-02-10 00:25:00,475 - INFO - {"timestamp": "2026-02-09T18:55:00.475165Z", "level": "info", "event": "Loading embedding model...."}


In [8]:
await pipeline.initialize()

results = await pipeline.ingest_documents(
    progress_callback=lambda c, t: print(f"Ingested {c}/{t}")
)

await pipeline.close()

results


2026-02-10 00:25:01,323 - INFO - PostgreSQL connection pool initialized
2026-02-10 00:25:01,324 - INFO - Ingestion pipeline initialized
2026-02-10 00:25:01,324 - INFO - Processing 1/1: c:\Users\birok\Python\LLMOPs\regulatory-rag-system\data\R141e.pdf
2026-02-10 00:25:01,324 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-02-10 00:25:01,343 - INFO - Going to convert document batch...
2026-02-10 00:25:01,343 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-02-10 00:25:01,354 - INFO - Loading plugin 'docling_defaults'
2026-02-10 00:25:01,356 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-02-10 00:25:01,365 - INFO - Loading plugin 'docling_defaults'
2026-02-10 00:25:01,365 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-02-10 00:25:01,365 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2026-02-10 00:25:01,365 - INFO - easyo

Ingested 1/1


[IngestionResult(document_id='fc875e83-c9d4-4f2c-9d25-ff460bfb7202', title='R141e', chunks_created=34, processing_time_ms=30555.884, errors=[])]

In [9]:
import asyncpg
import os

conn = await asyncpg.connect(os.getenv("DATABASE_URL"))

docs = await conn.fetch("SELECT title FROM documents")
chunks = await conn.fetch("SELECT COUNT(*) FROM chunks")

docs, chunks


([<Record title='r020r3e'>,
  <Record title='R023r5e'>,
  <Record title='R0r5e'>,
  <Record title='R141e'>],
 [<Record count=283>])

## Testing the PIpeline

In [2]:
import sys, os

# Get project root — one level up from "research-report-generation"
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\regulatory-rag-system


In [3]:
from src.pipelines.rag_pipeline import RAGPipeline

rag = RAGPipeline()


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
THREAD_ID = "test-session-001"

In [6]:
result = await rag.run(
    query="What is approval process?",
    thread_id=THREAD_ID,
    filters={"title": "R0r5e"},
)

print("ANSWER:\n")
print(result["answer"])



ANSWER:

The approval process for IWVTA is described in the sources as follows:

1. The approval authority shall verify that all type approval certificates issued pursuant to the UN Regulations which are applicable for IWVTA cover the IWVTA type and correspond to the prescribed requirements. [1], [2]
2. The approval authority shall ensure that the vehicle specifications and data contained in the information document (Part II of Annex 5) are included in the data in the information packages and in the type approval certificates in respect of the relevant UN Regulations. [1], [2]
3. The approval authority shall carry out or arrange to be carried out inspections to verify that the vehicle(s) is/are built in accordance with the relevant data contained in the information package in respect of the relevant type approval certificates. [1], [2]
4. The approval authority shall carry out or arrange to be carried out relevant installation checks in respect of systems, equipment and parts where app