In [1]:
import sys, os

# Get project root — one level up from "research-report-generation"
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to path:", project_root)

Project root added to path: c:\Users\birok\Python\LLMOPs\research-report-generation


In [2]:
from src.api.retriever.chunking import DocumentChunker
from src.api.retriever.vectorstore import VectorStoreManager
from src.api.retriever.embedding import EmbeddingService
from src.api.retriever.ingestion import DocumentIngestor

In [3]:
ingestor = DocumentIngestor()

docs = ingestor.ingest_files([
    r"C:\Users\birok\Python\LLMOPs\research-report-generation\temp_uploads\Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf",
    r"C:\Users\birok\Python\LLMOPs\research-report-generation\temp_uploads\Design Document – Internal Ml Research Copilot.pdf",
    r"C:\Users\birok\Python\LLMOPs\research-report-generation\temp_uploads\Internal Research Notes – Evaluation Of Retrieval-augmented Multi-agent Systems.pdf",
    r"C:\Users\birok\Python\LLMOPs\research-report-generation\temp_uploads\Internal Ticket – Mlplat-2478 Gpu Training Instability.pdf",
    r"C:\Users\birok\Python\LLMOPs\research-report-generation\temp_uploads\Postmortem – Gpu Training Pipeline Outage (sev-2).pdf",
    r"C:\Users\birok\Python\LLMOPs\research-report-generation\temp_uploads\Rfc-012 – Deprecation Of Tensor Flow Training Pipelines.pdf"
])

print("Total documents loaded:", len(docs))
print(docs[0].metadata)
print(docs[0].page_content[:300])


✅ Loaded 4 Document objects from Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf
✅ Loaded 5 Document objects from Design Document – Internal Ml Research Copilot.pdf
✅ Loaded 4 Document objects from Internal Research Notes – Evaluation Of Retrieval-augmented Multi-agent Systems.pdf
✅ Loaded 3 Document objects from Internal Ticket – Mlplat-2478 Gpu Training Instability.pdf
✅ Loaded 4 Document objects from Postmortem – Gpu Training Pipeline Outage (sev-2).pdf
✅ Loaded 4 Document objects from Rfc-012 – Deprecation Of Tensor Flow Training Pipelines.pdf

✅ Total loaded documents: 24
Total documents loaded: 24
{'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System', 'author': 'ChatGPT Canvas', 'source': 'C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\temp_uploads\\Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf', 'total_pages': 4

In [4]:
chunker = DocumentChunker()
chunks = chunker.chunk(docs)
print("✅ Total Docs Loaded:", len(docs))
print("✅ Total Chunks:", len(chunks))

# Preview first chunk
print("\n--- First Chunk Preview ---")
print(chunks[0].page_content[:300])
print("\n--- First Chunk Metadata ---")
print(chunks[0].metadata)


{"path": "C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\src\\config\\configuration.yaml", "keys": ["chroma", "retriever", "embedding_model", "llm"], "timestamp": "2026-01-11T13:26:01.947447Z", "level": "info", "event": "Configuration loaded successfully"}
{"module": "DocumentChunker", "timestamp": "2026-01-11T13:26:01.953906Z", "level": "info", "event": "Chunked 24 documents into 79 chunks."}


✅ Total Docs Loaded: 24
✅ Total Chunks: 79

--- First Chunk Preview ---
ADR-021: Adoption of a Retrieval-Augmented
Multi-Agent Research System
Status: Accepted
Date: 2026-01-08
Deciders: AI Platform Lead, ML Infrastructure Lead, Research Director
Technical Story: Internal Research Enablement
1. Context
The organization relies heavily on internal technical knowledge to m

--- First Chunk Metadata ---
{'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System', 'author': 'ChatGPT Canvas', 'source': 'C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\temp_uploads\\Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'file_name': 'Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf', 'file_type': '.pdf', 'chunk_id': 0, 'total_chunks': 4}


In [5]:
from collections import defaultdict

chunks_per_pdf = defaultdict(list)

for c in chunks:
    source = c.metadata.get("source", "")
    
    # ✅ filter: only chunks coming from temp_uploads folder
    if "temp_uploads" in source.replace("\\", "/"):
        pdf_name = c.metadata.get("file_name") or "unknown"
        chunks_per_pdf[pdf_name].append(c)

print("✅ Chunks per PDF (temp_uploads only):\n")

for pdf_name, pdf_chunks in sorted(chunks_per_pdf.items(), key=lambda x: len(x[1]), reverse=True):
    print(f"- {pdf_name}: {len(pdf_chunks)} chunks")


✅ Chunks per PDF (temp_uploads only):

- Design Document – Internal Ml Research Copilot.pdf: 16 chunks
- Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf: 14 chunks
- Internal Research Notes – Evaluation Of Retrieval-augmented Multi-agent Systems.pdf: 14 chunks
- Postmortem – Gpu Training Pipeline Outage (sev-2).pdf: 13 chunks
- Rfc-012 – Deprecation Of Tensor Flow Training Pipelines.pdf: 12 chunks
- Internal Ticket – Mlplat-2478 Gpu Training Instability.pdf: 10 chunks


In [6]:
pages_per_pdf = defaultdict(set)

for d in docs:
    source = d.metadata.get("source", "")
    if "temp_uploads" in source.replace("\\", "/"):
        pdf_name = d.metadata.get("file_name") or "unknown"
        page = d.metadata.get("page")
        if page is not None:
            pages_per_pdf[pdf_name].add(page)

print("\n✅ Pages per PDF (temp_uploads only):\n")
for pdf_name, pages in sorted(pages_per_pdf.items(), key=lambda x: len(x[1]), reverse=True):
    print(f"- {pdf_name}: {len(pages)} pages")



✅ Pages per PDF (temp_uploads only):

- Design Document – Internal Ml Research Copilot.pdf: 5 pages
- Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf: 4 pages
- Internal Research Notes – Evaluation Of Retrieval-augmented Multi-agent Systems.pdf: 4 pages
- Postmortem – Gpu Training Pipeline Outage (sev-2).pdf: 4 pages
- Rfc-012 – Deprecation Of Tensor Flow Training Pipelines.pdf: 4 pages
- Internal Ticket – Mlplat-2478 Gpu Training Instability.pdf: 3 pages


In [7]:
from src.api.retriever.embedding import EmbeddingService

embedding_service = EmbeddingService()

embeddings = embedding_service.embed_documents(chunks)

print("Total chunks:", len(chunks))
print("Total embeddings:", len(embeddings))
print("Embedding dimension:", len(embeddings[0]))

{"timestamp": "2026-01-11T13:26:05.750133Z", "level": "info", "event": "GROQ_API_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:05.750133Z", "level": "info", "event": "AWS_SECRET_ACCESS_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:05.754242Z", "level": "info", "event": "AWS_ACCESS_KEY_ID loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:05.756443Z", "level": "info", "event": "AWS_DEFAULT_REGION loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:05.757845Z", "level": "info", "event": "OPENAI_API_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:05.757845Z", "level": "info", "event": "OPENAI_ENDPOINT loaded successfully from environment"}
{"path": "C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\src\\config\\configuration.yaml", "keys": ["chroma", "retriever", "embedding_model", "llm"], "timestamp": "2026-01-11T13:26:05.764805Z", "level": "info", "even

Total chunks: 79
Total embeddings: 79
Embedding dimension: 1024


In [8]:
from src.api.retriever.vectorstore import VectorStoreManager

vs_manager = VectorStoreManager()

vectorstore = vs_manager.add_documents_with_embeddings(
    documents=chunks,
    embeddings=embeddings
)

print("✅ Stored chunks into FAISS.")
print("Vectorstore type:", type(vectorstore))


{"path": "C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\src\\config\\configuration.yaml", "keys": ["chroma", "retriever", "embedding_model", "llm"], "timestamp": "2026-01-11T13:26:41.608748Z", "level": "info", "event": "Configuration loaded successfully"}
{"timestamp": "2026-01-11T13:26:41.618798Z", "level": "info", "event": "GROQ_API_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:41.620832Z", "level": "info", "event": "AWS_SECRET_ACCESS_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:41.620832Z", "level": "info", "event": "AWS_ACCESS_KEY_ID loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:41.624615Z", "level": "info", "event": "AWS_DEFAULT_REGION loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:41.625950Z", "level": "info", "event": "OPENAI_API_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:41.625950Z", "level": "info", "event": "OPENAI_ENDPOIN

✅ Stored chunks into FAISS.
Vectorstore type: <class 'langchain_community.vectorstores.faiss.FAISS'>


In [9]:
# 1) Your test retrieval query
query = "What are the goals and non-goals of the Internal ML Research Copilot?"

# 2) Create a retriever from your vectorstore
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# 3) Retrieve top-k chunks
results = retriever.invoke(query)

# 4) Print results
print(f"Retrieved {len(results)} chunks\n")

for i, doc in enumerate(results, start=1):
    print("=" * 80)
    print(f"Result {i}")
    print("File:", doc.metadata.get("file_name"))
    print("Chunk ID:", doc.metadata.get("chunk_id"))
    print(doc.page_content[:500])
    print()


Successfully invoked model amazon.titan-embed-text-v2:0. ResponseMetadata: {'RequestId': 'eadb1742-f3b1-4bf4-ac88-78115a6c72cf', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 11 Jan 2026 12:32:58 GMT', 'content-type': 'application/json', 'content-length': '43303', 'connection': 'keep-alive', 'x-amzn-requestid': 'eadb1742-f3b1-4bf4-ac88-78115a6c72cf', 'x-amzn-bedrock-invocation-latency': '75', 'x-amzn-bedrock-input-token-count': '17'}, 'RetryAttempts': 0}


Retrieved 5 chunks

Result 1
File: Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf
Chunk ID: 0
ADR-021: Adoption of a Retrieval-Augmented
Multi-Agent Research System
Status: Accepted
Date: 2026-01-08
Deciders: AI Platform Lead, ML Infrastructure Lead, Research Director
Technical Story: Internal Research Enablement
1. Context
The organization relies heavily on internal technical knowledge to make architecture, infrastructure, and
research decisions. This knowledge is distributed across multiple formats and systems, including design
documents, architecture decision records (ADRs), RFCs, pos

Result 2
File: Adr-021 – Adoption Of Retrieval-augmented Multi-agent Research System.pdf
Chunk ID: 0
Cons: -  Higher  implementation  complexity  -  Requires  careful  prompt  and  workflow  design  -  Initial
operational overhead
5. Decision
The  organization  will  adopt  a  retrieval-augmented,  multi-agent  research  system as  the  primary
approach for internal technica

In [9]:
from src.api.retriever.retriever import SimpleRAGRetriever, SimpleRAGRetrieverWithMMR, SimpleRAGRetrieverWithThreshold
query = "What are the goals and non-goals of the Internal ML Research Copilot?"

In [10]:
retreiver1 = SimpleRAGRetriever(top_k=3)
retreiver2 = SimpleRAGRetrieverWithMMR(top_k=2)
retreiver3 = SimpleRAGRetrieverWithThreshold(top_k=2)

{"path": "C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\src\\config\\configuration.yaml", "keys": ["chroma", "retriever", "embedding_model", "llm"], "timestamp": "2026-01-11T13:26:56.235507Z", "level": "info", "event": "Configuration loaded successfully"}
{"path": "C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\src\\config\\configuration.yaml", "keys": ["chroma", "retriever", "embedding_model", "llm"], "timestamp": "2026-01-11T13:26:56.243199Z", "level": "info", "event": "Configuration loaded successfully"}
{"timestamp": "2026-01-11T13:26:56.246075Z", "level": "info", "event": "GROQ_API_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:56.246075Z", "level": "info", "event": "AWS_SECRET_ACCESS_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:56.246075Z", "level": "info", "event": "AWS_ACCESS_KEY_ID loaded successfully from environment"}
{"timestamp": "2026-01-11T13:26:56.246075Z", "level": "info", "event":

In [11]:
retreiver1.retrieve(query)

{"module": "VectorStoreManager", "timestamp": "2026-01-11T13:27:00.341682Z", "level": "info", "event": "Loading existing FAISS vectorstore from ./faiss_db"}
{"module": "AutonomousReportGenerator", "timestamp": "2026-01-11T13:27:00.346186Z", "level": "info", "event": "Simple similarity retriever initialized."}
Successfully invoked model amazon.titan-embed-text-v2:0. ResponseMetadata: {'RequestId': 'a63b446b-9454-4909-a442-6f5c1026f271', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 11 Jan 2026 13:27:01 GMT', 'content-type': 'application/json', 'content-length': '43303', 'connection': 'keep-alive', 'x-amzn-requestid': 'a63b446b-9454-4909-a442-6f5c1026f271', 'x-amzn-bedrock-invocation-latency': '75', 'x-amzn-bedrock-input-token-count': '17'}, 'RetryAttempts': 0}


[Document(id='Design Document – Internal Ml Research Copilot.pdf::p0::c0::c0280f71f1cc', metadata={'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'Design Document – Internal Ml Research Copilot', 'author': 'ChatGPT Canvas', 'source': 'C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\temp_uploads\\Design Document – Internal Ml Research Copilot.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'file_name': 'Design Document – Internal Ml Research Copilot.pdf', 'file_type': '.pdf', 'chunk_id': 0, 'total_chunks': 4, 'chunk_uid': 'Design Document – Internal Ml Research Copilot.pdf::p0::c0::c0280f71f1cc'}, page_content='Design Document: Internal ML Research Copilot\nAuthor: AI Platform Team\nDate: 2026-01-08\nStatus: Draft\n1. Overview\nThe  Internal  ML  Research  Copilot is  a  retrieval-augmented,  multi-agent  AI  system  designed  to  help\nengineers,  researchers,  and  technical  leaders  answer  complex,  organization-specific  quest

In [12]:
retreiver2.retrieve(query)

{"module": "VectorStoreManager", "timestamp": "2026-01-11T13:27:04.899337Z", "level": "info", "event": "Loading existing FAISS vectorstore from ./faiss_db"}
{"module": "AutonomousReportGenerator", "timestamp": "2026-01-11T13:27:04.907984Z", "level": "info", "event": "MMR retriever initialized."}
Successfully invoked model amazon.titan-embed-text-v2:0. ResponseMetadata: {'RequestId': '8fcb60ba-9e57-4744-b807-a92988c40545', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 11 Jan 2026 13:27:05 GMT', 'content-type': 'application/json', 'content-length': '43303', 'connection': 'keep-alive', 'x-amzn-requestid': '8fcb60ba-9e57-4744-b807-a92988c40545', 'x-amzn-bedrock-invocation-latency': '85', 'x-amzn-bedrock-input-token-count': '17'}, 'RetryAttempts': 0}


[Document(id='Design Document – Internal Ml Research Copilot.pdf::p0::c0::c0280f71f1cc', metadata={'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'Design Document – Internal Ml Research Copilot', 'author': 'ChatGPT Canvas', 'source': 'C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\temp_uploads\\Design Document – Internal Ml Research Copilot.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'file_name': 'Design Document – Internal Ml Research Copilot.pdf', 'file_type': '.pdf', 'chunk_id': 0, 'total_chunks': 4, 'chunk_uid': 'Design Document – Internal Ml Research Copilot.pdf::p0::c0::c0280f71f1cc'}, page_content='Design Document: Internal ML Research Copilot\nAuthor: AI Platform Team\nDate: 2026-01-08\nStatus: Draft\n1. Overview\nThe  Internal  ML  Research  Copilot is  a  retrieval-augmented,  multi-agent  AI  system  designed  to  help\nengineers,  researchers,  and  technical  leaders  answer  complex,  organization-specific  quest

In [15]:
retreiver3 = SimpleRAGRetrieverWithThreshold(top_k=2, score_threshold=0.5)
retreiver3.retrieve(query)

{"path": "C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\src\\config\\configuration.yaml", "keys": ["chroma", "retriever", "embedding_model", "llm"], "timestamp": "2026-01-11T13:28:19.237181Z", "level": "info", "event": "Configuration loaded successfully"}
{"path": "C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\src\\config\\configuration.yaml", "keys": ["chroma", "retriever", "embedding_model", "llm"], "timestamp": "2026-01-11T13:28:19.241136Z", "level": "info", "event": "Configuration loaded successfully"}
{"timestamp": "2026-01-11T13:28:19.241136Z", "level": "info", "event": "GROQ_API_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:28:19.241136Z", "level": "info", "event": "AWS_SECRET_ACCESS_KEY loaded successfully from environment"}
{"timestamp": "2026-01-11T13:28:19.249680Z", "level": "info", "event": "AWS_ACCESS_KEY_ID loaded successfully from environment"}
{"timestamp": "2026-01-11T13:28:19.250642Z", "level": "info", "event":

[Document(id='Design Document – Internal Ml Research Copilot.pdf::p0::c0::c0280f71f1cc', metadata={'producer': 'WeasyPrint 65.1', 'creator': 'ChatGPT', 'creationdate': '', 'title': 'Design Document – Internal Ml Research Copilot', 'author': 'ChatGPT Canvas', 'source': 'C:\\Users\\birok\\Python\\LLMOPs\\research-report-generation\\temp_uploads\\Design Document – Internal Ml Research Copilot.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'file_name': 'Design Document – Internal Ml Research Copilot.pdf', 'file_type': '.pdf', 'chunk_id': 0, 'total_chunks': 4, 'chunk_uid': 'Design Document – Internal Ml Research Copilot.pdf::p0::c0::c0280f71f1cc'}, page_content='Design Document: Internal ML Research Copilot\nAuthor: AI Platform Team\nDate: 2026-01-08\nStatus: Draft\n1. Overview\nThe  Internal  ML  Research  Copilot is  a  retrieval-augmented,  multi-agent  AI  system  designed  to  help\nengineers,  researchers,  and  technical  leaders  answer  complex,  organization-specific  quest