# RAG Pipeline Experiments

This notebook documents the experimental process for building the NTT Data Agent System's RAG pipeline.

**Experiments covered:**
1. Chunking Strategies (Markdown vs Recursive)
2. Vector Store Ingestion with Gemini Embeddings
3. Agent Tool Development with Year Filtering
4. Streaming Response Testing

> **Note:** The final production code has been extracted to `src/` modules. This notebook serves as documentation of the R&D process.

In [1]:
import sys
import os

# Add the project root to the python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

print(f"Project root added to path: {project_root}")

Project root added to path: c:\Users\ahmet\OneDrive\Masaüstü\ntt_case\ntt_clean_start


In [3]:
# Import chunking utilities - directly from libraries (experiment setup)
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os
import tiktoken
# import matplotlib.pyplot as plt

load_dotenv()

# Configuration (will be moved to src/utils/config.py after experiments)
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "ntt_semantic_search_v1")
EMBEDDING_MODEL = "models/embedding-001"
LLM_MODEL = "gemini-2.5-flash"
RAG_K = 5

print("Modules imported successfully.")
print(f"Config: QDRANT_URL={QDRANT_URL}, COLLECTION={QDRANT_COLLECTION_NAME}")

Modules imported successfully.
Config: QDRANT_URL=http://localhost:6333, COLLECTION=ntt_semantic_search_v2


In [4]:
# Find all processed markdown files
processed_dir = os.path.join(project_root, "data", "processed")
md_files = []

for root, dirs, files in os.walk(processed_dir):
    for file in files:
        if file.endswith(".md"):
            md_files.append(os.path.join(root, file))

print(f"Found {len(md_files)} markdown files:")
for f in md_files:
    print(f"- {os.path.basename(f)}")

Found 10 markdown files:
- sr_2019_p.md
- sr_2020.md
- sr_2020_cb_p.md
- sr_2021.md
- sr_2020_cb_v.md
- sr_2022.md
- sr2023.md
- sr_2023_cb_v.md
- sr2024.md
- sr_2024_cb_v.md


In [None]:
"""
Experiment 1: Markdown Header Splitter

This approach splits text based on markdown headers (#, ##, ###).
Pros: Preserves document structure and semantic boundaries.
Cons: Chunk sizes can be highly variable.

Conclusion: Good for structured documents, but needs secondary splitting for large sections.
"""
print(f"{'='*20} MARKDOWN CHUNKER ANALYSIS {'='*20}")

# Direct implementation using LangChain
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
enc = tiktoken.get_encoding("cl100k_base")

for file_path in md_files[:2]:  # Limit to first 2 files for demo
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        
        chunks = markdown_splitter.split_text(content)
        tokens = [len(enc.encode(chunk.page_content)) for chunk in chunks]
        
        print(f"\nFile: {os.path.basename(file_path)}")
        print(f"Total Chunks: {len(chunks)}")
        print(f"Avg Tokens: {sum(tokens)/len(tokens):.0f}, Min: {min(tokens)}, Max: {max(tokens)}")
        
        if len(chunks) > 0:
             print(f"Sample Chunk Metadata: {chunks[0].metadata}")

        plt.figure(figsize=(10, 4))
        plt.hist(tokens, bins=30, color='skyblue', edgecolor='black')
        plt.title(f'Markdown Header Splitter - {os.path.basename(file_path)}')
        plt.xlabel('Token Count')
        plt.ylabel('Frequency')
        plt.axvline(x=1000, color='red', linestyle='--', label='Target: 1000 tokens')
        plt.legend()
        plt.show()
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

In [None]:
# Inspect a sample chunk with metadata
if chunks:
    print("Sample Chunk (index 4 or last available):")
    sample_idx = min(4, len(chunks) - 1)
    print(f"Metadata: {chunks[sample_idx].metadata}")
    print(f"Content Preview: {chunks[sample_idx].page_content[:300]}...")

# Recursive Character Splitter with Token Awareness

In [5]:
print(f"{'='*20} RECURSIVE CHUNKER ANALYSIS {'='*20}")

from langchain_core.documents import Document
from datetime import datetime

# Chunking config - derived from this experiment
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
MIN_CHUNK_SIZE = 300

# Direct implementation using LangChain
recursive_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", " ", ""]
)

enc = tiktoken.get_encoding("cl100k_base")
all_documents = []

for file_path in md_files:
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            
        # Extract Metadata from path
        path_parts = os.path.normpath(file_path).split(os.sep)
        year_folder = path_parts[-2]
        year = year_folder.replace("ntt_", "") if "ntt_" in year_folder else "unknown"
        source = path_parts[-1]
        
        # Get file stats
        file_stats = os.stat(file_path)
        update_date = datetime.fromtimestamp(file_stats.st_mtime).isoformat()
        
        # Split text
        raw_chunks = recursive_splitter.split_text(content)
        
        # Filter by minimum token count
        valid_chunks = []
        for i, chunk_text in enumerate(raw_chunks):
            token_count = len(enc.encode(chunk_text))
            if token_count >= MIN_CHUNK_SIZE:
                doc = Document(
                    page_content=chunk_text,
                    metadata={
                        "source": source,
                        "year": int(year) if year.isdigit() else year,
                        "chunk_index": len(valid_chunks),
                        "token_count": token_count,
                        "update_date": update_date,
                    }
                )
                valid_chunks.append(doc)
        
        all_documents.extend(valid_chunks)
        print(f"Processed {source} (Year: {year}): {len(valid_chunks)} chunks (filtered from {len(raw_chunks)})")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print(f"\nTotal documents prepared: {len(all_documents)}")

Processed sr_2019_p.md (Year: 2019): 98 chunks (filtered from 259)
Processed sr_2020.md (Year: 2020): 80 chunks (filtered from 447)
Processed sr_2020_cb_p.md (Year: 2020): 9 chunks (filtered from 11)
Processed sr_2021.md (Year: 2021): 86 chunks (filtered from 426)
Processed sr_2020_cb_v.md (Year: 2022): 18 chunks (filtered from 18)
Processed sr_2022.md (Year: 2022): 90 chunks (filtered from 311)
Processed sr2023.md (Year: 2023): 108 chunks (filtered from 602)
Processed sr_2023_cb_v.md (Year: 2023): 16 chunks (filtered from 27)
Processed sr2024.md (Year: 2024): 111 chunks (filtered from 475)
Processed sr_2024_cb_v.md (Year: 2024): 34 chunks (filtered from 44)

Total documents prepared: 650


# Experiment 3: Indexing with FastEmbed & Qdrant (Cosine Similarity)

This step indexes the chunks generated above into Qdrant using **FastEmbed** embeddings.
We explicitly configure the collection to use **Cosine Similarity** and ensure metadata (like `year`) is preserved for filtering.


In [12]:
print(f"{'='*20} INDEXING LIKE SRC/VECTORSTORE/INDEXER.PY (GEMINI DENSE) {'='*20}")

import uuid
from qdrant_client import QdrantClient
from qdrant_client.http import models
from fastembed import SparseTextEmbedding
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# 1. Setup Clients & Models
# Connect to real Qdrant instance
print(f"Connecting to Qdrant at {QDRANT_URL}...")
client = QdrantClient(url=QDRANT_URL)
COLLECTION_NAME = "ntt_hybrid_experiment"

# Dense Embedding (Gemini)
print(f"Initializing Gemini Embeddings with model: {EMBEDDING_MODEL}")
# User requested task_type="SEMANTIC_SIMILARITY"
dense_model = GoogleGenerativeAIEmbeddings(
    model=EMBEDDING_MODEL,
    google_api_key=GOOGLE_API_KEY,
    task_type="semantic_similarity"
)

# Get dimension dynamically
dummy_vector = dense_model.embed_query("test")
vector_size = len(dummy_vector)
print(f"Detected vector size: {vector_size}")

# Sparse Embedding (FastEmbed - BM25)
sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")

# 2. Create Collection (Hybrid: Dense + Sparse)
if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(COLLECTION_NAME)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "dense": models.VectorParams(
            size=vector_size, # Dynamic size
            distance=models.Distance.COSINE
        )
    },
    sparse_vectors_config={
        "sparse": models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
    }
)
print(f"Collection '{COLLECTION_NAME}' created with Hybrid Config (Gemini Dense).")

# 3. Create Payload Index
client.create_payload_index(
    collection_name=COLLECTION_NAME,
    field_name="year",
    field_schema=models.PayloadSchemaType.INTEGER
)
print("Payload index for 'year' created.")

# 4. Index Documents
if 'all_documents' in locals() and all_documents:
    print(f"Indexing {len(all_documents)} documents...")
    
    batch_size = 10
    total_docs = len(all_documents)
    
    for i in range(0, total_docs, batch_size):
        batch_docs = all_documents[i : i + batch_size]
        texts = [doc.page_content for doc in batch_docs]
        metadatas = [doc.metadata for doc in batch_docs]
        
        # Generate Embeddings
        # Dense (Gemini)
        dense_embeddings = dense_model.embed_documents(texts)
        
        # Sparse (FastEmbed)
        sparse_embeddings = list(sparse_model.embed(texts))
        
        points = []
        for j, (text, meta, dense, sparse) in enumerate(zip(texts, metadatas, dense_embeddings, sparse_embeddings)):
            # Create PointStruct
            points.append(models.PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "dense": dense, # Gemini returns list[float] directly
                    "sparse": models.SparseVector(
                        indices=sparse.indices.tolist(),
                        values=sparse.values.tolist()
                    )
                },
                payload={
                    "content": text, # Mapped to 'content' as requested
                    **meta
                }
            ))
            
        # Upsert Batch
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=points
        )
        print(f"Indexed batch {i//batch_size + 1}/{(total_docs + batch_size - 1)//batch_size}")
        
    print("Indexing complete.")

# 5. Verify Search (Hybrid + Filter)
print("\nTesting Hybrid Search with Year Filter (2023)...")
query_text = "sustainability goals"

# Embed Query
query_dense = dense_model.embed_query(query_text)
query_sparse = list(sparse_model.embed([query_text]))[0]

results = client.query_points(
    collection_name=COLLECTION_NAME,
    prefetch=[
        models.Prefetch(
            query=query_dense,
            using="dense",
            filter=models.Filter(
                must=[models.FieldCondition(key="year", match=models.MatchValue(value=2023))]
            ),
            limit=2
        ),
        models.Prefetch(
            query=models.SparseVector(indices=query_sparse.indices.tolist(), values=query_sparse.values.tolist()),
            using="sparse",
            filter=models.Filter(
                must=[models.FieldCondition(key="year", match=models.MatchValue(value=2023))]
            ),
            limit=2
        ),
    ],
    query=models.FusionQuery(fusion=models.Fusion.RRF),
    limit=2
)

for point in results.points:
    print(f"- [{point.payload.get('year')}] {point.payload.get('content')[:100]}...")

Connecting to Qdrant at http://localhost:6333...
Initializing Gemini Embeddings with model: models/embedding-001
Detected vector size: 768
Collection 'ntt_hybrid_experiment' created with Hybrid Config (Gemini Dense).
Payload index for 'year' created.
Indexing 650 documents...
Indexed batch 1/65
Indexed batch 2/65
Indexed batch 3/65
Indexed batch 4/65
Indexed batch 5/65
Indexed batch 6/65
Indexed batch 7/65
Indexed batch 8/65
Indexed batch 9/65
Indexed batch 10/65
Indexed batch 11/65
Indexed batch 12/65
Indexed batch 13/65
Indexed batch 14/65
Indexed batch 15/65
Indexed batch 16/65
Indexed batch 17/65
Indexed batch 18/65
Indexed batch 19/65
Indexed batch 20/65
Indexed batch 21/65
Indexed batch 22/65
Indexed batch 23/65
Indexed batch 24/65
Indexed batch 25/65
Indexed batch 26/65
Indexed batch 27/65
Indexed batch 28/65
Indexed batch 29/65
Indexed batch 30/65
Indexed batch 31/65
Indexed batch 32/65
Indexed batch 33/65
Indexed batch 34/65
Indexed batch 35/65
Indexed batch 36/65
Indexed batc

# Agent Tool with Year Filtering (Initial Version)

In [None]:
print(f"{'='*20} AGENT TOOL EXPERIMENT (v1 - Single Year) {'='*20}")

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.agents import create_agent
from langchain_core.tools import tool
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from pydantic import BaseModel, Field
from qdrant_client.http import models
from typing import Optional

# Initialize embeddings directly
embeddings = GoogleGenerativeAIEmbeddings(
    model=EMBEDDING_MODEL,
    google_api_key=GOOGLE_API_KEY
)

# Initialize Qdrant client directly
client = QdrantClient(url=QDRANT_URL)
# Use the same collection name as defined in the indexing cell
COLLECTION_NAME = "ntt_hybrid_experiment" 

vector_store = QdrantVectorStore(
    client=client,
    collection_name=COLLECTION_NAME,
    embedding=embeddings,
)
print(f"Connected to Qdrant collection: {COLLECTION_NAME}")

# Initial schema - single year (later improved to List[int])
class SearchInputV1(BaseModel):
    query: str = Field(description="The semantic search query.")
    year: Optional[str] = Field(default=None, description="Single year to filter by.")

@tool(args_schema=SearchInputV1)
def search_knowledge_base_v1(query: str, year: Optional[str] = None) -> str:
    """Initial version - single year filter. See v2 for improved version."""
    try:
        if year and year.isdigit():
            year_int = int(year)
            results = vector_store.similarity_search(
                query=query,
                k=3,
                filter=models.Filter(
                    must=[
                        models.FieldCondition(
                            key="metadata.year", 
                            match=models.MatchValue(value=year_int)
                        )
                    ]
                ),
            )
        else:
            results = vector_store.similarity_search(query, k=3)
        
        if not results:
            return "No documents found."
        
        context = ""
        for doc in results:
            context += f"Source: {doc.metadata.get('source')} (Year: {doc.metadata.get('year')})\n"
            context += f"Content: {doc.page_content[:500]}...\n\n"
        return context
    except Exception as e:
        return f"Error: {str(e)}"

print("Tool v1 defined. Issue: Cannot handle year ranges like '2021-2024'.")



ModuleNotFoundError: No module named 'langchain_qdrant'

## Experiment 4: Streaming Agent with Year List Filter

**Improvement:** Changed `year: str` to `years: List[int]` to support range queries.

This is the approach used in production (`src/agent/tools/rag_tool.py`).

In [None]:
print(f"{'='*20} AGENT TOOL EXPERIMENT (v2 - Year List + Streaming) {'='*20}")

from typing import List, Optional

# Improved schema - List[int] for year ranges
class SearchInputV2(BaseModel):
    query: str = Field(description="The semantic search query to find relevant information.")
    years: Optional[List[int]] = Field(
        default=None, 
        description="List of specific years to filter by. If a range is given (e.g., '2021-2023'), expand it to a list: [2021, 2022, 2023]."
    )

@tool(args_schema=SearchInputV2)
def search_knowledge_base_v2(query: str, years: Optional[List[int]] = None) -> str:
    """
    Production-ready search tool.
    Searches NTT Data sustainability reports with optional year filtering.
    """
    try:
        search_filter = None
        
        if years and len(years) > 0:
            # Use MatchAny for efficient OR filtering
            search_filter = models.Filter(
                must=[
                    models.FieldCondition(
                        key="metadata.year", 
                        match=models.MatchAny(any=years)
                    )
                ]
            )

        results = vector_store.similarity_search(
            query=query,
            k=RAG_K,
            filter=search_filter
        )

        print(f"[DEBUG] Query: '{query}', Years: {years}, Results: {len(results)}")
        for doc in results:
            print(f"  - Year: {doc.metadata.get('year')}, Source: {doc.metadata.get('source')}")

        if not results:
            years_str = ", ".join(map(str, years)) if years else "all years"
            return f"No documents found for: {years_str}."
        
        context = ""
        for doc in results:
            source = doc.metadata.get('source', 'Unknown')
            doc_year = doc.metadata.get('year', 'Unknown')
            context += f"Source: {source} (Year: {doc_year})\n"
            context += f"Content: {doc.page_content}\n"
            context += "-" * 20 + "\n"
            
        return context

    except Exception as e:
        print(f"[ERROR] {str(e)}")
        return f"Search error: {str(e)}"

# System prompt - derived from this experiment
SYSTEM_PROMPT = (
    "You are a helpful assistant for NTT Data. You have access to a tool that retrieves context from sustainability reports. "
    "Use the tool to help answer user queries. "
    "CRITICAL INSTRUCTION FOR DATES: "
    "If the user asks for a range of years (e.g., 'between 2021 and 2023' or 'from 2020 to 2022'), "
    "you MUST calculate all intermediate years and pass them as a list to the tool (e.g., years=[2021, 2022, 2023])."
)

# Create Agent directly
llm = ChatGoogleGenerativeAI(
    model=LLM_MODEL,
    google_api_key=GOOGLE_API_KEY,
    temperature=0
)

agent = create_agent(llm, [search_knowledge_base_v2], system_prompt=SYSTEM_PROMPT)
print(f"Agent created with model: {LLM_MODEL}")

# Test with streaming
print(f"\n{'='*20} STREAMING TEST {'='*20}")
user_input = "2021-2024 arasında NTT Data'nın karbon nötr hedefleri nelerdir?"
print(f"User: {user_input}\n")

for event in agent.stream(
    {"messages": [{"role": "user", "content": user_input}]},
    stream_mode="values",
):
    event["messages"][-1].pretty_print()