# Data Connector

## Phase 1: The Ingestion Loop (Data Preparation)

In [7]:
import os
import re
import hashlib
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm

#Langchain imports
from  langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_pinecone import PineconeSparseVectorStore
from pinecone import Pinecone, ServerlessSpec

In [8]:
# ─────────────────────────────────────────────
# 1. CONFIGURATION
# ─────────────────────────────────────────────

load_dotenv()  #load from .env file

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

#Ingestion settings

DATA_DIR = "./data"
PINECONE_INDEX_NAME = "multi-agents"
EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_DIMENSIONS = 3072

CHUNK_SIZE = 700

CHUNK_OVERLAP = 70


# ─────────────────────────────────────────────
# 2. DATA LOADING (PDF Connector)
# ─────────────────────────────────────────────

def load_pdfs(data_dir: str) -> list:
    """
    Load all PDF files from the specified directory.
    Uses PyPDFLoader to extract text from each page.
    """
    print("Step 1: Loading PDF document...")

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        print(f"  Created empty '{data_dir}' folder. Add your PDFs there and re-run")
        return []
    
    loader = DirectoryLoader(
        data_dir,
        glob = "**/*.pdf",
        loader_cls = PyPDFLoader,
        show_progress = True
    )

    documents = loader.load()
    print(f' Loaded{len(documents)} pages from PDFs')
    return documents


# ─────────────────────────────────────────────
# 3. DOCUMENT CLEANING
# ─────────────────────────────────────────────

def clean_documents(documents: list) -> list:
    """
    Clean loaded documents by removing noise:
    - Excessive whitespace
    - Page numbers
    - Common headers/footers
    - Empty pages
    """

    print(" Step2: Cleaning documents...")
    cleaned = []
    removed_count = 0

    for doc in documents:
        text = doc.page_content

        # Remove page numbers
        text = re.sub(r'\n\s*Page\s*\d+\s*(of\s*\d+)?\s*\n', '\n', text, flags=re.IGNORECASE)
        text = re.sub(r'\n\s*-\s*\d+\s*-\s*\n', '\n', text)

        # Remove excessive newlines (3+ → 2) 
        text = re.sub(r'\n{3,}', '\n\n', text)

        # Remove excessive spaces
        text = re.sub(r' {3,}', ' ', text)

        # Strip leading/trailing whitespace
        text = text.strip()

        # Skip nearly empty pages (less than 50 chars of actual content)
        if len(text) < 50:
            removed_count += 1
            continue

        doc.page_content = text
        cleaned.append(doc)

    print(f" ✅Cleaned {len(cleaned)} pages ({removed_count}) empty pages removed")
    return cleaned

In [9]:
from pinecone.db_control.models.index_description import ServerlessSpec
# ─────────────────────────────────────────────
# 4. CHUNKING (Recursive Character Splitting)
# ─────────────────────────────────────────────

def chunk_documents(documents: list) -> list:
    """
    Split documents into smaller chunks using Recursive Character Splitting.
    
    Strategy:
    - Chunk size: 700 tokens 
    - Overlap: 70 tokens (10%) so context isn't lost at edges
    - Splits on paragraphs first, then sentences, then words
    """

    print(" Step 3: Chunking documents...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = CHUNK_SIZE,
        chunk_overlap = CHUNK_OVERLAP,
        length_function = len,
        separators = [
            "\n\n",   # Double newline (paragraphs) — preferred split
            "\n",     # Single newline
            ". ",     # Sentences
            ", ",     # Clauses
            " ",      # Words
            ""        # character (last resort) 
        ],
        is_separator_regex = False,
    )

    chunks = text_splitter.split_documents(documents)
  
    # Add metadata to each chunk for traceability
    for i, chunk in enumerate(chunks):
        chunk.metadata["chunk_id"] = i
        chunk.metadata["chunk_total"] = len(chunks)

        #keep source files name clean
        if "source" in chunk.metadata:
            chunk.metadata["source"] = Path(chunk.metadata["source"]).name

    print(f" ✅Created {len(chunks)} chunks from {len(documents)} pages")

    # Show chunk size statistics
    sizes = [len(c.page_content) for c in chunks]
    print(f"Chunk sizes — Min: {min(sizes)}, Max: {max(sizes)}, Avg: {sum(sizes)//len(sizes)}")

    return chunks


# ─────────────────────────────────────────────
# 5. EMBEDDING & VECTOR STORAGE (Pinecone)
# ─────────────────────────────────────────────


def create_pinecone_index(pc: Pinecone) -> None:

    """
    Create the Pinecone index if it doesn't already exist.
    """
    existing_indexes = [idx.name for idx in pc.list_indexes()]
    
    if PINECONE_INDEX_NAME not in existing_indexes:
        print(f" Creating Pinecone index '{PINECONE_INDEX_NAME}'...")
        pc.create_index(
            name=PINECONE_INDEX_NAME,
            dimension=EMBEDDING_DIMENSIONS,
            metric="cosine",
            spec={
                "serverless": {
                    "cloud": "aws",
                    "region": "us-east-1"
                }
            }
        )
        print(f" ✅ Pinecone index '{PINECONE_INDEX_NAME}' created!")
    else:
        print(f" ✅ Pinecone index '{PINECONE_INDEX_NAME}' already exists.")


def embed_and_store(chunks: list) -> PineconeVectorStore:
    """
    Embed all chunks using OpenAI and store them in Pinecone.
    
    This is the most expensive step (API calls for each chunk).
    """
    print("Step 4: Embedding chunks and storing in Pinecone...")

    #Initialize the embedding model
    embeddings = OpenAIEmbeddings(
        model=EMBEDDING_MODEL,
        openai_api_key=OPENAI_API_KEY,
    )

    #Initialize the Pinecone client
    pc = Pinecone(api_key=PINECONE_API_KEY)

    #Create the index if it doesn't exist
    create_pinecone_index(pc)

    print(f" Embedding {len(chunks)} chunks...")
    
    # Connect to the vector store
    vectorstore = PineconeVectorStore(
        index_name=PINECONE_INDEX_NAME,
        embedding=embeddings,
        pinecone_api_key=PINECONE_API_KEY
    )
    
    # Generate a unique ID for each chunk based on its text
    ids = [hashlib.md5(c.page_content.encode('utf-8')).hexdigest() for c in chunks]
    
    # Add documents using their unique IDs to prevent duplicates
    vectorstore.add_documents(documents=chunks, ids=ids)

    print(f" All {len(chunks)} chunks embedded and stored in Pinecone.")
    return vectorstore


# ─────────────────────────────────────────────
# 6. VERIFICATION (Test Query)
# ─────────────────────────────────────────────

def verify_ingestion(vectorstore: PineconeVectorStore) -> None:
    """
    Run a test similarity search to confirm everything works.
    """

    print("\n Step 5: Verification — Running test query...")
    
    test_query = "What is the main topic of these documents?"
    results = vectorstore.similarity_search(test_query, k=3)

    print(f"\n✅Test query returned {len(results)} results!\n")

    for i, result in enumerate(results, 1):
        source = result.metadata.get("source_file", "Unknown")
        preview = result.page_content[:150].replace("\n", " ")
        print(f" Result{i}. from{source}:")
        print(f".  \"{preview}...\"")
        print()

In [10]:

# ─────────────────────────────────────────────
# 7. MAIN PIPELINE
# ─────────────────────────────────────────────

def run_ingestion_pipeline():
    """
    Execute the full ingestion pipeline:
    Load → Clean → Chunk → Embed → Store → Verify
    """
    print("=" * 80)
    print("MULTI-AGENTS— INGESTION PIPELINE")
    print("=" * 80)
    print()

    # VALIDATE API KEYS
    if not OPENAI_API_KEY:
        raise ValueError("❌OPENAI_API_KEY is not set. Add it to your .env file.")
    if not PINECONE_API_KEY:
        raise ValueError("❌PINECONE_API_KEY is not set. Add it to your .env file.")
    
    print(f"Config:")
    print(f"   Data directory:  {DATA_DIR}")
    print(f"   Embedding model: {EMBEDDING_MODEL}")
    print(f"   Chunk size:      {CHUNK_SIZE} chars, {CHUNK_OVERLAP} overlap")
    print(f"   Pinecone index:  {PINECONE_INDEX_NAME}")
    print()

    #Step 1: Load PDFs
    documents = load_pdfs(DATA_DIR)
    if not documents:
        print("❌No documents found. Add PDFs to the 'data/' folder.")
        return
    
    #Step 2: Clean
    documents =  clean_documents(documents)

    #Step 3: Chunk
    chunks = chunk_documents(documents)

    #step 4 & 5: Embed and store
    vectorstore = embed_and_store(chunks)

    #step 6: Verify
    verify_ingestion(vectorstore)

    print("=" * 80)
    print(" ✅ INGESTION COMPLETE! Your data is ready for Phase 2.")
    print("=" * 80)


if __name__ == "__main__":
    run_ingestion_pipeline()

MULTI-AGENTS— INGESTION PIPELINE

Config:
   Data directory:  ./data
   Embedding model: text-embedding-3-large
   Chunk size:      700 chars, 70 overlap
   Pinecone index:  multi-agents

Step 1: Loading PDF document...


100%|██████████| 7/7 [00:04<00:00,  1.63it/s]


 Loaded103 pages from PDFs
 Step2: Cleaning documents...
 ✅Cleaned 101 pages (2) empty pages removed
 Step 3: Chunking documents...
 ✅Created 300 chunks from 101 pages
Chunk sizes — Min: 27, Max: 700, Avg: 567
Step 4: Embedding chunks and storing in Pinecone...
 Creating Pinecone index 'multi-agents'...
 ✅ Pinecone index 'multi-agents' created!
 Embedding 300 chunks...
 All 300 chunks embedded and stored in Pinecone.

 Step 5: Verification — Running test query...

✅Test query returned 3 results!

 Result1. fromUnknown:
.  "met. Technical writer Jason GrossoDocuments features, creates user guides, and maintains project documentation. I. Introduction Current situation: "Gr..."

 Result2. fromUnknown:
.  "8GDPR (General Data Protection Regulation): This is a regulation in EU law that protects the privacy and personal data of EU citizens. It gives indivi..."

 Result3. fromUnknown:
.  "Functional Specifiactions | Team 6 Table of Contents 1. Document handling 1.1 Document information Docume

## Phase 2: The Inference Loop (The Chatbot)

In [11]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate


# ─────────────────────────────────────────────
# 1. QUERY TRANSFORMATION
# ─────────────────────────────────────────────

# Initialize the LLM for query transformation
llm = ChatOpenAI(
    model_name="gpt-4o", 
    temperature=0. # Deterministic output for consistent rewrites
)

# Query transform prompt
query_transform_prompt =  ChatPromptTemplate.from_messages([
    ("system", """You are a query transformation assistant. Your job is to rewrite 
the user's question to be more specific and search-friendly for a vector database search.

Rules: 
- Expand abbreviations and acronyms if possible
- Add relevant context words
- Make the query self-contained (don't assume context)
- Keep it concise but specific
- Return ONLY the rewritten query, nothing else"""),
    ("human", "Output query: {query}\n\nRewritten query: ")
])
    
query_transform_chain = query_transform_prompt | llm

def transform_query(user_query: str) -> str:
    """
    Rewrite the user's query to be more search-friendly.
    """
    result = query_transform_chain.invoke({"query": user_query})
    transformed = result.content.strip()
    return transformed


# ─────────────────────────────────────────────
# 2. RETRIEVAL(HYBRID SEARCH)
# ─────────────────────────────────────────────


# ── Connect to your existing Pinecone index ──
def connect_to_vectorstore() -> PineconeVectorStore:
    """
    Connect to the existing Pinecone index created in Phase 1.
    No re-embedding needed — we're just connecting.
    """

    embeddings = OpenAIEmbeddings(
        model=EMBEDDING_MODEL,
        openai_api_key=OPENAI_API_KEY,
    )
    vectorstore = PineconeVectorStore(
        index_name=PINECONE_INDEX_NAME,
        embedding=embeddings,
        pinecone_api_key=PINECONE_API_KEY,
    )

    print(f" ✅ Connected to Pinecone index '{PINECONE_INDEX_NAME}'")
    return vectorstore

# ── Retrieval function ──
# def retrieve_documents(vectorstore: PineconeVectorStore, query: str, k: int = 20): 
#     """
#     Retrieve the top-k most relevant chunks using similarity search.
#     We fetch 20 results here (not 5) because the Reranker in Step 3 
#     will narrow it down to the top 5.
#     """

#     print(f"\n Step 2: Retrieving top {k} relevant documents")
#     retrieved_docs = vectorstore.similarity_search(query, k=k)

#     print(f" ✅ Retrieved {len(retrieved_docs)} candidate chunks")
#     for i, doc in enumerate(retrieved_docs[:5], 1): #Preview top 5
#         source = doc.metadata.get("Source", "Unknown")
#         preview = doc.page_content[:80].replace("\n", " ")
#         print(f" [{i}] (score: {source}: \"{preview}...\"")
    
#     return retrieved_docs


# ─────────────────────────────────────────────
# 3. RERANKING (THE MAIN SOURCE)
# ─────────────────────────────────────────────

from langchain_cohere import CohereRerank
from langchain_classic.retrievers.contextual_compression import ContextualCompressionRetriever

COHERE_API_KEY = os.getenv("COHERE_API_KEY")

def rerank_documents(vectorstore, query: str, top_n: int = 5):
    """
    Use Cohere Reranker to re-score and select the top-n most relevant chunks.  
    Flow: 20 candidates → Cross-Encoder scoring → Top 5 winners
    """

    print(f"\n Step 3: Reranking to find top {top_n} results...")

    # Create base retriever (fetches 20 candidates)
    base_retriever = vectorstore.as_retriever(search_kwargs={"k": 20})

    # Create the ranker
    reranker = CohereRerank(
        cohere_api_key = COHERE_API_KEY,
        model = "rerank-v3.5",
        top_n = top_n
    )

    # Wrap retriever with reranker
    compression_retriever = ContextualCompressionRetriever(
        base_compressor = reranker,
        base_retriever = base_retriever
    )

    # Get reranked results
    reranked_docs = compression_retriever.invoke(query)

    print(f"  ✅ Reranked to top {len(reranked_docs)} chunks")
    for i, doc in enumerate(reranked_docs, 1):
        source = doc.metadata.get("source", "Unknown")
        score = doc.metadata.get("relevance_score", "N/A")
        preview = doc.page_content[:80].replace("\n", " ")
        print(f"  [{i}] (relevance: {score}) {source}: \"{preview}...\"")
    
    return reranked_docs


# ─────────────────────────────────────────────
# 4. CONTEXT ASSEMBLY & GENERATION
# ─────────────────────────────────────────────

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

rag_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a corporate assistant for the Multi-Agents project. 
    Your job is to answer questions ONLY using the provided context documents.

    STRICT RULES:
    1. Answer ONLY using the provided context. Do NOT use prior knowledge.
    2. If the answer is not in the context, say: "I do not have enough information to answer this based on the available documents."
    3. Always cite which document your answer comes from (e.g., "Based on [document name]...").
    4. Be concise, professional, and precise.
    5. If the context contains conflicting information, mention both sources and the discrepancy.
    
    CONTEXT DOCUMENTS:
    {context}
    """),
        ("human", "{question}")
])

# The generation LLM
generation_llm = ChatOpenAI(
    model = "gpt-4o",
    openai_api_key = OPENAI_API_KEY,
    temperature = 0.1 # low temperature for factual accuracy
)

rag_chain = rag_prompt | generation_llm | StrOutputParser()


def format_context(docs) -> str:
    """
    Format the retrieved documents into a clean context string
    with source attribution.
    """

    context_parts = []
    for i, doc in enumerate(docs, 1):
        source = doc.metadata.get("source", "Unknown")
        page = doc.metadata.get("page", "N/A")
        context_parts.append(
            f"[Document {i} | Source: {source} | Page: {page}]\n{doc.page_content}"
        )
    return "\n\n---\n\n".join(context_parts)

def generate_answer(docs, question: str) -> str:
    """
    Generate the final answer using the reranked documents and the user's question.
    """

    print(f"\n Step 4: Generating answer...")

    context = format_context(docs)
    
    answer = rag_chain.invoke({
        "context": context,
        "question": question
    })
    
    print(f"  ✅ Answer generated!\n")
    return answer

In [20]:
def ask(question: str) -> str:
    """
    The full RAG inference pipeline:
    Query Transform → Retrieve → Rerank → Generate
    """
    print("=" * 80)
    print(f"❓ USER QUESTION: {question}")
    print("=" * 80)

    # Step 0: Connect to vector store
    vectorstore = connect_to_vectorstore()

    # Step 1: Transform the query
    transformed_query = transform_query(question)

    # Step 2: Retrieve candidates (top 20)
    # results = retrieve_documents(vectorstore, transformed_query, k=20)

    #  Step 2 & 3:Retrieve & Rerank to top 5
    top_docs = rerank_documents(vectorstore, transformed_query, top_n=5)

    # Step 4: Generate answer
    answer = generate_answer(top_docs, question)

    print("=" * 80)
    print("ANSWER:")
    print("=" * 80)
    print(answer)
    print("=" * 80)
    
    return answer

# ── Test ──
ask("According to the document what is the current version and who is the author of locindoor")


❓ USER QUESTION: According to the document what is the current version and who is the author of locindoor
 ✅ Connected to Pinecone index 'multi-agents'

 Step 3: Reranking to find top 5 results...
  ✅ Reranked to top 5 chunks
  [1] (relevance: 0.17046688) locindoor.pdf: "Functional Specification of LOC-INDOOR (Indoor Localisation) Contents   Line-Bas..."
  [2] (relevance: 0.023529684) locindoor.pdf: "Postconditions: User completes all planned destinations System logs complete jou..."
  [3] (relevance: 0.022322435) locindoor.pdf: "iBeacon: Apple's protocol for Bluetooth Low Energy (BLE) proximity sensing that ..."
  [4] (relevance: 0.02211875) locindoor.pdf: "Navigation 10.2 UI Components 10.2.1 Information Displays Distance remaining Est..."
  [5] (relevance: 0.02064795) locindoor.pdf: "Floor detection 9.1.5 Map Rendering 3D Map Engine Purpose: Displays venue maps a..."

 Step 4: Generating answer...
  ✅ Answer generated!

ANSWER:
Based on [Document 1 | Source: locindoor.pdf | Page: 0.

'Based on [Document 1 | Source: locindoor.pdf | Page: 0.0], the current version of the document is 1.1, and the author is TSANGUE VIVIEN BISTREL.'