In [None]:
# ============================================
# Module 10: Retrieval-Augmented Generation & Vector Search
# Lab 1 – Build a RAG Pipeline with Multiple PDFs
# ============================================
# Author: Dr. Dasha Trofimova
# Course: M.Sc. Applied Data Science & AI
# --------------------------------------------
# Learning Goals:
# - Understand the architecture of Retrieval-Augmented Generation (RAG)
# - Load and preprocess multiple PDFs into text chunks
# - Create embeddings and store them in a vector database (Chroma)
# - Retrieve contextually relevant chunks for question answering
# --------------------------------------------
# Lab Objectives:
# 1. Load PDFs using PyPDFLoader and chunk text
# 2. Generate embeddings with OpenAI or Hugging Face models
# 3. Store vectors persistently in Chroma
# 4. Build a retriever and connect it to a local LLM (TinyLlama)
# 5. Run interactive QA grounded in the PDF content
# ============================================
!pip install langchain langchain-community langchain-chroma langchain-huggingface \
    transformers accelerate bitsandbytes sentence-transformers PyPDF2 python-dotenv


In [None]:
import os
from google.colab import files

from transformers import pipeline
from langchain_huggingface import ChatHuggingFace
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

print("Imports loaded.")



In [None]:
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

hf_gen_pipeline = pipeline(
    task="text-generation",
    model=model_name,
    tokenizer=model_name,
    torch_dtype="auto",
    device_map="auto",
    max_new_tokens=200,
    temperature=0.1,
)

# Wrap the transformers pipeline so it looks like an LLM to LangChain
llm = HuggingFacePipeline(pipeline=hf_gen_pipeline)

print(f"LLM ready: {model_name}")


In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("Embedding model ready (MiniLM).")


In [None]:
!pip install pypdf

def load_pdfs_colab():
    """
    Lets you upload one or more PDF files in Colab.
    Returns a list of LangChain Document objects (each page is one Document).
    """
    from langchain_community.document_loaders import PyPDFLoader  # keep import here so reload works after installs
    from google.colab import files

    print("Please upload one or more PDF files.")
    uploaded = files.upload()

    all_pages = []

    for filename in uploaded.keys():
        if not filename.lower().endswith(".pdf"):
            print(f"Skipping non-PDF file: {filename}")
            continue

        print(f"Loading {filename} ...")
        loader = PyPDFLoader(filename)
        try:
            pages = loader.load()
            print(f"  -> Loaded {len(pages)} pages.")
            all_pages.extend(pages)
        except Exception as e:
            print(f"Error loading {filename}: {e}")

    print(f"\nTotal pages loaded across all PDFs: {len(all_pages)}")
    return all_pages


pages = load_pdfs_colab()

if len(pages) == 0:
    raise ValueError("No PDF pages loaded. Please rerun cell and upload at least one PDF.")



In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # tune for context window
    chunk_overlap=200  # tune for continuity
)

pages_split = text_splitter.split_documents(pages)
print(f"Created {len(pages_split)} chunks.")


In [None]:
persist_directory = "./chroma_store"
collection_name = "rag_docs"

# make sure directory exists
os.makedirs(persist_directory, exist_ok=True)

vectorstore = Chroma.from_documents(
    documents=pages_split,
    embedding=embeddings,
    persist_directory=persist_directory,
    collection_name=collection_name
)

print("✅ Chroma vector DB is ready.")


In [20]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 2}
)


In [24]:
def retrieve_context(query: str):
    """
    Return top-k relevant LangChain Documents for a user query.
    """
    docs = retriever.invoke(query)
    return docs

def build_prompt(user_question: str, docs):
    # Build a trimmed context from top-k docs
    context_blocks = []
    for i, d in enumerate(docs):
        src = d.metadata.get("source", "unknown")
        page = d.metadata.get("page", "N/A")
        context_blocks.append(
            f"[Chunk {i+1} | source: {src} | page {page}]\n{d.page_content}"
        )

    context_text = "\n\n---\n\n".join(context_blocks)

    prompt = (
        "You are an assistant. You should primarily answer using the provided context.\n"
        "If the question is basic general knowledge or simple math, you MAY answer it directly.\n"
        "Otherwise, if the answer is not in the context, say:\n"
        "\"I don't see that in the provided documents.\"\n"
        "When you use the documents, cite chunks like [Chunk 2].\n\n"
        "CONTEXT:\n"
        f"{context_text}\n\n"
        "QUESTION:\n"
        f"{user_question}\n\n"
        "FINAL ANSWER:\n"
    )

    return prompt


In [None]:
def rag_qa_loop():
    print("\n=== RAG QA (multi-PDF, open source model, no agents) ===")
    print("Type your question. Type 'exit' to stop.")

    while True:
        user_query = input("\nYour question: ")
        if user_query.lower() in ["exit", "quit"]:
            print("Goodbye.")
            break

        # 1. retrieve top-k chunks
        docs = retrieve_context(user_query)
        if not docs:
            print("\nNo relevant context found in your PDFs.")
            continue

        # 2. build prompt for the LLM
        messages = build_prompt(user_query, docs)

        # 3. run the model
        answer = llm.invoke(messages)

        # 4. display
        print("\n=== ANSWER ===")
        print(answer)

rag_qa_loop()
