In [7]:

# Imports
# =========================================================
import re
from IPython.display import display, Markdown
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM



### Splitting Documents into Chunks

1. When we read an entire PDF, we often get very large blocks of text — sometimes thousands of words per page. But LLMs (and embedding models) have context limits — they can’t process all that at once.

2. That’s why we split long documents into smaller, overlapping pieces, called chunks.

3. Each chunk will later become:
   - an embedding (vector representation)
   - a retrieval unit (the part that gets fetched when the user asks a question)

In [8]:

# Load PDF
loader = PyPDFLoader("/Users/swatichandna/SynologyDrive/GitHub/NLP/Module 10/ToyData/test.pdf")
raw_docs = loader.load()
print(f"Loaded {len(raw_docs)} pages")



# Chunk documents
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=120,
    separators=["\n\n", "\n", ".", " "]
)
chunks = splitter.split_documents(raw_docs)
print(f"Created {len(chunks)} chunks")


Loaded 598 pages
Created 1134 chunks


### Create Embeddings for Each Chunk

Each chunk of text  previous step must be converted into a vector — a list of numbers that captures its meaning.
This process is called embedding.

Why?
1. Machines can’t understand text, but they can compare numbers.
2. Each vector encodes the semantic meaning of a chunk.
3. Similar meanings → vectors are close together in multi-dimensional space.
4. So when a user asks a question, the query is also embedded, and we find the chunks whose vectors are most similar to it (via cosine similarity or L2 distance).

In [None]:

# Create embeddings + FAISS retriever
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
print("FAISS retriever ready")



# Helper to extract clean text
def get_context_text(query):
    results = retriever.invoke(query)
    if not results:
        return "No relevant context found."
    if hasattr(results[0], "page_content"):
        texts = [doc.page_content for doc in results]
    else:
        texts = [str(doc) for doc in results]
    return "\n\n".join(texts)


In [None]:

# Hugging Face LLM setup
model_id = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")

hf_gen = pipeline(
    task="text-generation", #goal is to predict or generate text continuations
    model=model,  #Passes in your already loaded language model
    tokenizer=tokenizer,  #The tokenizer converts between text and token IDs
    max_new_tokens=200, #maximum number of tokens the model is allowed to generate
    temperature=0.3, #Controls the randomness of predictions,Lower = more deterministic, Higher = more creative/random.
    do_sample=False, #sample probabilistically or pick the top-scoring tokens deterministically.False means greedy decoding — always choose the most likely next token (no randomness).  
    top_p=0.9,  #nucleus sampling (a method to pick from only the most probable subset of tokens that together make up 90% of the probability mass).
    top_k=50, #Used for top-k sampling, which restricts the model to choose from the top 50 most likely next tokens.
    repetition_penalty=1.1, #Penalizes repeating tokens by slightly reducing their probability each time they appear.
    truncation=True, #input text is truncated if it exceeds the model’s maximum context length
    pad_token_id=tokenizer.eos_token_id, #padding token ID to use for sequences shorter than the model’s context length.
)



In [None]:
#Build prompt and RAG chain

llm = HuggingFacePipeline(pipeline=hf_gen)
print("Hugging Face model ready")

# 
# templated prompt with two required variables: {context} and {question}
# At runtime, LangChain will format this template by substituting the actual context text and the user’s question.
prompt = ChatPromptTemplate.from_template("""
Use the following context to answer the user's question.

Context:
{context}

Question:
{question}

Answer clearly and concisely.
""")

# Build RAG chain
rag_chain = (
    {
        "context": lambda q: get_context_text(q), #Takes the incoming input q and calls your function get_context_text(q) to retrieve retrieved documents / context text
        "question": RunnablePassthrough(), #original input unchanged and stores it under the
    }
    | prompt #prompt expects a dict with keys matching the template variables (context, question) and returns a formatted prompt
    | llm #Sends the formatted prompt to your Hugging Face model via the LangChain LLM wrapper.
    | StrOutputParser() #final output is a clean string
)
print("RAG chain ready")



In [None]:

# =========================================================
#  Visualization utilities
# =========================================================
def highlight_chunks(chunks, color="yellow"):
    """Display retrieved chunks with color highlights in Markdown."""
    highlights = []
    for i, c in enumerate(chunks, 1):
        text = c.page_content if hasattr(c, "page_content") else str(c)
        # Clean up whitespace and tabs
        text = re.sub(r"\s+", " ", text.strip())
        highlights.append(f"<div style='background-color:{color}; padding:8px; margin:6px 0;'>"
                          f"<b>Chunk {i}</b>: {text[:400]}...</div>")
    return "\n".join(highlights)

def visualize_retrieval(question):
    """Show top chunks, prompt, and final answer side-by-side."""
    docs = retriever.invoke(question)
    context_text = get_context_text(question)

    display(Markdown(f"## **Question:** {question}"))
    display(Markdown("### **Top Retrieved Chunks:**"))
    display(Markdown(highlight_chunks(docs, color="#FFF6A4")))

    # Show prompt content (truncated)
    filled_prompt = prompt.invoke({"context": context_text, "question": question})
    display(Markdown("###  **Prompt Sent to Model:**"))
    display(Markdown(f"```text\n{filled_prompt.to_string()[:800]}\n```"))

    # Generate answer
    answer = rag_chain.invoke(question)
    display(Markdown("### **Model Answer:**"))
    display(Markdown(f"> {answer}"))

# =========================================================
# Run the demo
# =========================================================
visualize_retrieval("What is a Large Language Model?")
