## Day 1 - Introduction to RAG

In [None]:
# Python Environment - Day 1 Task 1
%pip install transformers sentence-transformers  faiss-cpu gradio streamlit chromadb

In [None]:
# Choose a Dataset - Day 1 Task 2 - 3 - 4
## Dataset: ./data/cat-facts.txt

In [None]:
# Test Environment - Day 1 Task 5
from transformers import pipeline
generator = pipeline('text-generation', model='gpt2')
print(generator("Hello, my name is", max_length=10))

## Day 2 - Core Components of a RAG Pipeline (Data, Embeddings, and Retrieval)

### Chunking implementation (On Cat Data)

In [None]:
# Chunking implementation - Day 2 Task 1
def chunk_text(text, max_length=500):
    # Text is splitted into chunks at most max_length characters, at sentence boundaries if possible
    import re
    sentences = re.split(r'(?<=[.!?])\s+', text.strip()) # split on sentence end
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1  <= max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

In [None]:
with open("./data/cat-facts.txt", "r", encoding="utf-8") as f:
    text = f.read()

# First Chunk
print(chunk_text(text=text, max_length=500)[1])

### Embedding the chunks

In [None]:
# Embedding the Chunks - Day 2  Task 2
from sentence_transformers import SentenceTransformer

chunks = chunk_text(text, max_length=500)
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
chunk_embeddings = embedding_model.encode(chunks)

### Vector index storage

In [None]:
# Store in a Simple Vector index - Day 2 Task 3
import numpy as np

vectors = np.array(chunk_embeddings)
# Keep an array or list of chunk texts in the same order
chunks_list = chunks 


### Test the retreival with a query ⭐

In [None]:
# Test the retrieval on a Simple Query - Day 2 Task 4
def retrieve(query, vectors, chunks_list, model):
    '''Retrieve the most relevant chunk based on cosinle similarity'''
    q_vec = model.encode([query])[0]
    # Compute cosine similarty between q_vec and all chunk vectors
    scores = np.dot(vectors, q_vec) / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(q_vec) + 1e-9)
    top_indx = int(np.argmax(scores))
    return chunks_list[top_indx], scores[top_indx]

In [None]:
# Inspect the result - Day 2 Task 5
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


Query = "What is a cat lover called"
retrieve(query=Query, vectors=vectors, chunks_list=chunks_list, model=embedding_model)

### Save embeddings

In [None]:
# Save your work - Day 2 Task 6
import numpy as np
import json

# Save 
np.save('embeddings.npy', vectors) # chunk_embeddings


# save the chunk texts
with open("chunks.json", "w") as f:
    json.dump(chunks_list, f)


### Load the embeddings ⭐

In [None]:
# Load the embeddings - Day 2 Task 7
import numpy as np
import json

vectors = np.load("./data/embeddings.npy")

with open('./data/chunks.json', "r") as f:
    chunks_list = json.load(f)

In [None]:
print(vectors[:10])
print("\n")
print(chunks_list[:10])

## Day 3: Building Your First RAG System (End-to-End QA)

### Generate an Answer(Pipeline)

In [None]:
# Integrate Retrieval and Generation (PIPELINE VERSION) - Day 3 Task 1

from transformers import pipeline
from sentence_transformers import SentenceTransformer


embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the model and tokenizer for generation (this might download weights the first time)

generator = pipeline("text2text-generation", model="google/flan-t5-base")

def answer_query(query, top_k=3):
    # Retrieve top k chunks
    q_vec = embedding_model.encode([query])[0] # embed the query using same model as before
    scores = np.dot(vectors, q_vec) / (np.linalg.norm(vectors, axis=1)*np.linalg.norm(q_vec) + 1e-9)
    top_indices = scores.argsort()[-top_k:][::-1] # indices of top k chunks, sorted by score desc
    retrieved_chunks = [chunks_list[i] for i in top_indices] 
    # construct context string
    context = " ".join(retrieved_chunks)
    prompt = (f"Answer the question using ONLY the context below and Explain in detail. If the answer is not in the context, say 'I do not know.'\n\n"
              f"Context: {context}\n\nQuestion: {query}\nAnswer:")
    result = generator(prompt, max_length=200, num_return_sequences=1)
    answer = result[0]['generated_text']
    return answer

In [None]:
# Test with known Question - Day 3 Task 2
answer_query("What is the name of heaviest cat ever?")

### Generate an Answer (AutomodelForSeq2LM) ⭐

In [None]:
# Integrate Retrieval and Generation (AutoModelForSeq2SeqLM VERSION) - Day 3 Task 1

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Same with generator pipeline
def generate_answer(prompt):
    """Generate  Answer using FLAN-T5"""
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(**inputs, max_length=500)
    return tokenizer.decode(outputs[0], skip_special_tokens = True)


In [None]:
# Test with known Question - Day 3 Task 2
def answer_query(query):
    context = retrieve(query, vectors, chunks_list, embedding_model)
    # Refine prompt if needed - Day 3 Task 3
    prompt =  (f"""
                You are a QA assistant.

                Rules:
                - Use the context as the ONLY source of factual information.
                - You may paraphrase and combine details into your own sentences.
                - Do NOT add new facts that are not supported by the context.
                - If the context does not contain the answer, say exactly: "I do not know."

                Task:
                Answer the question in your own words.

                Context:
                {context}

                Question: {query}

                Answer:""") 
    # Logging - Day 3 Task 4
    print(f"Context: {context}")
    answer = generate_answer(prompt)
    return answer

In [None]:
answer_query("Lightiest cat ever?")

## Day 4: Building an Interactive RAG Application (UI Integration)

### Gradio UI

In [None]:
# Day 4 Task 1-2-3-4-5
import gradio as gr

def rag_system(query):
    # Use our answer_query function from Day 3
    answer = answer_query(query)
    return answer

iface = gr.Interface(fn=rag_system, inputs="text", outputs="text", title="RAG QA System", description="Ask a question and get an answer from documents.")
iface.launch()

## Day 5: Adding Conversational Memory to RAG Assistant

#### Necessary Fucntions

In [1]:
# Load vectors
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Saved Vectors
vectors = np.load("./data/embeddings.npy")
# Embedding model 
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Saved Chunks
with open('./data/chunks.json', "r") as f:
    chunks_list = json.load(f)
# Retrieval function
def retrieve(query, vectors, chunks_list, model):
    '''Retrieve the most relevant chunk based on cosinle similarity'''
    q_vec = model.encode([query])[0]
    # Compute cosine similarty between q_vec and all chunk vectors
    scores = np.dot(vectors, q_vec) / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(q_vec) + 1e-9)
    top_indx = int(np.argmax(scores))
    return chunks_list[top_indx], scores[top_indx]

# LLM model name to generate answer
model_name = "google/flan-t5-base"
# model library from HuggingFace
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Model tokenizer from HuggingFace
tokenizer = AutoTokenizer.from_pretrained(model_name)

# generate answer fucntion - same with generator pipeline
def generate_answer(prompt):
    """Generate  Answer using FLAN-T5"""
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(**inputs, max_length=500)
    return tokenizer.decode(outputs[0], skip_special_tokens = True)

  from .autonotebook import tqdm as notebook_tqdm


### Gradio UI (with history ⭐)

In [24]:
NEW_SYSTEM_PROMPT =(
    "Answer the user's query using ONLY the CONTEXT and CHAT HISTORY below"
    "Use CHAT HISTORY to resolve references like (e.g., it , they , them etc.)"
    "If the answer is not in the context or CHAT HISTORY, say 'I do not know'."
)

def build_prompt(context, history, question, max_turns):
    # keep last N turns to avoid prompt bloat
    recent = history[-max_turns:] if history else []

    history_block = ""
    for q,a in recent:
        history_block += f"User {q}\n Asisstant: {a}\n"

    return (
        f"{NEW_SYSTEM_PROMPT}\n\n"
        f"[CONTEXT]\n{context}\n\n"
        f"[CHAT HISTORY]\n{history_block if history_block else '(none)'}\n\n"
        f"[CURRENT QUESTION]\nUser: {question}\n Assistant:"
    )

In [25]:
import gradio as gr
import re

LAST_DEBUG = {"prompt": "", "retrieval_query": ""}

def make_retrieval_query(question: str, history: list[tuple[str, str]]) -> str:
    if not history:
        return question
    last_q, _ = history[-1]
    return f"{last_q}\nFollow-up: {question}"

def answer_query_with_history(question, history):
    retrieval_query = make_retrieval_query(question, history) 
    # retrieve the context
    context = retrieve(query=question, vectors=vectors,chunks_list=chunks_list,model=embedding_model)
    prompt = build_prompt(context=context, history=history, question=question, max_turns=3)

    # store for UI debugging
    LAST_DEBUG["prompt"] = prompt
    LAST_DEBUG["retrieval_query"] = retrieval_query

    out = generate_answer(prompt)
    return out


def _content_to_text(content):
    '''
    Gradio 6+ uses OpenAI-style structured content blocks.
    Older versions often use plain strings.
    '''
    if content is None:
        return ""
    if isinstance(content, str):
        return content
    if isinstance(content, list):
        # list of blocks, e.g., [{"type":"text", "text":"hi"}]
        parts = []
        for block in content:
            if isinstance(block, dict) and block.get("type") == "text":
                parts.append(block.get("text", ""))
            elif isinstance(block, str):
                parts.append(block)
        return "".join(parts)
    if isinstance(content, dict) and "text" in content:
        return content ["text"]
    return str(content)


def normalize_gradio_history(history):
    """
    Supports:
    - v4/v5 style: [[user,bot], ...]
    - v6 messages style: [{"role":"user", "content":[...]}, {"role":"assistant","content":[...]} , ...]
    Returns: list[tuple[user_text, assistant_text]]
    """
    if not history:
        return []
    
    # Pair format
    if isinstance(history, list) and history and isinstance(history[0], (list, tuple)) and len(history[0]) == 2:
        out = []
        for u, a in history:
            out.append((_content_to_text(u), _content_to_text(a)))
        return out
    
    # Messages format
    if isinstance(history, list) and history and isinstance(history[0], dict) and "role" in history[0]:
        pairs = []
        pending_user = None
        for msg in history:
            role = msg.get("role")
            text = _content_to_text(msg.get("content"))
            if role == "user":
                pending_user = text
            elif role == "assistant" and pending_user is not None:
                pairs.append((pending_user, text))
                pending_user = None
        return pairs
    
    # Fallback
    return []

def chatbot_fn(message, history):
    history_pairs = normalize_gradio_history(history)

    answer = answer_query_with_history(message, history_pairs)

    return answer



In [None]:
demo = gr.ChatInterface(
    fn=chatbot_fn,
    title="Day 5 RAG + Chat History Test",
)

demo.launch()

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("## Day 5 RAG + Chat History (with Debug)")
    chat = gr.Chatbot()
    msg = gr.Textbox(placeholder="Ask something…")
    clear = gr.Button("Clear")

    with gr.Accordion("Debug (what the model sees)", open=False):
        dbg_retrieval = gr.Textbox(label="Retrieval Query", lines=2)
        dbg_prompt = gr.Textbox(label="Final Prompt", lines=18)

    def respond(message, history):
        # history is a list of {"role": ..., "content": ...} dicts in messages mode
        history_pairs = normalize_gradio_history(history)  # your helper: -> list[(user, assistant)]
        answer = answer_query_with_history(message, history_pairs)

        history = history or []
        history = history + [
            {"role": "user", "content": message},
            {"role": "assistant", "content": answer},
        ]

        return history, "", LAST_DEBUG.get("retrieval_query",""), LAST_DEBUG.get("prompt","")

    msg.submit(respond, [msg, chat], [chat, msg, dbg_retrieval, dbg_prompt])
    clear.click(lambda: ([], "", "", ""), None, [chat, msg, dbg_retrieval, dbg_prompt])

demo.launch()

## Day 6 - Deploying the RAG system (Local to Cloud)