<a href="https://colab.research.google.com/github/AXB2024/RAG-Pipline-Project/blob/main/RAG_PIPELINE_FINAL_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install llama-index llama-index-embeddings-huggingface transformers accelerate sentence-transformers faiss-cpu llama-cpp-python unstructured PyMuPDF

In [None]:
import os
import fitz  # PyMuPDF
import time
import faiss
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
from llama_cpp import Llama


In [None]:
!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {"/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"}

In [None]:
# STEP 1: Mount / Create Document Folder
!mkdir documents

In [None]:
"""from google.colab import files
uploaded = files.upload()"""

In [None]:
"""import shutil

for filename in uploaded.keys():
    shutil.move(filename, f'documents/{filename}')"""

In [None]:
# STEP 2: Extract Text from PDFs
def extract_text_from_pdfs(folder="/content/documents"):
    docs = {}
    for fname in os.listdir(folder):
        if fname.endswith(".pdf"):
            with fitz.open(os.path.join(folder, fname)) as doc:
                full_text = ""
                for page in doc:
                    full_text += page.get_text()
                docs[fname] = full_text
    return docs

In [None]:
# STEP 3: RAG Components
queries = {
    "appraisal.pdf": "What is the estimated home value?",
    "sample_bank_statement.pdf": "How much was the last transaction?",
    "payslip_sample_image.pdf": "What is the total net salary for this month?",
    "sample_contract.pdf" : "What are the penalties for late payments?",
    "LenderFeesWorksheetNew.pdf" : "What is the total estimated monthly payment?"
}

In [None]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import Document

def embed_documents(docs, embedder, chunk_size=300, chunk_overlap=30, use_semantic=True):
    if use_semantic:
        print(f"\nüîß Semantic Chunking | Size: {chunk_size} | Overlap: {chunk_overlap}")

        # Set up semantic chunking
        embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
        splitter = SemanticSplitterNodeParser(
            embed_model=embed_model,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

        raw_documents = [Document(text=content, metadata={"name": name}) for name, content in docs.items()]
        nodes = splitter.get_nodes_from_documents(raw_documents)

        passages = [node.text for node in nodes]
        doc_map = [node.metadata["name"] for node in nodes]
        embeddings = embedder.encode(passages, convert_to_tensor=True).cpu().numpy()
        print(f"‚úÖ Total Chunks Created: {len(passages)}")
        return passages, doc_map, embeddings
    else:
        # fallback to fixed chunking
        passages = []
        doc_map = []
        for name, text in docs.items():
            for i in range(0, len(text), chunk_size):
                chunk = text[i:i+chunk_size]
                passages.append(chunk)
                doc_map.append(name)
        embeddings = embedder.encode(passages, convert_to_tensor=True).cpu().numpy()
        return passages, doc_map, embeddings


In [None]:
import numpy as np

def search(query, embedder, passages, embeddings):
    query_vec = embedder.encode([query])[0]
    query_vec = np.array(query_vec).astype('float32').reshape(1, -1)

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    D, I = index.search(query_vec, 1)
    return passages[I[0][0]]

In [None]:
def load_model(name, model_type):
    if model_type == "transformers":
        tokenizer = AutoTokenizer.from_pretrained(name)
        model = AutoModelForCausalLM.from_pretrained(name, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
        return lambda prompt: pipe(prompt, max_new_tokens=128, do_sample=True)[0]['generated_text']
    elif model_type == "llama-cpp":
        return Llama(model_path=name, n_ctx=2048, n_threads=4)

In [None]:
def generate_answer(model, query, context, model_type):
    prompt = f"Answer this question based on the context:\nContext: {context}\nQuestion: {query}"
    if model_type == "llama-cpp":
        return model(prompt)["choices"][0]["text"].strip()
    else:
        return model(prompt)

In [None]:
# STEP 4: Run RAG
import pandas as pd
results = []  # this will hold all results across experiments
def run_rag(model_name, model_type, embedder_name="all-MiniLM-L6-v2", chunk_size=300, chunk_overlap=30):
    print(f"\nüîç Running RAG with model: {model_name}")
    embedder = SentenceTransformer(embedder_name)
    documents = extract_text_from_pdfs()

    passages, doc_map, embeddings = embed_documents(
        documents,
        embedder,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        use_semantic=True
    )

    model = load_model(model_name, model_type)

    for doc, query in queries.items():
        print(f"\nüìÑ Document: {doc}")
        print(f"‚ùì Query: {query}")
        start = time.time()
        relevant = search(query, embedder, passages, embeddings)
        answer = generate_answer(model, query, relevant, model_type)
        end = time.time()
        print(f"üìå Retrieved: {relevant[:80]}...")
        print(f"üí¨ Answer: {answer.strip()}")
        print(f"‚ö° Speed: {round(end - start, 2)}s")

        # Append result to global list
        results.append({
            "Model": model_name,
            "Chunk Size": chunk_size,
            "Chunk Overlap": chunk_overlap,
            "Document": doc,
            "Query": query,
            "Retrieved Context": relevant[:80],
            "Answer": answer.strip(),
            "Time (s)": round(end - start, 2)
        })


In [None]:
# Clear results
results = []

# Small chunks (100 tokens), no overlap
run_rag("microsoft/phi-2", "transformers", chunk_size=100, chunk_overlap=0)

# Medium chunks (300 tokens), small overlap
run_rag("microsoft/phi-2", "transformers", chunk_size=300, chunk_overlap=30)

# Large chunks (500 tokens), large overlap
run_rag("microsoft/phi-2", "transformers", chunk_size=500, chunk_overlap=100)


In [None]:
run_rag("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "transformers", chunk_size=100, chunk_overlap=0)
run_rag("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "transformers", chunk_size=300, chunk_overlap=30)
run_rag("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "transformers", chunk_size=500, chunk_overlap=100)

In [None]:
run_rag("/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf", "llama-cpp", chunk_size=100, chunk_overlap=0)
run_rag("/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf", "llama-cpp", chunk_size=300, chunk_overlap=30)
run_rag("/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf", "llama-cpp", chunk_size=500, chunk_overlap=100)

In [None]:
""" # All model setups
model_configs = [
    {"name": "microsoft/phi-2", "type": "transformers"},
    {"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "type": "transformers"},
    {"name": "/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf", "type": "llama-cpp"},
]

# Chunk sizes and overlaps to test
chunk_configs = [
    {"chunk_size": 100, "chunk_overlap": 0},
    {"chunk_size": 300, "chunk_overlap": 30},
    {"chunk_size": 500, "chunk_overlap": 100},
]
"""

In [None]:
""" results = []

for model_config in model_configs:
    for chunk_config in chunk_configs:
        run_rag(
            model_name=model_config["name"],
            model_type=model_config["type"],
            chunk_size=chunk_config["chunk_size"],
            chunk_overlap=chunk_config["chunk_overlap"]
        )
"""

In [None]:
df_results = pd.DataFrame(results)
df_results

In [None]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=df_results)

In [None]:
# Save to CSV
df_results.to_csv("rag_results.csv", index=False)

# Download to your local machine
from google.colab import files
files.download("rag_results.csv")