In [None]:
!pip install transformers accelerate sentence-transformers faiss-cpu llama-cpp-python unstructured PyMuPDF

In [None]:
import os
import fitz  # PyMuPDF
import time
import faiss
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
from llama_cpp import Llama


In [None]:
# STEP 1: Mount / Create Document Folder
os.makedirs("/content/documents", exist_ok=True)

In [None]:
# STEP 2: Extract Text from PDFs
def extract_text_from_pdfs(folder="/content/documents"):
    docs = {}
    for fname in os.listdir(folder):
        if fname.endswith(".pdf"):
            with fitz.open(os.path.join(folder, fname)) as doc:
                full_text = ""
                for page in doc:
                    full_text += page.get_text()
                docs[fname] = full_text
    return docs

In [None]:
# STEP 3: RAG Components
queries = {
    "appraisal.pdf": "What is the estimated home value?",
    "sample_bank_statement.pdf": "How much was the last transaction?",
    "payslip_sample_image.pdf": "What is the total net salary for this month?"
}


In [None]:
def embed_documents(docs, embedder):
    passages = []
    doc_map = []
    for name, text in docs.items():
        for i in range(0, len(text), 300):
            chunk = text[i:i+300]
            passages.append(chunk)
            doc_map.append(name)
    embeddings = embedder.encode(passages, convert_to_tensor=True).cpu().numpy()
    return passages, doc_map, embeddings

In [None]:
import numpy as np

def search(query, embedder, passages, embeddings):
    query_vec = embedder.encode([query])[0]
    query_vec = np.array(query_vec).astype('float32').reshape(1, -1)

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    D, I = index.search(query_vec, 1)
    return passages[I[0][0]]


In [None]:
def load_model(name, model_type):
    if model_type == "transformers":
        tokenizer = AutoTokenizer.from_pretrained(name)
        model = AutoModelForCausalLM.from_pretrained(name, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
        return lambda prompt: pipe(prompt, max_new_tokens=128, do_sample=True)[0]['generated_text']
    elif model_type == "llama-cpp":
        return Llama(model_path=name, n_ctx=2048, n_threads=4)

In [None]:
def generate_answer(model, query, context, model_type):
    prompt = f"Answer this question based on the context:\nContext: {context}\nQuestion: {query}"
    if model_type == "llama-cpp":
        return model(prompt)["choices"][0]["text"].strip()
    else:
        return model(prompt)

In [None]:
# STEP 4: Run RAG
def run_rag(model_name, model_type, embedder_name="all-MiniLM-L6-v2"):
    print(f"\n🔍 Running RAG with model: {model_name}")
    embedder = SentenceTransformer(embedder_name)
    documents = extract_text_from_pdfs()
    passages, doc_map, embeddings = embed_documents(documents, embedder)
    model = load_model(model_name, model_type)

    for doc, query in queries.items():
        print(f"\n📄 Document: {doc}")
        print(f"❓ Query: {query}")
        start = time.time()
        relevant = search(query, embedder, passages, embeddings)
        answer = generate_answer(model, query, relevant, model_type)
        end = time.time()
        print(f"📌 Retrieved: {relevant[:80]}...")
        print(f"💬 Answer: {answer.strip()}")
        print(f"⚡ Speed: {round(end - start, 2)}s")

In [None]:
# 🔁 Phi-2
run_rag("microsoft/phi-2", "transformers")

In [None]:
# STEP 5: Run All Models — Phi-2, TinyLlama, Mistral (GGUF)
# 🔁 TinyLlama
run_rag("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "transformers")

In [None]:
!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {"/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf"}


In [None]:
run_rag("/content/mistral-7b-instruct-v0.2.Q4_K_M.gguf", "llama-cpp")