In [None]:
# Install everything needed for:
# - embeddings (sentence-transformers)
# - vector search (faiss)
# - reading PDFs (pypdf)
# - running local LLM (transformers)
!pip -q install transformers sentence-transformers faiss-cpu pypdf pydantic accelerate bitsandbytes sentencepiece


In [None]:
# Core utilities
import os
import json
import re
import numpy as np
from typing import List, Dict, Tuple


In [None]:
# Create a documents folder if it doesn't exist
os.makedirs("documents", exist_ok=True)

# OPTIONAL but recommended:
# Create sample policy docs so the RAG pipeline works even if you didn't upload anything yet.
# You can replace these later with real PDFs/TXT.
sample_docs = {
    "payments_policy.txt": """
If a transfer fails but money is deducted, the customer must provide the transaction ID.
Support will investigate within 24 hours and update the customer.
If transaction ID is missing, request it before escalation.
""",
    "refund_policy.txt": """
Refunds are processed within 5 business days after verification.
Duplicate charges require an order ID and payment reference.
If the user was charged twice, create a refund ticket and attach the order ID.
""",
    "security_policy.txt": """
If an account is suspected to be hacked or fraud is reported:
- Escalate to the security team immediately.
- Mark the case as high priority.
- Require human confirmation before any account action.
"""
}

# Write sample docs only if documents folder is empty
if len(os.listdir("documents")) == 0:
    for fname, content in sample_docs.items():
        with open(os.path.join("documents", fname), "w", encoding="utf-8") as f:
            f.write(content.strip())
    print("✅ Sample documents created in /documents")
else:
    print("ℹ️ documents folder already has files:", os.listdir("documents"))


ℹ️ documents folder already has files: ['security_policy.txt', 'payments_policy.txt', 'refund_policy.txt']


In [None]:
from pypdf import PdfReader

def load_documents(folder: str = "documents") -> Tuple[List[str], List[str]]:
    """
    Loads .txt and .pdf files from folder.
    Returns:
      texts: list of document/page text
      sources: list of source identifiers (filename or filename#pageX)

    IMPORTANT:
    - PDF extract_text() sometimes returns None. We ignore empty/None pages.
    """
    texts = []
    sources = []

    for file in os.listdir(folder):
        path = os.path.join(folder, file)

        if file.lower().endswith(".txt"):
            with open(path, "r", encoding="utf-8") as f:
                content = f.read().strip()
                if content:
                    texts.append(content)
                    sources.append(file)

        elif file.lower().endswith(".pdf"):
            reader = PdfReader(path)
            for i, page in enumerate(reader.pages):
                page_text = page.extract_text()
                if page_text and page_text.strip():
                    texts.append(page_text.strip())
                    sources.append(f"{file}#page{i+1}")

    return texts, sources

documents, sources = load_documents("documents")
print("✅ Loaded docs/pages:", len(documents))
print("✅ Example source:", sources[0] if sources else "NO SOURCES")


✅ Loaded docs/pages: 3
✅ Example source: security_policy.txt


In [None]:
def chunk_text(text: str, chunk_size_words: int = 180, overlap_words: int = 40) -> List[str]:
    """
    Why chunking:
    - Vector search works better with smaller chunks.
    - Overlap helps avoid losing context across boundaries.
    """
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size_words
        chunk = " ".join(words[start:end]).strip()
        if chunk:
            chunks.append(chunk)
        start = end - overlap_words  # overlap
        if start < 0:
            start = 0
    return chunks

chunks = []
chunk_sources = []

for doc, src in zip(documents, sources):
    doc_chunks = chunk_text(doc)
    chunks.extend(doc_chunks)
    chunk_sources.extend([src] * len(doc_chunks))

print("✅ Total chunks:", len(chunks))
print("✅ Sample chunk:\n", chunks[0][:300] if chunks else "NO CHUNKS")

# Guard: stop early if nothing loaded (common error)
if len(chunks) == 0:
    raise ValueError("No chunks created. Upload .txt/.pdf into /documents or check PDF extraction.")


✅ Total chunks: 3
✅ Sample chunk:
 If an account is suspected to be hacked or fraud is reported: - Escalate to the security team immediately. - Mark the case as high priority. - Require human confirmation before any account action.


In [None]:
from sentence_transformers import SentenceTransformer

# Embedding model: fast + good enough for portfolio RAG
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# normalize_embeddings improves similarity quality
embeddings = embedder.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)

print("✅ Embeddings shape:", embeddings.shape)


✅ Embeddings shape: (3, 384)


In [None]:
import faiss

# FAISS stores vectors and searches nearest neighbors fast
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # IP (inner product) works well with normalized vectors
index.add(embeddings)

print("✅ FAISS index ready. Total vectors:", index.ntotal)


✅ FAISS index ready. Total vectors: 3


In [None]:
def retrieve(query: str, k: int = 3) -> List[Dict]:
    """
    Converts query to embedding, searches FAISS, returns top-k chunks with sources.
    """
    q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, idxs = index.search(q_emb, k)

    results = []
    for score, idx in zip(scores[0], idxs[0]):
        results.append({
            "text": chunks[int(idx)],
            "source": chunk_sources[int(idx)],
            "score": float(score)
        })
    return results

# Quick retrieval test
test_retrieval = retrieve("transfer failed money deducted", k=3)
test_retrieval


[{'text': 'If a transfer fails but money is deducted, the customer must provide the transaction ID. Support will investigate within 24 hours and update the customer. If transaction ID is missing, request it before escalation.',
  'source': 'payments_policy.txt',
  'score': 0.6410048007965088},
 {'text': 'Refunds are processed within 5 business days after verification. Duplicate charges require an order ID and payment reference. If the user was charged twice, create a refund ticket and attach the order ID.',
  'source': 'refund_policy.txt',
  'score': 0.33896416425704956},
 {'text': 'If an account is suspected to be hacked or fraud is reported: - Escalate to the security team immediately. - Mark the case as high priority. - Require human confirmation before any account action.',
  'source': 'security_policy.txt',
  'score': 0.2704086899757385}]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Instruction model that can follow "ONLY use context" style prompts
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    load_in_4bit=True
)

llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=0.0,      # stable answers
    do_sample=False,
    max_new_tokens=300
)

print("✅ LLM loaded")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [None]:
def build_rag_prompt(question: str, retrieved_chunks: List[Dict]) -> str:
    """
    Key rule: model must answer ONLY from retrieved context.
    If not found -> say "Information not found..."
    """
    context = "\n\n".join(
        [f"[Source: {c['source']}] {c['text']}" for c in retrieved_chunks]
    )

    return f"""
You are a helpful assistant for enterprise support.

You MUST answer using ONLY the Context below.
- If the answer is not in the Context, reply exactly: "Information not found in provided documents."
- Do not invent policies.
- Keep the answer short and actionable.

Context:
{context}

Question:
{question}

Answer:
""".strip()


In [None]:
def rag_answer(question: str, k: int = 3) -> Dict:
    """
    Full RAG pipeline:
    1) Retrieve top-k relevant chunks
    2) Build prompt with context
    3) Generate answer
    4) Return answer + sources + a simple confidence score

    Confidence here is heuristic:
    - based on retrieval scores (not perfect but good for portfolio)
    """
    retrieved = retrieve(question, k=k)

    prompt = build_rag_prompt(question, retrieved)
    out = llm(prompt)[0]["generated_text"]
    answer = out.replace(prompt, "").strip()

    # Simple confidence from top retrieval score (0..1-ish scale)
    top_score = max([r["score"] for r in retrieved]) if retrieved else 0.0
    confidence = float(max(0.0, min(1.0, (top_score + 1) / 2)))  # rough scaling

    return {
        "question": question,
        "answer": answer,
        "sources": list(dict.fromkeys([r["source"] for r in retrieved])),  # unique, keep order
        "retrieval_scores": [r["score"] for r in retrieved],
        "confidence": confidence
    }


In [None]:
questions = [
    "What should I do if my transfer failed but money was deducted?",
    "How long do refunds take?",
    "What should happen if fraud is reported?",
    "Do you support Apple Pay?"  # likely not in docs
]

for q in questions:
    resp = rag_answer(q, k=3)
    print(json.dumps(resp, indent=2))
    print("-"*90)


In [None]:
EVAL_SET = [
    {
        "q": "What should I do if my transfer failed but money was deducted?",
        "must_contain": ["transaction ID", "investigate", "24 hours"]
    },
    {
        "q": "How long do refunds take?",
        "must_contain": ["5 business days"]
    },
    {
        "q": "What do you do when fraud is reported?",
        "must_contain": ["security", "high priority", "human"]
    },
    {
        "q": "Do you support Apple Pay?",
        "must_be_not_found": True
    }
]

def evaluate_rag(eval_set):
    passed = 0
    details = []

    for item in eval_set:
        resp = rag_answer(item["q"], k=3)
        ans_low = resp["answer"].lower()

        ok = True
        if item.get("must_be_not_found"):
            ok = ("information not found in provided documents" in ans_low)
        else:
            for phrase in item["must_contain"]:
                if phrase.lower() not in ans_low:
                    ok = False
                    break

        passed += int(ok)
        details.append({
            "question": item["q"],
            "answer": resp["answer"],
            "sources": resp["sources"],
            "ok": ok
        })

    report = {
        "tests": len(eval_set),
        "passed": passed,
        "pass_rate": passed / len(eval_set)
    }
    return report, details

report, details = evaluate_rag(EVAL_SET)
print(report)
for d in details:
    print("\nOK:", d["ok"])
    print("Q:", d["question"])
    print("Sources:", d["sources"])
    print("A:", d["answer"])


In [None]:
with open("rag_eval_results.json", "w", encoding="utf-8") as f:
    json.dump({"report": report, "details": details}, f, ensure_ascii=False, indent=2)

print("✅ Saved rag_eval_results.json (download it from Colab files)")
