<a href="https://colab.research.google.com/github/AkratiSachan23/RAG-System/blob/main/RAG(Q%26A).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Retrieval‑Augmented Generation (RAG) Q&A for PS‑4
**National‑Security Focused**  
This notebook will:
1. Install dependencies  
2. Load your PDFs from `./data/`  
3. Extract and clean text  
4. Chunk into passages  
5. Build a FAISS index  
6. Run an interactive Q&A loop with FLAN‑T5  

---



In [None]:
!pip install faiss-cpu sentence-transformers transformers PyMuPDF

In [3]:
# Imports
import os
import fitz                             # PyMuPDF
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [4]:
# Configuration
PDF_FOLDER = "data"                    # place your PDFs here
EMBED_MODEL = "all-MiniLM-L6-v2"
GEN_MODEL   = "google/flan-t5-base"
INDEX_PATH  = "faiss.index"
CHUNK_SIZE  = 500
CHUNK_OVERLAP = 50
TOP_K = 3


In [5]:
# PDF Loader
def load_pdfs(pdf_folder: str):
    docs = {}
    for fname in sorted(os.listdir(pdf_folder)):
        if not fname.lower().endswith(".pdf"): continue
        path = os.path.join(pdf_folder, fname)
        txt_pages = []
        with fitz.open(path) as pdf:
            for page in pdf:
                txt_pages.append(page.get_text())
        docs[fname] = "\n".join(txt_pages)
        print(f"Loaded {fname} ({len(txt_pages)} pages)")
    return docs

docs = load_pdfs(PDF_FOLDER)


Loaded Alert and Advisories _ NCIIPC.pdf (162 pages)
Loaded CERT-In Vulnerability Notes.pdf (2 pages)
Loaded CIWP-2025-0002.pdf (53 pages)
Loaded Comprehensive_Cyber_Security_Audit_Policy_Guidelines.pdf (69 pages)
Loaded National_Cyber_Security_Policy-2013.pdf (15 pages)
Loaded Roles_Responsibilities-CISO.pdf (2 pages)
Loaded SOP-PPP.pdf (25 pages)


In [6]:
# Chunking
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", " ", ""]
)

chunks = []
for doc_id, text in docs.items():
    parts = splitter.split_text(text)
    for i, chunk in enumerate(parts):
        chunks.append({
            "doc_id": doc_id,
            "chunk_id": i,
            "text": chunk
        })
print(f"Total chunks: {len(chunks)}")


Total chunks: 1200


In [None]:
# Embed & Index
embedder = SentenceTransformer(EMBED_MODEL)

# encode all chunks
texts = [c["text"] for c in chunks]
embeddings = embedder.encode(texts, show_progress_bar=True)
dim = embeddings.shape[1]

# create FAISS index
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings, dtype="float32"))
faiss.write_index(index, INDEX_PATH)
print(f"FAISS index built ({index.ntotal} vectors) and saved to {INDEX_PATH}")


In [8]:
# Helper Classes
class Retriever:
    def __init__(self, index_path=INDEX_PATH):
        self.index = faiss.read_index(index_path)
        self.embedder = embedder

    def retrieve(self, query: str, top_k=TOP_K):
        q_emb = self.embedder.encode([query])
        D, I = self.index.search(np.array(q_emb, dtype="float32"), top_k)
        return [chunks[i] for i in I[0]]

class Generator:
    def __init__(self, model_name=GEN_MODEL):
        self.pipe = pipeline("text2text-generation", model=model_name)

    def answer(self, question: str, contexts: list):
        combined = "\n\n".join(
            f"[{c['doc_id']}#{c['chunk_id']}]: {c['text']}"
            for c in contexts
        )
        prompt = (
            "Use the following document snippets to answer the question and cite each fact:\n\n"
            f"{combined}\n\nQuestion: {question}\nAnswer:"
        )
        out = self.pipe(prompt, max_length=200)[0]["generated_text"]
        return out


In [None]:
# Run Q&A Loop
ret = Retriever()
gen = Generator()

print("RAG Q&A ready! Type your question, or 'exit' to quit.\n")
while True:
    q = input("Q> ").strip()
    if q.lower() in ("exit", "quit"):
        break
    ctx = ret.retrieve(q)
    ans = gen.answer(q, ctx)
    print("\nA:", ans, "\n" + "-"*60 + "\n")


#Output :
Device set to use cpu <br>
RAG Q&A ready! Type your question, or 'exit' to quit.<br>

##Q> What are current Cyber Security Threats <br>
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)<br>

A: identity theft, phishing, social engineering, hactivism, cyber terrorism, compound threats
------------------------------------------------------------

## Q> exit