<a href="https://colab.research.google.com/github/Aamina0/Project-1/blob/main/GIKI_RAG_Chatbot_Colab_Full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GIKI RAG Chatbot — Colab Notebook (Full)

This notebook contains a full Retrieval-Augmented Generation (RAG) pipeline:

- Document ingestion (PDF/DOCX/TXT)
- Chunking
- Multilingual embeddings (intfloat/multilingual-e5-base)
- FAISS vector store with persistence
- Generation with Flan-T5 (paraphrasing prompt)
- English/Urdu toggle using Helsinki-NLP Marian models
- Streamlit UI with pyngrok launcher for Colab
- Conversation export to PDF

**Run cells top-to-bottom in Colab.**

In [1]:
# Install dependencies (run in Colab)
!pip -q install --upgrade pip
!pip -q install sentence-transformers faiss-cpu transformers accelerate pymupdf python-docx pypdf reportlab streamlit pyngrok>=5.0.0 langdetect

In [2]:
# ✅ 2) Mount Google Drive and set project working directory
from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = '/content/drive/MyDrive/Project3'
import os
os.makedirs(PROJECT_DIR, exist_ok=True)
%cd "$PROJECT_DIR"
print('Working directory:', PROJECT_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Project3
Working directory: /content/drive/MyDrive/Project3


In [3]:

# Imports & config
import os, io, pickle, time, textwrap
from pathlib import Path

import faiss
import torch
import numpy as np

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, MarianMTModel, MarianTokenizer

import fitz  # pymupdf
import docx
from pypdf import PdfReader

from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

try:
    from langdetect import detect as lang_detect
except Exception:
    lang_detect = None

# Paths
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
INDEX_DIR = BASE_DIR / "faiss_store"
APP_DIR = BASE_DIR / "app"
DATA_DIR.mkdir(exist_ok=True)
INDEX_DIR.mkdir(exist_ok=True)
APP_DIR.mkdir(exist_ok=True)

INDEX_FILE = INDEX_DIR / "index.faiss"
META_FILE = INDEX_DIR / "metadata.pkl"

# Models
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base"
GEN_MODEL_NAME = "google/flan-t5-large"

device = 0 if torch.cuda.is_available() else -1
print("Device:", "cuda" if device==0 else "cpu")


Device: cpu


In [4]:
# Optimized document ingestion utilities
def extract_text_from_pdf_pymupdf(pdf_path: str) -> str:
    text = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text.append(page.get_text())
    return "\n".join(text)

def extract_text_from_pdf_pypdf(pdf_path: str) -> str:
    text = []
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text.append(page.extract_text() or "")
    return "\n".join(text)

def extract_text_from_docx(docx_path: str) -> str:
    d = docx.Document(docx_path)
    return "\n".join([p.text for p in d.paragraphs])

def extract_text_from_txt(txt_path: str) -> str:
    return Path(txt_path).read_text(encoding="utf-8", errors="ignore")

def load_documents(filepaths, max_pages=50):
    texts = []
    for p in filepaths:
        p = str(p)
        if p.lower().endswith(".pdf"):
            try:
                # Extract only first max_pages to prevent excessive processing
                txt = []
                with fitz.open(p) as doc:
                    for i, page in enumerate(doc):
                        if i >= max_pages:
                            break
                        txt.append(page.get_text())
                txt = "\n".join(txt)
                if not txt.strip():
                    txt = extract_text_from_pdf_pypdf(p)
            except Exception:
                txt = extract_text_from_pdf_pypdf(p)
        elif p.lower().endswith(".docx"):
            txt = extract_text_from_docx(p)
        elif p.lower().endswith(".txt"):
            txt = extract_text_from_txt(p)
        else:
            print("Skipping unsupported file:", p)
            continue
        if txt.strip():
            texts.append(txt)
    return "\n".join(texts)

In [5]:
# Optimized chunking utility (by characters instead of words for faster processing)
def chunk_text(text: str, chunk_size_chars=2000, overlap_chars=200):
    chunks = []
    for i in range(0, len(text), max(1, chunk_size_chars - overlap_chars)):
        chunk = text[i:i+chunk_size_chars]
        if chunk.strip():
            chunks.append(chunk)
    return chunks

# Optimized FAISS building with progress indication
def build_or_load_faiss(chunks):
    if INDEX_FILE.exists() and META_FILE.exists():
        print("[FAISS] Loading existing index and metadata...")
        index = faiss.read_index(str(INDEX_FILE))
        with open(META_FILE, "rb") as f:
            metadata = pickle.load(f)
        return index, metadata

    print("[FAISS] Building new index...")

    # Process in smaller batches to avoid memory issues
    batch_size = 16  # Reduced batch size
    all_embs = []

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(chunks)-1)//batch_size + 1}")
        embs = embedding_model.encode(batch, batch_size=8, show_progress_bar=False,
                                     convert_to_numpy=True, normalize_embeddings=True)
        all_embs.append(embs)

    embs = np.vstack(all_embs)
    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embs)
    metadata = {i: {"text": chunks[i]} for i in range(len(chunks))}

    faiss.write_index(index, str(INDEX_FILE))
    with open(META_FILE, "wb") as f:
        pickle.dump(metadata, f)

    print("[FAISS] Saved index and metadata.")
    return index, metadata

In [6]:

# Embedding model and FAISS persistence
print("Loading embedding model (this may take time)...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=('cuda' if device==0 else 'cpu'))

def build_or_load_faiss(chunks):
    if INDEX_FILE.exists() and META_FILE.exists():
        print("[FAISS] Loading existing index and metadata...")
        index = faiss.read_index(str(INDEX_FILE))
        with open(META_FILE, "rb") as f:
            metadata = pickle.load(f)
        return index, metadata
    print("[FAISS] Building new index...")
    embs = embedding_model.encode(chunks, batch_size=64, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embs)
    metadata = {i: {"text": chunks[i]} for i in range(len(chunks))}
    faiss.write_index(index, str(INDEX_FILE))
    with open(META_FILE, "wb") as f:
        pickle.dump(metadata, f)
    print("[FAISS] Saved index and metadata.")
    return index, metadata

def load_faiss_only():
    if INDEX_FILE.exists() and META_FILE.exists():
        index = faiss.read_index(str(INDEX_FILE))
        with open(META_FILE, "rb") as f:
            metadata = pickle.load(f)
        return index, metadata
    raise FileNotFoundError("No FAISS index found. Build it first.")


Loading embedding model (this may take time)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# Generation model (Flan-T5) and paraphrase prompt
print("Loading generation model (this may take time)...")
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME, device_map="auto")
generator = pipeline("text2text-generation", model=gen_model, tokenizer=tokenizer, max_new_tokens=300, temperature=0.3, top_p=0.95)

PARAPHRASE_PROMPT = """You are an assistant for GIKI students.
Answer the question using ONLY the provided context. Do NOT copy-paste; paraphrase and summarize in your own words.
If the answer cannot be found in the context, say you don't know.

Question: {question}

Context:
{context}

Answer:"""

def generate_answer(question: str, retrieved_text: str) -> str:
    prompt = PARAPHRASE_PROMPT.format(question=question, context=retrieved_text)
    out = generator(prompt)[0]["generated_text"].strip()
    return out

Loading generation model (this may take time)...


Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [8]:

# Translation helpers using MarianMT
_mt_cache = {}
def _load_translator(src_tgt):
    if src_tgt in _mt_cache:
        return _mt_cache[src_tgt]
    model_name = {"ur-en": "Helsinki-NLP/opus-mt-ur-en", "en-ur": "Helsinki-NLP/opus-mt-en-ur"}[src_tgt]
    tok = MarianTokenizer.from_pretrained(model_name)
    mod = MarianMTModel.from_pretrained(model_name)
    _mt_cache[src_tgt] = (tok, mod)
    return tok, mod

def translate(text: str, src_tgt: str) -> str:
    tok, mod = _load_translator(src_tgt)
    batch = tok([text], return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        gen = mod.generate(**batch, max_new_tokens=400)
    return tok.batch_decode(gen, skip_special_tokens=True)[0]

def detect_lang(text: str) -> str:
    if lang_detect:
        try:
            code = lang_detect(text)
            return "ur" if code.startswith("ur") else "en"
        except Exception:
            pass
    for ch in text:
        if "\u0600" <= ch <= "\u06FF":
            return "ur"
    return "en"


In [9]:

# Retrieval + answer function
index = None
metadata = None

def retrieve_context(query_en: str, top_k=4):
    q_emb = embedding_model.encode([query_en], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_emb, top_k)
    chunks = []
    for idx in I[0]:
        if int(idx) in metadata:
            chunks.append(metadata[int(idx)]["text"])
    return "\n\n".join(chunks)

def answer_question(query: str, lang='en', top_k=4):
    # translate if needed, retrieve, generate, translate back
    if lang == 'ur':
        query_en = translate(query, "ur-en")
    else:
        query_en = query
    ctx = retrieve_context(query_en, top_k=top_k)
    ans_en = generate_answer(query_en, ctx)
    if lang == 'ur':
        return translate(ans_en, "en-ur"), ctx
    return ans_en, ctx


In [10]:

# Evaluation & export
def cosine_sim(a,b):
    a = np.array(a) / (np.linalg.norm(a)+1e-8)
    b = np.array(b) / (np.linalg.norm(b)+1e-8)
    return float(np.dot(a,b))

def evaluate_answer(answer: str, context: str):
    if not answer.strip() or not context.strip():
        return 0.0
    emb_a = embedding_model.encode([answer], convert_to_numpy=True, normalize_embeddings=True)[0]
    emb_c = embedding_model.encode([context], convert_to_numpy=True, normalize_embeddings=True)[0]
    return cosine_sim(emb_a, emb_c)

def save_conversation_to_pdf(lines, filename="conversation.pdf"):
    c = canvas.Canvas(filename, pagesize=letter)
    width, height = letter
    x, y = 40, height-40
    wrap = 95
    for line in lines:
        for l in textwrap.wrap(line, wrap):
            if y < 60:
                c.showPage()
                y = height-40
            c.drawString(x, y, l)
            y -= 14
    c.save()
    return filename


In [11]:
# Build or load FAISS index from files in data/
from glob import glob
def discover_files(extensions=(".pdf", ".docx", ".txt"), limit=3):  # Reduced limit
    found = []
    for ext in extensions:
        found.extend(glob(str(DATA_DIR / f"*{ext}")))
    found = sorted(found)[:limit]
    return found

def build_index_from_data():
    global index, metadata
    files = discover_files()
    print("Files:", files)

    # Limit processing to first 20 pages of PDFs
    text = load_documents(files, max_pages=20)
    chunks = chunk_text(text, 2000, 200)  # Use character-based chunking

    # Show progress
    print(f"Processing {len(chunks)} chunks...")
    index, metadata = build_or_load_faiss(chunks)
    print("Index ready. Chunks:", len(metadata))

In [12]:
# Create the optimized Streamlit app file
app_content = '''
import streamlit as st
from pathlib import Path
import faiss, pickle, textwrap
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, MarianMTModel, MarianTokenizer
import fitz, docx
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import numpy as np
import tempfile
import os
import time

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
INDEX_DIR = BASE_DIR / "faiss_store"
INDEX_FILE = INDEX_DIR / "index.faiss"
META_FILE  = INDEX_DIR / "metadata.pkl"
DATA_DIR.mkdir(exist_ok=True)
INDEX_DIR.mkdir(exist_ok=True)

EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base"
GEN_MODEL_NAME = "google/flan-t5-large"

device = 0 if torch.cuda.is_available() else -1

@st.cache_resource
def get_embedding_model():
    return SentenceTransformer(EMBEDDING_MODEL_NAME, device=('cuda' if device==0 else 'cpu'))

@st.cache_resource
def get_generator():
    tok = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
    mod = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
    return pipeline(
        "text2text-generation",
        model=mod,
        tokenizer=tok,
        max_new_tokens=300,
        temperature=0.3,
        top_p=0.95
    )

_mt_cache = {}
def get_translator(src_tgt):
    if src_tgt in _mt_cache:
        return _mt_cache[src_tgt]
    model_name = {"ur-en": "Helsinki-NLP/opus-mt-ur-en", "en-ur": "Helsinki-NLP/opus-mt-en-ur"}[src_tgt]
    tok = MarianTokenizer.from_pretrained(model_name)
    mod = MarianMTModel.from_pretrained(model_name)
    _mt_cache[src_tgt] = (tok, mod)
    return tok, mod

def translate(text, src_tgt):
    tok, mod = get_translator(src_tgt)
    batch = tok([text], return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        gen = mod.generate(**batch, max_new_tokens=400)
    return tok.batch_decode(gen, skip_special_tokens=True)[0]

# Optimized chunking function
def chunk_text_local(text: str, chunk_size_chars=2000, overlap_chars=200):
    chunks = []
    for i in range(0, len(text), max(1, chunk_size_chars - overlap_chars)):
        chunk = text[i:i+chunk_size_chars]
        if chunk.strip():
            chunks.append(chunk)
    return chunks

def load_faiss_local():
    if INDEX_FILE.exists() and META_FILE.exists():
        idx = faiss.read_index(str(INDEX_FILE))
        with open(META_FILE, "rb") as f:
            meta = pickle.load(f)
        return idx, meta
    return None, None

def save_faiss_local(index, metadata):
    INDEX_DIR.mkdir(exist_ok=True)
    faiss.write_index(index, str(INDEX_FILE))
    with open(META_FILE, "wb") as f:
        pickle.dump(metadata, f)

PARAPHRASE_PROMPT = """You are a helpful assistant for GIKI students.
Answer the user question using ONLY the provided context. Do NOT copy-paste sentences.
Paraphrase and summarize in your own words. If the answer is not in the context, say you don't know.

Question: {question}

Context:
{context}

Answer:"""

def generate_answer_local(question: str, context: str, generator):
    prompt = PARAPHRASE_PROMPT.format(question=question, context=context)
    return generator(prompt)[0]["generated_text"].strip()

def detect_lang_local(text: str) -> str:
    try:
        from langdetect import detect
        code = detect(text)
        return "ur" if code.startswith("ur") else "en"
    except:
        for ch in text:
            if "\u0600" <= ch <= "\u06FF":
                return "ur"
        return "en"

def retrieve_context_local(query_en: str, index, metadata, embedder, top_k=4):
    q_emb = embedder.encode([query_en], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_emb, top_k)
    chunks = []
    for idx in I[0]:
        if int(idx) in metadata:
            chunks.append(metadata[int(idx)]["text"])
    return "\\n\\n".join(chunks)

def answer_question_local(query: str, lang, index, metadata, embedder, generator, top_k=4):
    if lang == 'ur':
        query_en = translate(query, "ur-en")
    else:
        query_en = query
    ctx = retrieve_context_local(query_en, index, metadata, embedder, top_k=top_k)
    ans_en = generate_answer_local(query_en, ctx, generator)
    if lang == 'ur':
        return translate(ans_en, "en-ur"), ctx
    return ans_en, ctx

st.set_page_config(page_title="GIKI RAG Chatbot", page_icon="🎓", layout="wide")
st.title("🎓 GIKI Prospectus Q&A (RAG)")
st.caption("Upload up to 5 docs (PDF/DOCX/TXT), build index once, then chat. English/Urdu supported.")

embedder = get_embedding_model()
generator = get_generator()

with st.sidebar:
    st.header("Index Manager")
    uploaded = st.file_uploader("Upload up to 5 documents", type=["pdf","docx","txt"], accept_multiple_files=True)

    # Add file size limit (5MB per file)
    max_size_mb = 5
    if uploaded:
        valid_files = []
        for f in uploaded[:5]:
            if f.size > max_size_mb * 1024 * 1024:
                st.warning(f"File {f.name} is too large ({f.size//(1024*1024)}MB). Max size is {max_size_mb}MB.")
            else:
                valid_files.append(f)

        if valid_files and st.button("🔧 Build FAISS Index"):
            progress_bar = st.progress(0)
            status_text = st.empty()

            texts = []
            for i, f in enumerate(valid_files):
                status_text.text(f"Processing {f.name} ({i+1}/{len(valid_files)})")
                # Create a temporary file
                with tempfile.NamedTemporaryFile(delete=False, suffix=f.name) as tmp_file:
                    tmp_file.write(f.read())
                    tmp_path = tmp_file.name

                if f.name.lower().endswith(".pdf"):
                    txt = []
                    # Limit PDF processing to first 20 pages
                    with fitz.open(tmp_path) as doc:
                        for page_num, page in enumerate(doc):
                            if page_num >= 20:  # Limit to first 20 pages
                                break
                            txt.append(page.get_text())
                    texts.append("\\n".join(txt))
                elif f.name.lower().endswith(".docx"):
                    d = docx.Document(tmp_path)
                    texts.append("\\n".join([p.text for p in d.paragraphs]))
                else:
                    with open(tmp_path, encoding="utf-8", errors="ignore") as f_txt:
                        texts.append(f_txt.read())

                # Clean up temporary file
                os.unlink(tmp_path)
                progress_bar.progress((i+1)/len(valid_files))

            full_text = "\\n".join(texts)
            status_text.text("Chunking text...")
            chunks = chunk_text_local(full_text, 2000, 200)

            status_text.text("Creating embeddings...")
            # Process in smaller batches with progress
            batch_size = 8
            all_embs = []

            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i+batch_size]
                embs = embedder.encode(batch, batch_size=4, convert_to_numpy=True,
                                      normalize_embeddings=True, show_progress_bar=False)
                all_embs.append(embs)
                progress_bar.progress(0.5 + 0.5 * (i / len(chunks)))

            embs = np.vstack(all_embs)
            dim = embs.shape[1]
            index = faiss.IndexFlatIP(dim)
            index.add(embs)
            metadata = {i: {"text": chunks[i]} for i in range(len(chunks))}

            status_text.text("Saving index...")
            save_faiss_local(index, metadata)
            progress_bar.progress(1.0)
            status_text.text("")
            st.success(f"Index built with {len(chunks)} chunks!")

index, metadata = load_faiss_local()

if index is None:
    st.warning("Please upload documents and build the index first.")
    st.stop()

st.success(f"Index loaded with {len(metadata)} chunks!")

# Initialize session state
if "messages" not in st.session_state:
    st.session_state.messages = []

# Language selection
lang = st.radio("Language", ["English", "Urdu"], horizontal=True, index=0)
lang_code = "en" if lang == "English" else "ur"

# Display chat messages
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Chat input
if prompt := st.chat_input("Ask a question about GIKI..."):
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)

    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            answer, context = answer_question_local(prompt, lang_code, index, metadata, embedder, generator)
            st.markdown(answer)
            with st.expander("View context used"):
                st.text(context)
    st.session_state.messages.append({"role": "assistant", "content": answer})

# Export conversation
if st.session_state.messages:
    if st.button("💾 Export Conversation to PDF"):
        lines = []
        for msg in st.session_state.messages:
            role = "User" if msg["role"] == "user" else "Assistant"
            lines.append(f"{role}: {msg['content']}")

        filename = "conversation.pdf"
        c = canvas.Canvas(filename, pagesize=letter)
        width, height = letter
        x, y = 40, height-40
        wrap = 95
        for line in lines:
            for l in textwrap.wrap(line, wrap):
                if y < 60:
                    c.showPage()
                    y = height-40
                c.drawString(x, y, l)
                y -= 14
        c.save()

        with open(filename, "rb") as f:
            st.download_button("Download PDF", f, file_name=filename, mime="application/pdf")
'''

# Write the app file
app_file = APP_DIR / "app.py"
with open(app_file, "w", encoding="utf-8") as f:
    f.write(app_content)

print("Streamlit app created at:", app_file)

Streamlit app created at: /content/drive/MyDrive/Project3/app/app.py


In [13]:
  !pip install --upgrade pyngrok



In [14]:
  !pip install --upgrade streamlit



In [15]:
!pip install streamlit pyngrok




In [None]:
# Launch Streamlit in Colab and open public URL via pyngrok
import threading, subprocess, time, signal
from pyngrok import ngrok, conf
from pathlib import Path

# Kill any existing ngrok processes
ngrok.kill()

# Set ngrok authtoken if available
try:
    from google.colab import userdata
    ngrok.set_auth_token(userdata.get('NGROK_AUTH_TOKEN'))
    print("Ngrok authtoken set from Colab secrets")
except:
    print("No ngrok authtoken found in Colab secrets. Using free version.")

# Function to run Streamlit
def run_streamlit():
    import os
    os.chdir(APP_DIR)
    cmd = ["streamlit", "run", "app.py", "--server.port", "8501", "--server.address", "0.0.0.0", "--server.headless", "true"]
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    print("Streamlit process ended")
    if stdout:
        print("STDOUT:", stdout.decode())
    if stderr:
        print("STDERR:", stderr.decode())

# Start Streamlit in a separate thread
thread = threading.Thread(target=run_streamlit, daemon=True)
thread.start()

# Wait for Streamlit to start
time.sleep(8)

# Set up ngrok tunnel
public_url = ngrok.connect(8501, bind_tls=True)
print("Public URL:", public_url.public_url)
print("If runtime restarts, rerun this cell to get a new URL. Persist the FAISS index to avoid rebuilding.")

# Keep the process alive
try:
    while True:
        time.sleep(10)
except KeyboardInterrupt:
    print("Shutting down...")
    ngrok.kill()

Ngrok authtoken set from Colab secrets




Public URL: https://4af6ca68a9ca.ngrok-free.app
If runtime restarts, rerun this cell to get a new URL. Persist the FAISS index to avoid rebuilding.


In [None]:
!cat /root/.streamlit/logs/*.log
