# 📚 GIKI Prospectus Q&A Chatbot (RAG)
Bilingual chatbot (English + Urdu) using FAISS + MiniLM embeddings + Flan-T5-Small (direct generation for stability).

In [None]:

!pip -q install pymupdf python-docx faiss-cpu sentence-transformers transformers gradio deep-translator


In [None]:

import os, re, fitz, docx
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from deep_translator import GoogleTranslator
import gradio as gr
import torch

embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

def translate_to_en(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except:
        return text

def translate_to_ur(text):
    try:
        return GoogleTranslator(source='en', target='ur').translate(text)
    except:
        return text

def answer_with_t5(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(**inputs, max_new_tokens=80)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:

def extract_text_from_file(file):
    if file.name.endswith('.pdf'):
        doc = fitz.open(file.name)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    elif file.name.endswith('.docx'):
        d = docx.Document(file.name)
        return "\n".join([p.text for p in d.paragraphs])
    elif file.name.endswith('.txt'):
        return open(file.name, encoding='utf-8').read()
    else:
        return ""

def chunk_text(text, max_tokens=400):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks, current = [], ""
    for s in sentences:
        if len((current+s).split()) > max_tokens:
            chunks.append(current.strip())
            current = s
        else:
            current += " " + s
    if current:
        chunks.append(current.strip())
    return chunks


In [None]:

def build_faiss_index(docs):
    texts, meta = [], []
    for file in docs:
        t = extract_text_from_file(file)
        ch = chunk_text(t)
        texts.extend(ch)
        meta.extend([file.name]*len(ch))
    embeds = embedder.encode(texts, convert_to_numpy=True)
    dim = embeds.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeds)
    return index, texts, meta


In [None]:

def rag_answer(query, index, texts, meta, top_k=2, lang='en'):
    q_orig = query
    q_en = translate_to_en(query) if lang=='ur' else query
    q_emb = embedder.encode([q_en], convert_to_numpy=True)
    D,I = index.search(q_emb, top_k)
    context = " ".join([texts[i][:300] for i in I[0]])
    prompt = f"Context: {context}\n\nQuestion: {q_en}\nAnswer:"
    try:
        ans = answer_with_t5(prompt)
    except Exception as e:
        ans = f"⚠️ Model error: {e}"
    if lang=='ur' or (q_orig != q_en):
        ans = translate_to_ur(ans)
    return ans, [meta[i] for i in I[0]]


In [None]:

docs, texts, meta, index = [], [], [], None

def upload_files(files):
    global docs, texts, meta, index
    docs = files
    index, texts, meta = build_faiss_index(docs)
    return f"Uploaded {len(docs)} documents, processed into {len(texts)} chunks."

def chatbot(query, lang_choice):
    if not index:
        return "Please upload documents first.", []
    ans, sources = rag_answer(query, index, texts, meta, lang='ur' if lang_choice=='Urdu' else 'en')
    return ans, sources

with gr.Blocks() as demo:
    gr.Markdown("## 📘 GIKI Prospectus RAG Chatbot (English + Urdu)")
    with gr.Row():
        uploader = gr.File(file_types=['.pdf','.docx','.txt'], file_count="multiple", label="Upload Documents")
    status = gr.Textbox(label="Status")
    uploader.upload(upload_files, uploader, status)
    with gr.Row():
        query = gr.Textbox(label="Your Question (English or Urdu)")
        lang = gr.Radio(["English","Urdu"], value="English", label="Answer Language")
    btn = gr.Button("Ask")
    output = gr.Textbox(label="Answer")
    sources = gr.Label(label="Sources")
    btn.click(fn=chatbot, inputs=[query, lang], outputs=[output, sources])
demo.launch(share=True)
