In [None]:
!pip install gradio



In [None]:
# Install required libraries
!pip install pymupdf faiss-cpu sentence-transformers transformers

import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from google.colab import files  # for runtime upload

# -----------------------------
# 1. PDF Upload & Extraction
# -----------------------------
def upload_pdf():
    uploaded = files.upload()   # this opens a file picker
    pdf_path = list(uploaded.keys())[0]  # get uploaded file name
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text")
    return text

# 2. Split text into chunks
def split_text_into_chunks(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

# Load Embedding Model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# 3. Build FAISS Index
def build_faiss_index(chunks):
    embeddings = embedder.encode(chunks)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings, dtype="float32"))
    return index, embeddings

# 4. Search Function
def semantic_search(query, chunks, index, top_k=3):
    query_vec = embedder.encode([query])
    distances, indices = index.search(np.array(query_vec, dtype="float32"), top_k)
    results = [chunks[i] for i in indices[0]]
    return results

# 5. Load LLM (small model for demo)
qa_pipeline = pipeline("text-generation", model="distilgpt2")

def answer_question(query, chunks):
    context = "\n".join(chunks)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    response = qa_pipeline(prompt, max_new_tokens=300, do_sample=True)
    return response[0]["generated_text"]

# -----------------------------
# Example Usage
# -----------------------------
# Step 1: Upload the PDF at runtime
text = upload_pdf()

# Step 2: Split into chunks
chunks = split_text_into_chunks(text)

# Step 3: Build FAISS Index
index, embeddings = build_faiss_index(chunks)

# Step 4: Ask a question
query = "What is quantum computing?"
retrieved_chunks = semantic_search(query, chunks, index)

# Step 5: Generate grounded answer
answer = answer_question(query, retrieved_chunks)

print("\nQ:", query)
print("\nA:", answer)


Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, faiss-cpu
Successfully installed faiss-cpu-1.12.0 pymupdf-1.26.4


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


Saving Financial Accounting (6).pdf to Financial Accounting (6).pdf


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Q: What is quantum computing?

A: Context:
final entry or 
principal book of accounts. It is a book where all 
transactions either debited or credited are stored. 
                            A ledger  is a record of a business’s 
financial transactions. It summarises all the revenue 
and expenses of the business, plus the debts owed 
and assets owned. 
 
4.Subsidiary Books  
Subsidiary Books are the books that record the 
transactions which are similar in nature in an orderly 
manner. They are also known as special journals or 
Daybooks
d 
tax collection entities. 
Examples  
              Financial Acccounting 
2.Journal  
          An accounting journal is a detailed account of all the 
financial transactions of a business. It's also known as the 
book of original entry as it's the first place where 
transactions are recorded. 
                  Journal is a book of accounts in which all day to 
day business transactions are recorded in a chronological 
order i.e. in the order of t