In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!pip install pypdf sentence-transformers qdrant-client transformers streamlit accelerate





In [9]:
PDF_FOLDER = "/content/drive/MyDrive/ncert_biology"


In [10]:
from pypdf import PdfReader

def extract_text_with_page(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []
    for page_no, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            pages.append({
                "page": page_no + 1,
                "text": text
            })
    return pages


In [11]:
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]


In [None]:
!pip uninstall -y transformers sentence-transformers accelerate
!pip install transformers==4.38.2 sentence-transformers==2.6.1 accelerate==0.27.2


Found existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Found existing installation: sentence-transformers 2.6.1
Uninstalling sentence-transformers-2.6.1:
  Successfully uninstalled sentence-transformers-2.6.1
Found existing installation: accelerate 0.27.2
Uninstalling accelerate-0.27.2:
  Successfully uninstalled accelerate-0.27.2
Collecting transformers==4.38.2
  Using cached transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
Collecting sentence-transformers==2.6.1
  Using cached sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting accelerate==0.27.2
  Using cached accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Using cached transformers-4.38.2-py3-none-any.whl (8.5 MB)
Using cached sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
Using cached accelerate-0.27.2-py3-none-any.whl (279 kB)
Installing collected packages: accelerate, transformers, sentence-transformers
Successfully i

In [12]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

client = QdrantClient(":memory:")
client.delete_collection(collection_name="ncert_biology")
client.create_collection(
    collection_name="ncert_biology",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)


True

In [14]:
import os
from qdrant_client.models import PointStruct # Added import

points = []
idx = 0

for file in os.listdir(PDF_FOLDER):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(PDF_FOLDER, file)
        pages = extract_text_with_page(pdf_path)

        for p in pages:
            chunks = chunk_text(p["text"])
            for chunk in chunks:
                vector = embedding_model.encode(chunk).tolist()

                points.append(
                    PointStruct(
                        id=idx, # Used PointStruct to define the point
                        vector=vector,
                        payload={
                            "pdf": file,
                            "page": p["page"],
                            "text": chunk
                        }
                    )
                )
                idx += 1

client.upsert(collection_name="ncert_biology", points=points)

print("âœ… PDFs indexed into Qdrant")

âœ… PDFs indexed into Qdrant


In [15]:
def retrieve_context(query, top_k=5):
    query_vector = embedding_model.encode(query).tolist()

    hits = client.search(
        collection_name="ncert_biology",
        vector=query_vector,
        limit=top_k
    )
    context = ""
    sources = []

    for hit in hits:
        context += hit.payload["text"] + "\n\n"
        sources.append(f"{hit.payload['pdf']} | Page {hit.payload['page']}")

    return context, sources


In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")


In [17]:
import traceback

def generate_answer(question):
    if not question.strip():
        return "Please ask a question.", []

    try:
        context, sources = retrieve_context(question)

        if not context.strip():
            return "No relevant content found in PDFs.", []

        prompt = (
            "Answer the question using the context below.\n\n"
            f"Context:\n{context[:1500]}\n\n"
            f"Question:\n{question}\n\n"
            "Answer:"
        )

        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )

        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=False
        )

        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return answer, sources
    except Exception as e:
        error_message = f"An error occurred: {str(e)}"
        traceback_str = traceback.format_exc()
        return f"{error_message}\n\nTraceback:\n{traceback_str}", []

In [18]:
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.5.0


In [18]:
import gradio as gr

def ask(question):
    answer, sources = generate_answer(question)
    return answer, "\n".join(set(sources))

iface = gr.Interface(
    fn=ask,
    inputs="text",
    outputs=["text", "text"],
    title="ðŸ“˜ NCERT Biology â€“ AI PDF Search (RAG)"
)

iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a2bd2d20ed05786675.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


