In [5]:
# Step 1: Install necessary libraries
!pip install -q sentence-transformers faiss-cpu pandas beautifulsoup4

# Step 2: Upload required files
from google.colab import files
print("Upload 'DatasetQA.csv', 'chunk_data.pkl', and 'ncert_index'")
uploaded = files.upload()

# Step 3: Import dependencies
import os, pickle, re
import faiss
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

# Step 4: Utility functions
def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text().strip()

def extract_relevant_paragraph(text, user_query):
    sentences = re.split(r'(?<=[.?!])\s+', text)

    # Prioritize sentences with "is", "are", or definition-like structure
    def_like = [s for s in sentences if re.search(r"\bis\b|\bare\b|\brefers to\b|\bdefined as\b", s.lower()) and user_query.lower().split()[0] in s.lower()]
    if def_like:
        return " ".join(def_like[:3])

    # Fallback to keyword match
    matches = [s for s in sentences if user_query.lower() in s.lower()]
    if matches:
        return " ".join(matches[:3])

    return " ".join(sentences[:3])  # fallback

def guess_subject_from_filename(fname):
    subjects = ["Physics", "Chemistry", "Biology", "History", "Geography", "Polity", "Economics", "Science"]
    for subject in subjects:
        if subject.lower() in fname.lower():
            return subject
    return "Unknown"

# Step 5: Core QA system function
def answer_query(user_query):
    print("Encoding query...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_emb = model.encode([user_query])

    print("Loading FAISS index and chunks...")
    index = faiss.read_index("ncert_index")
    with open("chunk_data.pkl", "rb") as f:
        chunks = pickle.load(f)

    print("Searching for best match...")
    D, I = index.search(query_emb, k=5)
    best_match = chunks[I[0][0]]

    full_text = best_match.get("text", "").strip()
    answer = extract_relevant_paragraph(full_text, user_query)

    metadata = {
        "Subject": best_match.get("subject", "Unknown"),
        "Chapter": best_match.get("chapter", "Unknown"),
        "Topic": best_match.get("topic", "Unknown"),
        "Page Number": best_match.get("page", "Unknown"),
        "Source File": best_match.get("source_file", "Unknown")
    }

    if metadata["Subject"] == "Unknown":
        metadata["Subject"] = guess_subject_from_filename(metadata["Source File"])

    image_path = best_match.get("image_path", None)

    print("Loading or generating question embeddings...")
    if not os.path.exists("question_embeddings.pkl"):
        df = pd.read_csv("DatasetQA.csv")
        questions = df['question_text'].astype(str).tolist()
        q_embs = model.encode(questions)
        with open("question_embeddings.pkl", "wb") as f:
            pickle.dump((questions, q_embs), f)
    else:
        with open("question_embeddings.pkl", "rb") as f:
            questions, q_embs = pickle.load(f)

    print("Finding similar questions...")
    sim_scores = np.dot(q_embs, query_emb[0]) / (np.linalg.norm(q_embs, axis=1) * np.linalg.norm(query_emb[0]))
    top_idxs = np.argsort(sim_scores)[-3:][::-1]
    similar_qs = list(dict.fromkeys([clean_html(questions[i]) for i in top_idxs]))

    return {
        "Answer": answer,
        "Metadata": metadata,
        "Similar Questions": similar_qs,
        "Image Preview": image_path
    }


Upload 'DatasetQA.csv', 'chunk_data.pkl', and 'ncert_index'


In [7]:
# Ask user for input
user_query = input("Enter your question: ").strip()
result = answer_query(user_query)
# Display result
print("\nAnswer:")
print(result['Answer'])

print("\nMetadata:")
for k, v in result["Metadata"].items():
    print(f"{k}: {v}")

print("\nSimilar Questions:")
for q in result["Similar Questions"]:
    print(f"- {q}")

if result["Image Preview"]:
    print("\nImage Preview Path:", result["Image Preview"])
else:
    print("\nNo image associated.")

Enter your question: define haloalkanes
Encoding query...
Loading FAISS index and chunks...
Searching for best match...
Loading or generating question embeddings...
Finding similar questions...

Answer:
160
Chemistry
Haloalkanes and haloarenes may be classified as follows:
These may be classified as mono, di, or polyhalogen (tri-,tetra-, etc.)
compounds depending on whether they contain one, two or more halogen
atoms in their structures. For example,
Monohalocompounds may further be classified according to the
hybridisation of the carbon atom to which the halogen is bonded, as
discussed below. This class includes
(a) Alkyl halides or haloalkanes (R—X)
In alkyl halides, the halogen atom is bonded to an alkyl group (R).

Metadata:
Subject: Unknown
Chapter: Haloalkanes and Haloarenes
Topic: 160
Page Number: 2
Source File: Haloalkanes and Haloarenes.pdf

Similar Questions:
- Which of the following is NOT a halogen gas?

No image associated.
