In [None]:
import os
import pdfplumber
import faiss
import streamlit as st
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from huggingface_hub import login

# Fetch Hugging Face token from Colab secrets
hf_token = os.environ.get('HF_TOKEN')

# Authenticate with Hugging Face
login(hf_token)

# Load models
embedder = SentenceTransformer('all-MiniLM-L6-v2')
qa_model = pipeline("text2text-generation", model="google/flan-t5-base")

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def chunk_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

def embed_chunks(chunks):
    return embedder.encode(chunks)

def get_top_chunks(query, chunks, chunk_embeddings, top_k=3):
    query_embedding = embedder.encode([query])
    sims = cosine_similarity(query_embedding, chunk_embeddings)[0]
    top_indices = sims.argsort()[-top_k:][::-1]
    return [chunks[i] for i in top_indices]

def answer_question(context, question):
    prompt = f"Answer the question based on the context:\nContext: {context}\nQuestion: {question}"
    result = qa_model(prompt, max_length=200)[0]['generated_text']
    return result

# Streamlit UI
st.title("🧠 Free Document Q&A Assistant")
pdf_file = st.file_uploader("Upload a PDF", type="pdf")
question = st.text_input("Ask a question about the document:")

if pdf_file and question:
    with open("temp.pdf", "wb") as f:
        f.write(pdf_file.read())

    with st.spinner("Processing..."):
        raw_text = extract_text_from_pdf("temp.pdf")
        chunks = chunk_text(raw_text)
        chunk_embeddings = embed_chunks(chunks)
        top_chunks = get_top_chunks(question, chunks, chunk_embeddings)
        context = " ".join(top_chunks)
        answer = answer_question(context, question)

    st.subheader("📄 Answer:")
    st.write(answer)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Device set to use cpu


In [None]:
from google.colab import files

# Upload PDF file
uploaded = files.upload()


Saving The Indian Constitution is the supreme law of India and governs the country’s political framework.pdf to The Indian Constitution is the supreme law of India and governs the country’s political framework.pdf


In [None]:
# Extract text from the uploaded PDF
# Get the filename from the uploaded dictionary
pdf_path = list(uploaded.keys())[0]  # Use the uploaded file's name

# Extract the text from the uploaded PDF using the defined function
pdf_text = extract_text_from_pdf(pdf_path)

# Check a preview of the extracted text
print(pdf_text[:1000])  # Print the first 1000 characters



The Indian Constitution is the supreme law of India and governs the country’s political
framework. It was adopted by the Constituent Assembly of India on 26th November 1949
and came into effect on 26th January 1950. This day is celebrated as Republic Day in India.
Key Facts about the Indian Constitution:
1. Length: It is the longest written constitution in the world, containing around 450
articles, 12 schedules, and 5 parts. Over time, it has been amended several times to
adapt to changing needs.
2. Structure:
○ Preamble: The preamble serves as an introductory statement and outlines the
goals and ideals of the Constitution. It declares India to be a Sovereign,
Socialist, Secular, Democratic Republic.
○ Parts: The Constitution is divided into 22 parts.
○ Schedules: There are 12 schedules that provide details on various aspects such
as the distribution of powers, provisions related to states, and the Union
Territories.
3. Fundamental Rights (Part III): The Constitution guarantees basic r

In [None]:
# Define the question you want to ask
question = "What is the importance of the Preamble in the Indian Constitution?"

# Chunk the extracted text (split the text into smaller parts)
chunks = chunk_text(pdf_text)

# Embed the chunks (convert them into numerical vectors)
chunk_embeddings = embed_chunks(chunks)

# Find the top relevant chunks based on cosine similarity with the question
top_chunks = get_top_chunks(question, chunks, chunk_embeddings)

# Combine the top chunks into context for the model
context = " ".join(top_chunks)

# Get the answer from the model (assuming you have a function like `answer_question`)
answer = answer_question(context, question)

# Display the answer
print("Answer:", answer)


Token indices sequence length is longer than the specified maximum sequence length for this model (847 > 512). Running this sequence through the model will result in indexing errors


Answer: serves as an introductory statement and outlines the goals and ideals of the Constitution
