In [None]:
# Install dependencies
!pip install transformers torch pymupdf

# --- Imports ---
import fitz  # PyMuPDF for reading PDFs
import re
from transformers import pipeline
from google.colab import files

# --- Upload PDF ---
print("📄 Upload your PDF file:")
uploaded_pdf = files.upload()

# --- Clean extracted text ---
def clean_text(text):
    text = re.sub(r'\n+', ' ', text)       # Replace multiple line breaks with space
    text = re.sub(r'\s+', ' ', text)       # Remove extra spaces
    return text.strip()

# --- Extract text from PDF ---
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return clean_text(full_text)

# --- Chunk the text (in 500-word blocks for better context) ---
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# --- Load QA Model (more accurate Roberta model) ---
qa_pipeline = pipeline(
    "question-answering",
    model="deepset/roberta-base-squad2",
    tokenizer="deepset/roberta-base-squad2"
)

# --- Find top 2 answers across all chunks ---
def ask_question_from_chunks(chunks, question, top_k=2):
    results = []
    for chunk in chunks:
        if len(chunk.strip()) < 50:
            continue
        try:
            result = qa_pipeline(question=question, context=chunk)
            results.append(result)
        except:
            continue
    # Sort by confidence
    sorted_results = sorted(results, key=lambda x: x['score'], reverse=True)
    return sorted_results[:top_k] if sorted_results else [{"answer": "No confident answer found.", "score": 0.0}]

# --- Extract and prepare PDF content ---
pdf_text = ""
for file_name in uploaded_pdf:
    pdf_text = extract_text_from_pdf(file_name)

# --- Create chunks from cleaned PDF text ---
chunks = chunk_text(pdf_text, 500)
print(f"\n✅ PDF loaded and split into {len(chunks)} chunk(s). Ready to answer questions.")

# --- Ask questions loop ---
while True:
    question = input("\n❓ Ask a question about the PDF (or type 'exit' to quit): ")
    if question.lower() == "exit":
        print("👋 Exiting. Thanks for using the QA system.")
        break

    top_answers = ask_question_from_chunks(chunks, question)
    for i, ans in enumerate(top_answers, 1):
        print(f"\n🔹 Answer {i}: {ans['answer']}\n   🎯 Confidence: {ans['score']:.2f}")


📄 Upload your PDF file:


Saving case study.pdf to case study (1).pdf


Device set to use cpu



✅ PDF loaded and split into 5 chunk(s). Ready to answer questions.

❓ Ask a question about the PDF (or type 'exit' to quit): what's this pdf about?

🔹 Answer 1: the nature of ethics
   🎯 Confidence: 0.18

🔹 Answer 2: legal frameworks and policies mentioned in the slides that are relevant to professional practices
   🎯 Confidence: 0.04

❓ Ask a question about the PDF (or type 'exit' to quit): what's the main idea of it?

🔹 Answer 1: to avoid causing damage or negative consequences
   🎯 Confidence: 0.64

🔹 Answer 2: what is to be produced/delivered
   🎯 Confidence: 0.56

❓ Ask a question about the PDF (or type 'exit' to quit): quit
