In [1]:
!pip install transformers torch PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
from google.colab import files

uploaded = files.upload()

Saving chapter1.pdf to chapter1.pdf


In [4]:
from transformers import pipeline
from PyPDF2 import PdfReader


def extract_clean_text(pdf_path):
    reader = PdfReader(pdf_path)
    cleaned_text = ""

    for page in reader.pages:
        text = page.extract_text()
        if not text:
            continue

        for line in text.split("\n"):
            line = line.strip()

            if len(line) < 40:
                continue
            if "figure" in line.lower():
                continue
            if "©" in line:
                continue

            cleaned_text += line + " "

    return cleaned_text


def chunk_text(text, chunk_size=350):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])


# Load QA model
qa_pipeline = pipeline(
    "question-answering",
    model="distilbert-base-cased-distilled-squad"
)

# Load PDF text (Colab path)
pdf_text = extract_clean_text("/content/chapter1.pdf")

print("\nStudy Buddy (PDF Quizzer)")
print("Type 'exit' to quit\n")


while True:
    question = input("Ask a question: ").strip()

    if question.lower() == "exit":
        print("Goodbye!")
        break

    best_answer = ""
    best_score = 0

    for chunk in chunk_text(pdf_text):
        result = qa_pipeline(
            question=question,
            context=chunk
        )

        if result["score"] > best_score:
            best_score = result["score"]
            best_answer = result["answer"]

    if best_score < 0.15:
        print("Answer: Not found in the document.")
    else:
        if len(best_answer.split()) < 4:
            print("Answer: Definition not explicitly stated in the document.")
        else:
            print("Answer:", best_answer)

    print("-*" * 25)


Device set to use cpu



Study Buddy (PDF Quizzer)
Type 'exit' to quit

Ask a question: What are Generative AI techniques used for?
Answer: when working with pictures or visual data
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
Ask a question: What does autoregressive refer to?
Answer: Definition not explicitly stated in the document.
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
Ask a question: What does the discriminator differentiate between?
Answer: source and produced data
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
Ask a question: What is Latent Dirichlet Allocation?
Answer: a generative probabilistic model
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
Ask a question: exit
Goodbye!
