In [1]:
import fitz  # PyMuPDF for PDF extraction
from transformers import pipeline
from multiprocessing import Pool

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

def chunk_text(text, max_tokens=512):
    """Split text into smaller chunks."""
    words = text.split()
    chunks = [" ".join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)]
    return chunks

def generate_questions(text):
    """Generate questions using a smaller model."""
    qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-hl")
    questions = qg_pipeline(text)
    return questions

def process_chunk(chunk):
    """Process a single chunk of text."""
    return generate_questions(chunk)

if __name__ == "__main__":
    pdf_path = input("Enter the path to your PDF file: ").strip()
    
    # Extract text
    pdf_text = extract_text_from_pdf(pdf_path)
    print("\nExtracted Text (Preview):\n", pdf_text[:500], "...")  # Show preview

    # Chunk text
    chunks = chunk_text(pdf_text)
    print(f"\nSplit into {len(chunks)} chunks.")

    # Generate questions in parallel
    print("\nGenerating Questions...\n")
    with Pool(processes=4) as pool:  # Use 4 processes
        results = pool.map(process_chunk, chunks)

    # Print generated questions
    print("\nGenerated Questions:\n")
    for i, questions in enumerate(results):
        print(f"Chunk {i+1}:")
        for q in questions:
            print("-", q["generated_text"])

  from .autonotebook import tqdm as notebook_tqdm


Enter the path to your PDF file:  /home/anjana/Project/generator/question_papers/note.pdf



Extracted Text (Preview):
 
1. Functions
1.1
Introduction to Functions
Every C program must have a main function to indicate where the program has to begin its
execution. If a program is written only using a single main function, the program becomes too
large and complex and as a result, the task of debugging, testing and maintaining becomes difﬁcult.
C functions can be classiﬁed into two categories:
• Library Functions: Not required to be writtenby the programmer. e.g. printf(), scanf(),
strlen() etc.
• User-deﬁned funct ...

Split into 6 chunks.

Generating Questions...



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using the default legacy behaviour of the <class 'transformers.model


Generated Questions:

Chunk 1:
- What is the name of the function that is used to describe a program?
Chunk 2:
- What is the name of the function declaration?
Chunk 3:
- What is the name of the example for a function with no arguments and no return value?
Chunk 4:
- What is the name of the variable that is declared inside a function?
Chunk 5:
- What is the name of the declaration of external variable?
Chunk 6:
- What is the name of the register variable?
