<a href="https://colab.research.google.com/github/Anasuya11/DS-Tasks/blob/main/Generating_MCA_Questions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
import random
from PyPDF2 import PdfReader

nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    """Extracts text from a given PDF."""
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def generate_mca_from_paragraph(paragraph):
    doc = nlp(paragraph)

    # Extract main verbs, which often carry the main action or idea in a sentence
    main_verbs = [token for token in doc if "VERB" in token.pos_ and token.dep_ == "ROOT"]
    if not main_verbs:
        return None

    main_verb = random.choice(main_verbs)
    subj = [child for child in main_verb.children if "subj" in child.dep_]
    obj = [child for child in main_verb.children if "obj" in child.dep_]
    if not subj or not obj:
        return None

    question = f"What did {subj[0].text} {main_verb.lemma_} in the context?"
    correct_choices = [obj[0].text]

    # Add another relevant entity/keyword from the sentence as a correct choice
    relevant_terms = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN'] and token.text not in correct_choices]
    if relevant_terms:
        correct_choices.append(random.choice(relevant_terms))

    distractors = [ent.text for ent in doc.ents if ent.text not in correct_choices]

    # Ensure we have at least two distractors to make up the 4 options
    if len(distractors) < 2:
        return None

    # Adjust the code here to make sure we have a total of 4 choices
    all_choices = correct_choices + random.sample(distractors, 4 - len(correct_choices))
    random.shuffle(all_choices)

    return question, all_choices, correct_choices


def extract_questions_from_text(text):
    paragraphs = [p for p in text.split("\n") if p]
    questions = []

    for paragraph in paragraphs:
        mca_data = generate_mca_from_paragraph(paragraph)
        if mca_data:
            questions.append(mca_data)

    return questions

pdf_files = ["/content/internship-assignment-nlp/Dataset/chapter-2.pdf", "/content/internship-assignment-nlp/Dataset/chapter-3.pdf", "/content/internship-assignment-nlp/Dataset/chapter-4.pdf"]

all_questions = []
for pdf_file in pdf_files:
    text = extract_text_from_pdf(pdf_file)
    questions = extract_questions_from_text(text)
    all_questions.extend(questions)

for idx, (q, options, correct) in enumerate(all_questions, 1):
    print(f"Q{idx}: {q}")
    for i, option in enumerate(options, 1):
        print(f"{i}. {option}")
    correct_indices = [str(options.index(c) + 1) for c in correct]
    print(f"Correct Answers: {', '.join(correct_indices)}\n")
