<a href="https://colab.research.google.com/github/FaraazArsath/Akaike-Assignment/blob/main/nlp_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Creation of Objective Questions with Multiple Correct Answers using generative text models**

In [1]:
!pip install PyMuPDF
!pip install transformers
!pip install sentencepiece

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.5-cp310-none-manylinux2014_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.5 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.5 PyMuPDFb-1.23.5
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
import fitz  # PyMuPDF
import spacy
import random
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the paraphrasing model and tokenizer
paraphrase_model_name = "t5-large"
paraphrase_model = T5ForConditionalGeneration.from_pretrained(paraphrase_model_name)
paraphrase_tokenizer = T5Tokenizer.from_pretrained(paraphrase_model_name)

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")


def read_entire_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""

    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()

    return text


def tokenize_sentences(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]


def extract_keywords(text):
    doc = nlp(text)
    keywords = [token.text for token in doc if token.is_alpha]
    return list(set(keywords))


def map_sentences_to_keywords(sentences, keywords):
    keyword_sentence_mapping = {}
    for keyword in keywords:
        keyword_sentence_mapping[keyword] = [sent for sent in sentences if keyword.lower() in sent.lower()]
    return keyword_sentence_mapping


def paraphrase_question(question):
    input_ids = paraphrase_tokenizer(question, return_tensors="pt")["input_ids"]
    paraphrased_ids = paraphrase_model.generate(input_ids, num_beams=4, num_return_sequences=1)
    paraphrased_question = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
    return paraphrased_question


def get_mca_questions(context: str):
    sentences = tokenize_sentences(context)
    keywords = extract_keywords(context)
    keyword_sentence_mapping = map_sentences_to_keywords(sentences, keywords)
    mca_questions = []

    for i in range(3):
        keyword = random.choice(keywords)
        sentence = random.choice(keyword_sentence_mapping[keyword])

        # Paraphrase the sentence
        paraphrased_sentence = paraphrase_question(sentence)

        words = [token.text for token in nlp(paraphrased_sentence) if token.is_alpha]
        options = random.sample(words, 4)  # Randomly select 4 words for options

        correct_options = random.sample(["(a)", "(b)", "(c)", "(d)"], 2)

        options = random.sample(options, 4)  # Randomly shuffle options
        options = [option.capitalize() for option in options]  # Capitalize options
        formatted_question = f"Q{i+1}: {paraphrased_sentence.strip()}?\n"  # Add a question mark
        formatted_question += f"(a) {options[0]}\n(b) {options[1]}\n(c) {options[2]}\n(d) {options[3]}\n"
        formatted_question += f"Correct Options: {' & '.join(correct_options)}"  # Indicate two correct options
        mca_questions.append(formatted_question)

    return mca_questions


pdf_path = "chapter-2.pdf"
context = read_entire_pdf(pdf_path)  # Define context here
mca_questions = get_mca_questions(context)

# Print the generated questions with two lines between each set
for i, question in enumerate(mca_questions, start=1):
    print(question)
    if i < len(mca_questions):
        print("\n")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Q1: he was himself cross-examined by the British Parliament in 1772.?
(a) Cross
(b) The
(c) British
(d) Himself
Correct Options: (a) & (b)


Q2: . His office – the Collectorate – became the new centre of power and?
(a) Power
(b) Became
(c) Of
(d) Centre
Correct Options: (d) & (c)


Q3: . The fine qualities of cotton and silk produced in India had a big market in?
(a) Qualities
(b) In
(c) Cotton
(d) Produced
Correct Options: (c) & (b)
