In [7]:
# Core libraries
!pip install spacy nltk transformers torch

# Install the spaCy English model
!python -m spacy download en_core_web_sm

# Download NLTK data
!python -m nltk.downloader punkt wordnet omw

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw to /root/nltk_data...


In [8]:
import random
import spacy
import nltk
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Ensure required NLTK data is available
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# ——— Setup —————————————————————————————
nlp = spacy.load("en_core_web_sm")
ner = pipeline("ner", aggregation_strategy="simple")
model_name = "fares7elsadek/t5-base-finetuned-question-generation"
tokenizer = AutoTokenizer.from_pretrained(model_name)
qg_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def generate_qa(context: str, answer: str = "[MASK]", max_len=64):
    input_text = f"context: {context} answer: {answer} </s>"
    inputs = tokenizer([input_text], return_tensors="pt", truncation=True, padding=True)
    output = qg_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_len,
        num_beams=5,
        early_stopping=True
    )
    text = tokenizer.decode(output[0], skip_special_tokens=True)

    if "question:" in text and "answer:" in text:
        q, a = text.split("answer:")
        return q.replace("question:", "").strip(), a.strip()
    if "?" in text:
        q, a = text.split("?", 1)
        return (q + "?").strip(), a.strip()
    return None, None

def get_distractors_wordnet(word):
    key = word.replace(" ", "_")
    synsets = wordnet.synsets(key, pos='n')
    if not synsets:
        return []
    hyper = synsets[0].hypernyms()
    if not hyper:
        return []
    hypos = hyper[0].hyponyms()
    distros = [lemma.name().replace("_", " ") for h in hypos for lemma in h.lemmas()]
    return list(set(distros) - {word})[:3]

def generate_mcqs(text: str, num_questions=5):
    doc = nlp(text)
    sentences = [s.text.strip() for s in doc.sents if len(s.text.split()) >= 6]
    mcqs = []

    for sent in random.sample(sentences, min(num_questions, len(sentences))):
        print("➡️ Processing:", sent)
        ents = [e['word'] for e in ner(sent) if e['entity_group'] in ("PER","LOC","ORG","MISC")]
        print("   Entities:", ents)
        if not ents:
            continue

        answer_ent = ents[0]
        question, answer = generate_qa(sent, answer=answer_ent)
        print("   Generated Q/A:", question, "/", answer)
        if not question or not answer:
            continue

        distractors = get_distractors_wordnet(answer)
        print("   Distractors:", distractors)
        if len(distractors) < 3:
            continue

        options = [answer] + random.sample(distractors, 3)
        random.shuffle(options)
        correct = chr(65 + options.index(answer))
        mcqs.append({
            "question": question,
            "options": options,
            "answer": correct
        })

    return mcqs

# ——— Test Run ——————————————————————————————
if __name__ == "__main__":
    sample_text = """
    BERT is a transformer-based model developed by Google. It is widely used for natural language processing tasks such as question answering and text classification.
    The Eiffel Tower was constructed in 1889 for the World's Fair in Paris.
    """
    mcqs = generate_mcqs(sample_text, num_questions=3)
    print("\n🎓 Generated MCQs:")
    for i, q in enumerate(mcqs, 1):
        print(f"Q{i}. {q['question']}")
        for idx, opt in enumerate(q['options'], start=65):
            print(f"   {chr(idx)}) {opt}")
        print(f"Answer: {q['answer']}\n{'-'*40}")


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


➡️ Processing: BERT is a transformer-based model developed by Google.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


   Entities: ['BERT', 'Google']
   Generated Q/A: What is a transformer-based model developed by Google? / BERT
   Distractors: []
➡️ Processing: It is widely used for natural language processing tasks such as question answering and text classification.
   Entities: []
➡️ Processing: The Eiffel Tower was constructed in 1889 for the World's Fair in Paris.
   Entities: ['Eiffel Tower', "World ' s Fair", 'Paris']
   Generated Q/A: What was built for the World's Fair in Paris? / Eiffel Tower
   Distractors: []

🎓 Generated MCQs:
