In [None]:
! pip install spacy nltk==3.5.0 transformers torch

In [None]:
! python3 -m spacy download en_core_web_lg

In [224]:
input_file = "input/universe.txt"
with open(input_file,encoding="Utf-8") as f:
    text = f.read()

In [225]:
import spacy 
import re 
nlp = spacy.load('en_core_web_lg')

def preprocess(text):
    text = text.replace("\n"," ")
    text = re.sub(r'\s+', ' ', text).strip()
    doc = nlp(text)
    sents = []
    for sent in doc.sents:
        noun_count = sum(1 for token in sent if token.pos_ == 'NOUN')
        if noun_count >= 2:
            sents.append(sent.text)
    return sents

In [226]:
sentences = preprocess(text)
sentences

['The universe is vast and filled with mysteries that continue to captivate scientists and astronomers alike.',
 'From the depths of space to the farthest reaches of distant galaxies, the cosmos holds countless wonders waiting to be explored.',
 'One of the fundamental concepts in astrophysics is the Big Bang theory, which posits that the universe originated from a singular, infinitely dense point nearly 13.8 billion years ago.',
 'Over time, the universe expanded and cooled, giving rise to the formation of galaxies, stars, and planets.',
 'Galaxies are immense systems containing billions or even trillions of stars, as well as various types of interstellar matter such as gas, dust, and dark matter.',
 'The Milky Way, our home galaxy, is a spiral galaxy containing hundreds of billions of stars, including our own Sun.',
 'Stars are the celestial objects that shine brightly in the night sky, fueled by nuclear fusion reactions occurring in their cores.',
 'They come in a variety of sizes, 

In [227]:
import random
def select_random_sents(sentences,max_choice=5):
    random_sents = random.sample(sentences,max_choice)
    return random_sents

In [228]:
chosen_sents = select_random_sents(sentences)
chosen_sents

['Cosmology, the scientific study of the origin, evolution, and eventual fate of the universe, seeks to answer some of the most profound questions about our existence.',
 'The study of black holes, mysterious regions of spacetime where gravity is so strong that nothing, not even light, can escape, is another area of active research in astrophysics.',
 'Planets orbit stars and come in different types, including terrestrial planets like Earth, gas giants like Jupiter, and icy worlds like Neptune.',
 'Missions to the Moon, Mars, and beyond have expanded our understanding of the universe and laid the groundwork for future exploration and colonization of other worlds.',
 'Scientists study the conditions on other planets and moons in our solar system, as well as exoplanets orbiting distant stars, in the hope of finding signs of life elsewhere in the universe.']

In [229]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

def get_distractors_wordnet(syn, word):
    distractors = []
    word = word.lower().replace(" ", "_")
    orig_word = word
    hypernyms = syn.hypernyms()
    if not hypernyms:
        return distractors
    for item in hypernyms[0].hyponyms():
        name = item.lemmas()[0].name()
        if name == orig_word:
            continue
        name = " ".join(w.capitalize() for w in name.replace("_", " ").split())
        if name and name not in distractors:
            distractors.append(name)
    return distractors

[nltk_data] Downloading package wordnet to /home/eps/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [230]:
ner_dict = {}
doc = nlp(text)
print("Named Entities:")
for ent in doc.ents:
    if not ent.label_ in ner_dict:
        ner_dict[ent.label_] = []
    ner_dict[ent.label_].append(ent.text)
ner_dict

Named Entities:


{'CARDINAL': ['One', 'hundreds of billions', 'eight'],
 'MONEY': ['nearly 13.8 billion', 'billions or even trillions'],
 'DATE': ['years ago'],
 'ORG': ['Sun', 'Neptune', 'Sun'],
 'PERSON': ['Sun'],
 'LOC': ['Earth', 'Jupiter', 'Earth', 'Mars', 'Earth']}

In [231]:
for sent in sentences[:3]:
    doc = nlp(sent)
   
    ner_nouns = [ent.text
             for ent in doc.ents 
             if ent.label_ and nlp(ent.text)[0].pos_ == "NOUN"]
    if ner_nouns:
        answer_word = ner_nouns[0]
        print(answer_word)
        related = []

        for key, values in ner_dict.items():
            if answer_word in values:
                related = [v for v in values if v != answer_word]
        print(related)  
    else: 
       
        nouns = [token.lemma_.lower() for token in doc if token.pos_ == "NOUN"]
        noun_freq = Counter(nouns)
        answer_word = noun_freq.most_common(1)[0][0]
        print(answer_word)
        synset = wn.synsets(answer_word, 'n')
        if synset:
            distractors = get_distractors_wordnet(synset[0], answer_word)
        print(distractors)

universe
['Asterism', 'Black Body', 'Body', 'Carpet', 'Celestial Body', 'Cocoon', 'Consolidation', 'Constellation', 'Covering', 'Extraterrestrial Object', 'Mechanism', 'Nest', 'Plant Part', 'Radiator', 'Rock', 'Sample', 'Tangle']
depth
['Area', 'Coverage', 'Frontage', 'Length', 'Limit', 'Scope']
years ago
[]


In [232]:
from collections import Counter

def extract_keywords(sent):
    doc = nlp(sent)
    ner_nouns = [ent.text for ent in doc.ents if ent.label_ and nlp(ent.text)[0].pos_ == "NOUN"]
    if ner_nouns:
        print(ner_nouns)
        return ner_nouns[0]
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    noun_freq = Counter(nouns)
    for noun, _ in noun_freq.most_common():
        if wn.synsets(noun, pos='n'):
            return noun

    return None  # If no valid noun with synset is found


In [233]:
# def extract_distractors(word):
#     doc = nlp(word)
#     lemma = doc[0].lemma_.lower()
#     distractors = []
#     for key, values in ner_dict.items():
#         if lemma in [nlp(v)[0].lemma_.lower() for v in values]:
#             related_distractors = [v for v in values if nlp(v)[0].lemma_.lower() != lemma]
#             # Supplement with WordNet if needed
#             if len(related_distractors) < 3:
#                 synsets = wn.synsets(lemma, pos='n')
#                 if synsets:
#                     wn_distractors = get_distractors_wordnet(synsets[0], lemma)
#                     distractors = related_distractors + wn_distractors
#                     return distractors[:3]
#             return related_distractors[:3]

#     synsets = wn.synsets(lemma, pos='n')
#     if synsets:
#         distractors = get_distractors_wordnet(synsets[0], lemma)

#     return distractors[:3] if distractors else []


In [234]:
def extract_distractors(word):
    
    for key, values in ner_dict.items():
        if word in values:
            related_distractors = [v for v in values if v != word]
            if len(related_distractors) < 3:
                synset = wn.synsets(word[0], 'n')
                if synset:
                    distractors = get_distractors_wordnet(synset[0], word)
                    distractors = related_distractors+distractors
                    return distractors[:3]
    
    synset = wn.synsets(word, 'n')
    if synset:
        distractors = get_distractors_wordnet(synset[0], word)
    return distractors[:3]

In [235]:
def generate_questions(sentences):
    questions = {}
    i = 0
    for sent in sentences:
        keyword = extract_keywords(sent)
        if keyword:
            distractors = extract_distractors(keyword)

            question_sent = sent.replace(keyword, "_______")
            questions[i] = {
                'question': question_sent,
                'answer': keyword,
                'options': distractors
            }
            
            i += 1
    return questions


In [236]:
questions = generate_questions(sentences[:3])
questions

['years ago']


{0: {'question': 'The _______ is vast and filled with mysteries that continue to captivate scientists and astronomers alike.',
  'answer': 'universe',
  'options': ['Asterism', 'Black Body', 'Body']},
 1: {'question': 'From the _______ of space to the farthest reaches of distant galaxies, the cosmos holds countless wonders waiting to be explored.',
  'answer': 'depths',
  'options': ['Area', 'Coverage', 'Depth']},
 2: {'question': 'One of the fundamental concepts in astrophysics is the Big Bang theory, which posits that the universe originated from a singular, infinitely dense point nearly 13.8 billion _______.',
  'answer': 'years ago',
  'options': ['Alkali Metal', 'Alkaline Earth', 'Aluminum']}}

In [None]:
import random
import string

def format_mcqs(questions):
    formatted = {}

    for qid, qdata in questions.items():
        all_options = qdata['options'] + [qdata['answer']]
        random.shuffle(all_options)

        # Map options to letters A, B, C, ...
        option_labels = list(string.ascii_uppercase)
        options = {option_labels[i]: opt.lower() for i, opt in enumerate(all_options)}

        # Find which letter is the correct answer
        correct_letter = next(k for k, v in options.items() if v == qdata['answer'].lower())

        formatted[qid] = {
            'question': qdata['question'],
            'options': options,
            'answer': correct_letter
        }

    return formatted


In [238]:
questions

{0: {'question': 'The _______ is vast and filled with mysteries that continue to captivate scientists and astronomers alike.',
  'answer': 'universe',
  'options': ['Asterism', 'Black Body', 'Body']},
 1: {'question': 'From the _______ of space to the farthest reaches of distant galaxies, the cosmos holds countless wonders waiting to be explored.',
  'answer': 'depths',
  'options': ['Area', 'Coverage', 'Depth']},
 2: {'question': 'One of the fundamental concepts in astrophysics is the Big Bang theory, which posits that the universe originated from a singular, infinitely dense point nearly 13.8 billion _______.',
  'answer': 'years ago',
  'options': ['Alkali Metal', 'Alkaline Earth', 'Aluminum']}}

In [249]:
format_mcqs(questions)

{0: {'question': 'The _______ is vast and filled with mysteries that continue to captivate scientists and astronomers alike.',
  'options': {'A': 'asterism',
   'B': 'body',
   'C': 'universe',
   'D': 'black body'},
  'answer': 'C'},
 1: {'question': 'From the _______ of space to the farthest reaches of distant galaxies, the cosmos holds countless wonders waiting to be explored.',
  'options': {'A': 'area', 'B': 'depth', 'C': 'depths', 'D': 'coverage'},
  'answer': 'C'},
 2: {'question': 'One of the fundamental concepts in astrophysics is the Big Bang theory, which posits that the universe originated from a singular, infinitely dense point nearly 13.8 billion _______.',
  'options': {'A': 'aluminum',
   'B': 'years ago',
   'C': 'alkali metal',
   'D': 'alkaline earth'},
  'answer': 'B'}}

In [256]:
def text_to_questions_pipeline(text_file):
    with open(text_file, encoding="utf-8") as f:
        text = f.read()
    
    text = text.replace("\n", " ").strip()
    text = re.sub(r'\s+', ' ', text)
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents if sum(1 for token in sent if token.pos_ == 'NOUN') >= 2]
    
    chosen_sents = random.sample(sentences, min(5, len(sentences)))

    questions = {}
    for i, sent in enumerate(chosen_sents):
        keyword = extract_keywords(sent)
        if keyword:
            distractors = extract_distractors(keyword)
            question_sent = sent.replace(keyword, "_______")
            questions[i+1] = {
                'question': question_sent,
                'answer': keyword,
                'options': [keyword] + distractors[:3] 
            }
    # return questions
    return format_mcqs(questions)

In [257]:
text_to_questions_pipeline('input/universe.txt')

['billions or even trillions']
['Earth']
{'question': 'Galaxies are immense systems containing _______ of stars, as well as various types of interstellar matter such as gas, dust, and dark matter.', 'options': {'A': 'Actinomyces', 'B': 'billions or even trillions', 'C': 'nearly 13.8 billion', 'D': 'billions or even trillions', 'E': 'Actinomycete'}, 'answer': 'B'}
{'question': 'The life cycle of a star depends on its mass, with massive _______ undergoing supernova explosions at the end of their lives, while smaller _______ like our Sun eventually evolve into white dwarfs.', 'options': {'A': 'Planetesimal', 'B': 'Minor Planet', 'C': 'Planet', 'D': 'stars', 'E': 'stars'}, 'answer': 'D'}
{'question': 'Black _______ come in various sizes, from stellar-mass black _______ formed from the collapse of massive stars to supermassive black _______ that lurk at the centers of galaxies.', 'options': {'A': 'holes', 'B': 'holes', 'C': 'Chasm', 'D': 'Breach', 'E': 'Crack'}, 'answer': 'A'}
{'question': 

{1: {'question': 'Galaxies are immense systems containing _______ of stars, as well as various types of interstellar matter such as gas, dust, and dark matter.',
  'options': {'A': 'Actinomyces',
   'B': 'billions or even trillions',
   'C': 'nearly 13.8 billion',
   'D': 'billions or even trillions',
   'E': 'Actinomycete'},
  'answer': 'B'},
 2: {'question': 'The life cycle of a star depends on its mass, with massive _______ undergoing supernova explosions at the end of their lives, while smaller _______ like our Sun eventually evolve into white dwarfs.',
  'options': {'A': 'Planetesimal',
   'B': 'Minor Planet',
   'C': 'Planet',
   'D': 'stars',
   'E': 'stars'},
  'answer': 'D'},
 3: {'question': 'Black _______ come in various sizes, from stellar-mass black _______ formed from the collapse of massive stars to supermassive black _______ that lurk at the centers of galaxies.',
  'options': {'A': 'holes',
   'B': 'holes',
   'C': 'Chasm',
   'D': 'Breach',
   'E': 'Crack'},
  'answer