In [1]:
from qdrant_client import QdrantClient

In [2]:
client = QdrantClient(host = 'localhost', port = 6333)

In [3]:
collection_info = client.get_collection(collection_name = 'course_83c94b40-9dc0-4253-b83c-b82218156493')

In [4]:
print(collection_info)

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=148 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None

In [5]:
result = client.scroll(collection_name = 'course_83c94b40-9dc0-4253-b83c-b82218156493')

In [6]:
print(result)

([Record(id=276760000, payload={'text': ' Welcome to the course on Cloud Computing. Today we will have our fast lecture. So, as you might have seen the broad overview of the course. So, in this particular series of lectures, we will try to give an overall picture of what Cloud Computing is and what are its major components and what are the recent trends and at the end maybe what are the different type of research opportunities or these trends of future trends in the Cloud Computing. So, before going to the details of Cloud Computing, we will try t', 'lecture_id': '1e239ed3-109f-496d-b5de-f77e96d353a0', 'course_id': '83c94b40-9dc0-4253-b83c-b82218156493'}, vector=None, shard_key=None, order_value=None), Record(id=276760001, payload={'text': 'o have a quick overview of course, and the basic paradigm of computing. Now, if you look at that as defined by ACM computing, curricula in 2005 as the defined computing, it is a general way we can define computing to mean as a mean to solve any goal

In [7]:
import json
import re
import random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [8]:
def preprocess_text(text: str) -> str:
    """Basic preprocessing: strip extra spaces."""
    return text.strip()

In [9]:
def extract_sentences(text: str) -> list:
    """Split the lesson material into sentences using NLTK."""
    sentences = sent_tokenize(text)
    return sentences

In [10]:
def compute_tfidf(sentences: list):
    """
    Compute the TF-IDF matrix for the list of sentences.
    Using ngram_range=(1, 2) captures both unigrams and bigrams.
    """
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(sentences)
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names, vectorizer

In [11]:
def generate_question_from_sentence(sentence: str, keyword: str) -> str:
    """
    Replace the first occurrence of the keyword in the sentence with a blank.
    This creates the question stem.
    """
    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
    question_sentence = pattern.sub("______", sentence, count=1)
    return question_sentence

In [12]:
def generate_mcqs(text: str, num_questions: int = 5) -> list:
    """
    Generates MCQs by:
    1. Preprocessing and splitting the document into sentences.
    2. Computing TF-IDF (with n-grams) to extract keywords.
    3. For each sentence, selecting the highest weighted word as the keyword,
       replacing it with a blank, and generating a question.
    4. Picking distractors from the overall top keywords.
    """
    text = preprocess_text(text)
    sentences = extract_sentences(text)
    
    # Compute TF-IDF for sentences
    tfidf_matrix, feature_names, _ = compute_tfidf(sentences)
    
    # For distractor pool: Sum TF-IDF scores over the whole document
    tfidf_sum = np.squeeze(np.asarray(tfidf_matrix.sum(axis=0)))
    keywords_scores = list(zip(feature_names, tfidf_sum))
    # Sort keywords by overall importance
    keywords_scores.sort(key=lambda x: x[1], reverse=True)
    # Select top keywords as the candidate pool
    top_keywords = [kw for kw, score in keywords_scores[:10]]
    
    mcqs = []
    for i, sentence in enumerate(sentences):
        row = tfidf_matrix[i].toarray().flatten()
        if row.max() == 0:
            continue  # Skip if the sentence has no useful keywords.
        max_index = row.argmax()
        correct_keyword = feature_names[max_index]
        # Ensure the keyword appears in the sentence
        if correct_keyword.lower() not in sentence.lower():
            continue
        
        # Create the question by replacing the keyword with a blank.
        question_text = generate_question_from_sentence(sentence, correct_keyword)
        correct_answer = correct_keyword
        
        # Generate distractors: choose three other keywords from the pool
        distractor_pool = [kw for kw in top_keywords if kw.lower() != correct_keyword.lower()]
        if len(distractor_pool) < 3:
            continue  # Skip if not enough distractors.
        distractors = random.sample(distractor_pool, 3)
        
        # Combine correct answer with distractors and shuffle options.
        options = [correct_answer] + distractors
        random.shuffle(options)
        option_labels = ['A', 'B', 'C', 'D']
        options_dict = {label: opt for label, opt in zip(option_labels, options)}
        correct_label = [label for label, opt in options_dict.items() if opt == correct_answer][0]
        
        mcq = {
            "question": question_text,
            "options": options_dict,
            "correct_answer": correct_label,
            "explanation": f"The blank was filled by the keyword '{correct_answer}', extracted using TF-IDF."
        }
        mcqs.append(mcq)
        if len(mcqs) >= num_questions:
            break

    return mcqs

In [13]:
records, _ = client.scroll(collection_name="course_83c94b40-9dc0-4253-b83c-b82218156493", limit=5)


In [14]:
lesson_text = " ".join([record.payload.get("text", "") for record in records])
if not lesson_text.strip():
    lesson_text = "No lesson text found."

In [16]:
# Generate MCQs from the lesson text.
generated_mcqs = generate_mcqs(lesson_text, num_questions=3)
print("Generated MCQs:")
print(json.dumps(generated_mcqs))

Generated MCQs:
[{"question": "Today we will have our ______ lecture.", "options": {"A": "service", "B": "cloud computing", "C": "fast", "D": "terms"}, "correct_answer": "C", "explanation": "The blank was filled by the keyword 'fast', extracted using TF-IDF."}, {"question": "So, as you might have seen the ______ of the course.", "options": {"A": "broad overview", "B": "computing", "C": "overview", "D": "broad"}, "correct_answer": "A", "explanation": "The blank was filled by the keyword 'broad overview', extracted using TF-IDF."}, {"question": "So, in this particular series of lectures, we will try to give an overall picture of what Cloud Computing is and what are its major components and what are the recent ______ and at the end maybe what are the different type of research opportunities or these trends of future trends in the Cloud Computing.", "options": {"A": "trends", "B": "computing", "C": "course", "D": "broad"}, "correct_answer": "A", "explanation": "The blank was filled by the 

In [17]:
def transform_quiz_format(mcqs):
    formatted = []
    for mcq in mcqs:
        # Wrap the correct answer in parentheses
        transformed = {
            "options": mcq["options"],
            "question": mcq["question"],
            "explanation": mcq["explanation"],
            "correct_answer": f"({mcq['correct_answer']})"
        }
        formatted.append(transformed)
    return formatted

# Assuming generated_mcqs is the output from your quiz generation function:
formatted_quiz = transform_quiz_format(generated_mcqs)
print(json.dumps(formatted_quiz, indent=2))


[
  {
    "options": {
      "A": "service",
      "B": "cloud computing",
      "C": "fast",
      "D": "terms"
    },
    "question": "Today we will have our ______ lecture.",
    "explanation": "The blank was filled by the keyword 'fast', extracted using TF-IDF.",
    "correct_answer": "(C)"
  },
  {
    "options": {
      "A": "broad overview",
      "B": "computing",
      "C": "overview",
      "D": "broad"
    },
    "question": "So, as you might have seen the ______ of the course.",
    "explanation": "The blank was filled by the keyword 'broad overview', extracted using TF-IDF.",
    "correct_answer": "(A)"
  },
  {
    "options": {
      "A": "trends",
      "B": "computing",
      "C": "course",
      "D": "broad"
    },
    "question": "So, in this particular series of lectures, we will try to give an overall picture of what Cloud Computing is and what are its major components and what are the recent ______ and at the end maybe what are the different type of research opport