In [1]:
import markovify
import json
import logging
import random
import re

import nltk
import qdrant_client
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from nltk.corpus import stopwords

In [3]:
QDRANT_URL = "http://localhost:6333"
QDRANT_COLLECTION = "course_83c94b40-9dc0-4253-b83c-b82218156493"

In [4]:
q_client = qdrant_client.QdrantClient(QDRANT_URL)


In [5]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")


In [6]:
def retrieve_transcript_from_qdrant(query: str, top_k: int = 5) -> str:
    """
    Retrieves transcript content from Qdrant based on the input query.
    It uses the embedding model to encode the query and fetches the top_k most relevant texts.
    """
    # Encode the query to get its vector representation
    query_embedding = embedding_model.encode(query).tolist()
    
    # Search Qdrant for relevant transcript points
    search_results = q_client.search(
        collection_name=QDRANT_COLLECTION,
        query_vector=query_embedding,
        limit=top_k
    )
    
    # Extract and concatenate transcript texts from the search results
    transcript_texts = [hit.payload.get("text", "") for hit in search_results]
    return " ".join(transcript_texts)

In [7]:
def build_markov_model(text: str):
    """
    Build a Markov model using the provided text.
    The model can generate new sentences in the style of the input corpus.
    """
    return markovify.Text(text)


In [8]:

def generate_markov_sentence(model) -> str:
    """
    Generates a sentence from the Markov model.
    It tries up to 100 times to produce a valid sentence.
    """
    sentence = model.make_sentence(tries=100)
    return sentence

In [9]:
def generate_fill_in_blank_question(sentence: str):
    """
    Converts a generated sentence into a fill-in-the-blank quiz question.
    It tokenizes the sentence, selects a candidate word (filtering out short words and stopwords),
    replaces it with a blank, and creates distractor options.
    """
    words = sentence.split()
    # Filter out short words and common stopwords
    filtered_words = [w for w in words if len(w) > 3 and w.lower() not in stopwords.words('english')]
    if not filtered_words:
        return None

    # Randomly select a word to remove (as the answer)
    word_to_blank = random.choice(filtered_words)
    
    # Create the question text by replacing the selected word with a blank
    pattern = re.compile(re.escape(word_to_blank), re.IGNORECASE)
    question_text = pattern.sub("______", sentence, count=1)
    
    # Create distractors: select other words from the filtered list
    distractors = list(set(filtered_words) - {word_to_blank})
    if len(distractors) >= 3:
        distractors = random.sample(distractors, 3)
    else:
        # Pad with "N/A" if not enough distractors
        distractors += ["N/A"] * (3 - len(distractors))
    
    # Combine the correct answer with distractors and shuffle options
    options = [word_to_blank] + distractors
    random.shuffle(options)
    
    # Map the options to letters A, B, C, D
    letters = ["A", "B", "C", "D"]
    option_dict = {letter: option for letter, option in zip(letters, options)}
    correct_letter = [letter for letter, option in option_dict.items() if option == word_to_blank][0]
    
    return {
        "question": question_text,
        "options": option_dict,
        "correct_answer": correct_letter,
        "explanation": f"The correct word was '{word_to_blank}'."
    }


In [10]:
def generate_markov_quiz_from_qdrant(query: str, num_questions: int = 5):
    """
    Retrieves transcript content from Qdrant using the given query,
    builds a Markov model on the retrieved text, and generates quiz questions.
    """
    # Retrieve transcript text from Qdrant
    transcript_text = retrieve_transcript_from_qdrant(query)
    if not transcript_text.strip():
        print("No transcript found in Qdrant for query:", query)
        return []
    
    # Build the Markov model on the retrieved text
    model = build_markov_model(transcript_text)
    
    # Generate quiz questions from the Markov model
    questions = []
    attempts = 0
    while len(questions) < num_questions and attempts < num_questions * 10:
        sentence = generate_markov_sentence(model)
        if sentence:
            question = generate_fill_in_blank_question(sentence)
            if question:
                questions.append(question)
        attempts += 1
    return questions

In [11]:
# Main Execution
# ---------------------------
if __name__ == "__main__":
    # Example: use "Cloud Computing" as the query to retrieve related transcript data.
    sample_query = "Cloud Computing"
    quiz = generate_markov_quiz_from_qdrant(sample_query, num_questions=5)
    print(json.dumps(quiz, indent=4))

  search_results = q_client.search(


[
    {
        "question": "So, in this particular ______ of lectures, we will have our fast lecture.",
        "options": {
            "A": "lectures,",
            "B": "particular",
            "C": "series",
            "D": "fast"
        },
        "correct_answer": "C",
        "explanation": "The correct word was 'series'."
    },
    {
        "question": "So, in this particular series of ______ we will try t Welcome to the course on Cloud Computing.",
        "options": {
            "A": "course",
            "B": "series",
            "C": "Welcome",
            "D": "lectures,"
        },
        "correct_answer": "D",
        "explanation": "The correct word was 'lectures,'."
    },
    {
        "question": "So, ______ computing is a shared pool of configurable computing resources.",
        "options": {
            "A": "cloud",
            "B": "computing",
            "C": "pool",
            "D": "resources."
        },
        "correct_answer": "A",
        "expla