In [3]:
!pip install git+https://github.com/openai/whisper.git
!pip install pytube
import whisper
from pytube import YouTube
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import random

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-_kq1wxy6
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-_kq1wxy6
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [5]:
def download_audio(video_url):
    yt = YouTube(video_url)
    audio_stream = yt.streams.filter(only_audio=True).first()
    output_file = audio_stream.download(filename_prefix='audio_')
    return output_file

# Function to transcribe audio from downloaded file
def transcribe_audio_whisper(audio_file):
    model = whisper.load_model("base")
    result = model.transcribe(audio_file, fp16=False)
    return result["text"]

# Function to generate questions
def generate_questions(text):
    tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl")
    model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl")

    input_text = "generate questions: " + text
    inputs = tokenizer.encode(input_text, return_tensors='pt')

    output_sequences = model.generate(
        input_ids=inputs,
        max_length=512,
        num_beams=5,
        no_repeat_ngram_size=2,
        num_return_sequences=5,
        early_stopping=True
    )

    questions = [tokenizer.decode(sequence, skip_special_tokens=True) for sequence in output_sequences]
    return questions

# Function to generate MCQ
def generate_mcq(transcribed_text, question):
    sentences = transcribed_text.split('.')
    correct_answer = random.choice(sentences).strip()

    incorrect_answers = random.sample(sentences, 3)
    incorrect_answers = [ans.strip() for ans in incorrect_answers if ans.strip() != correct_answer][:3]

    while len(incorrect_answers) < 3:
        incorrect_answers.append("This is a dummy incorrect answer.")

    options = incorrect_answers + [correct_answer]
    random.shuffle(options)

    return {
        "question": question,
        "options": options,
        "correct_answer": correct_answer
    }

# Main function to process video URL and generate MCQs
def generate_transcript_and_mcqs(video_url):
    audio_file = download_audio(video_url)
    transcribed_text = transcribe_audio_whisper(audio_file)

    questions = generate_questions(transcribed_text)
    mcqs = [generate_mcq(transcribed_text, question) for question in questions]

    os.remove(audio_file)

    return mcqs

# Function to display MCQs and get user answers
def display_mcqs(mcqs):
    for i, mcq in enumerate(mcqs):
        print(f"Q{i+1}: {mcq['question']}")
        for idx, option in enumerate(mcq['options']):
            print(f"  {chr(65 + idx)}. {option}")

        user_answer = input("Your answer (A, B, C, D): ").strip().upper()
        correct_option = chr(65 + mcq['options'].index(mcq['correct_answer']))

        if user_answer == correct_option:
            print("Correct!")
        else:
            print(f"Incorrect! The correct answer is {correct_option}. {mcq['correct_answer']}")
        print("\n")

# Give the URL code
video_url = input("Enter YouTube URL: ")  # Taking YouTube URL as input from the user
mcqs = generate_transcript_and_mcqs(video_url)
display_mcqs(mcqs)


Enter YouTube URL: https://youtu.be/BqqfQnyjmgg?si=MagfG4_eQgSEEYnT


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Token indices sequence length is longer than the specified maximum sequence length for this model (787 > 512). Running this sequence through the model will result in indexing errors


Q1: What is the purpose of transfer learning?
  A. Earth was pre-trained this way using the English Wikipedia and had 11,000 published books
  B. This is because pre-training models are usually trained on large amounts of data, but provides a model with statistical understanding of the language used during pre-training
  C. Now let's say you want to train a model B for a different task
  D. For instance, when we defined it in the Bert model earlier, we removed the ad-let classified mass quotes and replaced it with a classifier with two outputs, since our task at two labels
Your answer (A, B, C, D): B
Incorrect! The correct answer is D. For instance, when we defined it in the Bert model earlier, we removed the ad-let classified mass quotes and replaced it with a classifier with two outputs, since our task at two labels


Q2: How is transfer learning applied to a model?
  A. Training on the right it's fine training a pre-training model
  B. Models are frequently trained on image net, but