In [1]:
#%pip install pytube youtube-transcript-api openai
#%pip install yt-dlp
#%pip install openai==0.28
#update openai
#%pip install --upgrade openai
#%pip install tiktoken
import tiktoken
import os
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
import openai
from math import ceil
import yt_dlp
import json 
import re
import csv

In [2]:
# Set your OpenAI API key
client = openai.OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
def get_video_info(youtube_url):
    ydl_opts = {
        'skip_download': True,
        'quiet': True
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(youtube_url, download=False)
        return info

def get_video_transcript(youtube_url):
    ydl_opts = {
        'writesubtitles': True,
        'skip_download': True,
        'outtmpl': 'tmp/transcript',
        'subtitlesformat': 'vtt'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
    
    with open('tmp/transcript.en.vtt', 'r', encoding='utf-8') as f:
        transcript = f.read()
    
    return ' '.join(line.strip() for line in transcript.split('\n') 
                    if line.strip() and not line.strip().isdigit() 
                        and '-->' not in line)

def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def convert_to_json(content):
    # Replace single quotes with double quotes
    content = content.replace("'", '"')
    # Split the content into individual question blocks
    question_blocks = re.split(r'### Question \d+:', content)
    # Remove the ```python and ``` markers
    cleaned_blocks = [block.replace("```python", "").replace("```", "") for block in question_blocks]
    # Extract JSON objects using regex
    json_objects = [re.findall(r'{.*?}', block, re.DOTALL) for block in cleaned_blocks]
    # Flatten the list of lists
    json_objects = [item for sublist in json_objects for item in sublist]
    # Parse JSON objects
    quiz = [json.loads(obj) for obj in json_objects]
    return quiz

def generate_quiz(transcript, num_questions):
    
    # Define max tokens for input (leaving room for output)
    MAX_TOKENS = 3000
    
    # Split transcript into chunks
    chunks = []
    current_chunk = ""
    for sentence in transcript.split('. '):
        if num_tokens_from_string(current_chunk + sentence) > MAX_TOKENS:
            chunks.append(current_chunk)
            current_chunk = sentence
        else:
            current_chunk += sentence + '. '
    if current_chunk:
        chunks.append(current_chunk)
    
    # Generate questions for each chunk
    all_questions = []
    questions_per_chunk = max(1, num_questions // len(chunks))
    
    for i, chunk in enumerate(chunks):
        prompt = f"""Based on the following part of a video transcript, create a multiple-choice quiz with {questions_per_chunk} questions. 
        Format each question as a python dictionary with 'question', 'options' (a list of 4 choices), and 'correct_answer' keys. Structure example: 
       {{'question': 'What is the capital of France?', 'options': ['Paris', 'London', 'Berlin', 'Madrid'], 'correct_answer': 'Paris'}}\n\n
        Transcript part {i+1}/{len(chunks)}: {chunk}"""

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that creates quizzes based on video content."},
                {"role": "user", "content": prompt}
            ]
        )
        chunk_questions = convert_to_json(response.choices[0].message.content)
        # print(response.choices[0].message.content)
        # print(chunk_questions)
        all_questions.extend(chunk_questions)
    
    # If we have more questions than needed, randomly select the required number
    if len(all_questions) > num_questions:
        import random
        all_questions = random.sample(all_questions, num_questions)
    
    return all_questions


def create_quiz_from_youtube(youtube_url):
    try:
        info = get_video_info(youtube_url)
        duration_seconds = info['duration']
        
        num_questions = ceil(duration_seconds / 60)  # 1 question per 60 seconds
        transcript = get_video_transcript(youtube_url)
        if not transcript:
            return None

        quiz = generate_quiz(transcript, num_questions)

        # return quiz as a JSON string
        return quiz

    except Exception as e:
        print(f"Error creating quiz: {e}")
        return None
    
def store_quiz_in_csv(youtube_url, quiz, transcript, info):
    # Extract video information
    title = info['title']
    duration = info['duration']
    
    # Prepare data for CSV
    questions = []
    correct_answers = []
    for question in quiz:
        questions.append(question['question'])
        correct_answers.append(question['correct_answer'])
    
    # Create a CSV row
    row = {
        'title': title,
        'link': youtube_url,
        'duration': duration,
        'transcript': transcript,
        'questions': json.dumps(questions),
        'correct_answers': json.dumps(correct_answers)
    }
    
    # Write to CSV file
    csv_file = 'quiz_data.csv'
    fieldnames = ['title', 'link', 'duration', 'transcript', 'questions', 'correct_answers']
    
    try:
        with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            
            # Write header if the file is empty
            if file.tell() == 0:
                writer.writeheader()
            
            writer.writerow(row)
    except Exception as e:
        print(f"Error writing to CSV file: {e}")


In [4]:
# Example usage
youtube_url = "https://www.youtube.com/watch?v=SSo_EIwHSd4"  # Replace with your YouTube video URL
quiz = create_quiz_from_youtube(youtube_url)
quiz



[youtube] Extracting URL: https://www.youtube.com/watch?v=SSo_EIwHSd4
[youtube] SSo_EIwHSd4: Downloading webpage
[youtube] SSo_EIwHSd4: Downloading tv client config
[youtube] SSo_EIwHSd4: Downloading player 9c6dfc4a
[youtube] SSo_EIwHSd4: Downloading tv player API JSON
[youtube] SSo_EIwHSd4: Downloading ios player API JSON
[youtube] SSo_EIwHSd4: Downloading m3u8 information
[info] SSo_EIwHSd4: Downloading subtitles: en




[info] SSo_EIwHSd4: Downloading 1 format(s): 18
Deleting existing file tmp\transcript.en.vtt
[info] Writing video subtitles to: tmp\transcript.en.vtt
[download] Destination: tmp\transcript.en.vtt
[download] 100% of    7.56KiB in 00:00:00 at 100.16KiB/s


[{'question': 'What was the original purpose of blockchain technology?',
  'options': ['Timestamping digital documents',
   'Creating digital currencies',
   'Managing medical records',
   'Collecting taxes'],
  'correct_answer': 'Timestamping digital documents'},
 {'question': 'Who adapted the blockchain technology in 2009 to create Bitcoin?',
  'options': ['Satoshi Nakamoto',
   'Vitalik Buterin',
   'Charlie Lee',
   'David Chaum'],
  'correct_answer': 'Satoshi Nakamoto'},
 {'question': 'What technique makes it difficult to change data recorded inside a blockchain?',
  'options': ['Hashing', 'Encryption', 'Compression', 'Serialization'],
  'correct_answer': 'Hashing'},
 {'question': 'What is the unique property of a hash in a block?',
  'options': ['It is like a fingerprint and uniquely identifies the block',
   'It is easily changeable',
   'It is encrypted and secure',
   'It is shared among multiple blocks'],
  'correct_answer': 'It is like a fingerprint and uniquely identifies t

In [8]:
# function to store quiz in csv with columns: title, link, duration, questions, options, correct_answers.
# if title already exists in csv, skip
    
def store_quiz_in_csv(youtube_url):
    # Create quiz from YouTube video
    quiz = create_quiz_from_youtube(youtube_url)
    
    # Extract video information
    info = get_video_info(youtube_url)
    title = info['title']
    duration = info['duration']
    

    # Prepare data for CSV
    questions = []
    options = []
    correct_answers = []
    for question in quiz:
        questions.append(question['question'])
        options.append(json.dumps(question['options']))
        correct_answers.append(question['correct_answer'])
    
    # Create a CSV row
    row = {
        'title': title,
        'link': youtube_url,
        'duration': duration,
        'questions': json.dumps(questions),
        'options': json.dumps(options),
        'correct_answers': json.dumps(correct_answers)
    }
    
    # Write to CSV file
    csv_file = 'quiz_data.csv'
    fieldnames = ['title', 'link', 'duration', 'questions', 'options', 'correct_answers']
    
    try:
        with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            
            # Write header if the file is empty
            if file.tell() == 0:
                writer.writeheader()
            
            writer.writerow(row)
    except Exception as e:
        print(f"Error writing to CSV file: {e}")


In [None]:
store_quiz_in_csv("https://www.youtube.com/watch?v=SSo_EIwHSd4")
store_quiz_in_csv("https://www.youtube.com/watch?v=3xGLc-zz9cA")
store_quiz_in_csv("https://www.youtube.com/watch?v=ReRJzrCwsaw")
store_quiz_in_csv("https://www.youtube.com/watch?v=17QRFlml4pA&t=1s")
store_quiz_in_csv("https://www.youtube.com/watch?v=ll5c4pLMDdA")
# store_quiz_in_csv("https://www.youtube.com/watch?v=SQyg9pyJ1Ac")
# store_quiz_in_csv("https://www.youtube.com/watch?v=vTEjf7MYAWA")
# store_quiz_in_csv("https://www.youtube.com/watch?v=t1brCcgi174")



[youtube] Extracting URL: https://www.youtube.com/watch?v=SSo_EIwHSd4
[youtube] SSo_EIwHSd4: Downloading webpage
[youtube] SSo_EIwHSd4: Downloading tv client config
[youtube] SSo_EIwHSd4: Downloading player 9c6dfc4a
[youtube] SSo_EIwHSd4: Downloading tv player API JSON
[youtube] SSo_EIwHSd4: Downloading ios player API JSON
[youtube] SSo_EIwHSd4: Downloading m3u8 information
[info] SSo_EIwHSd4: Downloading subtitles: en




[info] SSo_EIwHSd4: Downloading 1 format(s): 18
Deleting existing file tmp\transcript.en.vtt
[info] Writing video subtitles to: tmp\transcript.en.vtt
[download] Destination: tmp\transcript.en.vtt
[download] 100% of    7.56KiB in 00:00:00 at 94.12KiB/s




[youtube] Extracting URL: https://www.youtube.com/watch?v=3xGLc-zz9cA
[youtube] 3xGLc-zz9cA: Downloading webpage
[youtube] 3xGLc-zz9cA: Downloading tv client config
[youtube] 3xGLc-zz9cA: Downloading player 9c6dfc4a
[youtube] 3xGLc-zz9cA: Downloading tv player API JSON
[youtube] 3xGLc-zz9cA: Downloading ios player API JSON
[youtube] 3xGLc-zz9cA: Downloading m3u8 information




[info] 3xGLc-zz9cA: Downloading 1 format(s): 18
[info] There are no subtitles for the requested languages




[youtube] Extracting URL: https://www.youtube.com/watch?v=ReRJzrCwsaw
[youtube] ReRJzrCwsaw: Downloading webpage
[youtube] ReRJzrCwsaw: Downloading tv client config
[youtube] ReRJzrCwsaw: Downloading player 9c6dfc4a
[youtube] ReRJzrCwsaw: Downloading tv player API JSON
[youtube] ReRJzrCwsaw: Downloading ios player API JSON
[youtube] ReRJzrCwsaw: Downloading m3u8 information
[info] ReRJzrCwsaw: Downloading subtitles: en-US




[info] ReRJzrCwsaw: Downloading 1 format(s): 18
Deleting existing file tmp\transcript.en-US.vtt
[info] Writing video subtitles to: tmp\transcript.en-US.vtt
[download] Destination: tmp\transcript.en-US.vtt
[download] 100% of    3.47KiB in 00:00:00 at 52.63KiB/s




[youtube] Extracting URL: https://www.youtube.com/watch?v=17QRFlml4pA&t=1s
[youtube] 17QRFlml4pA: Downloading webpage
[youtube] 17QRFlml4pA: Downloading tv client config
[youtube] 17QRFlml4pA: Downloading player 9c6dfc4a
[youtube] 17QRFlml4pA: Downloading tv player API JSON
[youtube] 17QRFlml4pA: Downloading ios player API JSON
[youtube] 17QRFlml4pA: Downloading m3u8 information




[info] 17QRFlml4pA: Downloading 1 format(s): 18
[info] There are no subtitles for the requested languages




[youtube] Extracting URL: https://www.youtube.com/watch?v=ll5c4pLMDdA
[youtube] ll5c4pLMDdA: Downloading webpage
[youtube] ll5c4pLMDdA: Downloading tv client config
[youtube] ll5c4pLMDdA: Downloading player 9c6dfc4a
[youtube] ll5c4pLMDdA: Downloading tv player API JSON
[youtube] ll5c4pLMDdA: Downloading ios player API JSON
[youtube] ll5c4pLMDdA: Downloading m3u8 information
[info] ll5c4pLMDdA: Downloading subtitles: en




[info] ll5c4pLMDdA: Downloading 1 format(s): 18
Deleting existing file tmp\transcript.en.vtt
[info] Writing video subtitles to: tmp\transcript.en.vtt
[download] Destination: tmp\transcript.en.vtt
[download] 100% of    4.18KiB in 00:00:00 at 47.07KiB/s




[youtube] Extracting URL: https://www.youtube.com/watch?v=SQyg9pyJ1Ac
[youtube] SQyg9pyJ1Ac: Downloading webpage
[youtube] SQyg9pyJ1Ac: Downloading tv client config
[youtube] SQyg9pyJ1Ac: Downloading player 9c6dfc4a
[youtube] SQyg9pyJ1Ac: Downloading tv player API JSON
[youtube] SQyg9pyJ1Ac: Downloading ios player API JSON
[youtube] SQyg9pyJ1Ac: Downloading m3u8 information




[info] SQyg9pyJ1Ac: Downloading 1 format(s): 18
[info] There are no subtitles for the requested languages




[youtube] Extracting URL: https://www.youtube.com/watch?v=vTEjf7MYAWA
[youtube] vTEjf7MYAWA: Downloading webpage
[youtube] vTEjf7MYAWA: Downloading tv client config
[youtube] vTEjf7MYAWA: Downloading player 9c6dfc4a
[youtube] vTEjf7MYAWA: Downloading tv player API JSON
[youtube] vTEjf7MYAWA: Downloading ios player API JSON
[youtube] vTEjf7MYAWA: Downloading m3u8 information




[info] vTEjf7MYAWA: Downloading 1 format(s): 18
[info] There are no subtitles for the requested languages




[youtube] Extracting URL: https://www.youtube.com/watch?v=t1brCcgi174
[youtube] t1brCcgi174: Downloading webpage
[youtube] t1brCcgi174: Downloading tv client config
[youtube] t1brCcgi174: Downloading player 9c6dfc4a
[youtube] t1brCcgi174: Downloading tv player API JSON
[youtube] t1brCcgi174: Downloading ios player API JSON
[youtube] t1brCcgi174: Downloading m3u8 information




[info] t1brCcgi174: Downloading 1 format(s): 18
[info] There are no subtitles for the requested languages




In [11]:
store_quiz_in_csv("https://www.youtube.com/watch?v=kf28zqP_F2s")



[youtube] Extracting URL: https://www.youtube.com/watch?v=kf28zqP_F2s
[youtube] kf28zqP_F2s: Downloading webpage
[youtube] kf28zqP_F2s: Downloading tv client config
[youtube] kf28zqP_F2s: Downloading player 9c6dfc4a
[youtube] kf28zqP_F2s: Downloading tv player API JSON
[youtube] kf28zqP_F2s: Downloading ios player API JSON
[youtube] kf28zqP_F2s: Downloading m3u8 information
[info] kf28zqP_F2s: Downloading subtitles: en-US




[info] kf28zqP_F2s: Downloading 1 format(s): 18
Deleting existing file tmp\transcript.en-US.vtt
[info] Writing video subtitles to: tmp\transcript.en-US.vtt
[download] Destination: tmp\transcript.en-US.vtt
[download] 100% of    5.61KiB in 00:00:00 at 49.63KiB/s




In [12]:
store_quiz_in_csv("https://www.youtube.com/watch?v=rYQgy8QDEBI")



[youtube] Extracting URL: https://www.youtube.com/watch?v=rYQgy8QDEBI
[youtube] rYQgy8QDEBI: Downloading webpage
[youtube] rYQgy8QDEBI: Downloading tv client config
[youtube] rYQgy8QDEBI: Downloading player 9c6dfc4a
[youtube] rYQgy8QDEBI: Downloading tv player API JSON
[youtube] rYQgy8QDEBI: Downloading ios player API JSON
[youtube] rYQgy8QDEBI: Downloading m3u8 information




[info] rYQgy8QDEBI: Downloading 1 format(s): 18
[info] There are no subtitles for the requested languages


