# Veritasium - Categorized Data Extraction

## Downloading & Importing Libraries

In [None]:
!pip install openai langchain langchain_community langchain_openai yt-dlp ffmpeg-python transformers torchaudio



In [None]:
import os
import gc
import requests
import openai
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

import json
from google.colab import files
from google.colab import userdata
from google.colab import runtime

import re
import string
import nltk

import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import yt_dlp
import ffmpeg
import subprocess


In [None]:
OPENAI_API_KEY = userdata.get('Ironhack-GPT')
GCC_API_KEY = userdata.get('BH-GCC')
HF_TOKEN = userdata.get('HF')


os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['HF_TOKEN'] = HF_TOKEN

## Veritasium videos pre-processing

### Getting Veritasium videos via YT-API and extracting metadata

In [None]:
CHANNEL_ID = 'UCHnyfMqiRRG1u-2MsSQLbXA' # https://www.youtube.com/@veritasium

In [None]:
# Helper Funtion to preprocess text (decided to use a simplified version since GPT handles raw text input quite effectively // removing excessive whitespace)
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Removing excessive whitespace
    text = ' '.join(text.split())

    return text

In [None]:
def get_video_metadata(api_key, channel_id):
    base_url = 'https://www.googleapis.com/youtube/v3/search'
    videos = []
    next_page_token = ''

    while True:
        url = f'{base_url}?key={api_key}&channelId={channel_id}&part=snippet,id&type=video&maxResults=50&pageToken={next_page_token}'
        response = requests.get(url)
        data = response.json()

        # Check if there are items in the response
        if 'items' not in data:
            break

        # Print the keys of the first item to see available fields
        if len(videos) == 0:  # Only print the keys for the first item
            print(f"Available keys in data['items'][0]: {list(data['items'][0].keys())}")
            print(f"Available keys in data['items'][0]['snippet']: {list(data['items'][0]['snippet'].keys())}")

        # Retrieve and store video metadata
        for item in data['items']:
            if 'videoId' in item['id']:
                video_id = item['id']['videoId']
                title = item['snippet']['title']
                description = item['snippet']['description']
                published_at = item['snippet']['publishedAt']
                video_url = f"https://www.youtube.com/watch?v={video_id}"

                # Preprocess title and description
                title = preprocess_text(title)
                description = preprocess_text(description)

                videos.append({
                    'videoId': video_id,
                    'title': title,
                    'description': description,
                    'published_at': published_at,
                    'url': video_url
                })

        next_page_token = data.get('nextPageToken', '')
        if not next_page_token:
            break

    return videos


# Get video metadata from the channel
videos = get_video_metadata(GCC_API_KEY, CHANNEL_ID)

Available keys in data['items'][0]: ['kind', 'etag', 'id', 'snippet']
Available keys in data['items'][0]['snippet']: ['publishedAt', 'channelId', 'title', 'description', 'thumbnails', 'channelTitle', 'liveBroadcastContent', 'publishTime']


In [None]:
len(videos) # Check if any videos missing

382

In [None]:
videos[0:5] # Inspect first 5 dicts

[{'videoId': 'scliyWrN7mk',
  'title': 'how bikes *actually* work',
  'description': 'why are bicycles stable? the most common answer is gyroscopic effects, but this is not right.',
  'published_at': '2024-04-17T16:20:50Z',
  'url': 'https://www.youtube.com/watch?v=scliyWrN7mk'},
 {'videoId': 'P_MSlxczd94',
  'title': 'indestructible coating?!',
  'description': 'the coating is a polymer sold under the brand name line-x. more about how it works in the full video here: ...',
  'published_at': '2022-12-31T16:32:25Z',
  'url': 'https://www.youtube.com/watch?v=P_MSlxczd94'},
 {'videoId': 'n8WxkqMRgS4',
  'title': 'falling ladders - why does this happen?',
  'description': 'what happens when a chain ladder lands on a table? great video and concept by andy ruina. let me know if you want me to ...',
  'published_at': '2023-01-12T17:24:24Z',
  'url': 'https://www.youtube.com/watch?v=n8WxkqMRgS4'},
 {'videoId': 'Ko02mUWhItA',
  'title': 'raindrops aren&#39;t shaped like raindrops',
  'descripti

In [None]:
# Function to save videos metadata to a JSON file
def save_video_metadata(videos, filename="0-videos_metadata.json"):
    with open(filename, 'w') as f:
        json.dump(videos, f, indent=4)

# Save the metadata to a file
save_video_metadata(videos)

# Download the JSON file
files.download('0-videos_metadata.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Categorization with GPT

In [None]:
predefined_categories = [
    "Physics",
    "Mathematics",
    "Engineering",
    "Biology",
    "Chemistry",
    "Space",
    "Technology",
    "General Science",
    "Miscellaneous Educational Content",
    "Other"
]

few_shot_examples = [
    {
        "title": "The Wonders of Quantum Physics",
        "description": "Exploring the mysterious world of quantum physics.",
        "category": "Physics"
    },
    {
        "title": "DIY Science Experiment",
        "description": "A simple experiment to demonstrate chemical reactions at home.",
        "category": "Chemistry"
    },
    {
        "title": "The Future of AI",
        "description": "Discussing the advancements and future prospects of artificial intelligence.",
        "category": "Technology"
    }
]

# Constructing the prompt with few-shot examples
few_shot_text = "\n".join(
    f"Title: {example['title']}\nDescription: {example['description']}\nCategory: {example['category']}"
    for example in few_shot_examples
)

dynamic_prompt_template = f"""
You are an AI that categorizes YouTube videos based on their titles and descriptions.
Choose the most appropriate category from the following list:
{', '.join(predefined_categories)}

Here are some examples:
{few_shot_text}

Now, categorize the following video:

Title: {{title}}
Description: {{description}}

Respond with only the most appropriate category for this video, without any additional text.
"""

# Create a LangChain prompt
dynamic_prompt = PromptTemplate(template=dynamic_prompt_template, input_variables=["title", "description"])

# Initialize the OpenAI Chat model with a specific name
dynamic_llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)

def dynamic_categorize_videos(videos):
    categories = {}

    for video in videos:
        title = video['title']
        description = video['description']
        prompt = dynamic_prompt.format(title=title, description=description)

        response = dynamic_llm.generate([prompt])
        category = response.generations[0][0].text.strip()  # Access the text and strip it

        # Ensuring only valid categories are considered
        if category in predefined_categories:
            if category not in categories:
                categories[category] = []
            categories[category].append(video)
        else:
            # Handle unexpected categories
            if "Other" not in categories:
                categories["Other"] = []
            categories["Other"].append(video)

    return categories

# Categorize the videos
categorized_videos = dynamic_categorize_videos(videos)

# Print categorized videos
for category, vids in categorized_videos.items():
    print(f"\nCategory: {category}")
    for vid in vids:
        print(f" - {vid['title']}")



Category: Physics
 - how bikes *actually* work
 - falling ladders - why does this happen?
 - raindrops aren&#39;t shaped like raindrops
 - world&#39;s roundest object
 - microwaving grapes makes plasma
 - this phone trick is impossible
 - i call this the &#39;no, you don&#39;t&#39; law
 - does pressure melt ice?
 - world&#39;s strongest magnet!
 - how a slinky falls in slow motion #shorts
 - atomic theory
 - backspin basketball flies off dam
 - how does a boomerang work?
 - why does the earth spin?
 - can you solve this shadow illusion?
 - anti-gravity wheel explained
 - ice spikes explained
 - misconceptions about temperature
 - heisenberg&#39;s uncertainty principle explained
 - supercooled water - explained!
 - can you go the speed of light?
 - empty space is not empty
 - how special relativity makes magnets work
 - 4 revolutionary riddles
 - 5 fun physics phenomena
 - what is a force?
 - explained: 5 fun physics phenomena
 - option b - acceleration of a bungy jump
 - how does a sa

In [None]:
categorized_videos#['Chemistry']  >> Uncomment for accessing a category

{'Physics': [{'videoId': 'scliyWrN7mk',
   'title': 'how bikes *actually* work',
   'description': 'why are bicycles stable? the most common answer is gyroscopic effects, but this is not right.',
   'published_at': '2024-04-17T16:20:50Z',
   'url': 'https://www.youtube.com/watch?v=scliyWrN7mk'},
  {'videoId': 'n8WxkqMRgS4',
   'title': 'falling ladders - why does this happen?',
   'description': 'what happens when a chain ladder lands on a table? great video and concept by andy ruina. let me know if you want me to ...',
   'published_at': '2023-01-12T17:24:24Z',
   'url': 'https://www.youtube.com/watch?v=n8WxkqMRgS4'},
  {'videoId': 'Ko02mUWhItA',
   'title': 'raindrops aren&#39;t shaped like raindrops',
   'description': 'water breaks up into droplets - this is what it would be like to fall with raindrops.',
   'published_at': '2023-02-03T19:24:54Z',
   'url': 'https://www.youtube.com/watch?v=Ko02mUWhItA'},
  {'videoId': 'uDeQB9OkoXY',
   'title': 'world&#39;s roundest object',
   'de

In [None]:
# Function to save categorized videos metadata to a JSON file
def save_categorized_videos_metadata(categorized_videos, filename="1-categorized_videos_metadata.json"):
    with open(filename, 'w') as f:
        json.dump(categorized_videos, f, indent=4)

# Save the metadata to a file
save_categorized_videos_metadata(categorized_videos)

# Download the JSON file
files.download('1-categorized_videos_metadata.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>