In [1]:
!pip install youtube_transcript_api transformers

Collecting youtube_transcript_api
  Downloading youtube_transcript_api-0.6.3-py3-none-any.whl.metadata (17 kB)
Downloading youtube_transcript_api-0.6.3-py3-none-any.whl (622 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.3/622.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube_transcript_api
Successfully installed youtube_transcript_api-0.6.3


In [2]:
import re
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
import torch
from typing import Optional, List
import time

In [3]:
# Block 3: Helper Functions
def extract_video_id(youtube_url: str) -> Optional[str]:
    """
    Extract YouTube video ID from URL
    Args:
        youtube_url (str): YouTube video URL
    Returns:
        str: Video ID if found, None otherwise
    """
    patterns = [
        r'(?:youtube\.com\/watch\?v=|youtu.be\/)([\w-]+)',
        r'(?:youtube\.com\/embed\/)([\w-]+)',
        r'(?:youtube\.com\/v\/)([\w-]+)'
    ]

    for pattern in patterns:
        match = re.search(pattern, youtube_url)
        if match:
            return match.group(1)
    return None

def get_transcript(video_id: str) -> str:
    """
    Fetch and format transcript
    Args:
        video_id (str): YouTube video ID
    Returns:
        str: Formatted transcript text or error message
    """
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        # Optimize by using join on list comprehension
        transcript_text = ' '.join(item['text'] for item in transcript_list)
        return transcript_text
    except Exception as e:
        return f"Error fetching transcript: {str(e)}"

In [4]:
# Block 4: Summarization Function
def create_summarizer():
    """
    Initialize the summarization pipeline with GPU if available
    """
    device = 0 if torch.cuda.is_available() else -1
    return pipeline("summarization",
                   model="facebook/bart-large-cnn",
                   device=device)

def chunk_text(text: str, chunk_size: int = 1024) -> List[str]:
    """
    Split text into chunks at sentence boundaries
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 <= chunk_size:
            current_chunk.append(word)
            current_length += len(word) + 1
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def summarize_text(text: str, max_length: int = 150, min_length: int = 50) -> str:
    """
    Summarize text using BART model
    Args:
        text (str): Input text
        max_length (int): Maximum summary length
        min_length (int): Minimum summary length
    Returns:
        str: Generated summary
    """
    try:
        summarizer = create_summarizer()
        chunks = chunk_text(text)
        summaries = []

        for chunk in chunks:
            if len(chunk.split()) < min_length:
                continue

            summary = summarizer(chunk,
                               max_length=max_length,
                               min_length=min_length,
                               do_sample=False)
            summaries.append(summary[0]['summary_text'])

            # Add small delay to prevent rate limiting
            time.sleep(0.5)

        return ' '.join(summaries)

    except Exception as e:
        return f"Error in summarization: {str(e)}"

In [5]:
# Block 5: Main Processing Function
def process_video(youtube_url: str, max_length: int = 150, min_length: int = 50) -> str:
    """
    Process YouTube video and generate summary
    Args:
        youtube_url (str): YouTube video URL
        max_length (int): Maximum summary length
        min_length (int): Minimum summary length
    Returns:
        str: Video summary or error message
    """
    print("Extracting video ID...")
    video_id = extract_video_id(youtube_url)
    if not video_id:
        return "Invalid YouTube URL"

    print("Fetching transcript...")
    transcript = get_transcript(video_id)
    if transcript.startswith("Error"):
        return transcript

    print("Generating summary...")
    return summarize_text(transcript, max_length, min_length)

In [7]:
# Block 6: Example Usage (Run this block to test)
def main():
    youtube_url = input("https://www.youtube.com/watch?v=gs8qfL9PNac")

    print("\nProcessing video...")
    start_time = time.time()

    summary = process_video(youtube_url)

    processing_time = time.time() - start_time
    print(f"\nProcessing completed in {processing_time:.2f} seconds")
    print("\nSummary:")
    print(summary)

# Run this line to execute the example
if __name__ == "__main__":
    main()


https://www.youtube.com/watch?v=gs8qfL9PNachttps://www.youtube.com/watch?v=gs8qfL9PNac

Processing video...
Extracting video ID...
Fetching transcript...
Generating summary...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu



Processing completed in 356.59 seconds

Summary:
2000 people are about to battle in a series of trials for a spot in my ten episode Prime Video series called Beast Games. The first challenge of these trials is gonna test your strength. Whichever group is the last to lift that 10,000 pound boulder across that finish line is eliminated. "This is for $5 million! We don't want to be the first out" "I'm going to be so for real, I don't think I'm cut out for this" "We have to go backwards. No! That's yellow. Oh, no" Every single one of you will receive $2,000 just for showing up. With a total of 2000 contestants competing, that means we're giving away $4 million throughout Beast Games. Some people chose to go home early, but at least they got to walk away with $1,000. Your chance to win $5 million completely relies on this case either being safe or eliminated. Your goal is to memorize where the safe briefcases are and claim one before someone else does. Think fast and run faster because of 