# ✨ **YouTube Transcript API** ✨

Install the following package: https://pypi.org/project/youtube-transcript-api/

In [2]:
!pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.0.3


In [3]:
from transformers import pipeline
from youtube_transcript_api import YouTubeTranscriptApi
import re
from tqdm.notebook import tqdm
import textwrap

**HuggingFace**: https://huggingface.co/sshleifer/distilbart-cnn-12-6

**ModelName**: sshleifer/distilbart-cnn-12-6

In [4]:
class YouTubeSummarizer:
  def __init__(self, model_name="sshleifer/distilbart-cnn-12-6", max_chunk_size=1000):

      self.summarizer = pipeline('summarization', model=model_name)
      self.max_chunk_size = max_chunk_size

  def extract_video_id(self, youtube_url):

      # Handle different URL formats (standard, shortened, embedded, etc.)
      patterns = [
          r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',  # Standard and embedded URLs
          r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})',  # Shortened URLs
      ]

      for pattern in patterns:
          match = re.search(pattern, youtube_url)
          if match:
              return match.group(1)

      raise ValueError(f"Could not extract video ID from URL: {youtube_url}")

  def get_transcript(self, video_id):
      try:
          transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
          transcript = ' '.join([item['text'] for item in transcript_list])
          return transcript
      except Exception as e:
          raise Exception(f"Error retrieving transcript: {str(e)}")

  def split_into_chunks(self, text):
      # Split by sentences first (simple approach)
      sentences = re.split(r'(?<=[.!?])\s+', text)
      chunks = []
      current_chunk = ""

      for sentence in sentences:
          # If adding this sentence exceeds the chunk size, start a new chunk
          if len(current_chunk) + len(sentence) > self.max_chunk_size:
              # If the current chunk is not empty, add it to chunks
              if current_chunk:
                  chunks.append(current_chunk.strip())
                  current_chunk = sentence
              else:
                  # If a single sentence is longer than max_chunk_size, force split it
                  chunks.append(sentence[:self.max_chunk_size])
                  current_chunk = sentence[self.max_chunk_size:]
          else:
              current_chunk += " " + sentence

      # Add the last chunk if not empty
      if current_chunk:
          chunks.append(current_chunk.strip())

      return chunks

  def summarize_text(self, text, min_length=30, max_length=150):

      # Split text into chunks that the model can handle
      chunks = self.split_into_chunks(text)

      # Summarize each chunk
      summaries = []
      for chunk in tqdm(chunks, desc="Summarizing chunks"):
          # Skip chunks that are too short to summarize meaningfully
          if len(chunk) < 100:
              summaries.append(chunk)
              continue

          summary = self.summarizer(chunk,
                                    max_length=max_length,
                                    min_length=min_length,
                                    do_sample=False)
          summaries.append(summary[0]['summary_text'])

      # Combine the summaries
      full_summary = ' '.join(summaries)

      # For very long texts with many chunks, we might want to summarize again
      if len(chunks) > 3:
          # Re-summarize if the combined summary is still long
          if len(full_summary) > self.max_chunk_size:
              chunks = self.split_into_chunks(full_summary)
              second_level_summaries = []

              for chunk in tqdm(chunks, desc="Creating final summary"):
                  summary = self.summarizer(chunk,
                                          max_length=max_length,
                                          min_length=min_length,
                                          do_sample=False)
                  second_level_summaries.append(summary[0]['summary_text'])

              full_summary = ' '.join(second_level_summaries)

      return full_summary

  def summarize_youtube_video(self, youtube_url, min_length=30, max_length=150):

      # Extract the video ID
      video_id = self.extract_video_id(youtube_url)

      # Get the transcript
      transcript = self.get_transcript(video_id)

      # Summarize the transcript
      summary = self.summarize_text(transcript, min_length, max_length)

      return {
          'video_id': video_id,
          'transcript_length': len(transcript),
          'summary_length': len(summary),
          'compression_ratio': len(summary) / len(transcript) if len(transcript) > 0 else 0,
          'summary': summary
      }

In [6]:
# Initialize our summarizer
youtube_summarizer = YouTubeSummarizer()

Device set to use cuda:0


In [7]:
# Test with a video
video_url = "https://www.youtube.com/watch?v=KbYu85euDvY"

In [9]:
result = youtube_summarizer.summarize_youtube_video(video_url)
result

Summarizing chunks:   0%|          | 0/2 [00:00<?, ?it/s]

{'video_id': 'KbYu85euDvY',
 'transcript_length': 5022,
 'summary_length': 723,
 'compression_ratio': 0.14396654719235363,
 'summary': ' OpenAI launched its advanced AI image generator last week . Users quickly began using it They rendered image after image . They turned everything from memes to selfies into copies of the Japanese studios work . This spurred a global outage of chat GPT over the weekend and forced its top boss to say "Please chill This is insane"  Chat GPT\'s freefor-all Giblly cuteness may come at a cost Tech giants train their models without disclosing details . this data to train its AI models . Your photo could be misused and manipulated It could be manipulated and sold for targeted ads If the data is stolen it could even end up on the dark web And if this sounds scary that\'s because it is None of these risks are unprecedented .'}

In [10]:
# Display the results nicely
print(f"Video ID: {result['video_id']}")
print(f"Transcript length: {result['transcript_length']} characters")
print(f"Summary length: {result['summary_length']} characters")
print(f"Compression ratio: {result['compression_ratio']:.2%}")
print("\nSUMMARY:")

print("=" * 80)

# Wrap the text for nice display
wrapped_summary = textwrap.fill(result['summary'], width=80)
print(wrapped_summary)

Video ID: KbYu85euDvY
Transcript length: 5022 characters
Summary length: 723 characters
Compression ratio: 14.40%

SUMMARY:
 OpenAI launched its advanced AI image generator last week . Users quickly began
using it They rendered image after image . They turned everything from memes to
selfies into copies of the Japanese studios work . This spurred a global outage
of chat GPT over the weekend and forced its top boss to say "Please chill This
is insane"  Chat GPT's freefor-all Giblly cuteness may come at a cost Tech
giants train their models without disclosing details . this data to train its AI
models . Your photo could be misused and manipulated It could be manipulated and
sold for targeted ads If the data is stolen it could even end up on the dark web
And if this sounds scary that's because it is None of these risks are
unprecedented .
