<a href="https://colab.research.google.com/github/DivyaShreeK-dev/sdc/blob/main/youtube_video_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries for Hugging Face model
!pip install -q \
    transformers \
    langchain==0.1.8 \
    langchain-core==0.1.33 \
    langchain-community==0.0.26 \
    youtube-transcript-api==0.4.0 \
    faiss-cpu==1.7.2 \
    tiktoken==0.1.8


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[31mERROR: Could not find a version that satisfies the requirement tiktoken==0.1.8 (from versions: 0.1.1, 0.1.2, 0.2.0, 0.3.0, 0.3.1, 0.3.2, 0.3.3, 0.4.0, 0.5.0, 0.5.1, 0.5.2, 0.6.0, 0.7.0, 0.8.0, 0.9.0)[0m[31m
[0m[31mERROR: No matching distribution found for tiktoken==0.1.8[0m[31m
[0m

In [2]:
# Install required libraries for Hugging Face model and Langchain without the specific tiktoken version
!pip install -q \
    transformers \
    langchain==0.1.8 \
    langchain-core==0.1.33 \
    langchain-community==0.0.26 \
    youtube-transcript-api==0.4.0 \
    faiss-cpu==1.7.2 \
    tiktoken \
    numpy==1.26.4 \
    packaging==24.2 \
    google-cloud-bigquery==3.31.0 \
    thinc==8.3.6


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Cannot install langchain-core==0.1.33 and packaging==24.2 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m

In [3]:
!pip install langchain faiss-cpu youtube-transcript-api sentence-transformers transformers ipywidgets


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nv

In [4]:
import ipywidgets as widgets
from IPython.display import display
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from transformers import pipeline
from langchain.embeddings.base import Embeddings
import re

# 1. Custom wrapper for SentenceTransformer to work with LangChain
class LocalEmbedding(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts)

    def embed_query(self, text):
        return self.model.encode([text])[0]

# 2. Get transcript from YouTube video
def get_transcript(video_url):
    video_id = re.search(r"(?<=v=)[^&#]+", video_url)
    if not video_id:
        raise ValueError("Invalid YouTube URL")
    video_id = video_id.group(0)
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = " ".join([entry["text"] for entry in transcript])
    return text

# 3. Split long text into smaller chunks
def split_text_into_docs(text, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.create_documents([text])

# 4. Create FAISS vector store with local embeddings
def create_vectorstore(docs):
    embedding = LocalEmbedding()
    db = FAISS.from_documents(docs, embedding)
    return db

# 5. Retrieve relevant chunks and summarize
def summarize_retrieved_docs(vectorstore, query="Summarize the video"):
    retriever = vectorstore.as_retriever()
    docs = retriever.get_relevant_documents(query)
    combined_text = " ".join([doc.page_content for doc in docs])

    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(combined_text, max_length=300, min_length=50, do_sample=False)[0]["summary_text"]
    return summary

# 6. Complete pipeline
def summarize_youtube_video(video_url):
    print("📥 Getting transcript...")
    text = get_transcript(video_url)

    print("✂️ Splitting into chunks...")
    docs = split_text_into_docs(text)

    print("📚 Building vectorstore...")
    vectorstore = create_vectorstore(docs)

    print("🧠 Summarizing...")
    summary = summarize_retrieved_docs(vectorstore)

    return summary

# 7. Interactive Widgets for URL Input
def on_button_click(b):
    video_url = url_textbox.value
    if not video_url:
        print("Please provide a valid URL.")
        return
    print("⌛ Processing the video...")
    summary = summarize_youtube_video(video_url)
    print("\n📋 Summary:\n", summary)

# Create a URL input field
url_textbox = widgets.Text(
    description='YouTube URL:',
    placeholder='Enter YouTube video URL',
    layout=widgets.Layout(width='50%')
)

# Create a button to trigger the summarization
summarize_button = widgets.Button(description="Summarize Video", layout=widgets.Layout(width='20%'))

# Bind button click to the function
summarize_button.on_click(on_button_click)

# Display the input field and button
display(url_textbox, summarize_button)


ModuleNotFoundError: Module langchain_community.vectorstores not found. Please install langchain-community to access this module. You can install it using `pip install -U langchain-community`

In [5]:
import ipywidgets as widgets
from IPython.display import display
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from transformers import pipeline
from langchain.embeddings.base import Embeddings
import re

# 1. Custom wrapper for SentenceTransformer to work with LangChain
class LocalEmbedding(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts)

    def embed_query(self, text):
        return self.model.encode([text])[0]

# 2. Get transcript from YouTube video
def get_transcript(video_url):
    video_id = re.search(r"(?<=v=)[^&#]+", video_url)
    if not video_id:
        raise ValueError("Invalid YouTube URL")
    video_id = video_id.group(0)
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = " ".join([entry["text"] for entry in transcript])
    return text

# 3. Split long text into smaller chunks
def split_text_into_docs(text, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.create_documents([text])

# 4. Create FAISS vector store with local embeddings
def create_vectorstore(docs):
    embedding = LocalEmbedding()
    db = FAISS.from_documents(docs, embedding)
    return db

# 5. Retrieve relevant chunks and summarize
def summarize_retrieved_docs(vectorstore, query="Summarize the video"):
    retriever = vectorstore.as_retriever()
    docs = retriever.get_relevant_documents(query)
    combined_text = " ".join([doc.page_content for doc in docs])

    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(combined_text, max_length=300, min_length=50, do_sample=False)[0]["summary_text"]
    return summary

# 6. Complete pipeline
def summarize_youtube_video(video_url):
    print("📥 Getting transcript...")
    text = get_transcript(video_url)

    print("✂️ Splitting into chunks...")
    docs = split_text_into_docs(text)

    print("📚 Building vectorstore...")
    vectorstore = create_vectorstore(docs)

    print("🧠 Summarizing...")
    summary = summarize_retrieved_docs(vectorstore)

    return summary

# 7. Interactive Widgets for URL Input
def on_button_click(b):
    video_url = url_textbox.value
    if not video_url:
        print("Please provide a valid URL.")
        return
    print("⌛ Processing the video...")
    summary = summarize_youtube_video(video_url)
    print("\n📋 Summary:\n", summary)

# Create a URL input field
url_textbox = widgets.Text(
    description='YouTube URL:',
    placeholder='Enter YouTube video URL',
    layout=widgets.Layout(width='50%')
)

# Create a button to trigger the summarization
summarize_button = widgets.Button(description="Summarize Video", layout=widgets.Layout(width='20%'))

# Bind button click to the function
summarize_button.on_click(on_button_click)

# Display the input field and button
display(url_textbox, summarize_button)


ModuleNotFoundError: Module langchain_community.vectorstores not found. Please install langchain-community to access this module. You can install it using `pip install -U langchain-community`

In [6]:
!pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 k

In [7]:
!pip show langchain-community


Name: langchain-community
Version: 0.3.21
Summary: Community contributed LangChain integrations.
Home-page: 
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: aiohttp, dataclasses-json, httpx-sse, langchain, langchain-core, langsmith, numpy, pydantic-settings, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [8]:
!pip install -q \
    langchain==0.1.8 \
    langchain-core==0.1.26 \
    langchain-community==0.0.26 \
    openai==0.27.0 \
    youtube-transcript-api==0.4.0 \
    faiss-cpu==1.7.2 \
    tiktoken==0.9.0 \
    numpy==1.26.4 \
    packaging==23.2 \
    google-cloud-bigquery==3.31.0 \
    thinc==8.3.6


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[31mERROR: Cannot install langchain-community==0.0.26, langchain-core==0.1.26 and langchain==0.1.8 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m

In [9]:
!pip install langchain faiss-cpu youtube-transcript-api sentence-transformers transformers ipywidgets




In [11]:
import ipywidgets as widgets
from IPython.display import display
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from transformers import pipeline
from langchain.embeddings.base import Embeddings
import re

# 1. Custom wrapper for SentenceTransformer to work with LangChain
class LocalEmbedding(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts)

    def embed_query(self, text):
        return self.model.encode([text])[0]

# 2. Get transcript from YouTube video
def get_transcript(video_url):
    video_id = re.search(r"(?<=v=)[^&#]+", video_url)
    if not video_id:
        raise ValueError("Invalid YouTube URL")
    video_id = video_id.group(0)
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = " ".join([entry["text"] for entry in transcript])
    return text

# 3. Split long text into smaller chunks
def split_text_into_docs(text, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.create_documents([text])

# 4. Create FAISS vector store with local embeddings
def create_vectorstore(docs):
    embedding = LocalEmbedding()
    db = FAISS.from_documents(docs, embedding)
    return db

# 5. Retrieve relevant chunks and summarize
def summarize_retrieved_docs(vectorstore, query="Summarize the video"):
    retriever = vectorstore.as_retriever()
    docs = retriever.get_relevant_documents(query)
    combined_text = " ".join([doc.page_content for doc in docs])

    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(combined_text, max_length=300, min_length=50, do_sample=False)[0]["summary_text"]
    return summary

# 6. Complete pipeline
def summarize_youtube_video(video_url):
    print("📥 Getting transcript...")
    text = get_transcript(video_url)

    print("✂️ Splitting into chunks...")
    docs = split_text_into_docs(text)

    print("📚 Building vectorstore...")
    vectorstore = create_vectorstore(docs)

    print("🧠 Summarizing...")
    summary = summarize_retrieved_docs(vectorstore)

    return summary

# 7. Interactive Widgets for URL Input
def on_button_click(b):
    video_url = url_textbox.value
    if not video_url:
        print("Please provide a valid URL.")
        return
    print("⌛ Processing the video...")
    summary = summarize_youtube_video(video_url)
    print("\n📋 Summary:\n", summary)

# Create a URL input field
url_textbox = widgets.Text(
    description='YouTube URL:',
    placeholder='Enter YouTube video URL',
    layout=widgets.Layout(width='50%')
)

# Create a button to trigger the summarization
summarize_button = widgets.Button(description="Summarize Video", layout=widgets.Layout(width='20%'))

# Bind button click to the function
summarize_button.on_click(on_button_click)

# Display the input field and button
display(url_textbox, summarize_button)


Text(value='', description='YouTube URL:', layout=Layout(width='50%'), placeholder='Enter YouTube video URL')

Button(description='Summarize Video', layout=Layout(width='20%'), style=ButtonStyle())

⌛ Processing the video...
📥 Getting transcript...
✂️ Splitting into chunks...
📚 Building vectorstore...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🧠 Summarizing...


  docs = retriever.get_relevant_documents(query)


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 300, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)



📋 Summary:
 Stay tuned for the next episode of "Artificial Intelligence" on CNN.com. Follow us on Twitter @ArtificialIntelligence and @CNNArtificial on Facebook and YouTube. For the latest from "Artic Intelligence," visit CNN.COM/ArticIntelligence.
