
# Importing Libraries



In [3]:
import re # Regular expressions for extracting YouTube video ID from the URL
import numpy as np # For numerical operations, particularly with arrays

#Installing Gradio for deployement of App

In [4]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.4.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Col

#Installing Youtube Transcript Api for fetching transcripts from YT

In [5]:
!pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl.metadata (15 kB)
Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.2


In [6]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

#Natural Language ToolKit for text processing

In [7]:
import nltk

#Downloading PUNKT For Sentence Splitting

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#Installing Transformer for BART Model

In [9]:
!pip install transformers



# For TF-IDF text summarization

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Function to extract YouTube video ID from a given URL

In [11]:
def extract_video_id(url):
    video_id_match = re.search(r"(?:v=|/)([0-9A-Za-z_-]{11})", url) # Regular expression to find the video ID within the URL
    return video_id_match.group(1) if video_id_match else None # Return the ID if matched; otherwise, return None



# Main function to summarize YouTube video content

In [12]:
from transformers import BartTokenizer, BartForConditionalGeneration
def summarize_youtube_video(link):
    try:
        video_id = extract_video_id(link)  # Extract the video ID from the link
        if not video_id: # Check if a valid video ID was found
            return "Invalid YouTube link. Please ensure it's a valid video URL."

        try: # Attempt to retrieve the English subtitles
            sub = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        except TranscriptsDisabled:
            return "Transcripts are disabled for this video." # Error if transcripts are disabled for the video
        except Exception as e:
            return f"Could not retrieve subtitles: {e}" # Handle any other exception that might occur

        subtitle = " ".join([x['text'] for x in sub]) # Concatenate all subtitle lines into a single text block
        sentences = nltk.sent_tokenize(subtitle)  # Split the text into individual sentences
        organized_sent = {k: v for v, k in enumerate(sentences)} # Create a dictionary that maps each sentence to its index

        # Initialize the TF-IDF vectorize
        tf_idf = TfidfVectorizer(
            min_df=1,# Minimum document frequency for terms
            strip_accents='unicode', # Strip accents from text
            max_features=None, # No max limit for features
            lowercase=True, # Convert all words to lowercase
            token_pattern=r'\w{1,}', # Tokenization pattern
            ngram_range=(1, 3), # Unigrams, bigrams, and trigrams
            use_idf=True, # Use inverse document frequency
            smooth_idf=True, # Smooth IDF weights
            sublinear_tf=True, # Apply sublinear tf scaling
            stop_words='english' # Remove common English stopwords
        )
        # Convert the sentences into TF-IDF vectors
        sentence_vectors = tf_idf.fit_transform(sentences)
        sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel() # Calculate the sum of TF-IDF scores for each sentence
        N = min(20, len(sentences)) # Determine the number of sentences to use in the summary
        top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N ]]  # Select the top N sentences based on their scores
         # Sort sentences by their original order in the text
        mapped_sentences = [(sentence, organized_sent[sentence]) for sentence in top_n_sentences]
        mapped_sentences = sorted(mapped_sentences, key=lambda x: x[1])
        ordered_sentences = [element[0] for element in mapped_sentences]
        # Join ordered sentences into a TF-IDF-based summary
        tfidf_summary = " ".join(ordered_sentences)

        # Load the BART model and tokenizer for summarization
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
                # Increase BART summary length
        input_tensor = tokenizer.encode(subtitle, return_tensors="pt", max_length=1024, truncation=True)
        outputs_tensor = model.generate(input_tensor, max_length=500, min_length=300, length_penalty=2.0, num_beams=4, early_stopping=True)
        # Decode the generated tokens back to a readable summary
        bart_summary = tokenizer.decode(outputs_tensor[0], skip_special_tokens=True)

        final_summary = f"{bart_summary}"

        return final_summary # Return the summary text

    except Exception as e:
      # Handle any unexpected error during the summarization process
        return f"An unexpected error occurred: {str(e)}"



In [13]:
# Gradio app with Blocks layout for enhanced style
import gradio as gr
with gr.Blocks(css=".container { background-color: #f8fafc; }") as demo:
    gr.Markdown("""
        <div style="text-align: center; padding: 20px;">
            <h1 style="color: #4A90E2; font-size: 2.5em;">YouTube Video Summarizer 📹</h1>
            <p style="font-size: 1.25em; color: #333;">
                Enter a YouTube video link below to get a concise summary of the video's content.
                <br><b>Quickly capture the essence of any video with AI!</b>
            </p>
        </div>
    """)
     # Row layout for input and button
    with gr.Row():
        with gr.Column(scale=1):
            link_input = gr.Textbox(
                label="Enter YouTube Link",
                placeholder="https://www.youtube.com/watch?v=your_video_id",
                lines=1,
                show_label=False,
                container=False
            )

        with gr.Column(scale=1):
            submit_button = gr.Button("Summarize Video")
     # Output text box for displaying the video summary
    summary_output = gr.Textbox(
        label="Video Summary",
        placeholder="The summarized content will appear here...",
        lines=10,
        interactive=False,
        container=True
    )
     # Trigger the summarize function when the button is clicked
    submit_button.click(summarize_youtube_video, inputs=link_input, outputs=summary_output)


#Launching App On Gradio

In [14]:
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://35060711fb90c828be.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


