<a href="https://colab.research.google.com/github/Adeel-CS/howtodonwloadYoutubeVideo/blob/main/WatchToWork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytube openai-whisper moviepy
!pip install pytube openai-whisper transformers




In [None]:
import os
import re
import subprocess
import sys

# Function to check and install packages
def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Installed {package} successfully!")
    except subprocess.CalledProcessError:
        print(f"Error installing {package}. Please check your internet connection or try again later.")

# Ensure required packages are installed
def install_required_packages():
    required_packages = ["pytube", "whisper", "moviepy"]
    for package in required_packages:
        try:
            __import__(package)
        except ImportError:
            install_package(package)

install_required_packages()

# Now import the required packages
from pytube import YouTube
import whisper

# Function to sanitize URLs for filenames
def sanitize_filename(url):
    return re.sub(r'\W+', '_', url)

# Function to download audio from YouTube
def download_audio(video_url, download_path):
    try:
        yt = YouTube(video_url)
        audio_stream = yt.streams.filter(only_audio=True).first()
        audio_file = audio_stream.download(output_path=download_path)
        print("Audio download complete!")
        return audio_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file, language="hi"):
    try:
        model = whisper.load_model("base")
        result = model.transcribe(audio_file, language=language)
        print("Transcription complete!")
        formatted_text = result["text"].replace(". ", ".\n")
        return formatted_text
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

if __name__ == "__main__":
    video_url = input("Enter the video URL: ")
    sanitized_url = sanitize_filename(video_url)

    download_path = "./audio"
    os.makedirs(download_path, exist_ok=True)

    audio_file = download_audio(video_url, download_path)

    if audio_file:
        # Transcribe in Hindi
        transcription_hi = transcribe_audio(audio_file, language="hi")

        if transcription_hi:
            transcription_file_hi = f"transcription_{sanitized_url}_hi.txt"
            with open(transcription_file_hi, "w") as f:
                f.write(transcription_hi)
            print(f"Hindi transcription saved to {transcription_file_hi}")

        # Transcribe in Urdu
        transcription_ur = transcribe_audio(audio_file, language="ur")

        if transcription_ur:
            transcription_file_ur = f"transcription_{sanitized_url}_ur.txt"
            with open(transcription_file_ur, "w") as f:
                f.write(transcription_ur)
            print(f"Urdu transcription saved to {transcription_file_ur}")


Enter the video URL: https://youtu.be/uTSGHBsvplg?si=0fHP70mf1VYMFPBH
Audio download complete!


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 167MiB/s]


Transcription complete!
Hindi transcription saved to transcription_https_youtu_be_uTSGHBsvplg_si_0fHP70mf1VYMFPBH_hi.txt
Transcription complete!
Urdu transcription saved to transcription_https_youtu_be_uTSGHBsvplg_si_0fHP70mf1VYMFPBH_ur.txt


SyntaxError: invalid syntax (<ipython-input-4-62fa0ddf8b60>, line 2)

**Now this is where things get messy**

In [None]:
import os
import re
import subprocess
import sys
from pytube import YouTube
import whisper
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration

# Function to check and install packages
def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Installed {package} successfully!")
    except subprocess.CalledProcessError:
        print(f"Error installing {package}. Please check your internet connection or try again later.")

# Ensure required packages are installed
def install_required_packages():
    required_packages = ["pytube", "whisper", "transformers"]
    for package in required_packages:
        try:
            __import__(package)
        except ImportError:
            install_package(package)

install_required_packages()

# Function to sanitize URLs for filenames
def sanitize_filename(url):
    return re.sub(r'\W+', '_', url.split("/")[-1])

# Function to download audio from YouTube
def download_audio(video_url, download_path):
    try:
        yt = YouTube(video_url)
        audio_stream = yt.streams.filter(only_audio=True).first()
        audio_file = audio_stream.download(output_path=download_path)
        print("Audio download complete!")
        return audio_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file, language="hi"):
    try:
        model = whisper.load_model("base")
        result = model.transcribe(audio_file, language=language)
        print("Transcription complete!")
        formatted_text = result["text"].replace(". ", ".\n")
        return formatted_text
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

# Function to summarize text using BART model
def summarize_text(text):
    try:
        model_name = "facebook/bart-large-cnn"
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name)

        input_ids = tokenizer(text, return_tensors="pt").input_ids
        summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        print("Summarization complete!")
        return summary
    except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return None

# Function to translate text to English
def translate_to_english(text):
    try:
        translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
        result = translator(text)
        translated_text = result[0]['translation_text']
        print("Translation to English complete!")
        return translated_text
    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return None

if __name__ == "__main__":
    video_url = input("Enter the video URL: ")
    sanitized_url = sanitize_filename(video_url)

    # Define paths for saving audio and transcripts
    download_path = "./audio"
    transcripts_path = "./transcripts"
    os.makedirs(download_path, exist_ok=True)
    os.makedirs(transcripts_path, exist_ok=True)

    audio_file = download_audio(video_url, download_path)

    if audio_file:
        # Transcribe in Hindi
        transcription_hi = transcribe_audio(audio_file, language="hi")
        if transcription_hi:
            transcription_file_hi = os.path.join(transcripts_path, f"{sanitized_url}_hi.txt")
            with open(transcription_file_hi, "w") as f:
                f.write(transcription_hi)
            print(f"Hindi transcription saved to {transcription_file_hi}")

            # Summarize Hindi transcription
            summary_hi = summarize_text(transcription_hi)
            if summary_hi:
                summary_file_hi = os.path.join(transcripts_path, f"{sanitized_url}_hi_summary.txt")
                with open(summary_file_hi, "w") as f:
                    f.write(summary_hi)
                print(f"Hindi summary saved to {summary_file_hi}")

        # Transcribe in Urdu
        transcription_ur = transcribe_audio(audio_file, language="ur")
        if transcription_ur:
            transcription_file_ur = os.path.join(transcripts_path, f"{sanitized_url}_ur.txt")
            with open(transcription_file_ur, "w") as f:
                f.write(transcription_ur)
            print(f"Urdu transcription saved to {transcription_file_ur}")

            # Summarize Urdu transcription
            summary_ur = summarize_text(transcription_ur)
            if summary_ur:
                summary_file_ur = os.path.join(transcripts_path, f"{sanitized_url}_ur_summary.txt")
                with open(summary_file_ur, "w") as f:
                    f.write(summary_ur)
                print(f"Urdu summary saved to {summary_file_ur}")

        # Transcribe in English (or translate to English if not in English)
        transcription_en = transcribe_audio(audio_file, language="en")
        if transcription_en:
            transcription_file_en = os.path.join(transcripts_path, f"{sanitized_url}_en.txt")
            with open(transcription_file_en, "w") as f:
                f.write(transcription_en)
            print(f"English transcription saved to {transcription_file_en}")

            # Summarize English transcription
            summary_en = summarize_text(transcription_en)
            if summary_en:
                summary_file_en = os.path.join(transcripts_path, f"{sanitized_url}_en_summary.txt")
                with open(summary_file_en, "w") as f:
                    f.write(summary_en)
                print(f"English summary saved to {summary_file_en}")
        else:
            # Translate Hindi or Urdu transcription to English
            transcription_hi = transcribe_audio(audio_file, language="hi")
            if transcription_hi:
                transcription_en = translate_to_english(transcription_hi)
            else:
                transcription_ur = transcribe_audio(audio_file, language="ur")
                if transcription_ur:
                    transcription_en = translate_to_english(transcription_ur)

            if transcription_en:
                transcription_file_en = os.path.join(transcripts_path, f"{sanitized_url}_translated_en.txt")
                with open(transcription_file_en, "w") as f:
                    f.write(transcription_en)
                print(f"Translated English transcription saved to {transcription_file_en}")

                # Summarize translated English transcription
                summary_en = summarize_text(transcription_en)
                if summary_en:
                    summary_file_en = os.path.join(transcripts_path, f"{sanitized_url}_translated_en_summary.txt")
                    with open(summary_file_en, "w") as f:
                        f.write(summary_en)
                    print(f"Translated English summary saved to {summary_file_en}")


Enter the video URL: https://youtu.be/uTSGHBsvplg?si=0fHP70mf1VYMFPBH
Audio download complete!
Transcription complete!
Hindi transcription saved to ./transcripts/uTSGHBsvplg_si_0fHP70mf1VYMFPBH_hi.txt
An error occurred during summarization: index out of range in self
Transcription complete!
Urdu transcription saved to ./transcripts/uTSGHBsvplg_si_0fHP70mf1VYMFPBH_ur.txt
An error occurred during summarization: index out of range in self
Transcription complete!
English transcription saved to ./transcripts/uTSGHBsvplg_si_0fHP70mf1VYMFPBH_en.txt
An error occurred during summarization: index out of range in self


**Now More messy
**

In [None]:
import os
import re
import subprocess
import sys
from pytube import YouTube
import whisper
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration

# Function to check and install packages
def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Installed {package} successfully!")
    except subprocess.CalledProcessError:
        print(f"Error installing {package}. Please check your internet connection or try again later.")

# Ensure required packages are installed
def install_required_packages():
    required_packages = ["pytube", "whisper", "transformers"]
    for package in required_packages:
        try:
            __import__(package)
        except ImportError:
            install_package(package)

install_required_packages()

# Function to sanitize URLs for filenames
def sanitize_filename(url):
    return re.sub(r'\W+', '_', url.split("/")[-1])

# Function to download audio from YouTube
def download_audio(video_url, download_path):
    try:
        yt = YouTube(video_url)
        audio_stream = yt.streams.filter(only_audio=True).first()
        audio_file = audio_stream.download(output_path=download_path)
        print("Audio download complete!")
        return audio_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file, language="hi"):
    try:
        model = whisper.load_model("base")
        result = model.transcribe(audio_file, language=language)
        print("Transcription complete!")
        formatted_text = result["text"].replace(". ", ".\n")
        return formatted_text
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

# Function to summarize text using BART model
def summarize_text(text):
    try:
        model_name = "facebook/bart-large-cnn"
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name)

        input_ids = tokenizer(text, return_tensors="pt").input_ids
        summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        print("Summarization complete!")
        return summary
    except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return None

# Function to translate text to English
def translate_to_english(text):
    try:
        translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
        result = translator(text)
        translated_text = result[0]['translation_text']
        print("Translation to English complete!")
        return translated_text
    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return None

if __name__ == "__main__":
    video_url = input("Enter the video URL: ")
    sanitized_url = sanitize_filename(video_url)

    # Define paths for saving audio and transcripts
    download_path = "./audio"
    transcripts_path = "./transcripts"
    os.makedirs(download_path, exist_ok=True)
    os.makedirs(transcripts_path, exist_ok=True)

    audio_file = download_audio(video_url, download_path)

    if audio_file:
        # Transcribe in Hindi
        transcription_hi = transcribe_audio(audio_file, language="hi")
        if transcription_hi:
            # Save Hindi transcription in native script
            transcription_file_hi = os.path.join(transcripts_path, f"{sanitized_url}_hi.txt")
            with open(transcription_file_hi, "w") as f:
                f.write(transcription_hi)
            print(f"Hindi transcription saved to {transcription_file_hi}")

            # Summarize Hindi transcription
            summary_hi = summarize_text(transcription_hi)
            if summary_hi:
                summary_file_hi = os.path.join(transcripts_path, f"{sanitized_url}_hi_summary.txt")
                with open(summary_file_hi, "w") as f:
                    f.write(summary_hi)
                print(f"Hindi summary saved to {summary_file_hi}")

        # Transcribe in Urdu
        transcription_ur = transcribe_audio(audio_file, language="ur")
        if transcription_ur:
            # Save Urdu transcription
            transcription_file_ur = os.path.join(transcripts_path, f"{sanitized_url}_ur.txt")
            with open(transcription_file_ur, "w") as f:
                f.write(transcription_ur)
            print(f"Urdu transcription saved to {transcription_file_ur}")

            # Summarize Urdu transcription
            summary_ur = summarize_text(transcription_ur)
            if summary_ur:
                summary_file_ur = os.path.join(transcripts_path, f"{sanitized_url}_ur_summary.txt")
                with open(summary_file_ur, "w") as f:
                    f.write(summary_ur)
                print(f"Urdu summary saved to {summary_file_ur}")

        # Transcribe in English (or translate to English if not in English)
        transcription_en = transcribe_audio(audio_file, language="en")
        if transcription_en:
            # Save English transcription
            transcription_file_en = os.path.join(transcripts_path, f"{sanitized_url}_en.txt")
            with open(transcription_file_en, "w") as f:
                f.write(transcription_en)
            print(f"English transcription saved to {transcription_file_en}")

            # Summarize English transcription
            summary_en = summarize_text(transcription_en)
            if summary_en:
                summary_file_en = os.path.join(transcripts_path, f"{sanitized_url}_en_summary.txt")
                with open(summary_file_en, "w") as f:
                    f.write(summary_en)
                print(f"English summary saved to {summary_file_en}")
        else:
            # Translate Hindi or Urdu transcription to English
            transcription_hi = transcribe_audio(audio_file, language="hi")
            if transcription_hi:
                transcription_en = translate_to_english(transcription_hi)
            else:
                transcription_ur = transcribe_audio(audio_file, language="ur")
                if transcription_ur:
                    transcription_en = translate_to_english(transcription_ur)

            if transcription_en:
                transcription_file_en = os.path.join(transcripts_path, f"{sanitized_url}_translated_en.txt")
                with open(transcription_file_en, "w") as f:
                    f.write(transcription_en)
                print(f"Translated English transcription saved to {transcription_file_en}")

                # Summarize translated English transcription
                summary_en = summarize_text(transcription_en)
                if summary_en:
                    summary_file_en = os.path.join(transcripts_path, f"{sanitized_url}_translated_en_summary.txt")
                    with open(summary_file_en, "w") as f:
                        f.write(summary_en)
                    print(f"Translated English summary saved to {summary_file_en}")


Enter the video URL: https://youtu.be/uTSGHBsvplg?si=0fHP70mf1VYMFPBH
Audio download complete!
Transcription complete!
Hindi transcription saved to ./transcripts/uTSGHBsvplg_si_0fHP70mf1VYMFPBH_hi.txt
An error occurred during summarization: index out of range in self
Transcription complete!
Urdu transcription saved to ./transcripts/uTSGHBsvplg_si_0fHP70mf1VYMFPBH_ur.txt
An error occurred during summarization: index out of range in self
Transcription complete!
English transcription saved to ./transcripts/uTSGHBsvplg_si_0fHP70mf1VYMFPBH_en.txt
An error occurred during summarization: index out of range in self


THis one is the seperate one


In [None]:
!pip install pytube


Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [None]:
import os
import re
import subprocess
import sys
from pytube import YouTube
import moviepy.editor as mp
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from keybert import KeyBERT
import requests

# Function to check and install packages
def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Installed {package} successfully!")
    except subprocess.CalledProcessError:
        print(f"Error installing {package}. Please check your internet connection or try again later.")

# Install required packages if not already installed
required_packages = ["pytube", "moviepy", "transformers", "keybert", "requests"]
for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        install_package(package)

# Function to sanitize URLs for filenames
def sanitize_filename(url):
    return re.sub(r'\W+', '_', url)

# Function to download video from YouTube
def download_video(video_url, download_path):
    try:
        yt = YouTube(video_url)
        stream = yt.streams.get_highest_resolution()
        video_file = stream.download(output_path=download_path)
        print("Download complete!")
        return video_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to extract audio from video
def extract_audio(video_file, audio_file):
    try:
        video = mp.VideoFileClip(video_file)
        video.audio.write_audiofile(audio_file)
        print("Audio extraction complete!")
        return audio_file
    except Exception as e:
        print(f"An error occurred during audio extraction: {e}")
        return None

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file):
    try:
        url = "https://api-inference.huggingface.co/models/asapp/denoiseASR"
        headers = {"Authorization": "Bearer api_DmPySgMTSxQJgFkgilHjLjUlQQRpUzAqIC"}
        files = {"audio_file": open(audio_file, "rb")}
        response = requests.post(url, headers=headers, files=files)

        if response.status_code == 200:
            result = response.json()
            transcription = result["transcription"]
            print("Transcription complete!")
            formatted_text = transcription.replace(". ", ".\n")
            return formatted_text
        else:
            print(f"Transcription request failed with status code {response.status_code}")
            print(response.text)  # Print response content for further debugging
            return None
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

# Function to extract key topics using KeyBERT
def extract_key_topics(text, num_topics=10):
    try:
        kw_model = KeyBERT()
        key_topics = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_topics)
        topics = [keyword for keyword, _ in key_topics]
        print("Key topics extraction complete!")
        return topics
    except Exception as e:
        print(f"An error occurred during key topics extraction: {e}")
        return None

# Function to summarize text and generate short answers using transformers pipeline and BERT
def summarize_and_generate_answers(text, max_chunk_length=1024):
    try:
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
        tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
        model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)

        # Split text into chunks for summarization and question answering
        sentences = text.split('. ')
        current_chunk = []
        chunks = []

        for sentence in sentences:
            if len(current_chunk) + len(sentence.split()) <= max_chunk_length:
                current_chunk.append(sentence)
            else:
                chunks.append('. '.join(current_chunk) + '.')
                current_chunk = [sentence]

        if current_chunk:
            chunks.append('. '.join(current_chunk) + '.')

        summary = []
        answers = []

        for chunk in chunks:
            # Summarization
            chunk_summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
            summary.append(chunk_summary[0]['summary_text'])

            # Question answering
            qa_inputs = tokenizer(chunk, return_tensors="pt")
            with torch.no_grad():
                qa_outputs = model(**qa_inputs)
            answer_start = torch.argmax(qa_outputs.start_logits)
            answer_end = torch.argmax(qa_outputs.end_logits) + 1
            answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(qa_inputs["input_ids"][0][answer_start:answer_end]))
            answers.append(answer)

        combined_summary = ' '.join(summary)
        print("Summarization and answer generation complete!")
        return combined_summary, answers

    except Exception as e:
        print(f"An error occurred during summarization or QA: {e}")
        return None, None

# Main function
if __name__ == "__main__":
    video_url = input("Enter the video URL: ")
    sanitized_url = sanitize_filename(video_url)

    download_path = "./downloads"
    audio_path = "./audio"
    os.makedirs(download_path, exist_ok=True)
    os.makedirs(audio_path, exist_ok=True)

    video_file = download_video(video_url, download_path)

    if video_file:
        audio_file = os.path.join(audio_path, f"audio_{sanitized_url}.mp3")
        audio_file = extract_audio(video_file, audio_file)

        if audio_file:
            transcription = transcribe_audio(audio_file)

            if transcription:
                transcription_file = f"transcription_{sanitized_url}.txt"
                with open(transcription_file, "w") as f:
                    f.write(transcription)
                print(f"Transcription saved to {transcription_file}")

                key_topics = extract_key_topics(transcription)

                if key_topics:
                    key_topics_file = f"key_topics_{sanitized_url}.txt"
                    with open(key_topics_file, "w") as f:
                        f.write("\n".join(key_topics))
                    print(f"Key topics saved to {key_topics_file}")

                    summary, answers = summarize_and_generate_answers(transcription)

                    if summary:
                        summary_file = f"summary_{sanitized_url}.txt"
                        with open(summary_file, "w") as f:
                            f.write(summary)
                        print(f"Summary saved to {summary_file}")

                        # Save answers to a file
                        answers_file = f"answers_{sanitized_url}.txt"
                        with open(answers_file, "w") as f:
                            for i, answer in enumerate(answers):
                                f.write(f"Question {i+1}: {chunks[i]}\nAnswer: {answer}\n\n")
                        print(f"Answers saved to {answers_file}")


Enter the video URL: https://youtu.be/uTSGHBsvplg?si=0fHP70mf1VYMFPBH
Download complete!
MoviePy - Writing audio in ./audio/audio_https_youtu_be_uTSGHBsvplg_si_0fHP70mf1VYMFPBH.mp3




MoviePy - Done.
Audio extraction complete!
Transcription request failed with status code 400
{"error":"Authorization header is correct, but the token seems invalid"}


In [None]:
!pip install keybert


Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Usi

sdfsdg

In [10]:
import os
import re
from pytube import YouTube
import whisper
import moviepy.editor as mp
from transformers import pipeline
from keybert import KeyBERT

# Function to check and install packages
def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Installed {package} successfully!")
    except subprocess.CalledProcessError:
        print(f"Error installing {package}. Please check your internet connection or try again later.")

# Install required packages if not already installed
required_packages = ["pytube", "whisper", "moviepy", "transformers", "keybert", "huggingface_hub"]
for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        install_package(package)

# Function to sanitize URLs for filenames
def sanitize_filename(url):
    return re.sub(r'\W+', '_', url)

# Function to download video from YouTube
def download_video(video_url, download_path):
    try:
        yt = YouTube(video_url)
        stream = yt.streams.get_highest_resolution()
        video_file = stream.download(output_path=download_path)
        print("Download complete!")
        return video_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to extract audio from video
def extract_audio(video_file, audio_file):
    try:
        video = mp.VideoFileClip(video_file)
        video.audio.write_audiofile(audio_file)
        print("Audio extraction complete!")
        return audio_file
    except Exception as e:
        print(f"An error occurred during audio extraction: {e}")
        return None

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file):
    try:
        model = whisper.load_model("base")
        result = model.transcribe(audio_file)
        print("Transcription complete!")
        formatted_text = result["text"].replace(". ", ".\n")
        return formatted_text
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

# Function to extract key topics using KeyBERT
def extract_key_topics(text, num_topics=10):
    try:
        kw_model = KeyBERT()
        key_topics = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_topics)
        topics = [keyword for keyword, _ in key_topics]
        print("Key topics extraction complete!")
        return topics
    except Exception as e:
        print(f"An error occurred during key topics extraction: {e}")
        return None

# Function to summarize text using transformers pipeline
def summarize_text(text, max_chunk_length=1024):
    try:
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        sentences = text.split('. ')
        current_chunk = []
        chunks = []

        for sentence in sentences:
            if len(current_chunk) + len(sentence.split()) <= max_chunk_length:
                current_chunk.append(sentence)
            else:
                chunks.append('. '.join(current_chunk) + '.')
                current_chunk = [sentence]

        if current_chunk:
            chunks.append('. '.join(current_chunk) + '.')

        summary = []
        for chunk in chunks:
            chunk_length = len(chunk.split())
            max_new_tokens = min(512, chunk_length + 100)
            min_length = min(150, chunk_length // 2)
            chunk_summary = summarizer(chunk, max_new_tokens=max_new_tokens, min_length=min_length, do_sample=False)
            summary.append(chunk_summary[0]['summary_text'])

        combined_summary = ' '.join(summary)
        print("Summarization complete!")
        return combined_summary
    except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return None


#To remove the warning Your min_length=150 must be inferior than your max_length=142. but we will be getting smaller summary need to adjust this thing
# def summarize_text(text, max_chunk_length=1024):
#     try:
#         summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

#         sentences = text.split('. ')
#         current_chunk = []
#         chunks = []

#         for sentence in sentences:
#             if len(current_chunk) + len(sentence.split()) <= max_chunk_length:
#                 current_chunk.append(sentence)
#             else:
#                 chunks.append('. '.join(current_chunk) + '.')
#                 current_chunk = [sentence]

#         if current_chunk:
#             chunks.append('. '.join(current_chunk) + '.')

#         summary = []
#         for chunk in chunks:
#             chunk_length = len(chunk.split())
#             max_new_tokens = min(150, chunk_length + 20)  # Slightly increase max_new_tokens to handle edge cases
#             min_length = min(30, chunk_length // 2)
#             chunk_summary = summarizer(chunk, max_new_tokens=max_new_tokens, min_length=min_length, do_sample=False)
#             summary.append(chunk_summary[0]['summary_text'])

#         combined_summary = ' '.join(summary)
#         print("Summarization complete!")
#         return combined_summary
#     except Exception as e:
#         print(f"An error occurred during summarization: {e}")
#         return None


ModuleNotFoundError: No module named 'pytube'

In [9]:

# Main function
if __name__ == "__main__":
    video_url = input("Enter the video URL: ")
    sanitized_url = sanitize_filename(video_url)

    download_path = "./downloads"
    audio_path = "./audio"
    os.makedirs(download_path, exist_ok=True)
    os.makedirs(audio_path, exist_ok=True)

    video_file = download_video(video_url, download_path)

    if video_file:
        audio_file = os.path.join(audio_path, f"audio_{sanitized_url}.mp3")
        audio_file = extract_audio(video_file, audio_file)

        if audio_file:
            transcription = transcribe_audio(audio_file)

            if transcription:
                transcription_file = f"transcription_{sanitized_url}.txt"
                with open(transcription_file, "w") as f:
                    f.write(transcription)
                print(f"Transcription saved to {transcription_file}")

                key_topics = extract_key_topics(transcription)

                if key_topics:
                    key_topics_file = f"key_topics_{sanitized_url}.txt"
                    with open(key_topics_file, "w") as f:
                        f.write("\n".join(key_topics))
                    print(f"Key topics saved to {key_topics_file}")

                summary = summarize_text(transcription)

                if summary:
                    summary_file = f"summary_{sanitized_url}.txt"
                    with open(summary_file, "w") as f:
                        f.write(summary)
                    print(f"Summary saved to {summary_file}")


Enter the video URL: https://youtu.be/uTSGHBsvplg?si=uCghSuReTSbQcWdK


NameError: name 'sanitize_filename' is not defined

In [13]:
import os
import re
import subprocess
import sys
import torch
from transformers import pipeline
import requests
from pytube import YouTube

# Function to check and install packages
def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Installed {package} successfully!")
    except subprocess.CalledProcessError:
        print(f"Error installing {package}. Please check your internet connection or try again later.")

# Function to check and install required packages
def install_packages():
    required_packages = ["pytube", "moviepy", "transformers", "keybert", "requests"]
    for package in required_packages:
        try:
            __import__(package)
        except ImportError:
            install_package(package)

# Function to process video after installing required packages
def process_video_after_installation(video_url):
    try:
        # Ensure required packages are installed
        install_packages()

        # Import remaining modules after installation
        from keybert import KeyBERT
        import whisper
        import moviepy.editor as mp

        # Sanitize video URL for safe filename
        sanitized_url = sanitize_filename(video_url)

        # Directories for downloads and audio
        download_path = "./downloads"
        audio_path = "./audio"
        os.makedirs(download_path, exist_ok=True)
        os.makedirs(audio_path, exist_ok=True)

        # Download video from YouTube
        video_file = download_video(video_url, download_path)
        if not video_file:
            print("Video download failed. Exiting.")
            return

        # Extract audio from video
        audio_file = os.path.join(audio_path, f"audio_{sanitized_url}.mp3")
        audio_file = extract_audio(video_file, audio_file)
        if not audio_file:
            print("Audio extraction failed. Exiting.")
            return

        # Transcribe audio
        transcription = transcribe_audio(audio_file)
        if not transcription:
            print("Transcription failed. Exiting.")
            return

        # Save transcription to file
        transcription_file = f"transcription_{sanitized_url}.txt"
        with open(transcription_file, "w") as f:
            f.write(transcription)
        print(f"Transcription saved to {transcription_file}")

        # Extract key topics from transcription
        key_topics = extract_key_topics(transcription)
        if not key_topics:
            print("Key topics extraction failed. Exiting.")
            return

        # Save key topics to file
        key_topics_file = f"key_topics_{sanitized_url}.txt"
        with open(key_topics_file, "w") as f:
            f.write("\n".join(key_topics))
        print(f"Key topics saved to {key_topics_file}")

        # Summarize transcription
        summary = summarize_text(transcription)
        if not summary:
            print("Summarization failed. Exiting.")
            return

        # Save summary to file
        summary_file = f"summary_{sanitized_url}.txt"
        with open(summary_file, "w") as f:
            f.write(summary)
        print(f"Summary saved to {summary_file}")

    except Exception as e:
        print(f"An error occurred during video processing: {e}")

# Function to sanitize URLs for filenames
def sanitize_filename(url):
    return re.sub(r'\W+', '_', url)

# Function to download video from YouTube
def download_video(video_url, download_path):
    try:
        yt = YouTube(video_url)
        stream = yt.streams.get_highest_resolution()
        video_file = stream.download(output_path=download_path)
        print("Download complete!")
        return video_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to extract audio from video
def extract_audio(video_file, audio_file):
    try:
        video = mp.VideoFileClip(video_file)
        video.audio.write_audiofile(audio_file)
        print("Audio extraction complete!")
        return audio_file
    except Exception as e:
        print(f"An error occurred during audio extraction: {e}")
        return None

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file):
    try:
        model = whisper.load_model("base")
        result = model.transcribe(audio_file)
        print("Transcription complete!")
        formatted_text = result["text"].replace(". ", ".\n")
        return formatted_text
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

# Function to extract key topics using KeyBERT
def extract_key_topics(text, num_topics=10):
    try:
        kw_model = KeyBERT()
        key_topics = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_topics)
        topics = [keyword for keyword, _ in key_topics]
        print("Key topics extraction complete!")
        return topics
    except Exception as e:
        print(f"An error occurred during key topics extraction: {e}")
        return None

# Function to summarize text using transformers pipeline
def summarize_text(text, max_chunk_length=1024):
    try:
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

        sentences = text.split('. ')
        current_chunk = []
        chunks = []

        for sentence in sentences:
            if len(current_chunk) + len(sentence.split()) <= max_chunk_length:
                current_chunk.append(sentence)
            else:
                chunks.append('. '.join(current_chunk) + '.')
                current_chunk = [sentence]

        if current_chunk:
            chunks.append('. '.join(current_chunk) + '.')

        summary = []
        for chunk in chunks:
            chunk_length = len(chunk.split())
            max_new_tokens = min(150, chunk_length + 20)  # Slightly increase max_new_tokens to handle edge cases
            min_length = min(30, chunk_length // 2)
            chunk_summary = summarizer(chunk, max_new_tokens=max_new_tokens, min_length=min_length, do_sample=False)
            summary.append(chunk_summary[0]['summary_text'])

        combined_summary = ' '.join(summary)
        print("Summarization complete!")
        return combined_summary
    except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return None

# Main function (example usage)
if __name__ == "__main__":
    video_url = input("Enter the video URL: ")
    process_video_after_installation(video_url)


ModuleNotFoundError: No module named 'pytube'

In [1]:
! pip install git+https://github.com/huggingface/transformers -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [3]:
from transformers import pipeline

In [4]:
whisper = pipeline('automatic-speech-recognition',model= 'openai/whisper-medium',device=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [15]:
!pip install pytube


Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [19]:
from pytube import YouTube
import moviepy.editor as mp
import os

def download_youtube_audio(video_url, output_dir, output_name):
    try:
        yt = YouTube(video_url)
        stream = yt.streams.filter(only_audio=True).first()
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        audio_file = stream.download(output_path=output_dir, filename=output_name)
        print("Download complete!")
        return audio_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage:
video_url = "https://youtu.be/uTSGHBsvplg?si=uCghSuReTSbQcWdK"
output_dir = "./audio"
output_name = "testing"
audio_file = download_youtube_audio(video_url, output_dir, output_name)

if audio_file:
    print(f"Audio saved to: {audio_file}")
else:
    print("Audio download failed.")


Download complete!
Audio saved to: /content/./audio/testing


In [20]:
from IPython.display import Audio, display

# Specify the path to your audio file
audio_path = '/content/audio/testing.mp3'

# Display the audio file
display(Audio(audio_path, autoplay=True))


ValueError: rate must be specified when data is a numpy array or list of audio samples.