<a href="https://colab.research.google.com/github/Adeel-CS/howtodonwloadYoutubeVideo/blob/main/Copy_of_FYP_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

finalizing at this point

In [None]:
import os
import re
from pytube import YouTube
import whisper
import moviepy.editor as mp
from transformers import pipeline
from keybert import KeyBERT

# Function to check and install packages
def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"Installed {package} successfully!")
    except subprocess.CalledProcessError:
        print(f"Error installing {package}. Please check your internet connection or try again later.")

# Install required packages if not already installed
required_packages = ["pytube", "whisper", "moviepy", "transformers", "keybert", "huggingface_hub"]
for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        install_package(package)

# Function to sanitize URLs for filenames
def sanitize_filename(url):
    return re.sub(r'\W+', '_', url)

# Function to download video from YouTube
def download_video(video_url, download_path):
    try:
        yt = YouTube(video_url)
        stream = yt.streams.get_highest_resolution()
        video_file = stream.download(output_path=download_path)
        print("Download complete!")
        return video_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to extract audio from video
def extract_audio(video_file, audio_file):
    try:
        video = mp.VideoFileClip(video_file)
        video.audio.write_audiofile(audio_file)
        print("Audio extraction complete!")
        return audio_file
    except Exception as e:
        print(f"An error occurred during audio extraction: {e}")
        return None

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file):
    try:
        model = whisper.load_model("base")
        result = model.transcribe(audio_file)
        print("Transcription complete!")
        formatted_text = result["text"].replace(". ", ".\n")
        return formatted_text
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

# Function to extract key topics using KeyBERT
def extract_key_topics(text, num_topics=10):
    try:
        kw_model = KeyBERT()
        key_topics = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_topics)
        topics = [keyword for keyword, _ in key_topics]
        print("Key topics extraction complete!")
        return topics
    except Exception as e:
        print(f"An error occurred during key topics extraction: {e}")
        return None

# Function to summarize text using transformers pipeline
def summarize_text(text, max_chunk_length=1024):
    try:
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        sentences = text.split('. ')
        current_chunk = []
        chunks = []

        for sentence in sentences:
            if len(current_chunk) + len(sentence.split()) <= max_chunk_length:
                current_chunk.append(sentence)
            else:
                chunks.append('. '.join(current_chunk) + '.')
                current_chunk = [sentence]

        if current_chunk:
            chunks.append('. '.join(current_chunk) + '.')

        summary = []
        for chunk in chunks:
            chunk_length = len(chunk.split())
            max_new_tokens = min(512, chunk_length + 100)
            min_length = min(150, chunk_length // 2)
            chunk_summary = summarizer(chunk, max_new_tokens=max_new_tokens, min_length=min_length, do_sample=False)
            summary.append(chunk_summary[0]['summary_text'])

        combined_summary = ' '.join(summary)
        print("Summarization complete!")
        return combined_summary
    except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return None


#To remove the warning Your min_length=150 must be inferior than your max_length=142. but we will be getting smaller summary need to adjust this thing
# def summarize_text(text, max_chunk_length=1024):
#     try:
#         summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

#         sentences = text.split('. ')
#         current_chunk = []
#         chunks = []

#         for sentence in sentences:
#             if len(current_chunk) + len(sentence.split()) <= max_chunk_length:
#                 current_chunk.append(sentence)
#             else:
#                 chunks.append('. '.join(current_chunk) + '.')
#                 current_chunk = [sentence]

#         if current_chunk:
#             chunks.append('. '.join(current_chunk) + '.')

#         summary = []
#         for chunk in chunks:
#             chunk_length = len(chunk.split())
#             max_new_tokens = min(150, chunk_length + 20)  # Slightly increase max_new_tokens to handle edge cases
#             min_length = min(30, chunk_length // 2)
#             chunk_summary = summarizer(chunk, max_new_tokens=max_new_tokens, min_length=min_length, do_sample=False)
#             summary.append(chunk_summary[0]['summary_text'])

#         combined_summary = ' '.join(summary)
#         print("Summarization complete!")
#         return combined_summary
#     except Exception as e:
#         print(f"An error occurred during summarization: {e}")
#         return None


In [None]:

# Main function
if __name__ == "__main__":
    video_url = input("Enter the video URL: ")
    sanitized_url = sanitize_filename(video_url)

    download_path = "./downloads"
    audio_path = "./audio"
    os.makedirs(download_path, exist_ok=True)
    os.makedirs(audio_path, exist_ok=True)

    video_file = download_video(video_url, download_path)

    if video_file:
        audio_file = os.path.join(audio_path, f"audio_{sanitized_url}.mp3")
        audio_file = extract_audio(video_file, audio_file)

        if audio_file:
            transcription = transcribe_audio(audio_file)

            if transcription:
                transcription_file = f"transcription_{sanitized_url}.txt"
                with open(transcription_file, "w") as f:
                    f.write(transcription)
                print(f"Transcription saved to {transcription_file}")

                key_topics = extract_key_topics(transcription)

                if key_topics:
                    key_topics_file = f"key_topics_{sanitized_url}.txt"
                    with open(key_topics_file, "w") as f:
                        f.write("\n".join(key_topics))
                    print(f"Key topics saved to {key_topics_file}")

                summary = summarize_text(transcription)

                if summary:
                    summary_file = f"summary_{sanitized_url}.txt"
                    with open(summary_file, "w") as f:
                        f.write(summary)
                    print(f"Summary saved to {summary_file}")


Enter the video URL: https://www.youtube.com/watch?v=5sLYAQS9sWQ
Download complete!
MoviePy - Writing audio in ./audio/audio_https_www_youtube_com_watch_v_5sLYAQS9sWQ.mp3




MoviePy - Done.
Audio extraction complete!
Transcription complete!
Transcription saved to transcription_https_www_youtube_com_watch_v_5sLYAQS9sWQ.txt
Key topics extraction complete!
Key topics saved to key_topics_https_www_youtube_com_watch_v_5sLYAQS9sWQ.txt


Your min_length=150 must be inferior than your max_length=142.


Summarization complete!
Summary saved to summary_https_www_youtube_com_watch_v_5sLYAQS9sWQ.txt
