In [69]:
!pip install --upgrade -Uqq WhisperSpeech pydub webdataset openai youtube-transcript-api transformers torch soundfile

In [70]:
from openai import OpenAI
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, NoTranscriptAvailable
from google.colab import userdata
import google.colab

In [71]:
def get_youtube_transcript(url):
   # Extract the video ID from the URL considering different formats
    if "youtube.com" in url:
        video_id = url.split("v=")[1].split("&")[0]
    elif "youtu.be" in url:
        video_id = url.split("/")[-1]
    else:
        raise ValueError("Invalid YouTube URL. Please provide a valid YouTube video URL.")

    try:
        # Attempt to get the manually created transcript first
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
    except (TranscriptsDisabled, NoTranscriptFound, NoTranscriptAvailable):
        # If manual transcript is not available, get the auto-generated one
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'], preserve_formatting=True)

    # Convert transcript to a string
    transcript_str = '\n'.join([item['text'] for item in transcript])
    return transcript_str

In [72]:
def summarize_text_gpt(transcript):
    system_msg = 'You are a model that receives a transcription of a YouTube video. Your task is to correct any words that may be incorrect based on the context, and transform it into a well-structured summary of the entire video. Your summary should highlight important details and provide additional context when necessary. Aim to be detailed, particularly when addressing non-trivial aspects of the content. The summary should be organized into multiple paragraphs.'
    client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
    token_limit = 256

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": transcript},
            ],
            max_tokens=token_limit,
        )
        summary = response.choices[0].message.content

        # Split the summary into sentences
        sentences = summary.split('. ')
        return sentences
    except Exception as e:
        raise RuntimeError(f"OpenAI API error: {e}")

In [15]:
def summarize_text(transcript):
  from transformers import pipeline
  summarizer = pipeline("summarization", model="google/long-t5-local-base")
  summary = summarizer(transcript, min_length=30, max_length=len(transcript), do_sample=False)
  return summary[0]['summary_text']

In [73]:
def summarize_yt_video(url):
    try:
        transcript = get_youtube_transcript(url)
        sentences = summarize_text_gpt(transcript)
        return sentences
    except Exception as e:
        return f"Error processing YouTube URL: {e}"

In [74]:
# Example URL for testing
test_url = "https://www.youtube.com/watch?v=r6sa_fWQB_4"

# Test the process_youtube_url function
try:
    summary = summarize_yt_video(test_url)
    print("Summary:", summary)
except Exception as e:
    print("Error during testing:", e)

Summary: ["In this video, Justin explores the significant milestone of the world’s population reaching eight billion and presents three pivotal questions regarding humanity's past, present, and future", 'The journey begins by contextualizing the enormity of the number eight billion with a thought experiment on how long it would take to meet each individual if one were to spend a minute with every person, illustrating the overwhelming scale of humanity present today', "Justin notes that while there are currently about eight billion people living, historical estimations suggest that almost 117 billion people have ever lived on Earth, highlighting that today's population represents only 6.8% of all humans who have existed.\n\nTransitioning to the future, Justin discusses the inherent difficulties in predicting how many people will exist", "He approaches this complexity with mathematical models that project future population growth based on factors such as the annual birth rate and the dur

In [75]:
output_summary = summarize_yt_video(test_url)

In [76]:
import torch
import soundfile as sf
from whisperspeech.pipeline import Pipeline

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model', device="cuda")

count = 0

for paragraph in output_summary:
  file_name = f"output_{count}.wav"
  count += 1
  pipe.generate_to_file(file_name, paragraph)

In [77]:
from pydub import AudioSegment

def merge_audio_files(file_list, output_filename, delay_ms=1000):
    # Start with the first audio file
    combined_audio = AudioSegment.from_wav(file_list[0])

    # Define the silence period (delay)
    silence = AudioSegment.silent(duration=delay_ms)

    # Loop through the rest of the files and append them with the delay
    for file in file_list[1:]:
        next_audio = AudioSegment.from_wav(file)
        combined_audio += silence + next_audio

    # Export the combined audio to the output file
    combined_audio.export(output_filename, format="wav")

# List of audio files to merge
audio_files = []
for i in range(count):
    audio_files.append(f"output_{i}.wav")

# Name of the final merged file
output_file = "merged_output.wav"

# Merge the files with a delay of 0.5 second
merge_audio_files(audio_files, output_file, delay_ms=500)

In [78]:
# Play the audio
from IPython.display import Audio
Audio("merged_output.wav")