In [1]:
%pip install yt-dlp pydub moviepy whisper gradio

import yt_dlp as youtube_dl
from pydub import AudioSegment
import os
import whisper
import gradio as gr
from moviepy.editor import VideoFileClip

def download_video_and_extract_audio(video_url):
    # Download the video
    ydl_opts = {
        'format': 'best',
        'outtmpl': 'downloaded_video.%(ext)s',
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])
    
    # Find the downloaded video file
    video_path = 'downloaded_video.mp4'
    
    # Extract the audio using moviepy
    video = VideoFileClip(video_path)
    audio_path = os.path.splitext(video_path)[0] + '.mp3'
    video.audio.write_audiofile(audio_path)
    
    # Clean up
    video.close()
    os.remove(video_path)
    
    print(f"Audio has been saved to {audio_path}")
    return audio_path

def transcribe_audio_with_timestamps(audio_path):
    # Load the whisper model
    model = whisper.load_model("base")
    
    # Transcribe the audio with timestamps
    result = model.transcribe(audio_path, verbose=True)
    
    # Print the transcription with timestamps
    for segment in result["segments"]:
        print(f"Start: {segment['start']}, End: {segment['end']}, Text: {segment['text']}")
    
    return result["segments"]

def extract_audio_segments(audio_path, segments):
    audio = AudioSegment.from_file(audio_path)
    audio_segments = []
    
    for segment in segments:
        start_ms = segment['start'] * 1000  # Convert to milliseconds
        end_ms = segment['end'] * 1000  # Convert to milliseconds
        audio_segment = audio[start_ms:end_ms]
        audio_segments.append({
            "start": segment['start'],
            "end": segment['end'],
            "text": segment['text'],
            "audio": audio_segment
        })
    
    return audio_segments

def chunk_segments(segments, max_length=15.0):
    chunked_segments = []
    chunk_id = 1
    
    for segment in segments:
        start_time = segment["start"]
        end_time = segment["end"]
        text = segment["text"]
        audio = segment["audio"]
        
        duration = end_time - start_time
        if duration <= max_length:
            chunked_segments.append({
                "chunk_id": chunk_id,
                "chunk_length": duration,
                "text": text,
                "start_time": start_time,
                "end_time": end_time
            })
            chunk_id += 1
        else:
            # Split the segment into smaller chunks
            num_chunks = int(duration // max_length) + 1
            chunk_duration = duration / num_chunks
            for i in range(num_chunks):
                chunk_start_time = start_time + i * chunk_duration
                chunk_end_time = min(start_time + (i + 1) * chunk_duration, end_time)
                chunk_text = text  # This can be improved by splitting the text accordingly
                chunk_audio = audio[i * chunk_duration * 1000:(i + 1) * chunk_duration * 1000]
                
                chunked_segments.append({
                    "chunk_id": chunk_id,
                    "chunk_length": chunk_end_time - chunk_start_time,
                    "text": chunk_text,
                    "start_time": chunk_start_time,
                    "end_time": chunk_end_time
                })
                chunk_id += 1
    
    return chunked_segments

def process_video(youtube_url):
    audio_path = download_video_and_extract_audio(youtube_url)
    segments = transcribe_audio_with_timestamps(audio_path)
    audio_segments = extract_audio_segments(audio_path, segments)
    chunked_segments = chunk_segments(audio_segments)
    
    return chunked_segments

iface = gr.Interface(
    fn=process_video,
    inputs="text",
    outputs="json",
    title="YouTube Video Semantic Chunking",
    description="Enter a YouTube video URL to extract and transcribe audio into semantic chunks."
)

iface.launch()

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




[youtube] Extracting URL: https://www.youtube.com/watch?v=zdmEzqpAG70
[youtube] zdmEzqpAG70: Downloading webpage
[youtube] zdmEzqpAG70: Downloading ios player API JSON
[youtube] zdmEzqpAG70: Downloading mweb player API JSON
[youtube] zdmEzqpAG70: Downloading m3u8 information
[info] zdmEzqpAG70: Downloading 1 format(s): 18
[download] Destination: downloaded_video.mp4
[download] 100% of   58.82MiB in 00:00:10 at 5.38MiB/s     
Audio has been saved to downloaded_video.mp3


  checkpoint = torch.load(fp, map_location=device)


Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:23.080]  Hi everyone. So, let us start with lecture one of this course where we will be talking
[00:23.080 --> 00:28.320]  about a brief and maybe a bit selective partial history of deep learning.
[00:28.320 --> 00:34.080]  So, we talk about deep learning. So, most of this material, the early material that is there,
[00:34.080 --> 00:40.160]  at least there in these slides, is taken by from this article on deep learning and neural networks
[00:40.160 --> 00:46.400]  and overview by Shmi Duber. There might be some errors in my accounting of the history and if they
[00:46.400 --> 00:51.360]  are, then I apologize for them and also feel free to contact me if you think certain portions need to
[00:51.360 --> 00:56.400]  be corrected or there are more things which have happened and you would like to me to add them.
[00:56.720 --> 01:04.080]  So, I first 