<a href="https://colab.research.google.com/github/DtotheS/video-subtitles/blob/main/src/video_subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/video-subtitles
%pwd

/content/drive/MyDrive/video-subtitles


'/content/drive/MyDrive/video-subtitles'

In [3]:
%pip install pytube
%pip install git+https://github.com/openai/whisper.git
%pip install pysrt
%pip install transformers
%pip install ffmpeg-python
%pip install yt-dlp
%pip install moviepy

import os
import yt_dlp as youtube_dl
from moviepy.editor import VideoFileClip
import whisper
from transformers import pipeline
import pysrt
import subprocess

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-d2z8vikx
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-d2z8vikx
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-

  if event.key is 'enter':



In [4]:
import os
import yt_dlp as youtube_dl
from moviepy.editor import VideoFileClip
import whisper
from transformers import pipeline
import pysrt
import subprocess

# Step 1: Download video using yt-dlp and rename it to 'video.mp4'
def download_youtube_video(url, output_path):
    print(f"Downloading video from {url}...")
    ydl_opts = {
        'format': 'bestvideo+bestaudio/best',  # Downloads the best quality video and audio
        'outtmpl': f'{output_path}/video.mp4',  # Save with a simple name 'video.mp4'
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
        # The file might be named with a .webm extension after merging
        video_file = f"{output_path}/video.mp4.webm"  # This is the merged file
        print(f"Downloaded video to {video_file}")

        # Rename the video to ensure it's an MP4 file (optional conversion step)
        final_video_file = f"{output_path}/video.mp4"
        os.rename(video_file, final_video_file)
        print(f"Renamed video to {final_video_file}")
        return final_video_file

# Step 2: Extract audio from the video
def extract_audio_from_video(video_path, output_directory):
    print(f"Extracting audio from {video_path}...")
    if not os.path.exists(video_path):
        print(f"Error: Video file not found at {video_path}")
        return None
    video = VideoFileClip(video_path)
    audio = video.audio
    audio_path = os.path.join(output_directory, "audio.wav")
    audio.write_audiofile(audio_path)
    print(f"Audio saved to {audio_path}")
    return audio_path

# Step 3: Transcribe and translate audio using Whisper
def transcribe_and_translate(audio_path, whisper_model="base"):
    print("Loading Whisper model...")
    model = whisper.load_model(whisper_model)

    print("Transcribing audio...")
    result = model.transcribe(audio_path, language="ko")

    print("Translating transcription to English...")
    translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
    subtitles = []
    for i, segment in enumerate(result['segments']):
        translated_text = translator(segment["text"])[0]["translation_text"]
        subtitles.append({
            "index": i + 1,
            "start": segment["start"],
            "end": segment["end"],
            "text": translated_text
        })
    return subtitles

# Step 4: Create an SRT file with subtitles
def create_srt_file(subtitles, srt_path):
    print(f"Creating SRT file at {srt_path}...")
    subs = pysrt.SubRipFile()
    for sub in subtitles:
        sub_item = pysrt.SubRipItem(
            index=sub["index"],
            start=pysrt.SubRipTime(seconds=sub["start"]),
            end=pysrt.SubRipTime(seconds=sub["end"]),
            text=sub["text"]
        )
        subs.append(sub_item)
    subs.save(srt_path, encoding="utf-8")
    print(f"SRT file saved to {srt_path}")

# Step 5: Embed subtitles into the video
def embed_subtitles(video_path, srt_path, output_video_path):
    print(f"Embedding subtitles into video...")
    command = [
        "ffmpeg",
        "-i", video_path,
        "-vf", f"subtitles={srt_path}:force_style='FontName=Arial,FontSize=16'",
        output_video_path
    ]
    subprocess.run(command, check=True)
    print(f"Video with subtitles saved to {output_video_path}")

# Main function to process the YouTube video
def process_youtube_video(url, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    # Step 1: Download the video
    video_path = download_youtube_video(url, output_dir)

    # Check if the video file exists
    if not os.path.exists(video_path):
        print(f"Error: Video file not found: {video_path}")
        return

    # Step 2: Extract audio
    audio_path = extract_audio_from_video(video_path, output_dir)

    if audio_path is None:
        print("Error: Failed to extract audio.")
        return

    # Step 3: Transcribe and translate audio
    subtitles = transcribe_and_translate(audio_path)

    # Step 4: Create the SRT file
    srt_path = os.path.join(output_dir, "subtitles.srt")
    create_srt_file(subtitles, srt_path)

    # Step 5: Embed subtitles into the video
    output_video_path = os.path.join(output_dir, "video_with_subtitles.mp4")
    embed_subtitles(video_path, srt_path, output_video_path)

    print(f"Process completed! Final video saved at: {output_video_path}")
    return output_video_path

# Example usage
if __name__ == "__main__":
    youtube_url = "https://www.youtube.com/watch?v=sLs04yn42KA"  # Replace with your YouTube URL
    output_directory = "./data"

    process_youtube_video(youtube_url, output_directory)


Downloading video from https://www.youtube.com/watch?v=sLs04yn42KA...
[youtube] Extracting URL: https://www.youtube.com/watch?v=sLs04yn42KA
[youtube] sLs04yn42KA: Downloading webpage
[youtube] sLs04yn42KA: Downloading tv player API JSON
[youtube] sLs04yn42KA: Downloading ios player API JSON
[youtube] sLs04yn42KA: Downloading player 37364e28
[youtube] sLs04yn42KA: Downloading m3u8 information
[info] sLs04yn42KA: Downloading 1 format(s): 399+251
[download] Destination: ./data/video.mp4.f399.mp4
[download] 100% of  335.96MiB in 00:00:08 at 39.67MiB/s  
[download] Destination: ./data/video.mp4.f251.webm
[download] 100% of   39.98MiB in 00:00:01 at 37.84MiB/s  
[Merger] Merging formats into "./data/video.mp4.webm"
Deleting original file ./data/video.mp4.f251.webm (pass -k to keep)
Deleting original file ./data/video.mp4.f399.mp4 (pass -k to keep)
Downloaded video to ./data/video.mp4.webm
Renamed video to ./data/video.mp4
Extracting audio from ./data/video.mp4...
MoviePy - Writing audio in .



MoviePy - Done.
Audio saved to ./data/audio.wav
Loading Whisper model...


100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 36.6MiB/s]
  checkpoint = torch.load(fp, map_location=device)



Transcribing audio...
Translating transcription to English...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Creating SRT file at ./data/subtitles.srt...
SRT file saved to ./data/subtitles.srt
Embedding subtitles into video...
Video with subtitles saved to ./data/video_with_subtitles.mp4
Process completed! Final video saved at: ./data/video_with_subtitles.mp4
