In [None]:
# Install necessary libraries (if not already installed)
!pip install openai-whisper moviepy google-generativeai nltk tiktoken ffmpeg-python gradio

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloa

In [None]:
import gradio as gr
import whisper
import subprocess
from datetime import datetime, timedelta
import timeit
import torch

# Load the Whisper model once at the start and ensure it's using the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = whisper.load_model("small").to(device)

# Supported languages and subtitle formats
languages = {
    "auto": "Auto Detect",
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
    "it": "Italian",
    "pt": "Portuguese",
    "ru": "Russian",
    "zh": "Chinese",
    "ja": "Japanese",
    "id": "Indonesian",
    "ko": "Korean",
    "th": "Thai",
    "vi": "Vietnamese",
    # Add more languages as needed
}

subtitle_formats = {
    "vtt": ".vtt",
    "srt": ".srt",
}

# Function to transcribe audio
def transcribe(file, lang):
    if lang == "auto":
        result = model.transcribe(file, language=None)
    else:
        result = model.transcribe(file, language=lang)
    return result

# Function to generate subtitles efficiently
def generate_subtitles(result, format="vtt"):
    segments = result["segments"]
    subtitle_strings = [
        f"{i+1}\n{format_timestamp(segment['start'], format)} --> {format_timestamp(segment['end'], format)}\n{segment['text'].strip()}\n"
        for i, segment in enumerate(segments)
    ]
    return "\n".join(subtitle_strings)

# Helper function to format timestamps efficiently
def format_timestamp(seconds, format="vtt"):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    milliseconds = int(round((seconds - int(seconds)) * 1000))
    if format == "vtt":
        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
    elif format == "srt":
        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

# Function to add subtitles to video using ffmpeg
def add_subtitles_to_video(video_path, subtitles_path, output_path):
    try:
        subprocess.call([
            "ffmpeg",
            "-i", video_path,
            "-vf", f"subtitles={subtitles_path}",
            "-c:a", "copy",
            output_path
        ])
    except Exception as e:
        print(f"Error adding subtitles: {str(e)}")

# Main function with error handling
def process(file, lang, format):
    try:
        start_time = timeit.default_timer()
        result = transcribe(file, lang)
        elapsed = timeit.default_timer() - start_time
        print(f"Transcription took {elapsed} seconds")

        subtitles_content = generate_subtitles(result, format)
        with open(f"subtitles.{format}", "w") as f:
            f.write(subtitles_content)

        if file.name.endswith((".mp4", ".avi")):
            output_video = "output_video.mp4"
            add_subtitles_to_video(file.name, f"subtitles.{format}", output_video)
            return result["text"], output_video
        else:
            return result["text"], None
    except Exception as e:
        return f"Processing failed: {str(e)}", None

# Gradio interface with file size limit
with gr.Blocks() as demo:
    gr.Markdown("# Multilingual Video Subtitling App")
    with gr.Row():
        file_input = gr.File(label="Upload Video or Audio File")  # 100MB limit
        lang_dropdown = gr.Dropdown(choices=list(languages.keys()), value="auto", label="Language")
        format_dropdown = gr.Dropdown(choices=list(subtitle_formats.keys()), value="vtt", label="Subtitle Format")
    with gr.Row():
        submit_button = gr.Button("Process")
    with gr.Row():
        transcription_output = gr.Textbox(label="Transcription")
        video_output = gr.File(label="Subtitled Video")

    submit_button.click(fn=process, inputs=[file_input, lang_dropdown, format_dropdown], outputs=[transcription_output, video_output])

demo.launch(debug="true")

100%|████████████████████████████████████████| 461M/461M [00:04<00:00, 111MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d320da80ea8e0cf2be.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Transcription took 161.82087523300004 seconds
