# <center> Subtitle Generator </center>

# Install Dependencies

In [None]:
%pip install -r requirements-gpu.txt
#%pip install -r requirements-cpu.txt

In [None]:
# sudo apt update && sudo apt install ffmpeg -y  # Ubuntu/Debian
# brew install ffmpeg  # macOS (Homebrew)
# if windows download ffmpeg from https://ffmpeg.org/download.html

# Extract Audio from Video

In [None]:
import subprocess

# Convert Windows path to WSL-compatible path
video_path = "/mnt/d/DBHH/s1/1.mp4"
audio_path = "audio.wav"

# Extract audio using FFmpeg
subprocess.run(["ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a", audio_path, "-y"])

print("Audio extracted successfully!")


# Transcribe & Translate Speech 

In [None]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import numpy as np
from datetime import timedelta

# Load Whisper model
model_name = "openai/whisper-large-v2"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Load audio file
audio_path = "audio.wav"
audio, sr = librosa.load(audio_path, sr=16000)

# Segment the audio into 30-second chunks for better transcription
chunk_duration = 30  # in seconds
num_chunks = int(np.ceil(len(audio) / (chunk_duration * sr)))
transcription_segments = []

for i in range(num_chunks):
    start = i * chunk_duration  # Start time in seconds
    end = min((i + 1) * chunk_duration, len(audio) / sr)  # End time

    # Extract audio chunk
    start_sample = int(start * sr)
    end_sample = int(end * sr)
    audio_chunk = audio[start_sample:end_sample]

    # Convert chunk to model input
    inputs = processor(audio_chunk, sampling_rate=16000, return_tensors="pt").input_features.to("cuda" if torch.cuda.is_available() else "cpu")

    # Force translation to English
    forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="translate")

    # Transcribe with timestamps and translation
    with torch.no_grad():
        predicted_ids = model.generate(inputs, forced_decoder_ids=forced_decoder_ids, max_length=448)

    transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    # Store transcription segment with timestamps
    transcription_segments.append({
        "start": start,
        "end": end,
        "text": transcript
    })

# Print detected segments
print("Detected Speech Segments:\n")
for seg in transcription_segments:
    print(f"[{timedelta(seconds=int(seg['start']))} --> {timedelta(seconds=int(seg['end']))}]: {seg['text']}")

# Save as an SRT file
srt_path = "subtitles.srt"

def format_timestamp(seconds):
    return str(timedelta(seconds=int(seconds))).replace(".", ",") + ",000"

with open(srt_path, "w", encoding="utf-8") as srt_file:
    for i, segment in enumerate(transcription_segments):
        start_time = format_timestamp(segment["start"])
        end_time = format_timestamp(segment["end"])
        srt_file.write(f"{i+1}\n{start_time} --> {end_time}\n{segment['text']}\n\n")

print(f"\nSubtitles saved to {srt_path}")


# Burn Subtitles into Video (Optional)

In [None]:
% ffmpeg -y -i /mnt/d/DBHH/s1/1.mp4 -vf subtitles=subtitles.srt -c:a copy output.mp4