In [5]:
import whisper
from pathlib import Path
def format_time(seconds):
    """Convert seconds to SRT time format (HH:MM:SS,mmm)"""
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

def transcribe_video(video_path: Path, model_size="base"):

    srt_output=f"outputs/{video_path.stem}_{model_size}.srt"
    # Load the Whisper model (you can choose tiny, base, small, medium, or large)
    model = whisper.load_model(model_size)
    
    # Transcribe the video file
    result = model.transcribe(video_path.as_posix())
    
    # Extract segments (each segment has start, end, and text)
    segments = result.get("segments", [])
    
    # Write out an SRT file with timings
    with open(srt_output, "w", encoding="utf-8") as srt_file:
        for i, segment in enumerate(segments):
            start = segment["start"]
            end = segment["end"]
            text = segment["text"].strip()
            srt_file.write(f"{i + 1}\n")
            srt_file.write(f"{format_time(start)} --> {format_time(end)}\n")
            srt_file.write(f"{text}\n\n")
    print(f"Subtitle file saved to: {srt_output}")



In [6]:
transcribe_video(Path(r"E:\Coding\sample_video_data\Record_2025_02_15_09_14_05_937.mp4"), model_size='large')

100%|█████████████████████████████████████| 2.88G/2.88G [01:33<00:00, 33.1MiB/s]


: 