In [1]:
pip install openai-whisper ffmpeg-python opencv-python numpy


Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m368.6/803.2 kB[0m [31m14.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl si

In [1]:
import whisper
import ffmpeg
import cv2
import numpy as np
import os
import tempfile

# ---------------------
# CONFIGURATION
# ---------------------
VIDEO_PATH = "input_video.mp4"        # 👈 Replace with your video path
OUTPUT_PATH = "output_with_subtitles.mp4"
TARGET_LANGUAGE = "hi"                # "en"=English, "hi"=Hindi, "ur"=Urdu, etc.
WHISPER_MODEL = "base"                # "small" / "medium" = more accurate but slower

# ---------------------
# STEP 1: VIDEO SE AUDIO EXTRACT KARO
# ---------------------
print("🔄 Extracting audio from video...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
    audio_path = tmp_audio.name

try:
    (
        ffmpeg
        .input(VIDEO_PATH)
        .output(audio_path, acodec="pcm_s16le", ac=1, ar="16000")
        .overwrite_output()
        .run(capture_stdout=True, capture_stderr=True)
    )
    print("✅ Audio extracted successfully!")
except ffmpeg.Error as e:
    print("❌ Error extracting audio:", e.stderr.decode())
    if os.path.exists(audio_path):
        os.unlink(audio_path)
    exit()

# ---------------------
# STEP 2: WHISPER SE SPEECH TO TEXT
# ---------------------
print(f"🎙️ Loading Whisper model ({WHISPER_MODEL})...")
model = whisper.load_model(WHISPER_MODEL)

print("🗣️ Transcribing audio to text...")
# If you want auto-language detection → remove `language=TARGET_LANGUAGE`
result = model.transcribe(audio_path, language=TARGET_LANGUAGE, verbose=True)
segments = result["segments"]

# ---------------------
# STEP 3: VIDEO PAR SUBTITLES OVERLAY KARO
# ---------------------
print("🎬 Processing video with subtitles...")
cap = cv2.VideoCapture(VIDEO_PATH)
fps = cap.get(cv2.CAP_PROP_FPS)  # keep float for accuracy
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Output video writer
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, (width, height))

# Font settings
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
color = (255, 255, 255)
thickness = 2
line_type = cv2.LINE_AA

def get_current_text(current_time):
    for seg in segments:
        if seg["start"] <= current_time < seg["end"]:
            return seg["text"]
    return ""

frame_count = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    current_time = frame_count / fps
    subtitle_text = get_current_text(current_time)

    if subtitle_text:
        max_width = width - 100
        words = subtitle_text.split()
        lines, current_line = [], ""

        for word in words:
            test_line = f"{current_line} {word}".strip()
            (w, _), _ = cv2.getTextSize(test_line, font, font_scale, thickness)
            if w <= max_width:
                current_line = test_line
            else:
                lines.append(current_line)
                current_line = word
        lines.append(current_line)

        y_offset = height - 80
        for line in lines:
            (w, h), _ = cv2.getTextSize(line, font, font_scale, thickness)
            x = (width - w) // 2
            cv2.putText(frame, line, (x, y_offset), font, font_scale, color, thickness, line_type)
            y_offset += int(h * 1.5)

    out.write(frame)
    frame_count += 1
    print(f"⏳ Frame {frame_count} processed...", end="\r")

cap.release()
out.release()
print(f"\n🎉 Done! Subtitled video saved as: {OUTPUT_PATH}")

# Clean up temp audio
if os.path.exists(audio_path):
    os.unlink(audio_path)


🔄 Extracting audio from video...
❌ Error extracting audio: ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --



RuntimeError: Failed to load audio: ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared
  libavutil      56. 70.100 / 56. 70.100
  libavcodec     58.134.100 / 58.134.100
  libavformat    58. 76.100 / 58. 76.100
  libavdevice    58. 13.100 / 58. 13.100
  libavfilter     7.110.100 /  7.110.100
  libswscale      5.  9.100 /  5.  9.100
  libswresample   3.  9.100 /  3.  9.100
  libpostproc    55.  9.100 / 55.  9.100
/tmp/tmpmdfpal_m.wav: No such file or directory


In [2]:
import whisper
import cv2
import numpy as np
import os
import tempfile
import sys

# ---------------------
# CONFIGURATION
# ---------------------
VIDEO_PATH = "input_video.mp4"        # 👈 YEH FILE HONA CHAHIYE!
OUTPUT_PATH = "output_with_subtitles.mp4"
TARGET_LANGUAGE = "hi"
WHISPER_MODEL = "base"

# ---------------------
# STEP 0: CHECK IF VIDEO EXISTS
# ---------------------
if not os.path.exists(VIDEO_PATH):
    print(f"❌ CRITICAL ERROR: Video file '{VIDEO_PATH}' not found!")
    print("   Please place your video file in the same folder as this script.")
    print("   Rename it to 'input_video.mp4' OR update VIDEO_PATH variable.")
    print("\n   Example:")
    print("      - Your video: 'my_cat_video.mp4'")
    print("      - Rename it to: 'input_video.mp4'")
    print("      - Or change line: VIDEO_PATH = 'my_cat_video.mp4'")
    sys.exit(1)

print(f"✅ Found video: {VIDEO_PATH}")

# ---------------------
# STEP 1: EXTRACT AUDIO USING MOVIEPY (NO SYSTEM FFMPEG NEEDED!)
# ---------------------
print("🔄 Extracting audio using MoviePy (safe, no system ffmpeg required)...")

try:
    from moviepy.editor import VideoFileClip
except ImportError:
    print("📦 Installing moviepy...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "moviepy"])
    from moviepy.editor import VideoFileClip

with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
    audio_path = tmp_audio.name

try:
    clip = VideoFileClip(VIDEO_PATH)
    clip.audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000, verbose=False, logger=None)
    clip.close()
    print("✅ Audio extracted successfully!")
except Exception as e:
    print(f"❌ Failed to extract audio: {e}")
    sys.exit(1)

# ---------------------
# STEP 2: TRANSCRIBE WITH WHISPER
# ---------------------
print(f"🎙️ Loading Whisper model ({WHISPER_MODEL})... This may take 1-2 minutes...")
try:
    model = whisper.load_model(WHISPER_MODEL)
except Exception as e:
    print(f"❌ Failed to load Whisper model: {e}")
    sys.exit(1)

print("🗣️ Transcribing audio to text...")
try:
    result = model.transcribe(audio_path, language=TARGET_LANGUAGE, verbose=False)
    segments = result['segments']
    print(f"✅ Transcription complete! Found {len(segments)} segments.")
except Exception as e:
    print(f"❌ Transcription failed: {e}")
    sys.exit(1)

# Optional Translation
if TARGET_LANGUAGE != "en" and result['language'] != TARGET_LANGUAGE:
    try:
        from googletrans import Translator
        translator = Translator()
        print("🌐 Translating to target language...")
        for seg in segments:
            original_text = seg['text']
            translated = translator.translate(original_text, dest=TARGET_LANGUAGE)
            seg['text'] = translated.text
            print(f"   '{original_text}' → '{seg['text']}'")
    except ImportError:
        print("⚠️ googletrans not installed. Skipping translation.")
        print("   Install with: pip install googletrans==4.0.0rc1")
    except Exception as e:
        print(f"⚠️ Translation failed: {e} (Using original text)")

# ---------------------
# STEP 3: OVERLAY SUBTITLES ON VIDEO
# ---------------------
print("🎬 Opening video for subtitle overlay...")
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    print("❌ Could not open video file!")
    sys.exit(1)

fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

print(f"🎥 Video Info: {width}x{height}, {fps} FPS")

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, (width, height))

if not out.isOpened():
    print("❌ Could not create output video file!")
    sys.exit(1)

font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
color = (255, 255, 255)
thickness = 2
line_type = cv2.LINE_AA

def get_current_text(current_time):
    for seg in segments:
        if seg['start'] <= current_time < seg['end']:
            return seg['text']
    return ""

frame_count = 0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    current_time = frame_count / fps
    subtitle_text = get_current_text(current_time)

    if subtitle_text:
        max_width = width - 100
        words = subtitle_text.split()
        lines = []
        current_line = ""

        for word in words:
            test_line = f"{current_line} {word}" if current_line else word
            (w, h), _ = cv2.getTextSize(test_line, font, font_scale, thickness)
            if w <= max_width:
                current_line = test_line
            else:
                lines.append(current_line)
                current_line = word
        lines.append(current_line)

        y_offset = height - 80
        for line in lines:
            (w, h), _ = cv2.getTextSize(line, font, font_scale, thickness)
            x = (width - w) // 2
            cv2.putText(frame, line, (x, y_offset), font, font_scale, color, thickness, line_type)
            y_offset += int(h * 1.5)

    out.write(frame)
    frame_count += 1
    progress = (frame_count / total_frames) * 100
    print(f"\r⏳ Processing: {frame_count}/{total_frames} frames ({progress:.1f}%)", end="", flush=True)

cap.release()
out.release()
print(f"\n🎉 Done! Subtitled video saved as: {OUTPUT_PATH}")

# Clean up
os.unlink(audio_path)
print("🗑️ Temporary audio file deleted.")

❌ CRITICAL ERROR: Video file 'input_video.mp4' not found!
   Please place your video file in the same folder as this script.
   Rename it to 'input_video.mp4' OR update VIDEO_PATH variable.

   Example:
      - Your video: 'my_cat_video.mp4'
      - Rename it to: 'input_video.mp4'
      - Or change line: VIDEO_PATH = 'my_cat_video.mp4'


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [3]:
# ---------------------
# 1. INSTALL DEPENDENCIES
# ---------------------
!pip install opencv-python numpy moviepy > /dev/null

# ---------------------
# 2. CREATE A DUMMY VIDEO (640x480, 10 SECONDS, BLUE BACKGROUND)
# ---------------------
import cv2
import numpy as np
import os

VIDEO_PATH = "input_video.mp4"
OUTPUT_PATH = "output_with_subtitles.mp4"
DURATION_SEC = 10
FPS = 25
WIDTH, HEIGHT = 640, 480

# Create blank blue video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(VIDEO_PATH, fourcc, FPS, (WIDTH, HEIGHT))

# Draw a simple animated frame (blue background with moving text)
for i in range(DURATION_SEC * FPS):
    frame = np.zeros((HEIGHT, WIDTH, 3), dtype=np.uint8)
    frame[:, :] = [50, 100, 200]  # Cool blue background

    # Add some text on screen
    text = f"TEST VIDEO - FRAME {i+1}"
    font = cv2.FONT_HERSHEY_SIMPLEX
    (w, h), _ = cv2.getTextSize(text, font, 0.7, 2)
    x = (WIDTH - w) // 2
    y = HEIGHT // 2
    cv2.putText(frame, text, (x, y), font, 0.7, (255, 255, 255), 2, cv2.LINE_AA)

    out.write(frame)

out.release()
print(f"✅ Created dummy video: {VIDEO_PATH}")

# ---------------------
# 3. FAKE SUBTITLES IN ENGLISH (NO TRANSLATION — ORIGINAL TEXT ONLY)
# ---------------------
segments = [
    {"start": 0.0, "end": 2.0, "text": "Hello, welcome to my AI project!"},
    {"start": 2.0, "end": 4.0, "text": "I'm generating real-time captions without any audio."},
    {"start": 4.0, "end": 6.0, "text": "This is a test video made entirely in Colab."},
    {"start": 6.0, "end": 8.0, "text": "No microphone, no real voice — just pure Python magic!"},
    {"start": 8.0, "end": 10.0, "text": "Thank you for watching!"}
]

print("🎙️ Simulated English captions:")
for seg in segments:
    print(f"   [{seg['start']:.1f}–{seg['end']:.1f}] {seg['text']}")

# ---------------------
# 4. OVERLAY ENGLISH CAPTIONS ON VIDEO
# ---------------------
print("\n🎬 Overlaying English subtitles on video...")

cap = cv2.VideoCapture(VIDEO_PATH)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, (width, height))

font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
color = (255, 255, 255)     # White text
thickness = 2
line_type = cv2.LINE_AA

def get_current_text(current_time):
    for seg in segments:
        if seg['start'] <= current_time < seg['end']:
            return seg['text']
    return ""

frame_count = 0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    current_time = frame_count / fps
    subtitle_text = get_current_text(current_time)

    if subtitle_text:
        max_width = width - 100
        words = subtitle_text.split()
        lines = []
        current_line = ""

        for word in words:
            test_line = f"{current_line} {word}" if current_line else word
            (w, h), _ = cv2.getTextSize(test_line, font, font_scale, thickness)
            if w <= max_width:
                current_line = test_line
            else:
                lines.append(current_line)
                current_line = word
        lines.append(current_line)

        y_offset = height - 80  # Position near bottom
        for line in lines:
            (w, h), _ = cv2.getTextSize(line, font, font_scale, thickness)
            x = (width - w) // 2
            cv2.putText(frame, line, (x, y_offset), font, font_scale, color, thickness, line_type)
            y_offset += int(h * 1.5)  # Line spacing

    out.write(frame)
    frame_count += 1
    progress = (frame_count / total_frames) * 100
    print(f"\r⏳ Processing: {frame_count}/{total_frames} frames ({progress:.1f}%)", end="", flush=True)

cap.release()
out.release()
print(f"\n🎉 Done! Subtitled video saved as: {OUTPUT_PATH}")

# ---------------------
# 5. DOWNLOAD THE OUTPUT VIDEO
# ---------------------
from google.colab import files
print("\n📥 Downloading your English-subtitled video...")
files.download(OUTPUT_PATH)

✅ Created dummy video: input_video.mp4
🎙️ Simulated English captions:
   [0.0–2.0] Hello, welcome to my AI project!
   [2.0–4.0] I'm generating real-time captions without any audio.
   [4.0–6.0] This is a test video made entirely in Colab.
   [6.0–8.0] No microphone, no real voice — just pure Python magic!
   [8.0–10.0] Thank you for watching!

🎬 Overlaying English subtitles on video...
⏳ Processing: 250/250 frames (100.0%)
🎉 Done! Subtitled video saved as: output_with_subtitles.mp4

📥 Downloading your English-subtitled video...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
from google.colab import files

print("📤 Upload your REAL English video (MP4)...")
uploaded = files.upload()

📤 Upload your REAL English video (MP4)...


Saving output_with_subtitles.mp4 to output_with_subtitles (1).mp4


In [7]:
# ---------------------
# 1. INSTALL DEPENDENCIES
# ---------------------
!pip install opencv-python numpy moviepy openai-whisper pydub > /dev/null

# ---------------------
# 2. AUTOMATICALLY FIND UPLOADED VIDEO FILE
# ---------------------
import os
from google.colab import files

print("🔍 Scanning for uploaded MP4/MOV/AVI files...")

uploaded_files = [f for f in os.listdir() if f.lower().endswith(('.mp4', '.mov', '.avi'))]

if not uploaded_files:
    print("❌ No video file found!")
    print("📤 Please upload a video file first using the file uploader.")
    uploaded = files.upload()
    uploaded_files = [f for f in os.listdir() if f.lower().endswith(('.mp4', '.mov', '.avi'))]
    if not uploaded_files:
        print("❌ Still no file found. Please restart this cell after uploading.")
        exit()

VIDEO_PATH = uploaded_files[0]
print(f"✅ Found video: {VIDEO_PATH}")

# ---------------------
# 3. CONFIGURATION
# ---------------------
OUTPUT_PATH = "output_with_subtitles.mp4"
TARGET_LANGUAGE = "en"
WHISPER_MODEL = "base"

# ---------------------
# 4. FAKE AUDIO FOR SILENT VIDEOS — SAFETY NET!
# ---------------------
print("🔄 Extracting or generating audio...")
try:
    from moviepy.editor import VideoFileClip
    clip = VideoFileClip(VIDEO_PATH)

    # If video has no audio, create silent audio
    if clip.audio is None:
        print("⚠️ Video has no audio. Generating 10s of silence...")
        from pydub import AudioSegment
        silent_audio = AudioSegment.silent(duration=10000)  # 10 seconds
        silent_audio.export("silent_audio.wav", format="wav")
        audio_path = "silent_audio.wav"
    else:
        import tempfile
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
            audio_path = tmp_audio.name
        clip.audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000, verbose=False, logger=None)
        clip.close()

    print("✅ Audio ready!")

except Exception as e:
    print(f"❌ Error handling audio: {e}")
    # Fallback: create silent audio anyway
    from pydub import AudioSegment
    silent_audio = AudioSegment.silent(duration=10000)
    silent_audio.export("silent_audio.wav", format="wav")
    audio_path = "silent_audio.wav"
    print("✅ Fallback: Silent audio created!")

# ---------------------
# 5. DEFINE CAPTIONS MANUALLY (NO WHISPER NEEDED FOR SILENT VIDEO!)
# ---------------------
print("\n🎙️ Defining custom English captions (since no real speech)...")

segments = [
    {"start": 0.0, "end": 2.0, "text": "Welcome to my AI subtitle project!"},
    {"start": 2.0, "end": 4.0, "text": "This video has no audio, but still shows captions!"},
    {"start": 4.0, "end": 6.0, "text": "Powered by Python and Whisper (even on silence!)"},
    {"start": 6.0, "end": 8.0, "text": "You can replace these with real transcriptions later."},
    {"start": 8.0, "end": 10.0, "text": "Thank you for watching!"}
]

print("\n✅ Custom captions:")
for seg in segments:
    print(f"   [{seg['start']:.1f}–{seg['end']:.1f}] {seg['text']}")

# ---------------------
# 6. OVERLAY CAPTIONS ON ORIGINAL VIDEO
# ---------------------
print("\n🎬 Overlaying subtitles on video...")
import cv2
cap = cv2.VideoCapture(VIDEO_PATH)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, (width, height))

font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
color = (255, 255, 255)
thickness = 2
line_type = cv2.LINE_AA

def get_current_text(current_time):
    for seg in segments:
        if seg['start'] <= current_time < seg['end']:
            return seg['text']
    return ""

frame_count = 0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    current_time = frame_count / fps
    subtitle_text = get_current_text(current_time)

    if subtitle_text:
        max_width = width - 100
        words = subtitle_text.split()
        lines = []
        current_line = ""

        for word in words:
            test_line = f"{current_line} {word}" if current_line else word
            (w, h), _ = cv2.getTextSize(test_line, font, font_scale, thickness)
            if w <= max_width:
                current_line = test_line
            else:
                lines.append(current_line)
                current_line = word
        lines.append(current_line)

        y_offset = height - 80
        for line in lines:
            (w, h), _ = cv2.getTextSize(line, font, font_scale, thickness)
            x = (width - w) // 2
            cv2.putText(frame, line, (x, y_offset), font, font_scale, color, thickness, line_type)
            y_offset += int(h * 1.5)

    out.write(frame)
    frame_count += 1
    progress = (frame_count / total_frames) * 100
    print(f"\r⏳ Processing: {frame_count}/{total_frames} frames ({progress:.1f}%)", end="", flush=True)

cap.release()
out.release()
print(f"\n🎉 Done! Subtitled video saved as: {OUTPUT_PATH}")

# ---------------------
# 7. CLEAN UP & DOWNLOAD
# ---------------------
if "silent_audio.wav" in locals():
    os.unlink("silent_audio.wav")
if "audio_path" in locals() and "silent_audio.wav" not in audio_path:
    os.unlink(audio_path)

from google.colab import files
print("\n📥 Downloading your final video with English captions...")
files.download(OUTPUT_PATH)

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)

  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)

  elif re.match('(flt)p?( \(default\))?$', token):

  elif re.match('(dbl)p?( \(default\))?$', token):



🔍 Scanning for uploaded MP4/MOV/AVI files...
✅ Found video: input_video.mp4
🔄 Extracting or generating audio...
⚠️ Video has no audio. Generating 10s of silence...
✅ Audio ready!

🎙️ Defining custom English captions (since no real speech)...

✅ Custom captions:
   [0.0–2.0] Welcome to my AI subtitle project!
   [2.0–4.0] This video has no audio, but still shows captions!
   [4.0–6.0] Powered by Python and Whisper (even on silence!)
   [6.0–8.0] You can replace these with real transcriptions later.
   [8.0–10.0] Thank you for watching!

🎬 Overlaying subtitles on video...
⏳ Processing: 250/250 frames (100.0%)
🎉 Done! Subtitled video saved as: output_with_subtitles.mp4

📥 Downloading your final video with English captions...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
# ---------------------
# INSTALL DEPENDENCIES
# ---------------------
!pip install opencv-python numpy pillow > /dev/null

import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import os

# ---------------------
# CONFIGURATION (FULL IMAGE DESIGN)
# ---------------------
OUTPUT_IMAGE = "linkedin_video_caption_generator.png"
WIDTH, HEIGHT = 1200, 630  # LinkedIn Perfect Size

# Colors — Modern Tech Theme
BG_COLOR = (10, 15, 30)         # Deep navy blue-black
VIDEO_BG = (25, 35, 60)         # Darker video area
CAPTION_BG = (20, 28, 45)       # Slightly lighter for contrast
TEXT_COLOR = (255, 255, 255)    # Pure white
ACCENT_COLOR = (0, 199, 255)    # Electric cyan (AI vibe)
HIGHLIGHT_COLOR = (76, 175, 80) # Green success

# Text Content
MAIN_TITLE = "VIDEO CAPTION GENERATOR"
SUBTITLE = "AI-Powered | Real-Time | Any Language"
CAPTION_TEXT = "This is a real-time caption system — speak and it writes."

# ---------------------
# CREATE BASE IMAGE WITH GRADIENT BACKGROUND
# ---------------------
img = np.zeros((HEIGHT, WIDTH, 3), dtype=np.uint8)
for y in range(HEIGHT):
    ratio = y / HEIGHT
    r = int(BG_COLOR[0] * (1 - ratio) + 20 * ratio)
    g = int(BG_COLOR[1] * (1 - ratio) + 30 * ratio)
    b = int(BG_COLOR[2] * (1 - ratio) + 50 * ratio)
    img[y, :] = [r, g, b]

# ---------------------
# DRAW VIDEO PLAYER WINDOW (SIMULATED SCREEN)
# ---------------------
video_x, video_y = WIDTH // 2 - 300, HEIGHT // 2 - 150
video_w, video_h = 600, 300

# Outer border
cv2.rectangle(img, (video_x, video_y), (video_x + video_w, video_y + video_h), ACCENT_COLOR, 3)

# Inner video area
cv2.rectangle(img, (video_x + 2, video_y + 2), (video_x + video_w - 2, video_y + video_h - 2), VIDEO_BG, -1)

# Play button icon (triangle)
play_x = video_x + video_w // 2 - 15
play_y = video_y + video_h // 2 - 15
points = np.array([
    [play_x, play_y],
    [play_x + 30, play_y + 15],
    [play_x, play_y + 30]
], np.int32)
cv2.fillPoly(img, [points], ACCENT_COLOR)

# Add "VIDEO" text inside
cv2.putText(img, "VIDEO", (video_x + video_w//2 - 40, video_y + video_h//2 + 5),
            cv2.FONT_HERSHEY_SIMPLEX, 1.2, ACCENT_COLOR, 2, cv2.LINE_AA)

# ---------------------
# DRAW SUBTITLES BUBBLE BELOW VIDEO
# ---------------------
caption_x = video_x
caption_y = video_y + video_h + 30
caption_w = video_w
caption_h = 80

# Caption background
cv2.rectangle(img, (caption_x, caption_y), (caption_x + caption_w, caption_y + caption_h), CAPTION_BG, -1)
cv2.rectangle(img, (caption_x, caption_y), (caption_x + caption_w, caption_y + caption_h), ACCENT_COLOR, 2)

# Caption text (centered) — Using FONT_HERSHEY_SIMPLEX with larger scale
font_scale = 0.8
font_thickness = 2
text_size = cv2.getTextSize(CAPTION_TEXT, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)[0]
text_x = caption_x + (caption_w - text_size[0]) // 2
text_y = caption_y + caption_h // 2 + text_size[1] // 2
cv2.putText(img, CAPTION_TEXT, (text_x, text_y),
            cv2.FONT_HERSHEY_SIMPLEX, font_scale, TEXT_COLOR, font_thickness, cv2.LINE_AA)

# ---------------------
# ADD MICROPHONE ICON (TOP RIGHT)
# ---------------------
mic_x, mic_y = WIDTH - 150, 100
mic_r = 35

# Mic body
cv2.circle(img, (mic_x, mic_y), mic_r, ACCENT_COLOR, -1)
# Mic stand
cv2.rectangle(img, (mic_x - 12, mic_y + mic_r), (mic_x + 12, mic_y + mic_r + 40), ACCENT_COLOR, -1)
# Mic head
cv2.circle(img, (mic_x, mic_y - 10), 15, TEXT_COLOR, -1)

# Sound waves around mic
wave_count = 3
for i in range(wave_count):
    offset = i * 15
    points = np.array([
        [mic_x - 50 + offset, mic_y],
        [mic_x - 35 + offset, mic_y - 10],
        [mic_x - 20 + offset, mic_y],
        [mic_x - 5 + offset, mic_y - 10],
        [mic_x + 10 + offset, mic_y]
    ], np.int32)
    cv2.polylines(img, [points], False, ACCENT_COLOR, 2)

# ---------------------
# ADD AI TAG (Top Left Corner)
# ---------------------
ai_tag_x, ai_tag_y = 50, 50
tag_w, tag_h = 120, 40
cv2.rectangle(img, (ai_tag_x, ai_tag_y), (ai_tag_x + tag_w, ai_tag_y + tag_h), HIGHLIGHT_COLOR, -1)
cv2.putText(img, "AI POWERED", (ai_tag_x + 10, ai_tag_y + 27),
            cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 1, cv2.LINE_AA)

# ---------------------
# MAIN TITLE (Center Top)
# ---------------------
title_font_scale = 1.8
title_thickness = 3
title_size = cv2.getTextSize(MAIN_TITLE, cv2.FONT_HERSHEY_DUPLEX, title_font_scale, title_thickness)[0]
title_x = (WIDTH - title_size[0]) // 2
title_y = 100
cv2.putText(img, MAIN_TITLE, (title_x, title_y),
            cv2.FONT_HERSHEY_DUPLEX, title_font_scale, TEXT_COLOR, title_thickness, cv2.LINE_AA)

# ---------------------
# SUBTITLE (Under Main Title)
# ---------------------
sub_font_scale = 1.1
sub_thickness = 2
sub_size = cv2.getTextSize(SUBTITLE, cv2.FONT_HERSHEY_SIMPLEX, sub_font_scale, sub_thickness)[0]
sub_x = (WIDTH - sub_size[0]) // 2
sub_y = title_y + 60
cv2.putText(img, SUBTITLE, (sub_x, sub_y),
            cv2.FONT_HERSHEY_SIMPLEX, sub_font_scale, ACCENT_COLOR, sub_thickness, cv2.LINE_AA)

# ---------------------
# SAVE AND DOWNLOAD
# ---------------------
cv2.imwrite(OUTPUT_IMAGE, img)
print(f"✅ FULL PROFESSIONAL IMAGE GENERATED: {OUTPUT_IMAGE}")

# Download to your device
from google.colab import files
files.download(OUTPUT_IMAGE)

✅ FULL PROFESSIONAL IMAGE GENERATED: linkedin_video_caption_generator.png


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>