## Generating dynamic karaoke-style subtitles for a video

In [2]:
import json

from moviepy import VideoFileClip, CompositeVideoClip, TextClip
from tabulate import tabulate

# Open the dictionary of character timestamps
with open('media/eleven_labs_ts_dict.json') as f:
    timings_dict = json.load(f)

In [3]:
def group_chars_into_words(chars, starts, ends):
    """
    Groups character-level data into words based on spaces/newlines.
    Returns a list of dicts like:
      [
        { "text": "Do", "start": 0.0, "end": 0.197 },
        { "text": "you", "start": 0.197, "end": 0.279 },
        ...
      ]
    """
    words = []
    current_word_chars = []
    current_word_start = None
    current_word_end = None

    for i, ch in enumerate(chars):
        # If this looks like whitespace or newline, that means we close the word if we have one
        if ch.isspace():
            if current_word_chars:
                # We have a completed word
                words.append({
                    "text": "".join(current_word_chars),
                    "start": current_word_start,
                    "end": current_word_end
                })
                current_word_chars = []
                current_word_start = None
                current_word_end = None
        else:
            # Start building (or continue building) a word
            if not current_word_chars:
                # New word starts
                current_word_start = starts[i]
            current_word_chars.append(ch)
            current_word_end = ends[i]

    # If we ended the loop but still have an unfinished word
    if current_word_chars:
        words.append({
            "text": "".join(current_word_chars),
            "start": current_word_start,
            "end": current_word_end
        })

    return words

In [343]:
import re


def group_words_into_sentences(words, max_words_in_sentence=3):
    """
    Splits the word list into sentences whenever we see a period, question mark, or exclamation mark
    at the end of a word (like "goals?" or "cake.") or when the maximum word count is reached.

    Parameters:
        words (list): List of word dictionaries.
        max_words_in_sentence (int): Maximum number of words per sentence.

    Returns:
        list: List of sentences, each a list of word dicts.
    """
    sentences = []
    current_sentence = []

    for w in words:
        current_sentence.append(w)

        # Check if the word ends with punctuation indicating a sentence boundary
        if re.search(r'[.?!]\Z', w["text"]) or len(current_sentence) >= max_words_in_sentence:
            # End current sentence
            sentences.append(current_sentence)
            current_sentence = []

    # Add any remaining words as the last sentence
    if current_sentence:
        sentences.append(current_sentence)

    return sentences

In [344]:
# Group the characters into words

words = group_chars_into_words(timings_dict['characters'], timings_dict['character_start_times_seconds'],
                               timings_dict['character_end_times_seconds'])

In [345]:
# Group the words into sentences

sentences = group_words_into_sentences(words)

# Print the first sentence
print(tabulate(sentences[0], headers="keys", tablefmt="pretty"))

+--------+-------+-------+
|  text  | start |  end  |
+--------+-------+-------+
|   Do   |  0.0  | 0.139 |
|  you   | 0.197 | 0.279 |
| really | 0.372 | 0.615 |
+--------+-------+-------+


### Draw subtitles on the video using moviepy

In [341]:
import moviepy as mp


def measure_text_width(text, font, fontsize):
    """
    Returns the (width, height) of the rendered text
    by creating a temporary TextClip and reading .size
    """
    temp_clip = mp.TextClip(
        text=text,
        font=font,
        font_size=fontsize,
        color='white'
    )
    return temp_clip.size  # (w, h)


def create_line_with_word_highlight(
        word_data,
        video_w=1080,
        video_h=1920,
        font='BebasNeue-Regular',
        fontsize=60,
        base_color='white',
        highlight_bg='#A020F0BB',
        line_y_ratio=0.8
) -> list[TextClip]:
    """
    Returns a CompositeVideoClip that:
    1. Displays line_text in `base_color` from line_start->line_end
    2. For each word in word_data, overlays a highlight background
       from word.start->word.end, behind the word.

    - line_y_ratio: 0.8 => place the line at 80% down the screen
    - highlight_bg: a partially transparent purple, e.g. #A020F0BB
    """

    # (A) Create the base line in white (or base_color).

    line_text = " ".join(w["text"] for w in word_data)

    line_start = word_data[0]["start"]
    line_end = word_data[-1]["end"]

    base_line = mp.TextClip(
        text=line_text,
        font=font,
        font_size=fontsize,
        color=base_color,
        margin=(12, 12)
    )


    line_w, line_h = base_line.size
    # Position in the bottom center (for example):
    x_center = (video_w - line_w) / 2
    y_pos = video_h * line_y_ratio

    base_line = (base_line
                 .with_start(line_start)
                 .with_duration(line_end - line_start)
                 .with_position((x_center, y_pos)))

    highlight_clips = []

    words_list = line_text.split()

    partial_texts = []
    for i in range(len(words_list)):
        partial_texts.append(" ".join(words_list[:i]) + (" " if i>0 else ""))

    # Pre-measure all partial_text widths
    offsets = []
    for p in partial_texts:
        w_size, _ = measure_text_width(p, font, fontsize)
        offsets.append(w_size)

    for i, w in enumerate(word_data):
        w_text = w["text"]
        w_start = w["start"]
        w_end   = w["end"]


        w_x = x_center + offsets[i]
        w_y = y_pos

        width, height = measure_text_width(w_text, font, fontsize)

        highlight_clip: mp.TextClip = mp.TextClip(
            text=w_text,
            font=font,
            font_size=fontsize,
            color='white',            # text color
            bg_color=highlight_bg,     # partial purple background
            method='caption',
            size=(width, height),
            transparent=True,
            margin=(12, 12)
        )


        # Reposition & timing
        highlight_clip = (
            highlight_clip
            .with_position((w_x, w_y))
            .with_start(w_start)
            .with_end(w_end)
            .with_duration(w_end - w_start)
        )

        highlight_clips.append(highlight_clip)


    return [base_line, *highlight_clips]


In [347]:
text_clips: list[TextClip] = []

for s in sentences:
    sample_line_clip = create_line_with_word_highlight(
        s,
        video_w=1080,
        video_h=1920,
        font='BebasNeue-Regular',
        fontsize=90,
        base_color='white',
        highlight_bg='#7710e2',
        line_y_ratio=0.8
    )
    text_clips.extend(sample_line_clip)

video_file = mp.VideoFileClip("media/tmp_output.mp4", target_resolution=(1080, 1920))

final_video = mp.CompositeVideoClip([video_file] + text_clips).with_duration(video_file.duration)

final_video.preview(fps=30)

for t in text_clips:
    print(f"Start: {t.start}, End: {t.end}, Duration: {t.duration}")



Exception in thread Thread-118:
Traceback (most recent call last):
  File "/Users/kozlovdmitriy/dev/persona_ai_project/persona_ai/.venv/lib/python3.9/site-packages/moviepy/audio/io/ffplay_audiopreviewer.py", line 60, in write_frames
    self.proc.stdin.write(frames_array.tobytes())
BrokenPipeError: [Errno 32] Broken pipe

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/Users/kozlovdmitriy/dev/persona_ai_project/persona_ai/.venv/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/kozlovdmitr

Start: 0.0, End: 0.615, Duration: 0.615
Start: 0.0, End: 0.139, Duration: 0.139
Start: 0.197, End: 0.279, Duration: 0.08200000000000002
Start: 0.372, End: 0.615, Duration: 0.243
Start: 0.662, End: 1.602, Duration: 0.9400000000000001
Start: 0.662, End: 0.813, Duration: 0.1509999999999999
Start: 0.859, End: 1.149, Duration: 0.29000000000000004
Start: 1.196, End: 1.602, Duration: 0.40600000000000014
Start: 1.637, End: 2.194, Duration: 0.5569999999999999
Start: 1.637, End: 1.695, Duration: 0.05800000000000005
Start: 1.776, End: 2.067, Duration: 0.29100000000000015
Start: 2.09, End: 2.194, Duration: 0.10400000000000009
Start: 2.241, End: 3.03, Duration: 0.7889999999999997
Start: 2.241, End: 2.531, Duration: 0.29000000000000004
Start: 2.566, End: 3.03, Duration: 0.46399999999999997
Start: 3.727, End: 4.609, Duration: 0.8820000000000001
Start: 3.727, End: 3.936, Duration: 0.20900000000000007
Start: 3.982, End: 4.191, Duration: 0.20899999999999963
Start: 4.261, End: 4.609, Duration: 0.34799999