https://imagemagick.org/script/download.php

In [1]:
import torch
from pathlib import Path
from faster_whisper import WhisperModel
from playsound import playsound
import time
from colorama import Fore, Style
from moviepy.config import change_settings, get_setting

# ════════════════════════════════════════════════════════════
# Global Settings
# ════════════════════════════════════════════════════════════

# Device configuration for WhisperModel
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "int8_float16" if DEVICE == "cuda" else "int8"
MODEL_SIZE = "large-v3"

# Path to ImageMagick binary (required for MoviePy TextClip)
IMAGEMAGICK_PATH = r"C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
change_settings({"IMAGEMAGICK_BINARY": IMAGEMAGICK_PATH})

# Print current ImageMagick binary for verification
print(f"ImageMagick Binary: {get_setting('IMAGEMAGICK_BINARY')}")

# Initialize WhisperModel
MODEL = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)

# ════════════════════════════════════════════════════════════
# Function Definitions
# ════════════════════════════════════════════════════════════
    
def group_lyrics_by_segments(audio_path):
    """
    Groups lyrics into verses with metadata for each word.

    Args:
        segments (list): List of transcription segments with metadata.

    Returns:
        list of dict: Each dictionary represents a verse with text and word-level metadata.
    """
    segments, info = MODEL.transcribe(audio_path, word_timestamps=True)

    verses = []

    for segment in segments:
        # Combine all words in the segment to form a verse
        verse_text = " ".join([word.word for word in segment.words])

        verse_data = {
            "text": verse_text,  # Full text of the verse
            "words": [{"word": word.word, "start": word.start, "end": word.end} for word in segment.words]
        }

        verses.append(verse_data)

    return verses

def preview_verses_with_audio(verses, audio_path=None):
    """
    Displays verses with word-level timings in sync with audio playback.
    Highlights the current spoken word in bright yellow.

    Args:
        verses (list): List of grouped verses with text and word metadata.
        audio_path (str, optional): Path to the audio file for playback. Defaults to None.
    """
    # Simulate timing preview
    start_time = time.time()

    for verse in verses:
        while time.time() - start_time < verse["words"][0]["start"]:
            time.sleep(0.01)
        
        for word in verse["words"]:
            # Highlight the current word dynamically
            highlighted_text = " ".join([
                f"{Fore.YELLOW}{w['word']}{Style.RESET_ALL}" if w == word else w["word"]
                for w in verse["words"]
            ])
            
            # Clear the console and print the updated verse
            print(f"\r{highlighted_text}", end="", flush=True)
            
            # Wait until the next word's start time
            while time.time() - start_time < word["end"]:
                time.sleep(0.01)
        
        print("\n")

  from .autonotebook import tqdm as notebook_tqdm


ImageMagick Binary: C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe


In [2]:
instrumental_file_path = Path("./audio_processing/karaoke_files/input_instrumentals/")
vocal_file_path = Path("./audio_processing/karaoke_files/input_vocals/")
font_file_path = Path("./audio_processing/karaoke_files/input_fonts/")
lyrics_file_path = Path("./audio_processing/karaoke_files/input_lyrics/")
output_video_path = Path("./audio_processing/karaoke_files/output_videos/")

In [3]:
# Input paths
vocals = vocal_file_path / "droppin_seeds.mp3"
# vocals = vocal_file_path / "all_the_magic.mp3"

# Align lyrics with audio
verses = group_lyrics_by_segments(str(vocals))

verses

[{'text': ' like  adam  in  the  garden  of  eve  my  bitch  got  an  apple  bottom  and  she  swallowed  my  seeds  follow',
  'words': [{'word': ' like',
    'start': 11.959999999999996,
    'end': 12.539999999999997},
   {'word': ' adam', 'start': 12.539999999999997, 'end': 13.12},
   {'word': ' in', 'start': 13.12, 'end': 13.34},
   {'word': ' the', 'start': 13.34, 'end': 13.44},
   {'word': ' garden', 'start': 13.44, 'end': 13.84},
   {'word': ' of', 'start': 13.84, 'end': 14.02},
   {'word': ' eve', 'start': 14.02, 'end': 14.36},
   {'word': ' my', 'start': 14.36, 'end': 14.84},
   {'word': ' bitch', 'start': 14.84, 'end': 15.0},
   {'word': ' got', 'start': 15.0, 'end': 15.2},
   {'word': ' an', 'start': 15.2, 'end': 15.36},
   {'word': ' apple', 'start': 15.36, 'end': 15.7},
   {'word': ' bottom', 'start': 15.7, 'end': 16.1},
   {'word': ' and', 'start': 16.1, 'end': 16.28},
   {'word': ' she', 'start': 16.28, 'end': 16.38},
   {'word': ' swallowed', 'start': 16.38, 'end': 16.7

In [36]:
from pprint import pprint

pprint(verses)

[{'text': ' like  adam  in  the  garden  of  eve  my  bitch  got  an  apple  '
          'bottom  and  she  swallowed  my  seeds  follow',
  'words': [{'end': 12.539999999999997,
             'start': 11.959999999999996,
             'word': ' like'},
            {'end': 13.12, 'start': 12.539999999999997, 'word': ' adam'},
            {'end': 13.34, 'start': 13.12, 'word': ' in'},
            {'end': 13.44, 'start': 13.34, 'word': ' the'},
            {'end': 13.84, 'start': 13.44, 'word': ' garden'},
            {'end': 14.02, 'start': 13.84, 'word': ' of'},
            {'end': 14.36, 'start': 14.02, 'word': ' eve'},
            {'end': 14.84, 'start': 14.36, 'word': ' my'},
            {'end': 15.0, 'start': 14.84, 'word': ' bitch'},
            {'end': 15.2, 'start': 15.0, 'word': ' got'},
            {'end': 15.36, 'start': 15.2, 'word': ' an'},
            {'end': 15.7, 'start': 15.36, 'word': ' apple'},
            {'end': 16.1, 'start': 15.7, 'word': ' bottom'},
            {'e

In [None]:
preview_verses_with_audio(verses[:2])

___
___
___

In [11]:
from moviepy.editor import (
    VideoFileClip,
    ImageClip,
    ColorClip,
    TextClip,
    AudioFileClip,
    CompositeVideoClip,
)

In [12]:
TextClip.list('font')

['Agency-FB',
 'Agency-FB-Bold',
 'Algerian',
 'Arial',
 'Arial-Black',
 'Arial-Bold',
 'Arial-Bold-Italic',
 'Arial-Italic',
 'Arial-Narrow',
 'Arial-Narrow-Bold',
 'Arial-Narrow-Bold-Italic',
 'Arial-Narrow-Italic',
 'Arial-Rounded-MT-Bold',
 'Bahnschrift',
 'Baskerville-Old-Face',
 'Bauhaus-93',
 'Bell-MT',
 'Bell-MT-Bold',
 'Bell-MT-Italic',
 'Berlin-Sans-FB',
 'Berlin-Sans-FB-Bold',
 'Berlin-Sans-FB-Demi-Bold',
 'Bernard-MT-Condensed',
 'Blackadder-ITC',
 'Bodoni-MT',
 'Bodoni-MT-Black',
 'Bodoni-MT-Black-Italic',
 'Bodoni-MT-Bold',
 'Bodoni-MT-Bold-Italic',
 'Bodoni-MT-Condensed',
 'Bodoni-MT-Condensed-Bold',
 'Bodoni-MT-Condensed-Bold-Italic',
 'Bodoni-MT-Condensed-Italic',
 'Bodoni-MT-Italic',
 'Bodoni-MT-Poster-Compressed',
 'Book-Antiqua',
 'Book-Antiqua-Bold',
 'Book-Antiqua-Bold-Italic',
 'Book-Antiqua-Italic',
 'Bookman-Old-Style',
 'Bookman-Old-Style-Bold',
 'Bookman-Old-Style-Bold-Italic',
 'Bookman-Old-Style-Italic',
 'Bookshelf-Symbol-7',
 'Bradley-Hand-ITC',
 'Britann

In [33]:
from pathlib import Path

def generate_karaoke_video(
    verses, 
    audio_path, 
    resolution=(1280, 720),
    font="Arial", 
    fontsize=48, 
    static_color="white", 
    spoken_color="yellow"
):
    """
    Generates a karaoke video with synchronized lyrics and highlighted words.

    Args:
        verses (list of dict): List of verses, each with text and word metadata.
        audio_path (str): Path to the audio file.
        output_path (str): Path to save the generated video.
        resolution (tuple): Video resolution, default is (1280, 720).
    """
    vocals_duration = max(word["end"] for verse in verses for word in verse["words"])

    # ════════════════════════════════════════════════════════════
    # Audio Clip
    # ════════════════════════════════════════════════════════════
    try:
        audio_clip = AudioFileClip(audio_path)
        audio_duration = audio_clip.duration

        if audio_duration < vocals_duration:
            print(f"Warning: Audio duration ({audio_duration}s) is shorter than video duration ({vocals_duration}s).")
            bf_duration = vocals_duration
        else:
            bf_duration = audio_duration
            
    except Exception as e:
        print(f"Error loading audio file: {audio_path} - {e}")
        audio_clip = None

    # Create a static background (black)

    # ════════════════════════════════════════════════════════════
    # Background Video/Image
    # ════════════════════════════════════════════════════════════
    bg_clip = ColorClip(size=resolution, color=(0, 0, 0), duration=bf_duration)

    # ════════════════════════════════════════════════════════════
    # Verse Clips
    # ════════════════════════════════════════════════════════════
    verse_clips = []
    for verse in verses:
        # verse_start = verse["words"][0]["start"]
        # verse_end = verse["words"][-1]["end"]

        # Create a base clip for the entire verse duration
        # base_clip = TextClip(
        #     verse["text"],
        #     size=(resolution[0], 200),
        #     color=static_color,
        #     bg_color="black",
        #     fontsize=fontsize,
        #     font=font,
        #     method="caption",
        # ).set_position(("center", "center")) \
        #  .set_start(verse_start) \
        #  .set_end(verse_end)
        
        # ════════════════════════════════════════════════════════════
        # Word Highlights
        # ════════════════════════════════════════════════════════════
        # notes 
        word_clips = []
        for word in verse["words"]:
            # highlighted_text = " ".join([
            #     f"**{w['word']}**" if w == word else w["word"] for w in verse["words"]
            # ])
            
            # highlighted_text = " ".join([w["word"] for w in verse["words"]])

            highlighted_text = " ".join(
                [w['word'] if w == word else " " * len(w["word"]) for w in verse["words"]]
            )

            # Create a text clip for the current word
            word_clip = TextClip(
                highlighted_text,
                size=(resolution[0], 200),
                color=spoken_color,
                # color="transparent",
                bg_color="transparent",
                # stroke_color=spoken_color,
                # stroke_width=2,
                fontsize=fontsize,
                font=font,
                method="caption"
            ).set_position(("center", "center")) \
             .set_start(word["start"]) \
             .set_end(word["end"])
            
            word_clips.append(word_clip)

        # Add the base clip and word highlights to the list
        verse_clips.append(base_clip)
        verse_clips.extend(word_clips)

    # Combine all clips
    video = CompositeVideoClip([bg_clip] + verse_clips).set_audio(audio_clip)
        
    return video 

In [34]:
karaoke_video = output_video_path / "droppin_seeds_karaoke.mp4"
karaoke_music = instrumental_file_path / "droppin_seeds.mp3"
karaoke_font = font_file_path / "enter-the-gungeon-big.ttf"
karaoke_lyrics = verses

# karaoke_video = output_video_path / "all_the_magic_karaoke.mp4"
# karaoke_music = instrumental_file_path / "droppin_seeds.mp3"

# Generate the karaoke video
video = generate_karaoke_video(
    verses=karaoke_lyrics,
    audio_path=str(karaoke_music),
    resolution=(1280, 720),
    font="Castellar",
    fontsize=48,
    static_color="white",
    spoken_color="yellow"
)

In [9]:
print(type(video))
pprint(video.__dict__)

<class 'moviepy.video.compositing.CompositeVideoClip.CompositeVideoClip'>
{'audio': <moviepy.audio.io.AudioFileClip.AudioFileClip object at 0x000001F2B700B640>,
 'bg': <moviepy.video.VideoClip.ColorClip object at 0x000001F2B71068C0>,
 'bg_color': (0, 0, 0),
 'clips': [<moviepy.video.VideoClip.ColorClip object at 0x000001F2B700B250>,
           <moviepy.video.VideoClip.TextClip object at 0x000001F2B705B2E0>,
           <moviepy.video.VideoClip.TextClip object at 0x000001F2B705B700>,
           <moviepy.video.VideoClip.TextClip object at 0x000001F2B705BD30>,
           <moviepy.video.VideoClip.TextClip object at 0x000001F2B70981C0>,
           <moviepy.video.VideoClip.TextClip object at 0x000001F2B7098610>,
           <moviepy.video.VideoClip.TextClip object at 0x000001F2B7098A60>,
           <moviepy.video.VideoClip.TextClip object at 0x000001F2B7098EB0>,
           <moviepy.video.VideoClip.TextClip object at 0x000001F2B7099300>,
           <moviepy.video.VideoClip.TextClip object at 0x

In [35]:
video.write_videofile(
    str(karaoke_video),
    fps=24,
    codec="libx264",
    preset="medium",
    threads=4,
    audio_codec="aac",
    logger="bar"
)

# video.write_videofile(
#     str(karaoke_video),
#     fps=24,                          # Your specified frame rate
#     codec="libx264",                 # Best for high-quality MP4 videos
#     bitrate="3000k",                 # Optimal bitrate for a balance of quality and file size
#     audio_codec="aac",               # Standard codec for audio
#     audio_bitrate="192k",            # High-quality audio bitrate
#     preset="medium",                 # Encoding speed vs compression trade-off
#     threads=4,                       # Adjust based on the number of CPU cores
#     ffmpeg_params=["-crf", "23"],    # Constant Rate Factor (CRF) for quality tuning
#     logger="bar"                     # Show a progress bar
# )

Moviepy - Building video audio_processing\karaoke_files\output_videos\droppin_seeds_karaoke.mp4.
MoviePy - Writing audio in droppin_seeds_karaokeTEMP_MPY_wvf_snd.mp4


chunk:   0%|          | 0/1326 [00:00<?, ?it/s, now=None]

                                                                      

MoviePy - Done.
Moviepy - Writing video audio_processing\karaoke_files\output_videos\droppin_seeds_karaoke.mp4



                                                                

Moviepy - Done !
Moviepy - video ready audio_processing\karaoke_files\output_videos\droppin_seeds_karaoke.mp4


___
___
___

In [48]:

def get_background(background_path, resolution, duration):
    """
    Creates a background clip for the karaoke video.

    Args:
        background_path (str, optional): Path to a video or image file for the background.
        resolution (tuple): Resolution of the video (width, height).
        duration (int): Duration of the background video.

    Returns:
        VideoClip: The background video clip.
    """
    if background_path:

        # Check if it's a video
        if background_path.endswith((".mp4", ".avi", ".mov", ".mkv")):
            return VideoFileClip(background_path).resize(resolution).subclip(0, duration)
        
        # Otherwise, assume it's an image
        elif background_path.endswith((".jpg", ".jpeg", ".png", ".bmp")):
            return ImageClip(background_path).set_duration(duration).resize(resolution)
        else:
            raise ValueError("Unsupported background file format. Use video or image files.")
    else:
        # Default black background
        return ColorClip(size=resolution, color=(0, 0, 0), duration=duration)

def generate_karaoke_video(verses, audio_path, background_path=None, resolution=(1280, 720), font="Arial", fontsize=48, static_color="white", spoken_color="yellow"):
    """
    Optimized function to generate a karaoke video with synchronized lyrics.

    Args:
        verses (list of dict): List of verses, each with text and word metadata.
        audio_path (str): Path to the audio file.
        output_path (str): Path to save the generated video.
        background_path (str, optional): Path to a video or image file for the background.
        resolution (tuple): Video resolution, default is (1280, 720).
        font (str): Font to use for the lyrics.
        fontsize (int): Font size for the lyrics.
        static_color (str): Color for static (unspoken) words.
        spoken_color (str): Color for highlighted (spoken) words.
    """
    # Calculate total video duration
    total_duration = max(word["end"] for verse in verses for word in verse["words"])

    # ════════════════════════════════════════════════════════════
    # Clip: Background Video/Image
    # ════════════════════════════════════════════════════════════
    bg_clip = get_background(
        background_path=background_path,
        resolution=resolution,
        duration=total_duration
    )

    # ════════════════════════════════════════════════════════════
    # Clip: Audio
    # ════════════════════════════════════════════════════════════
    audio_clip = AudioFileClip(audio_path)

    # ════════════════════════════════════════════════════════════
    # Clips: Verse Text
    # ════════════════════════════════════════════════════════════
    word_clips = []
    for verse in verses:
        print("Verse:", verse["text"])

        highlighted_text_parts = []
        for i, word in enumerate(verse["words"]):
            print("Word:", word["word"])
            
            # Cache the text rendering
            highlighted_text_parts.append(f"<span style='color:{spoken_color}'>{word['word']}</span>")
            remaining_text = " ".join([
                f"<span style='color:{static_color}'>{w['word']}</span>" for w in verse["words"][i+1:]
            ])
            full_text = " ".join(highlighted_text_parts) + " " + remaining_text

            try:
                word_clip = TextClip(
                    full_text,
                    fontsize=fontsize,
                    font=font,
                    stroke_color="black",
                    stroke_width=2,
                    method="caption",
                    size=(int(resolution[0] * 0.8), None)
                ).set_position(("center", "center")) \
                 .set_start(word["start"]) \
                 .set_end(word["end"])
                word_clips.append(word_clip)
            except Exception as e:
                print(f"Error generating clip for word: {word['word']} - {e}")

    # Combine all text clips over the background
    final_video = CompositeVideoClip([bg_clip] + word_clips).set_audio(audio_clip)

    return final_video

In [None]:
karaoke_video = output_video_path / "droppin_seeds_karaoke.mp4"
karaoke_music = instrumental_file_path / "droppin_seeds.mp3"
karaoke_font = font_file_path / "enter-the-gungeon-big.ttf"
karaoke_lyrics = verses

# Generate the karaoke video
video = generate_karaoke_video(
    verses=karaoke_lyrics,
    audio_path=str(karaoke_music),
    background_path=None,
    # lower quality for faster rendering
    resolution=(640, 360),
    font="Arial",
    fontsize=48,
    static_color="white",
    spoken_color="yellow"
)

pprint(video.__dict__)

___
___
___

In [None]:
final_video.write_videofile(
    str(karaoke_video),
    fps=12,                          # Reduced FPS for speed
    codec="libx264",                 # CPU-based encoding
    bitrate="500k",                  # Lower bitrate for speed
    audio_codec=None,                # Disable audio for testing
    preset="ultrafast",              # Fastest preset
    threads=8,                       # Use all CPU threads
    ffmpeg_params=["-crf", "30"],    # Higher CRF for faster compression
    logger="bar",                    # Show progress bar
)


In [None]:
final_video.write_videofile(
    str(karaoke_video),
    fps=24,
    codec="libx264",
    bitrate="2000k",  # Higher bitrate for better quality
    audio_codec="aac",
    audio_bitrate="128k",  # Standard audio quality
    preset="medium",  # Balance between encoding speed and compression
    threads=4,  # Use more threads for faster encoding
    ffmpeg_params=["-crf", "23"],  # Balanced CRF value
    logger="bar",
)

In [None]:
final_video.write_videofile(
    str(karaoke_video),
    fps=24,                          # Your specified frame rate
    codec="libx264",                 # Efficient and widely compatible
    bitrate="500k",                  # Low bitrate for smaller file size
    audio_codec="aac",               # Standard audio codec
    audio_bitrate="64k",             # Low audio quality
    preset="ultrafast",              # Fast encoding, less compression
    threads=2,                       # Use 2 threads for encoding
    ffmpeg_params=["-crf", "28"],    # Higher CRF value for more compression
    logger="bar"                     # Show a progress bar
)


In [None]:
final_video.write_videofile(
    str(karaoke_video),
    fps=24,                          # Your specified frame rate
    codec="libx264",                 # Best for high-quality MP4 videos
    bitrate="3000k",                 # Optimal bitrate for a balance of quality and file size
    audio_codec="aac",               # Standard codec for audio
    audio_bitrate="192k",            # High-quality audio bitrate
    preset="medium",                 # Encoding speed vs compression trade-off
    threads=4,                       # Adjust based on the number of CPU cores
    ffmpeg_params=["-crf", "23"],    # Constant Rate Factor (CRF) for quality tuning
    logger="bar"                     # Show a progress bar
)


In [None]:
video.write_videofile(
    str(karaoke_video),
    fps=12,                          # Reduced FPS for speed
    codec="libx264",                 # CPU-based encoding
    bitrate="500k",                  # Lower bitrate for speed
    audio_codec=None,                # Disable audio for testing
    preset="ultrafast",              # Fastest preset
    threads=8,                       # Use all CPU threads
    ffmpeg_params=["-crf", "30"],    # Higher CRF for faster compression
    logger="bar",                    # Show progress bar
)

___
___
___