In [3]:
#!pip install openai-whisper
#!pip install syllapy
#!pip install pysrt
#!pip install opencv-python
#!pip install numpy
#!pip install pillow

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
     ---------------------------------------- 0.0/800.5 kB ? eta -:--:--
     ------------------------------------- 800.5/800.5 kB 16.8 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting numba (from openai-whisper)
  Using cached numba-0.60.0-cp310-cp310-win_amd64.whl.metadata (2.8 kB)
Collecting numpy (from openai-whisper)
  Using cached numpy-2.1.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting torch (from openai-whisper)
  Downloading torch-2.5.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting tqdm (from openai-whisper)
  Downloading tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)
Collecting 

In [4]:
import whisper
import syllapy
import pysrt
import cv2
import numpy as np
import subprocess
from PIL import ImageFont, ImageDraw, Image

In [5]:
def count_syllables_syllapy(word):
    return syllapy.count(word)

# Séparer une phrase en mots et calculer les syllabes pour chaque mot
def get_word_syllables(sentence):
    words = sentence.split()
    syllables_per_word = [count_syllables_syllapy(word) for word in words]
    return words, syllables_per_word

# Distribuer le temps d'une phrase entre les mots et les syllabes
def distribute_time_over_words_and_syllables(start_time, end_time, syllables_per_word):
    total_syllables = sum(syllables_per_word)
    total_time = end_time - start_time
    time_per_syllable = total_time / total_syllables

    syllable_times = []
    current_time = start_time

    for syllables in syllables_per_word:
        word_duration = syllables * time_per_syllable
        syllable_times.append((syllables, current_time, current_time + word_duration))
        current_time += word_duration

    return syllable_times



In [6]:
def create_srt_file(karaoke_times, output_srt_file):
    with open(output_srt_file, 'w', encoding='utf-8') as f:
        for i, (sentence, start_time, end_time, syllable_times) in enumerate(karaoke_times):
            for word_index, (word, syllable_count, word_end_time) in enumerate(syllable_times):
                start_time_formatted = format_srt_time(start_time)
                end_time_formatted = format_srt_time(word_end_time)
                
                # SRT entry format
                f.write(f"{i+1}\n")
                f.write(f"{start_time_formatted} --> {end_time_formatted}\n")
                f.write(f"{word}\n\n")
                
                # Update start time for next word
                start_time = word_end_time

def format_srt_time(seconds):
    millis = int((seconds % 1) * 1000)
    seconds = int(seconds)
    minutes = seconds // 60
    seconds = seconds % 60
    hours = minutes // 60
    minutes = minutes % 60
    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"

In [7]:
# Transcrire l'audio et générer les timings
def transcribe_audio_to_karaoke_times(audio_file):
    model = whisper.load_model("tiny", device='cuda')
    result = model.transcribe(audio_file)
    
    sentence_times_global = []
    previous_end_time = 0

    for segment in result['segments']:
        sentence = segment['text']
        start_time = segment['start']
        end_time = segment['end']
        
        if start_time > previous_end_time:
            start_time = previous_end_time
        
        words, syllables_per_word = get_word_syllables(sentence)
        syllable_times = distribute_time_over_words_and_syllables(start_time, end_time, syllables_per_word)
        syllables_durations = list(zip(words, syllables_per_word, [word_end for _, _, word_end in syllable_times]))
        sentence_times_global.append((sentence, start_time, end_time, syllables_durations))
        
        previous_end_time = end_time
    
    return sentence_times_global

In [8]:
# Charger le fichier SRT
def load_srt_file(srt_file):
    subs = pysrt.open(srt_file)
    word_times = []
    for sub in subs:
        word_times.append((sub.text.strip(), sub.start.ordinal / 1000, sub.end.ordinal / 1000))
    return word_times

In [23]:
# Ajouter du texte sur les frames
def render_text_on_frame(frame, text, highlighted_words, font_path, font_size, pos=(300, 270), font_color=(255, 255, 255), highlight_color=(255, 0, 0), bg_color=(0, 0, 0), padding=7, y_offset_adjust=25, right_adjust=10, bg_opacity=255):
    # Convertir la frame en image Pillow avec un canal alpha (transparence)
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).convert("RGBA")
    
    # Créer une image temporaire pour dessiner le rectangle avec transparence
    overlay = Image.new("RGBA", pil_image.size, (0, 0, 0, 0))  # Image transparente
    overlay_draw = ImageDraw.Draw(overlay)

    # Charger la police spécifiée
    font = ImageFont.truetype(font_path, font_size)
    
    # Séparer les mots
    words = text.split()

    # Calculer la largeur et hauteur totale du texte
    text_width = 0
    text_height = 0
    word_widths = []  # Pour stocker la largeur de chaque mot

    # Calculer les dimensions de tous les mots
    for word in words:
        word_bbox = overlay_draw.textbbox((0, 0), word, font=font)
        word_width = word_bbox[2] - word_bbox[0]
        word_height = word_bbox[3] - word_bbox[1]
        word_widths.append(word_width)  # Stocke la largeur du mot
        text_width += word_width + 10  # Ajoute un espace entre les mots
        text_height = max(text_height, word_height)  # Prendre la plus grande hauteur de mot

    # Enlève le dernier espace supplémentaire
    text_width -= 10

    # Calculer la position (x_offset, y_offset) pour centrer le texte en bas
    frame_width, frame_height = pil_image.size
    x_offset = (frame_width - text_width) // 2  # Centrer horizontalement
    y_offset = frame_height - text_height - 50 + y_offset_adjust  # Placer plus bas avec ajustement

    # Dessiner le fond noir semi-transparent derrière le texte
   
   #overlay_draw.rectangle(
  #      [x_offset + padding, y_offset - padding // 2, x_offset + text_width - padding, y_offset + text_height + padding // 2],
   #     fill=(*bg_color, bg_opacity)  # Ajouter l'opacité ici
    #)

    # Ajuster l'alignement vertical pour corriger le positionnement
    #text_bbox = overlay_draw.textbbox((0, 0), text, font=font)
    #y_text_offset = y_offset + (text_height - (text_bbox[3] - text_bbox[1])) // 2


    # Dessiner les mots avec les ajustements
    word_x = x_offset
    for i, word in enumerate(words):
        word_color = highlight_color if word in highlighted_words else font_color
       # overlay_draw.text((word_x, y_text_offset), word, font=font, fill=word_color)
        word_x += word_widths[i] + 10  # Ajuste l'espace pour chaque mot

    # Combiner l'image avec l'overlay semi-transparent
    combined = Image.alpha_composite(pil_image, overlay)

    # Convertir en frame OpenCV
    frame = cv2.cvtColor(np.array(combined.convert("RGB")), cv2.COLOR_RGB2BGR)
    return frame



In [10]:

# Créer une vidéo avec les sous-titres karaoke en utilisant le fichier SRT
def create_video_with_karaoke_using_srt(input_video, srt_file, output_video, font_path, font_size):
    cap = cv2.VideoCapture(input_video)
    word_times = load_srt_file(srt_file)
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    temp_output_video = 'temp_output.mp4'  # Temp video without audio
    out = cv2.VideoWriter(temp_output_video, fourcc, fps, (frame_width, frame_height))

    current_word_index = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret or current_word_index >= len(word_times):
            break
        
        current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000

        word, start_time, end_time = word_times[current_word_index]
        
        if start_time <= current_time <= end_time:
            frame = render_text_on_frame(frame, word, [word], font_path, font_size)
        
        if current_time > end_time:
            current_word_index += 1

        out.write(frame)

    cap.release()
    out.release()

    return temp_output_video  # Return temp video without audio


In [25]:

# Fonction pour ajouter l'audio à la vidéo en utilisant FFmpeg
def combine_video_and_audio(input_video, input_audio, output_video):
    ffmpeg_command = [
        'ffmpeg',
        '-i', input_video,                  # Input video (avec les sous-titres déjà intégrés)
        '-i', input_audio,                  # Input audio
        '-c:v', 'copy',                     # Ne pas réencoder la vidéo
        '-c:a', 'aac',                      # Codec audio AAC
        '-b:a', '192k',                     # Débit audio
        '-shortest',                        # Faire en sorte que la vidéo s'arrête quand l'audio ou la vidéo le plus court se termine
        output_video                        # Fichier de sortie
    ]
    
    # Exécuter la commande FFmpeg
    subprocess.run(ffmpeg_command)
