In [1]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
from torch.nn.functional import pad
from pydub import AudioSegment
import numpy as np
from gtts import gTTS

import time
import tempfile
import os
import pyttsx3
import torch
import torchaudio
import gc
import subprocess
import io

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Initialize models
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
translator_model_name = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = MarianTokenizer.from_pretrained(translator_model_name)
translator = MarianMTModel.from_pretrained(translator_model_name)

def transcribe_audio(audio_path, target_lang="en"):
    waveform, rate = torchaudio.load(audio_path)
    if waveform.dim() > 2:
        waveform = waveform.mean(dim=0, keepdim=True)
    if rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)
    inputs = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    input_features = inputs.input_features
    attention_mask = inputs.attention_mask if 'attention_mask' in inputs else None

    with torch.no_grad():
        predicted_ids = model.generate(input_features, language=target_lang, attention_mask=attention_mask)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    print(transcription)
    return transcription

def translate_text(text, target_lang="en"):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = translator.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    print(f'Original text : {inputs.input_ids}')
    print(f'Translated text : {translated_text}')
    return translated_text

def text_to_speech(text, output_audio_path="translated_audio.mp3", language_code="en-US"):
    engine = pyttsx3.init()
    engine.setProperty("rate", 150)  # Adjust speed as needed for sync
    if language_code == "en-US":
        engine.setProperty("voice", "com.apple.speech.synthesis.voice.Alex")
    else:
        engine.setProperty("voice", "com.apple.speech.synthesis.voice.Thomas")
    engine.save_to_file(text, output_audio_path)
    engine.runAndWait()
    return output_audio_path

def convert_mp3_to_wav(mp3_path, wav_path="converted_audio.wav"):
    audio = AudioSegment.from_mp3(mp3_path).set_channels(1)  # Convert to mono
    audio.export(wav_path, format="wav")
    return wav_path

def add_audio_to_video(video_path, audio_path, output_path="output_video.mp4"):
    import time
    from moviepy.editor import VideoFileClip, AudioFileClip

    video_clip = VideoFileClip(video_path)
    audio_clip = AudioFileClip(audio_path)
    try:
        # Synchronize audio duration with video duration
        audio_clip = audio_clip.set_duration(video_clip.duration)
        video_clip = video_clip.set_audio(audio_clip)
        video_clip.write_videofile(output_path, codec="libx264", audio_codec="aac", remove_temp=True)
    finally:
        # Close clips to ensure resources are released
        audio_clip.close()
        video_clip.close()
        del audio_clip, video_clip
        gc.collect()
        time.sleep(1)  # Optional: Introduce a delay to ensure complete cleanup



def process_input(input_path, input_type, target_lang="en"):
    from pydub import AudioSegment
    from moviepy.editor import concatenate_audioclips, AudioFileClip

    if input_type == "video":
        video_clip = VideoFileClip(input_path)
        audio_path = "extracted_audio.wav"
        video_clip.audio.write_audiofile(audio_path)
    elif input_type == "audio":
        audio_path = input_path
    else:
        raise ValueError("Unsupported input type. Please use 'video' or 'audio'.")

    # Split audio into segments for synchronization with precise timestamps
    segment_duration = 1  # seconds (adjust as needed)
    audio = AudioSegment.from_file(audio_path)
    num_segments = len(audio) // (1000 * segment_duration) + 1
    audio_chunks = [audio[i * 1000 * segment_duration:(i + 1) * 1000 * segment_duration]
                    for i in range(num_segments)]

    translated_audio_segments = []
    for i, chunk in enumerate(audio_chunks):
        chunk_path = f"chunk_{i}.wav"
        chunk.export(chunk_path, format="wav")
        original_text = transcribe_audio(chunk_path)
        translated_text = translate_text(original_text, target_lang)
        translated_chunk_path = f"translated_chunk_{i}.mp3"
        text_to_speech(translated_text, output_audio_path=translated_chunk_path)
        translated_audio_segments.append(AudioFileClip(translated_chunk_path))

    # Concatenate all translated audio segments with precise synchronization
    final_audio = concatenate_audioclips(translated_audio_segments)
    final_audio_path = "final_translated_audio.mp3"
    final_audio.write_audiofile(final_audio_path)

    # Add the final synchronized audio to the video
    if input_type == "video":
        output_path = "translated_video.mp4"
        add_audio_to_video(input_path, final_audio_path, output_path=output_path)
    else:
        output_path = final_audio_path

    return output_path


In [6]:

processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
translator_model_name = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = MarianTokenizer.from_pretrained(translator_model_name)
translator = MarianMTModel.from_pretrained(translator_model_name)

def pad_mel_features(mel_features, target_length=3000):
    # Pad mel features to match the expected length of 3000
    print(f'Length of mel_features : {mel_features.shape}')
    current_length = mel_features.shape[-1]
    if current_length < target_length:
        mel_features = pad(mel_features, (0, target_length - current_length), mode='constant', value=0)
    print(f'Length of mel_features : {mel_features.shape}')
    return mel_features

def transcribe_audio(audio_path):
    waveform, rate = torchaudio.load(audio_path)
    print(f'Rate : {rate}')
    print(f'Dims of waveform : {waveform.shape}')
    if waveform.dim() >= 2:
        waveform = waveform.mean(dim=0, keepdim=True)
    print(f'Dims of waveform : {waveform.shape}')
    if rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)
    print(f'Dims of waveform : {waveform.shape}')
    inputs = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    input_features = inputs.input_features

    # Ensure input features are correctly padded to 3000
    input_features = pad_mel_features(input_features)
    print(f'Data type of inputs : {inputs.keys()}')
    print(f'Number of keys of inputs : {len(inputs)}')
    print(f'Shape of input_features : {input_features.shape}')

    # Generate transcription
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
        print(f'Shape of predicted_ids : {predicted_ids.shape}')
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    print(transcription)
    return transcription

def translate_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = translator.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    print(f"Translated text: {translated_text}")
    return translated_text

def text_to_speech(text, output_audio_path="translated_audio.mp3", language_code="en-US"):
    engine = pyttsx3.init()
    engine.setProperty("rate", 150)  # Adjust speed for synchronization
    if language_code == "en-US":
        engine.setProperty("voice", "com.apple.speech.synthesis.voice.Alex")
    else:
        engine.setProperty("voice", "com.apple.speech.synthesis.voice.Thomas")
    engine.save_to_file(text, output_audio_path)
    engine.runAndWait()
    return output_audio_path
def add_audio_to_video(video_path, audio_path, output_path="output_video.mp4"):
    # Load the original video
    video = VideoFileClip(video_path)
    
    # Load the new audio
    new_audio = AudioFileClip(audio_path)
    
    # Set the new audio to the video
    video_with_new_audio = video.set_audio(new_audio)
    
    # Write the result to a file
    video_with_new_audio.write_videofile(output_path, codec="libx264", audio_codec="aac")

def process_input(file_path, input_type, target_lang="en"):
    try:
        # Step 1: Extract audio from video if the input is a video
        if input_type == "video":
            audio_path = extract_audio(file_path)
            if audio_path is None:
                print("Error: Failed to extract audio from video.")
                return None  # Exit if audio extraction fails
        else:
            audio_path = file_path  # Assume it's already an audio file

        # Step 2: Transcribe the audio
        original_transcription = transcribe_audio(audio_path)
        print(f'DataType of transcription {type(original_transcription)}')

        # Step 3: Translate the transcription to the target language
        translated_text = translate_text(original_transcription)

        # Step 4: Align the translated audio with the original timing
        aligned_audio_path = process_audio_and_align(audio_path, translated_text)
        if aligned_audio_path is None:
            print("Error: Failed to process and align audio.")
            return None  # Exit if audio processing fails

        # Step 5: Add the aligned audio to the video (if input is a video)
        if input_type == "video":
            output_path = "translated_video.mp4"
            add_audio_to_video(file_path, aligned_audio_path, output_path)
        else:
            output_path = aligned_audio_path

        return output_path

    except Exception as e:
        print(f"An error occurred during the process: {e}")
        return None



def extract_audio(video_path, output_audio_path="extracted_audio.wav"):
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        audio.write_audiofile(output_audio_path)
        return output_audio_path
    except Exception as e:
        print(f"Error extracting audio: {e}")
        return None


import speech_recognition as sr

def get_word_timings(original_audio, recognized_text):
    """
    Get the approximate start and end times for each word in the original audio.
    """
    recognizer = sr.Recognizer()
    with sr.AudioFile(original_audio) as source:
        audio = recognizer.record(source)
    try:
        # Perform speech-to-text using Google Web Speech API
        transcription = recognizer.recognize_google(audio)
        print(f"Recognized Text: {transcription}")

        # Split recognized text and the translated text into words
        original_words = transcription.split()
        translated_words = recognized_text.split()

        # Calculate the word timings based on the word length in both texts
        word_durations = len(original_audio) / len(original_words)  # approximate duration per word
        timings = []
        current_time = 0

        for i, word in enumerate(original_words):
            # For each word, calculate its start and end time
            end_time = current_time + word_durations
            timings.append((current_time, end_time))
            current_time = end_time

        return timings

    except Exception as e:
        print(f"Error recognizing speech: {e}")
        return []

def process_audio_and_align(audio_path, translated_text):
    original_audio = AudioSegment.from_file(audio_path)
    audio_duration = len(original_audio) / 1000  # in seconds
    print(f'Audio duration: {audio_duration}')
    
    words = translated_text.split()
    print(f'Translated words: {words}')

    # Get the word timings from the original audio using speech recognition
    word_timings = get_word_timings(audio_path, translated_text)

    if not word_timings:
        print("Unable to get word timings, skipping alignment.")
        return None

    # Initialize the text-to-speech engine
    engine = pyttsx3.init()
    engine.setProperty("rate", 100)
    engine.setProperty("voice", "com.apple.speech.synthesis.voice.Alex")

    # Generate the entire sentence audio in one go (instead of generating per word)
    sentence_audio_buffer = io.BytesIO()
    engine.save_to_file(translated_text, sentence_audio_buffer)
    engine.runAndWait()
    
    # Load the generated audio into AudioSegment from memory
    sentence_audio_buffer.seek(0)  # Rewind the buffer to start
    full_audio = AudioSegment.from_file(sentence_audio_buffer, format="wav")
    print(f"Full sentence audio generated and loaded.")

    # Split the audio into word-level segments based on the recognized timings
    speech_segments = []
    for i, (start_time, end_time) in enumerate(word_timings):
        # Extract the corresponding word audio segment from the full audio
        word_audio = full_audio[start_time * 1000:end_time * 1000]  # Convert to milliseconds
        speech_segments.append(word_audio)
        
        # Add pause if needed based on the original audio
        if i < len(word_timings) - 1:
            next_start_time = word_timings[i + 1][0]
            pause_duration = (next_start_time - end_time) * 1000  # pause duration in milliseconds
            if pause_duration > 0:
                speech_segments.append(AudioSegment.silent(duration=pause_duration))

    try:
        # Combine all word audio segments into one aligned audio
        aligned_audio = sum(speech_segments) if all(isinstance(seg, AudioSegment) for seg in speech_segments) else None
        if aligned_audio is None:
            raise ValueError("Invalid segments in speech segments.")
        
        # Export aligned audio to a file if needed, for example:
        aligned_audio.export("aligned_audio.wav", format="wav")
        print("Aligned audio exported successfully.")
        return "aligned_audio.wav"  # Return the file path of the final output

    except Exception as e:
        print(f"Error exporting aligned audio: {e}")
        return None





In [4]:
# Use the converted WAV file in the pipeline
audio_path = convert_mp3_to_wav("1108.MP3")
translated_audio = process_input(audio_path, "audio", target_lang="en")

MoviePy - Writing audio in final_translated_audio.mp3


                                                       

MoviePy - Done.




In [7]:
translated_audio = process_input("Recording 2024-11-08 233058.mp4", "video", target_lang="en")

MoviePy - Writing audio in extracted_audio.wav


                                                        

MoviePy - Done.
Rate : 44100
Dims of waveform : torch.Size([2, 228438])
Dims of waveform : torch.Size([1, 228438])
Dims of waveform : torch.Size([82880])
Length of mel_features : torch.Size([1, 80, 518])
Length of mel_features : torch.Size([1, 80, 3000])
Data type of inputs : dict_keys(['input_features'])
Number of keys of inputs : 1
Shape of input_features : torch.Size([1, 80, 3000])




Shape of predicted_ids : torch.Size([1, 9])
 Je vis en France.
DataType of transcription <class 'str'>
Translated text: I live in France.
Audio duration: 5.18
Translated words: ['I', 'live', 'in', 'France.']
Recognized Text: Jab beta France
An error occurred during the process: Decoding failed. ffmpeg returned error code: 3199971767

Output from ffmpeg/avlib:

ffmpeg version 2024-11-03-git-df00705e00-full_build-www.gyan.dev Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 14.2.0 (Rev1, Built by MSYS2 project)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-bzlib --enable-lzma --enable-libsnappy --enable-zlib --enable-librist --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-libbluray --enable-libcaca --enable-sdl2 --enable-libaribb24 --enable-libaribcaption --enable-libdav1d --enable-libdavs2 --enable-