In [1]:

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips
from pydub import AudioSegment
import numpy as np

import pyttsx3
import torch
import torchaudio
import gc

ModuleNotFoundError: No module named 'moviepy.editor'

In [None]:
# Initialize models
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
translator_model_name = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = MarianTokenizer.from_pretrained(translator_model_name)
translator = MarianMTModel.from_pretrained(translator_model_name)

def transcribe_audio(audio_path, target_lang="en"):
    waveform, rate = torchaudio.load(audio_path)
    if waveform.dim() > 2:
        waveform = waveform.mean(dim=0, keepdim=True)
    if rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)
    inputs = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    input_features = inputs.input_features
    attention_mask = inputs.attention_mask if 'attention_mask' in inputs else None

    with torch.no_grad():
        predicted_ids = model.generate(
            input_features, language=target_lang, attention_mask=attention_mask
        )
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

def translate_text(text, target_lang="en"):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = translator.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

def text_to_speech(text, output_audio_path="translated_audio.mp3", language_code="en-US"):
    engine = pyttsx3.init()
    engine.setProperty("rate", 150)  # Adjust speed as needed for sync
    if language_code == "en-US":
        engine.setProperty("voice", "com.apple.speech.synthesis.voice.Alex")
    else:
        engine.setProperty("voice", "com.apple.speech.synthesis.voice.Thomas")
    engine.save_to_file(text, output_audio_path)
    engine.runAndWait()
    return output_audio_path

def convert_mp3_to_wav(mp3_path, wav_path="converted_audio.wav"):
    audio = AudioSegment.from_mp3(mp3_path).set_channels(1)  # Convert to mono
    audio.export(wav_path, format="wav")
    return wav_path

def add_audio_to_video(video_path, audio_path, output_path="output_video.mp4"):
    import time
    from moviepy.editor import VideoFileClip, AudioFileClip

    video_clip = VideoFileClip(video_path)
    audio_clip = AudioFileClip(audio_path)
    try:
        # Synchronize audio duration with video duration
        audio_clip = audio_clip.set_duration(video_clip.duration)
        video_clip = video_clip.set_audio(audio_clip)
        video_clip.write_videofile(output_path, codec="libx264", audio_codec="aac", remove_temp=True)
    finally:
        # Close clips to ensure resources are released
        audio_clip.close()
        video_clip.close()
        del audio_clip, video_clip
        gc.collect()
        time.sleep(1)  # Optional: Introduce a delay to ensure complete cleanup



def process_input(input_path, input_type, target_lang="en"):
    if input_type == "video":
        video_clip = VideoFileClip(input_path)
        audio_path = "extracted_audio.wav"
        video_clip.audio.write_audiofile(audio_path)
    elif input_type == "audio":
        audio_path = input_path
    else:
        raise ValueError("Unsupported input type. Please use 'video' or 'audio'.")

    # Split audio into segments for synchronization
    segment_duration = 1  # seconds (adjust as needed)
    audio = AudioSegment.from_file(audio_path)
    audio_chunks = [audio[i * 1000 * segment_duration:(i + 1) * 1000 * segment_duration]
                    for i in range(len(audio) // (1000 * segment_duration))]

    translated_audio_segments = []
    for i, chunk in enumerate(audio_chunks):
        chunk_path = f"chunk_{i}.wav"
        chunk.export(chunk_path, format="wav")
        original_text = transcribe_audio(chunk_path)
        translated_text = translate_text(original_text, target_lang)
        translated_chunk_path = f"translated_chunk_{i}.mp3"
        text_to_speech(translated_text, output_audio_path=translated_chunk_path)
        translated_audio_segments.append(AudioFileClip(translated_chunk_path))

    # Concatenate all translated audio segments
    final_audio = concatenate_audioclips(translated_audio_segments)
    final_audio_path = "final_translated_audio.mp3"
    final_audio.write_audiofile(final_audio_path)

    # Add the final synchronized audio to the video
    if input_type == "video":
        output_path = "translated_video.mp4"
        add_audio_to_video(input_path, final_audio_path, output_path=output_path)
    else:
        output_path = final_audio_path

    return output_path

In [3]:
translated_audio = process_input("Recording 2024-11-08 233058.mp4", "video", target_lang="en")

MoviePy - Writing audio in extracted_audio.wav


                                                                  

MoviePy - Done.


You have passed language=en, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=en.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


MoviePy - Writing audio in final_translated_audio.mp3


                                                                  

MoviePy - Done.
Moviepy - Building video translated_video.mp4.
MoviePy - Writing audio in translated_videoTEMP_MPY_wvf_snd.mp4


                                                                   

MoviePy - Done.
Moviepy - Writing video translated_video.mp4



                                                               

Moviepy - Done !
Moviepy - video ready translated_video.mp4


In [3]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import torchaudio

# Initialize models for Hindi transcription
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

def transcribe_hindi_audio(audio_path):
    # Load and preprocess audio
    waveform, rate = torchaudio.load(audio_path)
    if waveform.dim() > 2:
        waveform = waveform.mean(dim=0, keepdim=True)
    if rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
        waveform = resampler(waveform)
    waveform = waveform.squeeze(0)
    
    # Prepare input for the model
    inputs = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    input_features = inputs.input_features
    attention_mask = inputs.attention_mask if 'attention_mask' in inputs else None

    # Generate transcription in Hindi
    with torch.no_grad():
        predicted_ids = model.generate(
            input_features, 
            attention_mask=attention_mask, 
            forced_decoder_ids=processor.get_decoder_prompt_ids(language="hi", task="transcribe")
        )
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

audio_path = "Manav Kaul's lay 'Bali aur Shambhu' by Leher, The Dramatics Society of DCAC, Delhi University. [MZlTjI4sa68].mp3"
print("Transcription:", transcribe_hindi_audio(audio_path))


RuntimeError: Couldn't find appropriate backend to handle uri Manav Kaul's lay 'Bali aur Shambhu' by Leher, The Dramatics Society of DCAC, Delhi University. [MZlTjI4sa68].mp3 and format None.