In [5]:
%pip install sentencepiece


Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
import torch
import numpy as np
from pydub import AudioSegment
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import MarianTokenizer, MarianMTModel
import pyttsx3
from moviepy.editor import VideoFileClip

In [None]:
# Initialize models
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
translator_model_name = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = MarianTokenizer.from_pretrained(translator_model_name)
translator = MarianMTModel.from_pretrained(translator_model_name)

# Load audio safely using pydub (FFmpeg)
def load_audio_for_whisper(audio_path, target_sample_rate=16000):
    audio = AudioSegment.from_file(audio_path).set_channels(1).set_frame_rate(target_sample_rate)
    samples = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0  # normalize to [-1, 1]
    return samples

# Transcribe with Whisper
def transcribe_audio(audio_path, target_lang="en"):
    waveform = load_audio_for_whisper(audio_path)
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    input_features = inputs.input_features
    attention_mask = inputs.attention_mask if 'attention_mask' in inputs else None

    with torch.no_grad():
        predicted_ids = model.generate(
            input_features, language=target_lang, attention_mask=attention_mask
        )
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Translate with MarianMT
def translate_text(text, target_lang="en"):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = translator.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

# Text to Speech using pyttsx3
def text_to_speech(text, output_audio_path="translated_audio.mp3", language_code="en-US"):
    engine = pyttsx3.init()
    engine.setProperty("rate", 150)
    voices = engine.getProperty("voices")
    for voice in voices:
        if language_code in voice.id:
            engine.setProperty("voice", voice.id)
            break
    engine.save_to_file(text, output_audio_path)
    engine.runAndWait()
    return output_audio_path

# Process input: extract audio, transcribe, translate, convert to audio
def process_input_basic(input_path, input_type="video", target_lang="en"):
    temp_audio_path = "Temp/temp_extracted_audio.wav"

    # Extract audio
    if input_type == "video":
        video = VideoFileClip(input_path)
        video.audio.write_audiofile(temp_audio_path, codec='pcm_s16le')  # WAV PCM
        video.reader.close()
        video.audio.reader.close_proc()
    elif input_type == "audio":
        temp_audio_path = input_path
    else:
        raise ValueError("Unsupported input type. Use 'video' or 'audio'.")

    # Transcribe → Translate → TTS
    original_text = transcribe_audio(temp_audio_path)
    translated_text = translate_text(original_text, target_lang)
    final_audio_path = "Temp/final_translated_audio.mp3"
    text_to_speech(translated_text, output_audio_path=final_audio_path)

    # Cleanup only temp file (not input or video)
    if input_type == "video" and os.path.exists(temp_audio_path):
        os.remove(temp_audio_path)

    return final_audio_path


In [None]:
output_audio = process_input_basic("Temp/0518(1).mp4", input_type="video")

MoviePy - Writing audio in temp_extracted_audio.wav


                                                                    

MoviePy - Done.
