# **MeloTTS Setup** 

In [None]:
# Clone MeloTTS
!git clone https://github.com/myshell-ai/MeloTTS.git
%cd MeloTTS

# Install the package
!pip install -e .

# Download UniDic
!python -m unidic download

# **Packages Install** 

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers accelerate
!pip install openai-whisper
!pip install moviepy pydub gtts
!pip install librosa soundfile noisereduce
!pip install psutil tqdm

# Verify GPU is available
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")

# **GPU Config**

In [None]:
# gpu_config.py
"""
GPU Configuration and Optimization Settings
"""
import torch
import logging
from accelerate import Accelerator

class GPUOptimizer:
    """
    Centralized GPU optimization configuration.
    """
    
    def __init__(self, mixed_precision='fp16', gradient_checkpointing=True):
        try:
            from config import config # This might raise ImportError if config.py is not in the CWD (MeloTTS/)
            self.mixed_precision = config.get("gpu", "mixed_precision", mixed_precision)
            self.gradient_checkpointing = config.get("gpu", "gradient_checkpointing", gradient_checkpointing)
            self.compile_mode = config.get("gpu", "compile_mode", "reduce-overhead")
        except ImportError:
            # Defaults if config.py is not found or 'config' object is not in it
            logging.info("config.py not found or 'config' object missing. Using default GPUOptimizer settings.")
            self.mixed_precision = mixed_precision
            self.gradient_checkpointing = gradient_checkpointing
            self.compile_mode = "reduce-overhead"
        
        self.accelerator = None
        self.device = None
        self.setup_gpu()
    
    def setup_gpu(self):
        """
        Initialize GPU settings and accelerator.
        """
        # Initialize accelerator with mixed precision
        self.accelerator = Accelerator(
            mixed_precision=self.mixed_precision,
            gradient_accumulation_steps=1,
            cpu=not torch.cuda.is_available()
        )
        
        self.device = self.accelerator.device
        
        # Set optimal PyTorch settings for performance
        if torch.cuda.is_available():
            torch.backends.cudnn.benchmark = True
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True
            
            # Print GPU information
            for i in range(torch.cuda.device_count()):
                gpu_props = torch.cuda.get_device_properties(i)
                memory_gb = gpu_props.total_memory / 1024**3
                logging.info(f"GPU {i}: {gpu_props.name} ({memory_gb:.1f} GB)")
        
        logging.info(f"Using device: {self.device}")
        logging.info(f"Mixed precision: {self.mixed_precision}")
    
    def optimize_model(self, model, compile_mode=None):
        """
        Apply optimizations to a model.
        """
        compile_mode_to_use = compile_mode or self.compile_mode
        
        # Move model to device
        # Accelerator's prepare can handle model movement and more (like DDP setup)
        model = self.accelerator.prepare(model)
        
        # Enable gradient checkpointing if supported
        if self.gradient_checkpointing and hasattr(model, 'gradient_checkpointing_enable'):
            model.gradient_checkpointing_enable()
            logging.info("Gradient checkpointing enabled")
        
        # Apply torch.compile for 2x speed boost (PyTorch 2.0+)
        if hasattr(torch, 'compile') and torch.cuda.is_available() and self.device.type == 'cuda':
            logging.info(f"Applying torch.compile with mode: {compile_mode_to_use}")
            try:
                model = torch.compile(model, mode=compile_mode_to_use)
            except Exception as e:
                logging.warning(f"torch.compile failed with mode {compile_mode_to_use}: {e}. Model will not be compiled.")
        
        return model
    
    def get_memory_info(self):
        """Get current GPU memory usage."""
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1024**3
            cached = torch.cuda.memory_reserved() / 1024**3
            total = torch.cuda.get_device_properties(0).total_memory / 1024**3
            
            return {
                'allocated_gb': allocated,
                'cached_gb': cached,
                'total_gb': total,
                'free_gb': total - allocated # Note: free is total - allocated, not total - cached
            }
        return None
    
    def clear_cache(self):
        """Clear GPU cache to free memory."""
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            logging.info("GPU cache cleared")
    
    def log_gpu_status(self):
        """Log current GPU memory status."""
        memory_info = self.get_memory_info()
        if memory_info:
            logging.info(f"GPU Memory - Allocated: {memory_info['allocated_gb']:.1f}GB, "
                        f"Cached: {memory_info['cached_gb']:.1f}GB, "
                        f"Free (approx): {memory_info['free_gb']:.1f}GB, "
                        f"Total: {memory_info['total_gb']:.1f}GB")

# Global GPU optimizer instance
gpu_optimizer = GPUOptimizer()

# **Performance Monitor**

In [None]:
# Copy and paste the entire performance_monitor.py content here
# performance_monitor.py
"""
Performance monitoring and profiling utilities for video translation.
"""
import time
import psutil
import logging
from contextlib import contextmanager
import functools

class PerformanceMonitor:
    """
    Monitor system and GPU performance during video translation.
    """
    
    def __init__(self):
        self.metrics = {}
    
    @contextmanager
    def timer(self, operation_name: str):
        """Context manager to time operations."""
        start_time = time.time()
        start_memory = self._get_memory_usage()
        
        logging.info(f"Starting {operation_name}...")
        gpu_optimizer.log_gpu_status() # Log GPU status at start of operation
        
        try:
            yield
        finally:
            end_time = time.time()
            end_memory = self._get_memory_usage()
            
            duration = end_time - start_time
            memory_delta = end_memory - start_memory if end_memory and start_memory else 0
            
            self.metrics[operation_name] = {
                'duration': duration,
                'memory_delta_mb': memory_delta,
                'start_memory_mb': start_memory,
                'end_memory_mb': end_memory
            }
            
            logging.info(f"Completed {operation_name} in {duration:.2f}s (Memory: {memory_delta:+.1f}MB)")
            gpu_optimizer.log_gpu_status() # Log GPU status at end of operation
    
    def _get_memory_usage(self):
        """Get current memory usage in MB."""
        try:
            process = psutil.Process()
            return process.memory_info().rss / 1024 / 1024
        except:
            return None
    
    def log_gpu_status_direct(self): # Renamed to avoid conflict if gpu_optimizer is not yet fully init
        """Log current GPU memory status using gpu_optimizer."""
        gpu_optimizer.log_gpu_status()
    
    def get_summary(self):
        """Get performance summary."""
        if not self.metrics:
            return "No performance data collected."
        
        total_time = sum(m['duration'] for m in self.metrics.values())
        # Peak memory delta is not just sum, but max of end_memory_mb or related metric
        # For simplicity, current sum of deltas is kept, but it's not peak system RAM usage.
        total_memory_change = sum(m['memory_delta_mb'] for m in self.metrics.values())
        
        summary = f"\n{'='*50}\n"
        summary += "PERFORMANCE SUMMARY\n"
        summary += f"{'='*50}\n"
        summary += f"Total Time: {total_time:.2f}s\n"
        summary += f"Net Memory Change (RAM): {total_memory_change:.1f}MB\n\n"
        
        summary += "Operation Breakdown:\n"
        for op_name, metrics in self.metrics.items():
            summary += f"  {op_name}: {metrics['duration']:.2f}s "
            summary += f"(RAM Δ: {metrics['memory_delta_mb']:+.1f}MB)\n"
        
        return summary

# Global performance monitor
performance_monitor = PerformanceMonitor()

# **Main translator script I: Imports and Setup**

In [None]:
# Main translator script - Part 1: Imports and setup
import os
import sys
import logging
import tempfile
import math
import shutil
import mmap
from typing import List, Tuple, Dict, Optional
import numpy as np
import concurrent.futures

import whisper
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
from gtts import gTTS
from pydub import AudioSegment
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
from pydub.playback import play
from pydub.silence import detect_nonsilent
from pydub import effects
import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
import librosa
import soundfile as sf
import noisereduce as nr
# from accelerate import Accelerator # Accelerator is part of gpu_optimizer
from melo.api import TTS

# Set up logging (ensure this is run before other cells that use logging if modules were separate)
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_filepath = os.path.join(log_dir, "video_translator.log")
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(), 
        logging.FileHandler(log_filepath) 
    ]
)

logging.info("Logging system initialized.")

# Initialize melo
global melo_tts_model
melo_tts_model = None

# Use global GPU optimizer
accelerator = gpu_optimizer.accelerator # accelerator instance from GPUOptimizer
device = gpu_optimizer.device         # device (cuda/cpu) from GPUOptimizer

# Language mapping
# Speaker IDs should be names that MeloTTS recognizes for the respective language, or language codes themselves if MeloTTS defaults.
# For English, 'EN-US' is a common speaker. For others, the language code itself (e.g., 'ES', 'FR') often maps to a default speaker.
LANGUAGE_MODEL_MAP: Dict[str, Dict[str, str]] = {
    "en": {"melo_language": "EN", "speaker_id": "EN-US"}, # Changed EN-Default to EN-US
    "es": {"melo_language": "ES", "speaker_id": "ES"},
    "fr": {"melo_language": "FR", "speaker_id": "FR"},
    "zh": {"melo_language": "ZH", "speaker_id": "ZH"},
    "jp": {"melo_language": "JP", "speaker_id": "JP"},
    "kr": {"melo_language": "KR", "speaker_id": "KR"},
}

DUCKING_GAIN_DB = -15
CROSSFADE_MS = 100

# **Main translator script II: Core functions**

In [None]:
# Main translator script - Part 2: Core functions
def create_project_structure(video_path: str, target_language: str) -> str:
    print("Creating project structure...")
    base_name = os.path.splitext(os.path.basename(video_path))[0]
    project_dir = os.path.join(os.getcwd(), f"{base_name}_{target_language}_translation")
    os.makedirs(project_dir, exist_ok=True)
    
    for subdir in ['audio', 'transcripts', 'translations', 'translated_segments']:
        os.makedirs(os.path.join(project_dir, subdir), exist_ok=True)
    
    print(f"Project directory: {project_dir}")
    return project_dir

def memory_mapped_audio_loader(audio_path: str) -> np.ndarray:
    """Load large audio files using memory mapping for efficient memory usage."""
    logging.info(f"Loading audio with memory mapping: {audio_path}")
    
    try:
        with sf.SoundFile(audio_path, mode='r') as f:
            # soundfile's read can be more memory efficient than librosa for some direct loads.
            # However, Whisper expects a full numpy array.
            # The main benefit here might be broader format support or faster initial I/O.
            audio_data = f.read(dtype='float32') # Read as float32, Whisper expects this
            if f.samplerate != 16000:
                logging.info(f"Resampling audio from {f.samplerate} Hz to 16000 Hz for Whisper.")
                # Ensure mono for resampling
                if audio_data.ndim > 1 and audio_data.shape[1] > 1:
                    audio_data = np.mean(audio_data, axis=1)
                audio_data = librosa.resample(audio_data, orig_sr=f.samplerate, target_sr=16000)
            return audio_data
    except Exception as e:
        logging.warning(f"SoundFile loading failed: {e}, falling back to librosa.load for {audio_path}")
        # Librosa loads as float32 by default, and can resample.
        audio_data, sr = librosa.load(audio_path, sr=16000, mono=True) # Whisper requires 16kHz mono
        return audio_data


def extract_audio(video_path: str, output_path: str) -> str:
    logging.info("Extracting audio from video...")
    print("Extracting audio from video...")
    video = VideoFileClip(video_path)
    # Ensure audio is written in a compatible format, e.g., WAV for consistency
    video.audio.write_audiofile(output_path, codec='pcm_s16le', logger=None)
    print(f"Audio extracted to: {output_path}")
    return output_path

def transcribe_with_whisper(audio_path: str) -> dict:
    logging.info("Transcribing audio with Whisper...")
    print("Transcribing audio with Whisper...")
    
    # Load model with GPU optimization
    # Whisper models are relatively small, 'medium' is a good balance.
    # device comes from gpu_optimizer
    model = whisper.load_model("medium", device=device) 
    
    # Apply GPU optimizations (like torch.compile if applicable and enabled)
    # Note: Whisper model might already be optimized or have parts that don't benefit as much from compile
    model = gpu_optimizer.optimize_model(model) 
    
    audio_data = None
    try:
        # Load audio as a float32 NumPy array, resampled to 16kHz mono
        audio_data = memory_mapped_audio_loader(audio_path) 
        logging.info(f"Audio loaded for Whisper: shape={audio_data.shape}, dtype={audio_data.dtype}")
        result = model.transcribe(audio_data, word_timestamps=True)
    except Exception as e:
        logging.error(f"Whisper transcription failed: {e}", exc_info=True)
        # Fallback or re-raise might be needed here depending on desired robustness
        raise
    finally:
        # Critical: Explicitly unload Whisper model and clear GPU cache
        logging.info("Attempting to unload Whisper model and free memory.")
        del model 
        if audio_data is not None: 
            del audio_data 
        
        # gc.collect() can be helpful but not a silver bullet
        import gc
        gc.collect()
        gpu_optimizer.clear_cache()
        logging.info("Whisper model and associated data unloaded; GPU cache cleared.")

    print(f"Transcription complete. Detected language: {result['language']}")
    logging.info("Whisper transcription completed.")
    return result

In [None]:
def text_to_speech(text_to_synthesize: str, target_language: str, output_filepath: str,
                   melo_tts_model, melo_lang_code: str, melo_spk_id_name: str, speed: float = 1.0):
    
    # Try MeloTTS first
    if melo_tts_model and melo_lang_code and melo_spk_id_name:
        try:
            if melo_tts_model.hps.data.spk2id is None:
                # This can happen if the model for the language doesn't have explicit speaker IDs (e.g. single speaker model)
                # In such cases, MeloTTS might expect speaker_id=0 or handle it internally if None is passed.
                # Let's try with a default of 0 if not found, or consult MeloTTS docs for this specific case.
                # For now, we assume spk2id is available. If it's None, it might imply a single-speaker model
                # where the speaker ID is implicitly 0 or not needed.
                # However, the error indicates it *tries* to make a tensor from speaker_id.
                # Let's check if the key exists first.
                logging.warning(f"MeloTTS model for language {melo_lang_code} seems to have no spk2id map. Attempting default.")
                # This case needs more investigation into MeloTTS internals if spk2id is None.
                # For now, let's assume spk2id IS populated.
                pass # This branch may need refinement if `spk2id` can be None.

            if melo_spk_id_name not in melo_tts_model.hps.data.spk2id:
                logging.error(f"Speaker name '{melo_spk_id_name}' not found in MeloTTS model's speaker map for language {melo_lang_code}. Available: {list(melo_tts_model.hps.data.spk2id.keys())}")
                raise KeyError(f"Speaker '{melo_spk_id_name}' not found.") # Force fallback to gTTS

            numerical_speaker_id = melo_tts_model.hps.data.spk2id[melo_spk_id_name]
            
            logging.info(f"MeloTTS: Using lang={melo_lang_code}, spk_name='{melo_spk_id_name}', numerical_id={numerical_speaker_id}")

            melo_tts_model.tts_to_file(
                text_to_synthesize,
                numerical_speaker_id,  # Pass the numerical ID
                output_filepath,
                speed=speed
            )
            logging.info(f"MeloTTS generated audio for '{text_to_synthesize[:50]}...' to {output_filepath} using lang={melo_lang_code}, spk={melo_spk_id_name}")
            return
        except Exception as e: # Catching a broader exception as internal MeloTTS errors can vary
            logging.error(f"MeloTTS generation failed for lang='{melo_lang_code}', spk='{melo_spk_id_name}' with error: {e}. Falling back to gTTS.")
            import traceback
            traceback.print_exc() 

    # Fallback to gTTS
    try:
        logging.info(f"Using gTTS for '{text_to_synthesize[:50]}...' (lang: {target_language}) to {output_filepath}")
        # gTTS expects a language code like 'en', 'fr', etc.
        tts = gTTS(text=text_to_synthesize, lang=target_language, slow=False)
        tts.save(output_filepath)
    except Exception as e:
        logging.error(f"gTTS generation failed for '{text_to_synthesize[:50]}...' with error: {e}. Skipping segment audio.")
        # Create a very short silent file to prevent downstream errors if no audio is generated
        AudioSegment.silent(duration=10, frame_rate=22050).export(output_filepath, format="wav")

def _translate_text_internal(text: str, source_language: str, target_language: str, translation_model, translation_tokenizer) -> str:
    logging.info(f"Translating chunk of text from {source_language} to {target_language}...")
    if not text.strip(): # Handle empty input text
        logging.info("Input text for translation is empty. Returning empty string.")
        return ""
    if source_language == target_language:
        return text

    # Ensure model and tokenizer are on the correct device (handled by accelerator.prepare generally)
    # inputs = translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    # .to(device) might be redundant if model is already prepared and on device, tokenizer usually follows.
    raw_inputs = translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in raw_inputs.items()} # Explicitly move to device

    # For mBART-like models that require src_lang to be set on tokenizer
    if hasattr(translation_tokenizer, 'src_lang') and source_language:
        # mBART uses specific language codes (e.g., 'en_XX', 'fr_XX')
        # We need a mapping if `source_language` is 'en', 'fr'
        # For now, assume `source_language` is already in the correct format if mBART is used, or Helsinki model doesn't need it.
        # This might need refinement if mixing models with different lang code requirements.
        translation_tokenizer.src_lang = source_language 
    
    forced_bos_token_id = None
    if hasattr(translation_tokenizer, 'lang_code_to_id') and target_language in translation_tokenizer.lang_code_to_id:
        forced_bos_token_id = translation_tokenizer.lang_code_to_id[target_language]
    elif hasattr(translation_tokenizer, 'get_lang_id') and callable(translation_tokenizer.get_lang_id):
        try:
            forced_bos_token_id = translation_tokenizer.get_lang_id(target_language)
        except Exception as e:
            logging.warning(f"Could not get lang_id for {target_language} from tokenizer: {e}")


    with autocast(enabled=(accelerator.mixed_precision == 'fp16' or accelerator.mixed_precision == 'bf16')):
        if forced_bos_token_id is not None:
            translated_tokens = translation_model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_new_tokens=512)
        else:
            # This path is for models that don't use/need forced_bos_token_id (e.g. some Helsinki models for specific pairs)
            translated_tokens = translation_model.generate(**inputs, max_new_tokens=512)
    
    translated_text = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translated_text


def _summarize_text_internal(text: str, summarizer_pipeline, max_length_ratio: float = 0.75, min_length_abs: int = 30) -> str:
    print("Summarizing text...")
    if not text.strip():
        return ""
    
    text_len_chars = len(text)
    # Heuristic: if text is already short, don't summarize or summarize less aggressively
    if text_len_chars < 100: # Adjust this threshold as needed
        # Summarization might not be effective or necessary for very short texts
        # For now, we try to summarize anyway, but one could return text as is.
        pass 

    # Calculate max_length based on ratio, ensuring it's not too small
    # Summarization models often work better with token counts, but char count is an approximation
    # Max length should generally be less than the input length.
    calculated_max_length = int(text_len_chars * max_length_ratio)
    # Ensure max_length is at least min_length_abs + a small margin, and less than text_len_chars
    final_max_length = max(min_length_abs + 10, calculated_max_length) 
    final_max_length = min(final_max_length, text_len_chars -1) # Ensure it's shorter than original
    
    # Ensure min_length is less than max_length and not excessively small
    final_min_length = min(min_length_abs, final_max_length - 5)
    final_min_length = max(10, final_min_length) # Absolute minimum length

    if final_min_length >= final_max_length:
        logging.warning(f"Cannot summarize: min_length ({final_min_length}) >= max_length ({final_max_length}). Returning original text.")
        return text
    
    logging.info(f"Summarizing with min_length={final_min_length}, max_length={final_max_length}")

    with autocast(enabled=(accelerator.mixed_precision == 'fp16' or accelerator.mixed_precision == 'bf16')):
        try:
            summary = summarizer_pipeline(text, max_length=final_max_length, min_length=final_min_length, do_sample=False)[0]['summary_text']
        except Exception as e:
            logging.error(f"Summarization failed: {e}. Returning original text.")
            return text
    
    return summary

# **Audio processing functions**

In [None]:
# Audio processing functions
def noise_reduction(audio: AudioSegment, reduction_amount=0.8):
    """Apply noise reduction to the audio segment. reduction_amount (0.0 to 1.0)"""
    if not (0.0 <= reduction_amount <= 1.0):
        logging.warning(f"Noise reduction_amount {reduction_amount} out of bounds [0,1]. Clamping.")
        reduction_amount = max(0.0, min(1.0, reduction_amount))

    samples = np.array(audio.get_array_of_samples()).astype(np.float32)
    # Ensure samples are float for noisereduce, and normalize if they are not in [-1, 1] range
    # pydub samples are integers, so convert to float and scale
    if audio.sample_width == 2: # 16-bit
        samples = samples / (2**15) 
    elif audio.sample_width == 1: # 8-bit
        samples = (samples - 128) / 128.0
    # Add other sample widths if necessary

    # Noisereduce expects mono audio. If stereo, process channels separately or convert to mono.
    # For simplicity, let's assume mono or average stereo for noise profile.
    if audio.channels == 2:
        samples_mono_for_noise_profile = samples.reshape((-1, 2)).mean(axis=1)
        noise_clip = nr.reduce_noise(y=samples_mono_for_noise_profile, sr=audio.frame_rate, prop_decrease=0, n_fft=2048, hop_length=512, y_noise=None, stationary=False) # Get noise profile
        
        reduced_noise_channels = []
        for i in range(audio.channels):
            channel_samples = samples.reshape((-1, audio.channels))[:, i]
            reduced_channel = nr.reduce_noise(y=channel_samples, sr=audio.frame_rate, y_noise=noise_clip, prop_decrease=reduction_amount, stationary=True)
            reduced_noise_channels.append(reduced_channel)
        reduced_noise_samples = np.stack(reduced_noise_channels, axis=-1).flatten()
    else: # Mono
        reduced_noise_samples = nr.reduce_noise(y=samples, sr=audio.frame_rate, prop_decrease=reduction_amount)
    
    # Convert back to original integer type for AudioSegment
    if audio.sample_width == 2:
        reduced_noise_samples = (reduced_noise_samples * (2**15)).astype(np.int16)
    elif audio.sample_width == 1:
        reduced_noise_samples = ((reduced_noise_samples * 128) + 128).astype(np.uint8)

    return AudioSegment(
        reduced_noise_samples.tobytes(),
        frame_rate=audio.frame_rate,
        sample_width=audio.sample_width,
        channels=audio.channels
    )

def enhance_voice(audio: AudioSegment):
    """Enhance the voice in the audio segment."""
    # Apply a high-pass filter to remove low-frequency rumble
    enhanced = audio.high_pass_filter(80)
    # Apply a low-pass filter to remove high-frequency hiss (optional, depends on source)
    # enhanced = enhanced.low_pass_filter(12000) # Adjust frequency as needed
    
    # Compressor to even out volume levels
    # Threshold: -20dBFS, Ratio: 4:1, Attack: 5ms, Release: 50ms
    # These are common starting points, may need tuning.
    enhanced = effects.compress_dynamic_range(enhanced, threshold=-20.0, ratio=4.0, attack=5.0, release=50.0)
    
    # Normalize to a target level (e.g., -3dBFS) to ensure consistent loudness
    enhanced = effects.normalize(enhanced, headroom=3.0)
    return enhanced

def advanced_time_stretch(audio: AudioSegment, target_duration: int) -> AudioSegment:
    if len(audio) == 0 or target_duration <=0:
        logging.warning(f"Cannot time stretch, invalid input audio length {len(audio)} or target duration {target_duration}.")
        return audio if len(audio) > 0 else AudioSegment.silent(duration=10)
    
    logging.info(f"Performing advanced time stretch. Input: {len(audio)/1000:.2f}s, Target: {target_duration/1000:.2f}s")
    
    # Convert pydub AudioSegment to numpy array (float32, normalized to [-1, 1])
    samples = np.array(audio.get_array_of_samples()).astype(np.float32)
    if audio.sample_width == 2:
        samples /= (2**15) # For 16-bit audio
    elif audio.sample_width == 1: # 8-bit unsigned
        samples = (samples - 128) / 128.0
    # Add other sample widths if necessary

    if audio.channels == 2:
        samples_mono = samples.reshape((-1, 2)).mean(axis=1) # Librosa time_stretch needs mono
    else:
        samples_mono = samples
    
    stretch_factor = len(audio) / target_duration # rate for librosa.effects.time_stretch
    # Clamp stretch_factor to avoid extreme distortion (e.g., 0.5x to 2.0x speed)
    # rate < 1 slows down, rate > 1 speeds up.
    # So if current is 10s, target is 5s, factor is 2 (speed up by 2x)
    # if current is 5s, target is 10s, factor is 0.5 (slow down by 2x)
    stretch_factor = max(min(stretch_factor, 2.0), 0.5)
    
    logging.info(f"Adjusted stretch factor (rate for librosa): {stretch_factor:.2f}")
    
    # Librosa time_stretch
    stretched_samples_mono = librosa.effects.time_stretch(samples_mono, rate=stretch_factor)
    
    # If original was stereo, we need to create a stereo output from stretched mono
    # A simple way is to duplicate the mono channel.
    if audio.channels == 2:
        stretched_samples = np.vstack((stretched_samples_mono, stretched_samples_mono)).T.flatten()
    else:
        stretched_samples = stretched_samples_mono

    # Ensure correct duration by padding or truncating (librosa stretch is approximate)
    target_num_samples = int(target_duration / 1000 * audio.frame_rate * audio.channels)
    current_num_samples = len(stretched_samples)

    if current_num_samples > target_num_samples:
        stretched_samples = stretched_samples[:target_num_samples]
    elif current_num_samples < target_num_samples:
        padding = np.zeros(target_num_samples - current_num_samples)
        stretched_samples = np.concatenate([stretched_samples, padding])
    
    # Convert back to original integer type for AudioSegment
    if audio.sample_width == 2:
        stretched_samples = (stretched_samples * (2**15)).astype(np.int16)
    elif audio.sample_width == 1:
        stretched_samples = ((stretched_samples * 128) + 128).astype(np.uint8)
    
    stretched_audio = AudioSegment(
        stretched_samples.tobytes(),
        frame_rate=audio.frame_rate,
        sample_width=audio.sample_width,
        channels=audio.channels
    )
    
    return stretched_audio

# **Segment Processing and sync**

In [None]:
# In your main script, replace the existing process_segment function
def process_segment(segment_info: Tuple[int, dict], original_audio_segment_duration: int, 
                    source_language: str, target_language: str, segments_dir: str,
                    translation_model, translation_tokenizer, summarizer_pipeline, melo_tts_model):
    i, segment = segment_info # Unpack index and segment data
    start_time_ms = int(segment['start'] * 1000)
    end_time_ms = int(segment['end'] * 1000)
    # original_duration_ms = end_time_ms - start_time_ms # This is duration of original text segment
    # We use original_audio_segment_duration which is passed if it's more accurate (e.g. from audio split)
    # For now, let's use the segment's own duration. If Whisper segments are reliable, this is fine.
    original_duration_ms = end_time_ms - start_time_ms
    if original_duration_ms <= 0: # Avoid issues with zero/negative duration segments
        logging.warning(f"Segment {i} has zero or negative duration. Skipping.")
        return None

    logging.info(f"Processing segment {i}: '{segment['text'][:50]}...' ({original_duration_ms}ms)")

    # Translate segment text
    translated_segment_text = _translate_text_internal(segment['text'], source_language, target_language, translation_model, translation_tokenizer)
    
    if not translated_segment_text.strip():
        logging.warning(f"Skipping empty translated segment for: '{segment['text']}'")
        return None
    
    tts_output_path = os.path.join(segments_dir, f"segment_{i:04d}.wav")
    
    try:
        melo_lang_info = LANGUAGE_MODEL_MAP.get(target_language)
        if melo_lang_info:
            melo_lang_code = melo_lang_info["melo_language"]
            melo_spk_id_name = melo_lang_info["speaker_id"] if melo_lang_info else target_language.upper()
        else:
            logging.warning(f"MeloTTS language/speaker info not found for {target_language}. Fallback to gTTS will occur.")
            melo_lang_code = target_language # Best guess for MeloTTS if specific code not mapped
            melo_spk_id_name = target_language # Best guess for MeloTTS speaker if not mapped

        # Generate initial TTS audio
        text_to_speech(
            text_to_synthesize=translated_segment_text, # CORRECTED: was 'translated_segment'
            target_language=target_language, # For gTTS fallback
            output_filepath=tts_output_path,
            melo_tts_model=melo_tts_model,
            melo_lang_code=melo_lang_code, # For MeloTTS
            melo_spk_id_name=melo_spk_id_name, # For MeloTTS
            speed=1.0 # MeloTTS base speed
        )
            
        translated_audio_segment = AudioSegment.from_file(tts_output_path)
        current_tts_duration = len(translated_audio_segment)

        # Summarization if TTS audio is much longer than original segment's duration
        # Threshold: e.g., if TTS is 50% longer than original, or more than 3s longer absolute
        # (len(translated_audio_segment) - original_duration_ms >= 3000)
        if current_tts_duration > original_duration_ms * 1.5 and current_tts_duration - original_duration_ms > 1500:
            logging.info(f"Segment {i} TTS ({current_tts_duration}ms) is much longer than original ({original_duration_ms}ms). Summarizing text.")
            summarized_text = _summarize_text_internal(translated_segment_text, summarizer_pipeline)
            if summarized_text.strip() and len(summarized_text) < len(translated_segment_text):
                text_to_speech(
                    text_to_synthesize=summarized_text,
                    target_language=target_language,
                    output_filepath=tts_output_path,
                    melo_tts_model=melo_tts_model,
                    melo_lang_code=melo_lang_code,
                    melo_spk_id_name=melo_spk_id_name,
                    speed=1.0
                )
                translated_audio_segment = AudioSegment.from_file(tts_output_path)
                logging.info(f"Segment {i} re-synthesized with summarized text. New TTS duration: {len(translated_audio_segment)}ms")
            else:
                logging.info(f"Summarization did not significantly shorten text for segment {i}. Using original TTS.")
        
        # Audio processing pipeline
        # 1. Time Stretch to match original segment duration
        stretched_audio = advanced_time_stretch(translated_audio_segment, original_duration_ms)
        del translated_audio_segment; import gc; gc.collect()
        
        # 2. Noise Reduction (optional, can be light)
        # noise_reduced_audio = noise_reduction(stretched_audio, reduction_amount=0.1) # Light reduction
        # del stretched_audio; import gc; gc.collect()
        # Current noise_reduction implementation needs float samples, ensure advanced_time_stretch provides compatible output or adjust here
        # For now, skipping noise reduction here as it can be sensitive.
        processed_audio = stretched_audio # If skipping NR

        # 3. Voice Enhancement (Normalization, Light Compression, EQ)
        enhanced_audio = enhance_voice(processed_audio)
        if processed_audio != enhanced_audio: # only del if it's a new object
             del processed_audio; import gc; gc.collect()
        
        # Export the final processed segment audio to disk
        enhanced_audio.export(tts_output_path, format="wav")
        del enhanced_audio; import gc; gc.collect()
        
        return (start_time_ms, tts_output_path)
        
    except Exception as e:
        logging.error(f"Error processing segment {i} ('{segment['text'][:30]}...'): {str(e)}", exc_info=True)
        gpu_optimizer.clear_cache() # Clear cache on error too
        import gc; gc.collect()
        return None

def adaptive_segment_processing(segments_data: List[dict], original_audio: AudioSegment, 
                                source_language: str, target_language: str, project_dir: str,
                                translation_model, translation_tokenizer, summarizer_pipeline, melo_tts_model):
    segments_dir = os.path.join(project_dir, 'translated_segments')
    os.makedirs(segments_dir, exist_ok=True)
    
    processed_segments_info = []
    # Process sequentially for now to ensure stability, especially with GPU memory.
    # The original_audio object isn't directly used per segment here, but segment timings are from transcript.
    # If segment durations need to be derived from audio splits, that logic would be different.
    
    tasks = []
    for i, segment in enumerate(segments_data):
        # We pass the duration of the original segment based on its timestamps.
        # If Whisper segments are very precise, this is fine.
        # original_segment_duration_ms = int((segment['end'] - segment['start']) * 1000)
        # The process_segment function will calculate this from segment['start'] and segment['end'] itself.
        tasks.append(((i, segment), 0, source_language, target_language, segments_dir,
                      translation_model, translation_tokenizer, summarizer_pipeline, melo_tts_model))

    for task_args in tqdm(tasks, desc="Processing segments"):
        # Unpack to match new signature of process_segment
        segment_info, _, src_lang, tgt_lang, seg_dir, trans_model, trans_tok, summ_pipe, melo_model = task_args
        # The 'original_audio_segment_duration' is not explicitly passed here, 
        # process_segment calculates it from segment timestamps.
        result = process_segment(segment_info, 0, src_lang, tgt_lang, seg_dir,
                                 trans_model, trans_tok, summ_pipe, melo_model)
        if result:
            processed_segments_info.append(result)
        # Optional: aggressive cleanup after each segment if memory is extremely tight
        # gpu_optimizer.clear_cache()
        # import gc; gc.collect()
            
    return processed_segments_info

def preserve_sound_effects(original_audio: AudioSegment, 
                           synced_speech: AudioSegment, 
                           transcript: dict, 
                           project_dir: str, 
                           silence_original_speech: bool = True,
                           duck_gain_for_background: Optional[float] = DUCKING_GAIN_DB) -> AudioSegment:
    
    debug_audio_dir = os.path.join(project_dir, 'audio_debug')
    os.makedirs(debug_audio_dir, exist_ok=True)

    logging.info(f"Mixing audio. Silence original speech: {silence_original_speech}. Background duck gain: {duck_gain_for_background}dB")
    
    target_channels = original_audio.channels # Default to original's channels
    if original_audio.channels == 1 and synced_speech.channels == 2:
        target_channels = 2 # If original is mono but TTS is stereo, prefer stereo
    
    working_original_audio = original_audio
    if working_original_audio.channels != target_channels:
        logging.info(f"Standardizing 'working_original_audio' from {working_original_audio.channels} to {target_channels} channels.")
        working_original_audio = working_original_audio.set_channels(target_channels)
    
    if synced_speech.channels != target_channels:
        logging.info(f"Standardizing 'synced_speech' from {synced_speech.channels} to {target_channels} channels.")
        synced_speech = synced_speech.set_channels(target_channels)

    # Save initial states for debugging
    working_original_audio.export(os.path.join(debug_audio_dir, "0_working_original_audio_standardized.wav"), format="wav")
    synced_speech.export(os.path.join(debug_audio_dir, "0_synced_french_speech_standardized.wav"), format="wav")

    # This will become the original audio track with speech sections silenced or ducked
    processed_original_audio = AudioSegment.empty() # Start with an empty segment

    if silence_original_speech:
        logging.info("Reconstructing original audio with speech segments silenced.")
        last_end_ms = 0
        speech_intervals_ms = sorted([(int(s['start']*1000), int(s['end']*1000)) for s in transcript['segments'] if int(s['end']*1000) > int(s['start']*1000)])

        for start_ms, end_ms in speech_intervals_ms:
            # Append non-speech part from original
            if start_ms > last_end_ms:
                non_speech_part = working_original_audio[last_end_ms:start_ms]
                processed_original_audio += non_speech_part
            
            # Append silence for the speech part
            duration_ms = end_ms - start_ms
            if duration_ms > 0:
                silence = AudioSegment.silent(duration=duration_ms, frame_rate=working_original_audio.frame_rate)
                silence = silence.set_channels(working_original_audio.channels)
                processed_original_audio += silence
            last_end_ms = end_ms
        
        # Append any remaining part of the original audio after the last speech segment
        if last_end_ms < len(working_original_audio):
            remaining_part = working_original_audio[last_end_ms:]
            processed_original_audio += remaining_part
        
        # Ensure the processed_original_audio has the correct total duration, pad with silence if necessary
        if len(processed_original_audio) < len(working_original_audio):
            padding_duration = len(working_original_audio) - len(processed_original_audio)
            padding_silence = AudioSegment.silent(duration=padding_duration, frame_rate=working_original_audio.frame_rate).set_channels(working_original_audio.channels)
            processed_original_audio += padding_silence
        elif len(processed_original_audio) > len(working_original_audio):
            processed_original_audio = processed_original_audio[:len(working_original_audio)]


        logging.info(f"Reconstructed original audio with silenced speech. Duration: {len(processed_original_audio)/1000}s")
        processed_original_audio.export(os.path.join(debug_audio_dir, "1_processed_original_AFTER_silencing.wav"), format="wav")

    elif duck_gain_for_background is not None: 
        logging.info(f"Applying ducking with gain {duck_gain_for_background}dB to original speech segments.")
        processed_original_audio = working_original_audio.dup() # Start with a copy for ducking
        speech_intervals_ms = [(int(s['start']*1000), int(s['end']*1000)) for s in transcript['segments'] if int(s['end']*1000) > int(s['start']*1000)]
        for start_ms, end_ms in speech_intervals_ms:
            duck_eff_start = max(0, start_ms - CROSSFADE_MS) 
            duck_eff_end = min(len(processed_original_audio), end_ms + CROSSFADE_MS)
            if duck_eff_start < duck_eff_end :
                segment_to_duck = processed_original_audio[duck_eff_start:duck_eff_end]
                ducked_segment = segment_to_duck.apply_gain(duck_gain_for_background) 
                processed_original_audio = processed_original_audio.overlay(ducked_segment, position=duck_eff_start)
        logging.info(f"Applied ducking. Duration: {len(processed_original_audio)/1000}s")
        processed_original_audio.export(os.path.join(debug_audio_dir, "1_processed_original_AFTER_ducking.wav"), format="wav")
    else:
        # Neither silencing nor ducking, just use the standardized original
        processed_original_audio = working_original_audio
        logging.info("No silencing or ducking applied to original audio.")
        processed_original_audio.export(os.path.join(debug_audio_dir, "1_processed_original_NO_OP.wav"), format="wav")


    logging.info(f"Overlaying translated speech (duration: {len(synced_speech)/1000}s) onto processed original (duration: {len(processed_original_audio)/1000}s)...")
    
    # Ensure channel counts match perfectly before the final overlay
    if processed_original_audio.channels != synced_speech.channels:
        logging.error(f"FINAL ATTEMPT CHANNEL MISMATCH. Processed Original: {processed_original_audio.channels}, Synced Speech: {synced_speech.channels}. This should not happen if standardization worked.")
        # Force one to match the other, e.g. make synced_speech match processed_original_audio
        synced_speech = synced_speech.set_channels(processed_original_audio.channels)
        
    final_audio_mix = processed_original_audio.overlay(synced_speech)
    final_audio_mix.export(os.path.join(debug_audio_dir, "2_final_mixed_audio.wav"), format="wav")
    
    logging.info(f"Audio mixing complete. Final duration: {len(final_audio_mix)/1000}s")
    return final_audio_mix

def create_synced_audio(original_audio: AudioSegment, transcript: dict, source_language: str, target_language: str, project_dir: str,
                        translation_model, translation_tokenizer, summarizer_pipeline, melo_tts_model) -> AudioSegment:
    logging.info("Creating synced translated audio with segment fades...")

    final_synced_speech_track = AudioSegment.silent(
        duration=len(original_audio),
        frame_rate=original_audio.frame_rate
    ).set_channels(original_audio.channels)

    # Process all segments (generates individual .wav files for each translated segment)
    processed_segments_info = adaptive_segment_processing(
        transcript['segments'], original_audio, 
        source_language, target_language, project_dir,
        translation_model, translation_tokenizer, summarizer_pipeline, melo_tts_model
    )

    # Load each processed segment, apply fades, and overlay onto the main translated speech track
    for start_time_ms, audio_filepath in processed_segments_info:
        try:
            segment_audio = AudioSegment.from_file(audio_filepath)

            if CROSSFADE_MS > 0 and len(segment_audio) > CROSSFADE_MS * 2:
                segment_audio = segment_audio.fade_in(CROSSFADE_MS).fade_out(CROSSFADE_MS)
            elif CROSSFADE_MS > 0 and len(segment_audio) > 0 : # Shorter segments, shorter fade
                 fade_len = min(CROSSFADE_MS, len(segment_audio)//2)
                 if fade_len > 0 : segment_audio = segment_audio.fade_in(fade_len).fade_out(fade_len)

            final_synced_speech_track = final_synced_speech_track.overlay(segment_audio, position=start_time_ms)

            del segment_audio
            import gc; gc.collect()
        except Exception as e:
            logging.error(f"Error overlaying segment from {audio_filepath}: {str(e)}", exc_info=True)

    return final_synced_speech_track

def create_final_video(video_path: str, final_audio_path: str, output_path: str):
    logging.info("Creating final video...")
    video = VideoFileClip(video_path)
    audio = AudioFileClip(final_audio_path)
    final_clip = video.set_audio(audio)
    # Specify threads for moviepy for potentially faster writing, and logger for verbosity.
    final_clip.write_videofile(output_path, audio_codec='aac', threads=4, logger='bar')
    del video, audio, final_clip; import gc; gc.collect()

# **Process Video function**

In [None]:
# In your main script, replace the existing process_video function
def process_video(video_path: str, target_language: str):
    performance_monitor.log_gpu_status_direct() # Use direct to avoid issues if monitor's own gpu_optimizer not ready
    
    with performance_monitor.timer("total_processing"):
        project_dir = create_project_structure(video_path, target_language)
        
        translation_model = None
        translation_tokenizer = None
        summarizer_pipeline = None
        # melo_tts_model is global, will be loaded once.
        global melo_tts_model

        try:
            with performance_monitor.timer("audio_extraction"):
                audio_path = os.path.join(project_dir, 'audio', 'extracted_audio.wav')
                extract_audio(video_path, audio_path)
            
            with performance_monitor.timer("transcription"):
                transcript_result = transcribe_with_whisper(audio_path) 
                source_language = transcript_result['language']
                # Ensure source_language is a short code like 'en', 'fr'
                if source_language and len(source_language) > 2 and '-' in source_language:
                    source_language = source_language.split('-')[0]
                logging.info(f"Detected source language: {source_language}")
                print(f"Source: {source_language}, Target: {target_language}")
            
            transcript_path = os.path.join(project_dir, 'transcripts', 'transcript.txt')
            with open(transcript_path, 'w', encoding='utf-8') as f:
                f.write(transcript_result['text'])
            
            # --- LOAD MODELS (ONCE PER TYPE) ---
            with performance_monitor.timer("translation_model_loading"):
                # Prioritize Helsinki-NLP if source_language is available
                # Note: source_language from Whisper might be 'en', 'fr', etc.
                # Helsinki models are usually like 'opus-mt-en-fr'
                # mBART requires lang codes like 'en_XX', 'fr_XX'
                # This needs careful handling based on what Whisper returns vs what models expect.
                
                # Assuming Whisper's source_language (e.g., 'en') is okay for Helsinki naming
                specific_model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
                try:
                    logging.info(f"Attempting to load translation model: {specific_model_name}")
                    translation_tokenizer = AutoTokenizer.from_pretrained(specific_model_name)
                    translation_model = AutoModelForSeq2SeqLM.from_pretrained(specific_model_name)
                    translation_model = gpu_optimizer.optimize_model(translation_model)
                    logging.info(f"Successfully loaded and optimized {specific_model_name}")
                except Exception as e:
                    logging.warning(f"Failed to load {specific_model_name} ({e}). Falling back to mBART.")
                    if translation_model: del translation_model
                    if translation_tokenizer: del translation_tokenizer
                    gpu_optimizer.clear_cache(); import gc; gc.collect()

                    mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"
                    logging.info(f"Loading mBART model: {mbart_model_name}")
                    # For mBART, source_language needs to be in xx_YY format, e.g. 'en_XX'
                    # This requires a mapping if Whisper gives 'en'. For now, we'll assume User ensures correct input or tokenizer handles it.
                    # For simplicity, if Whisper gives 'en', mBART might need 'en_XX'.
                    # The _translate_text_internal function will need to handle tokenizer.src_lang correctly.
                    translation_tokenizer = AutoTokenizer.from_pretrained(mbart_model_name, src_lang=f"{source_language}_XX" if len(source_language)==2 else source_language) # Adjust src_lang format for mBART
                    translation_model = AutoModelForSeq2SeqLM.from_pretrained(mbart_model_name)
                    translation_model = gpu_optimizer.optimize_model(translation_model)
                    logging.info(f"Successfully loaded and optimized {mbart_model_name}")
            
            with performance_monitor.timer("summarization_model_loading"):
                # BART for summarization
                summarizer_model_name = "facebook/bart-large-cnn"
                logging.info(f"Loading summarization pipeline with model: {summarizer_model_name}")
                # device=-1 for CPU, or gpu_optimizer.device.index for specific GPU if using Transformers pipeline device arg
                summarizer_device_arg = device.index if device.type == 'cuda' else -1 
                summarizer_pipeline = pipeline("summarization", model=summarizer_model_name, device=summarizer_device_arg, framework="pt")
                # Optionally compile the model inside the pipeline if not done by optimize_model
                # if hasattr(summarizer_pipeline.model, 'parameters') and device.type == 'cuda':
                #    summarizer_pipeline.model = gpu_optimizer.optimize_model(summarizer_pipeline.model)
                logging.info("Summarization pipeline loaded.")

            with performance_monitor.timer("melo_tts_model_loading"):
                if melo_tts_model is None: # Load only if not already loaded
                    try:
                        # MeloTTS uses 'EN', 'FR' etc. for language codes. Use the one from LANGUAGE_MODEL_MAP.
                        # Initial load language doesn't matter as much since tts_to_file specifies language.
                        initial_melo_lang = LANGUAGE_MODEL_MAP.get(target_language, {}).get("melo_language", "EN")
                        logging.info(f"Loading MeloTTS model globally (initial lang: {initial_melo_lang}, device: {device})...")
                        melo_tts_model = TTS(language=initial_melo_lang, device=str(device)) # device expects 'cpu' or 'cuda:0'
                        logging.info("MeloTTS model loaded globally.")
                    except Exception as e:
                        logging.error(f"Failed to load MeloTTS model globally: {e}. TTS will rely on gTTS.", exc_info=True)
                        melo_tts_model = None # Ensure it's None if loading failed
            
            # Translate the entire transcript text (for saving and potential full-text use)
            with performance_monitor.timer("full_text_translation"):
                # This full translation is for the .txt output. Segments are re-translated for TTS generation.
                full_translated_text = _translate_text_internal(transcript_result['text'], source_language, target_language, translation_model, translation_tokenizer)
            
            translation_path = os.path.join(project_dir, 'translations', 'translation.txt')
            with open(translation_path, 'w', encoding='utf-8') as f:
                f.write(full_translated_text)
            
            with performance_monitor.timer("audio_synthesis_and_sync"):
                original_audio = AudioSegment.from_wav(audio_path)
                logging.info(f"Original audio duration: {len(original_audio)/1000:.2f}s")
                
                # Pass loaded models to create_synced_audio
                synced_translated_speech = create_synced_audio(
                    original_audio, transcript_result, 
                    source_language, target_language, project_dir,
                    translation_model, translation_tokenizer, summarizer_pipeline, melo_tts_model
                )
            
            with performance_monitor.timer("audio_mixing_with_effects"):
                final_mixed_audio = preserve_sound_effects(
                    original_audio, 
                    synced_translated_speech, 
                    transcript_result, 
                    project_dir, 
                    silence_original_speech=True
                ) 
                del original_audio, synced_translated_speech; import gc; gc.collect()
            
            final_audio_path = os.path.join(project_dir, 'audio', 'final_audio.wav')
            logging.info(f"Exporting final mixed audio to: {final_audio_path}")
            final_mixed_audio.export(final_audio_path, format="wav")
            del final_mixed_audio; import gc; gc.collect()
            
            with performance_monitor.timer("final_video_creation"):
                output_video_path = os.path.join(project_dir, f"translated_{os.path.basename(video_path)}")
                create_final_video(video_path, final_audio_path, output_video_path)
            
            logging.info(f"Translation complete. Output video: {output_video_path}")
            logging.info(performance_monitor.get_summary())
            return output_video_path
            
        except Exception as e:
            logging.error(f"An error occurred during video processing: {str(e)}", exc_info=True)
            raise
        finally:
            # Cleanup models regardless of success or failure
            logging.info("Cleaning up models...")
            if translation_model: del translation_model
            if translation_tokenizer: del translation_tokenizer
            if summarizer_pipeline:
                 if hasattr(summarizer_pipeline, 'model'): del summarizer_pipeline.model # Try to delete inner model
                 del summarizer_pipeline
            # Global melo_tts_model is kept for potential subsequent runs in notebook, 
            # but if this were a script, it would be deleted here too.
            # For notebook, explicit clearing may be desired by user if kernel restarted.
            gpu_optimizer.clear_cache()
            import gc; gc.collect()
            logging.info("Cleanup complete.")

print("✅ All functions loaded successfully!")

# **Run the translation**

In [None]:
# Run the translation
target_language = "fr"  # Change this to your desired language (en, es, fr, jp, kr, zh)
video_filename = "/kaggle/input/source-vids/fern_eng_short.mp4" # Ensure this path is correct for your Kaggle environment

# Ensure MeloTTS is ready (especially if running cells individually)
if melo_tts_model is None and 'process_video' in globals():
    logging.info("MeloTTS model is not loaded. Attempting to load before main processing.")
    try:
        # Simplified MeloTTS loading for pre-check - actual load is in process_video
        # This is more of a conceptual check; process_video handles the real loading.
        initial_melo_lang = LANGUAGE_MODEL_MAP.get(target_language, {}).get("melo_language", "EN")
        # melo_tts_model = TTS(language=initial_melo_lang, device=str(gpu_optimizer.device))
        # logging.info(f"Pre-checked MeloTTS model loading for {initial_melo_lang}.")
        # Actually, process_video will handle this. This is just a note.
        pass 
    except Exception as e:
        logging.error(f"Pre-check: Failed to load MeloTTS model: {e}")


# Run translation
try:
    output_video = process_video(video_filename, target_language)
    print(f"\n🎉 Translation completed!")
    print(f"📁 Output video: {output_video}")
    
except Exception as e:
    print(f"❌ Error during translation: {e}")
    import traceback
    traceback.print_exc()