In [1]:
import os
import gc
import json
import warnings
import asyncio
import nest_asyncio
import subprocess
import torch
import customtkinter as ctk
from tkinter import filedialog

# Library ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö AI
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import edge_tts
from moviepy.editor import AudioFileClip, concatenate_audioclips

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------------------------------------------------
# SETUP & CONFIG
# ---------------------------------------------------------
warnings.filterwarnings("ignore")
try:
    nest_asyncio.apply()
except:
    pass

# ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö Device
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"‚öôÔ∏è Running on: {DEVICE}")

‚öôÔ∏è Running on: cuda:0


In [3]:
# ---------------------------------------------------------
# HELPER FUNCTIONS (‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏°‡∏∑‡∏≠‡∏ä‡πà‡∏ß‡∏¢)
# ---------------------------------------------------------
def cleanup_gpu():
    """‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏•‡πâ‡∏≤‡∏á‡∏Ç‡∏¢‡∏∞‡πÉ‡∏ô GPU ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô VRAM ‡πÄ‡∏ï‡πá‡∏°"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    gc.collect()

def get_duration_ffmpeg(file_path):
    """‡∏î‡∏∂‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡πÑ‡∏ü‡∏•‡πå‡πÅ‡∏ö‡∏ö‡∏õ‡∏•‡∏≠‡∏î‡∏†‡∏±‡∏¢"""
    try:
        if not os.path.exists(file_path):
            return None
        
        cmd = [
            "ffprobe", "-v", "error", "-show_entries", "format=duration", 
            "-of", "json", file_path
        ]
        # creationflags=0x08000000 ‡πÉ‡∏ä‡πâ‡∏ã‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏ï‡πà‡∏≤‡∏á console ‡∏ö‡∏ô Windows
        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        data = json.loads(result.stdout)
        return float(data['format']['duration'])
    except Exception as e:
        print(f"‚ö†Ô∏è Warning reading duration: {e}")
        return None

In [4]:
# ---------------------------------------------------------
# 1. SPEECH TO TEXT (Whisper)
# ---------------------------------------------------------
def speech_to_text_en(audio_path):
    print("------------------------------------------------")
    print("1Ô∏è‚É£ Step 1: Speech to Text (Whisper Large-V3)")
    
    temp_wav = "temp_extracted_audio.wav"
    
    try:
        # 1.1 Extract Audio using FFmpeg (‡πÄ‡∏ö‡∏≤‡∏Å‡∏ß‡πà‡∏≤ MoviePy)
        if audio_path.lower().endswith(('.mp4', '.mov', '.avi', '.mkv', '.webm')):
            print(f"   üé¨ Extracting audio from video...")
            subprocess.run([
                "ffmpeg", "-y", "-i", audio_path, 
                "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", 
                temp_wav
            ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
            target_file = temp_wav
        else:
            target_file = audio_path

        # 1.2 Load Model (Load ‡∏ï‡∏£‡∏á‡∏ô‡∏µ‡πâ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏õ‡∏£‡∏∞‡∏´‡∏¢‡∏±‡∏î RAM ‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏µ‡πâ)
        print(f"   üéôÔ∏è Loading Whisper Model...")
        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        pipe = pipeline(
            "automatic-speech-recognition",
            model="openai/whisper-large-v3",
            torch_dtype=dtype,
            device=DEVICE,
        )

        # 1.3 Transcribe
        print("   üìù Transcribing...")
        result = pipe(
            target_file,
            chunk_length_s=30,      # Auto Chunking
            batch_size=8,           # Parallel Processing
            return_timestamps=True,
            generate_kwargs={"language": "english"}
        )
        
        text = result["text"].strip()
        
        # 1.4 Cleanup Model Immediately (‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç!)
        del pipe
        cleanup_gpu() 
        
        return text

    except Exception as e:
        print(f"‚ùå Error STT: {e}")
        return ""
    finally:
        if os.path.exists(temp_wav):
            try: os.remove(temp_wav)
            except: pass

In [5]:
# ---------------------------------------------------------
# 2. TRANSLATION (Typhoon)
# ---------------------------------------------------------
def text_translation(long_text):
    print("------------------------------------------------")
    print("2Ô∏è‚É£ Step 2: Translation (Typhoon 4B)")
    
    if not long_text: return ""

    model_id = "scb10x/typhoon-translate-4b"
    
    try:
        # 2.1 Load Model (Load ‡πÉ‡∏´‡∏°‡πà‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ï‡∏≠‡∏ô‡∏à‡∏∞‡πÉ‡∏ä‡πâ)
        print("   üîÑ Loading Translation Model...")
        dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
        
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id, 
            torch_dtype=dtype, 
            device_map="auto", # ‡πÉ‡∏´‡πâ Library ‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÄ‡∏≠‡∏á
        )

        # ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏¢‡πà‡∏≠‡∏¢‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏õ‡∏•
        def _translate_chunk(chunk):
            if not chunk.strip(): return ""
            messages = [
                {"role": "system", "content": "Translate the following text into Thai."},
                {"role": "user", "content": chunk},
            ]
            input_ids = tokenizer.apply_chat_template(
                messages, add_generation_prompt=True, return_tensors="pt"
            ).to(model.device)

            with torch.no_grad():
                outputs = model.generate(
                    input_ids, max_new_tokens=512, do_sample=False, pad_token_id=tokenizer.eos_token_id
                )
            return tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)

        # 2.2 Split & Translate (Auto Chunking Logic)
        paragraphs = long_text.split('\n')
        full_translation = []
        current_chunk = ""
        MAX_TOKENS = 1200 # ‡∏•‡∏î‡∏•‡∏á‡∏ô‡∏¥‡∏î‡∏´‡∏ô‡πà‡∏≠‡∏¢‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏õ‡∏•‡∏≠‡∏î‡∏†‡∏±‡∏¢

        print(f"   üìÑ Translating {len(paragraphs)} paragraphs...")
        
        for i, para in enumerate(paragraphs):
            if not para.strip():
                if current_chunk:
                    full_translation.append(_translate_chunk(current_chunk))
                    current_chunk = ""
                full_translation.append("")
                continue

            # Check length rough estimate (1 char approx 0.3-0.5 tokens, but len is safer)
            if len(current_chunk) + len(para) > 3000: # ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì 1000 tokens
                 full_translation.append(_translate_chunk(current_chunk))
                 current_chunk = para
            else:
                 current_chunk += "\n" + para if current_chunk else para

        if current_chunk:
            full_translation.append(_translate_chunk(current_chunk))

        # 2.3 Cleanup Model (‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏°‡∏≤‡∏Å!)
        del model
        del tokenizer
        cleanup_gpu()

        return "\n".join(full_translation)

    except Exception as e:
        print(f"‚ùå Error Translation: {e}")
        cleanup_gpu()
        return ""

In [6]:
# ---------------------------------------------------------
# 3. TEXT TO SPEECH (Edge TTS)
# ---------------------------------------------------------
def text_to_speech_TH(text, output_path):
    print("------------------------------------------------")
    print("3Ô∏è‚É£ Step 3: Text to Speech (Edge TTS)")
    
    if not text: return

    VOICE = "th-TH-PremwadeeNeural"
    
    async def _gen_chunks(chunks):
        files = []
        for i, chunk in enumerate(chunks):
            if not chunk.strip(): continue
            fname = f"temp_tts_{i}.mp3"
            try:
                # rate="-10%" ‡∏û‡∏π‡∏î‡∏ä‡πâ‡∏≤‡∏•‡∏á‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢
                comm = edge_tts.Communicate(chunk, VOICE, rate="-10%")
                await comm.save(fname)
                files.append(fname)
                print(f"     Generating audio chunk {i+1}...")
            except Exception as e:
                print(f"     ‚ö†Ô∏è Failed chunk {i}: {e}")
        return files

    temp_files = []
    try:
        # ‡∏´‡∏±‡πà‡∏ô‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏Å‡πà‡∏≠‡∏ô‡∏™‡πà‡∏á‡πÑ‡∏õ TTS
        chunks = []
        curr = ""
        for line in text.split('\n'):
            if len(curr) + len(line) < 1000:
                curr += line + "\n"
            else:
                chunks.append(curr)
                curr = line + "\n"
        if curr: chunks.append(curr)

        # Run Async
        loop = asyncio.get_event_loop()
        if loop.is_running():
            temp_files = loop.run_until_complete(_gen_chunks(chunks))
        else:
            temp_files = asyncio.run(_gen_chunks(chunks))

        # Merge Audio
        if temp_files:
            print("   üîó Merging audio clips...")
            clips = [AudioFileClip(f) for f in temp_files]
            final = concatenate_audioclips(clips)
            final.write_audiofile(output_path, fps=24000, verbose=False, logger=None)
            final.close()
            for c in clips: c.close()
            print(f"   ‚úÖ TTS Saved: {output_path}")
        else:
            print("   ‚ö†Ô∏è No audio generated.")

    except Exception as e:
        print(f"‚ùå Error TTS: {e}")
    finally:
        # Cleanup temp files
        for f in temp_files:
            if os.path.exists(f): 
                try: os.remove(f)
                except: pass

In [7]:
# ---------------------------------------------------------
# 4. VIDEO DUBBING (FFmpeg)
# ---------------------------------------------------------
def video_sound_editor(video_path, audio_path, output_path):
    print("------------------------------------------------")
    print("4Ô∏è‚É£ Step 4: Dubbing & Sync (FFmpeg)")
    
    # 4.1 ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏Å‡πà‡∏≠‡∏ô (‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô Error ‡∏ó‡∏µ‡πà‡∏Ñ‡∏∏‡∏ì‡πÄ‡∏Ñ‡∏¢‡πÄ‡∏à‡∏≠)
    if not os.path.exists(audio_path):
        print("‚ùå Error: Audio file not found. Skipping dubbing.")
        return

    vid_dur = get_duration_ffmpeg(video_path)
    aud_dur = get_duration_ffmpeg(audio_path)

    if not vid_dur or not aud_dur:
        print("‚ùå Error: Cannot read duration.")
        return

    print(f"   ‚è±Ô∏è Video: {vid_dur:.2f}s | Audio: {aud_dur:.2f}s")

    # 4.2 Calculate Speed
    speed_factor = aud_dur / vid_dur
    if speed_factor < 0.5: speed_factor = 0.5
    if speed_factor > 2.0: speed_factor = 2.0 # Limit extreme speed
    
    print(f"   üîß Adjusting speed: {speed_factor:.2f}x")

    # 4.3 Execute FFmpeg
    cmd = [
        "ffmpeg", "-y",
        "-i", video_path,
        "-i", audio_path,
        "-filter_complex", f"[1:a]atempo={speed_factor}[aout]",
        "-map", "0:v",
        "-map", "[aout]",
        "-c:v", "copy",
        "-c:a", "aac",
        "-shortest",
        output_path
    ]
    
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print(f"   ‚úÖ DONE! Saved to: {output_path}")
    except subprocess.CalledProcessError:
        print("‚ùå Error: FFmpeg failed.")

In [8]:
# ---------------------------------------------------------
# GUI & PIPELINE
# ---------------------------------------------------------
ctk.set_appearance_mode("System")
ctk.set_default_color_theme("blue")

def show_modern_msg(title, message):
    msg_window = ctk.CTkToplevel()
    msg_window.title(title)
    # ‡∏à‡∏±‡∏î‡∏´‡∏ô‡πâ‡∏≤‡∏ï‡πà‡∏≤‡∏á‡πÉ‡∏´‡πâ‡∏≠‡∏¢‡∏π‡πà‡∏Å‡∏•‡∏≤‡∏á‡∏à‡∏≠
    w, h = 400, 180
    x = (msg_window.winfo_screenwidth() // 2) - (w // 2)
    y = (msg_window.winfo_screenheight() // 2) - (h // 2)
    msg_window.geometry(f"{w}x{h}+{x}+{y}")
    msg_window.attributes('-topmost', True)

    label = ctk.CTkLabel(msg_window, text=message, font=("Leelawadee UI", 14), wraplength=350)
    label.pack(expand=True, padx=20, pady=20)
    ctk.CTkButton(msg_window, text="OK", command=msg_window.destroy, width=100).pack(pady=(0, 20))
    msg_window.grab_set()
    msg_window.wait_window()

def processing_pipline():
    root = ctk.CTk()
    root.withdraw()

    # 1. ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÑ‡∏ü‡∏•‡πå
    show_modern_msg("Start", "‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÑ‡∏ü‡∏•‡πå‡∏ß‡∏¥‡∏î‡∏µ‡πÇ‡∏≠‡∏ï‡πâ‡∏ô‡∏â‡∏ö‡∏±‡∏ö")
    video_path = filedialog.askopenfilename(filetypes=[("Video", "*.mp4 *.avi *.mkv *.mov *.webm")])
    if not video_path: return

    base_dir = os.path.dirname(video_path)
    audio_temp_path = os.path.join(base_dir, "temp_dubbing_audio.wav")
    
    try:
        # Step 1: STT
        eng_text = speech_to_text_en(video_path)
        if not eng_text:
            raise Exception("‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡∏û‡∏π‡∏î‡πÉ‡∏ô‡∏ß‡∏¥‡∏î‡∏µ‡πÇ‡∏≠ ‡∏´‡∏£‡∏∑‡∏≠ Whisper ‡∏ñ‡∏≠‡∏î‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ")

        # Step 2: Translate
        th_text = text_translation(eng_text)
        if not th_text:
            raise Exception("‡∏Å‡∏≤‡∏£‡πÅ‡∏õ‡∏•‡∏†‡∏≤‡∏©‡∏≤‡∏•‡πâ‡∏°‡πÄ‡∏´‡∏•‡∏ß")

        # Step 3: TTS
        text_to_speech_TH(th_text, audio_temp_path)
        if not os.path.exists(audio_temp_path):
             raise Exception("‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡∏û‡∏≤‡∏Å‡∏¢‡πå‡πÑ‡∏°‡πà‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à")

        # Step 4: Dubbing
        show_modern_msg("Save", "‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ó‡∏µ‡πà‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå")
        output_path = filedialog.asksaveasfilename(defaultextension=".mp4", filetypes=[("MP4", "*.mp4")])
        
        if output_path:
            video_sound_editor(video_path, audio_temp_path, output_path)
            show_modern_msg("Success", "‡πÄ‡∏™‡∏£‡πá‡∏à‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢! üéâ")

    except Exception as e:
        print(f"‚ùå PIPELINE ERROR: {e}")
        show_modern_msg("Error", f"‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î:\n{str(e)}")

    # Cleanup Final
    if os.path.exists(audio_temp_path):
        try: os.remove(audio_temp_path)
        except: pass
        
    root.destroy()

In [10]:
if __name__ == "__main__":
    processing_pipline()

------------------------------------------------
1Ô∏è‚É£ Step 1: Speech to Text (Whisper Large-V3)
   üé¨ Extracting audio from video...
   üéôÔ∏è Loading Whisper Model...


Device set to use cuda:0


   üìù Transcribing...
------------------------------------------------
2Ô∏è‚É£ Step 2: Translation (Typhoon 4B)
   üîÑ Loading Translation Model...


`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:09<00:00,  4.66s/it]
The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


   üìÑ Translating 1 paragraphs...
------------------------------------------------
3Ô∏è‚É£ Step 3: Text to Speech (Edge TTS)
   ‚ö†Ô∏è No audio generated.
‚ùå PIPELINE ERROR: ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏™‡∏µ‡∏¢‡∏á‡∏û‡∏≤‡∏Å‡∏¢‡πå‡πÑ‡∏°‡πà‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à
