# 1. Install Dependencies
- Pytorch [package](https://pytorch.org/)

In [1]:
!pip3 install torch

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch

# Print versions
print("Torch version: ", torch.__version__)

Torch version:  2.6.0+cpu


In [3]:
!pip install gradio transformers sentencepiece sacremoses pykakasi pypinyin korean-romanizer pythainlp --upgrade

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import gradio as gr
import sentencepiece
import sacremoses
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pykakasi  # Jepang
from korean_romanizer.romanizer import Romanizer  # Untuk romanisasi Korea
from pypinyin import pinyin, Style  # Cina
from pythainlp.transliterate import romanize as thai_romanize  # Untuk romanisasi Thai

print("Gradio version: ", gr.__version__) 
print("Sentencepiece version: ", sentencepiece.__version__)
print("Sacremoses version: ", sacremoses.__version__)

Gradio version:  5.29.0
Sentencepiece version:  0.2.0
Sacremoses version:  0.1.1


# 2. Inisiasi Model Penerjemah

In [5]:
# Load NLLB-200 model
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

# Mapping kode bahasa NLLB
target_lang_map = {
    "English": "eng_Latn",
    "Japanese": "jpn_Jpan",
    "Korean": "kor_Hang",
    "Chinese": "zho_Hans",
    "Thai": "tha_Thai"
}

# Romanisasi
def romanize(text, language):
    if language == "Japanese":
        kakasi = pykakasi.kakasi()
        result = kakasi.convert(text)
        return ' '.join([item['hepburn'] for item in result])
    elif language == "Korean":
        try:
            r = Romanizer(text)
            return r.romanize()
        except:
            return "[Romanization failed]"
    elif language == "Chinese":
        return ' '.join([item[0] for item in pinyin(text, style=Style.NORMAL)])
    elif language == "Thai":
        return thai_romanize(text)
    else:
        return text  # Latin-based, tidak perlu romanisasi

# Tranlasi + Romanisasi
def translate_indonesian_to_target(text, target_lang):
    # Set source language to Indonesian
    tokenizer.src_lang = "ind_Latn"
    encoded = tokenizer(text, return_tensors="pt")

    # Generate translation
    generated_tokens = model.generate(
        **encoded,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang_map[target_lang])
    )
    translated = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    # Romanisasi jika perlu
    romanized = romanize(translated, target_lang)

    return translated, romanized


# 3 Gradio Interface

In [6]:
languages = ["English", "Japanese", "Korean", "Chinese", "Thai"]

gr.Interface(
    fn=translate_indonesian_to_target,
    inputs=[
        gr.Textbox(label="Input (Bahasa Indonesia)"),
        gr.Dropdown(choices=languages, label="Target Language")
    ],
    outputs=[
        gr.Textbox(label="Translated Text"),
        gr.Textbox(label="Romanized Pronunciation")
    ],
    title="Multilingual Translator with NLLB-200",
    description="Terjemahan dari Bahasa Indonesia ke berbagai bahasa dunia dengan romanisasi untuk karakter non-Latin."
).launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


