In [None]:
!pip install -U transformers

## Local Inference on GPU
Model page: https://huggingface.co/PRAli22/arat5-arabic-dialects-translation

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/PRAli22/arat5-arabic-dialects-translation)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("PRAli22/arat5-arabic-dialects-translation")
model = AutoModelForSeq2SeqLM.from_pretrained("PRAli22/arat5-arabic-dialects-translation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/875 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [2]:

# Arabic sentences list
arabic_sentences = [
    "؟ شخبارك",          # Gulf dialect
    "كيف حالك؟",          # MSA
    "أنا مشغول اليوم",     # I'm busy today
    "وين كنت امس؟",        # Where were you yesterday? (dialect)
    "الجو حار جدا اليوم"   # The weather is very hot today
    , "شلونك"
    , "ازيك"
     , "وايد"
     , "كفو"
     , "دق علي"
     , "وش تبي"
     , " الله يصلح حالك"
     , "سنع"
     , "منقود"
     , "اونه"
]

# Translate each sentence
translations = []
for sentence in arabic_sentences:
    # Format input as expected by T5 (e.g., "translate: وش اخبارك")
    input_text = f"translate: {sentence}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    generated = model.generate(**inputs)
    translation = tokenizer.decode(generated[0], skip_special_tokens=True)
    translations.append(translation)

# Show results
for ar, ar2 in zip(arabic_sentences, translations):
    print(f"Dialect: {ar}")
    print(f"MSA: {ar2}")
    print("-" * 60)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dialect: ؟ شخبارك
MSA: الترجمة. كيف حالك ؟
------------------------------------------------------------
Dialect: كيف حالك؟
MSA: الترجمة. كيف حالك ؟
------------------------------------------------------------
Dialect: أنا مشغول اليوم
MSA: مترجم : أنا مشغول اليوم
------------------------------------------------------------
Dialect: وين كنت امس؟
MSA: مترجم. أين كنت الليلة الماضية ؟
------------------------------------------------------------
Dialect: الجو حار جدا اليوم
MSA: مترجم : الجو حار جدا اليوم
------------------------------------------------------------
Dialect: شلونك
MSA: مترجم. كيف حالك ؟
------------------------------------------------------------
Dialect: ازيك
MSA: مترجم. مرحباً
------------------------------------------------------------
Dialect: وايد
MSA: الترجمة : كثيرة جدا
------------------------------------------------------------
Dialect: كفو
MSA: الترجمة : جيدة
------------------------------------------------------------
Dialect: دق علي
MSA: الترجمة : اتصل بي
---------

In [3]:
# Step 2: Load Arabic → English model (M2M100)
en_tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
en_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M")

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [4]:

final_translations = []

for original_ar, ar2 in zip(arabic_sentences, translations):
    # Set the source language for Arabic
    en_tokenizer.src_lang = "ar"

    # Tokenize MSA sentence (ar2) and translate to English
    en_inputs = en_tokenizer(ar2, return_tensors="pt")
    output_ids = en_model.generate(**en_inputs, forced_bos_token_id=en_tokenizer.get_lang_id("en"))
    english = en_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    final_translations.append((original_ar, ar2, english))

# Nicely print the 3 steps: Dialect → MSA → English
print("Dialect → MSA → English\n" + "="*40)
for dialect, msa, en in final_translations:
    print(f"Dialect: {dialect}")
    print(f"MSA: {msa}")
    print(f"English translation: {en}")
    print("-" * 60)


Dialect → MSA → English
Dialect: ؟ شخبارك
MSA: الترجمة. كيف حالك ؟
English translation: The translation, how are you?
------------------------------------------------------------
Dialect: كيف حالك؟
MSA: الترجمة. كيف حالك ؟
English translation: The translation, how are you?
------------------------------------------------------------
Dialect: أنا مشغول اليوم
MSA: مترجم : أنا مشغول اليوم
English translation: I am busy today.
------------------------------------------------------------
Dialect: وين كنت امس؟
MSA: مترجم. أين كنت الليلة الماضية ؟
English translation: Where did you go last night?
------------------------------------------------------------
Dialect: الجو حار جدا اليوم
MSA: مترجم : الجو حار جدا اليوم
English translation: The weather is very hot today.
------------------------------------------------------------
Dialect: شلونك
MSA: مترجم. كيف حالك ؟
English translation: Translator, how are you?
------------------------------------------------------------
Dialect: ازيك
MSA: مترجم

In [2]:
import nbformat

def clean_invalid_metadata(path):
    with open(path, 'r') as f:
        nb = nbformat.read(f, as_version=4)

    for cell in nb.cells:
        if 'outputs' in cell:
            for output in cell['outputs']:
                if output.get('output_type') == 'stream' and 'metadata' in output:
                    del output['metadata']

    with open(path, 'w') as f:
        nbformat.write(nb, f)

clean_invalid_metadata('Arabic_dialects_translation.ipynb')  # replace with your filename
