In [None]:
!pip install -U transformers

## Local Inference on GPU
Model page: https://huggingface.co/PRAli22/arat5-arabic-dialects-translation

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/PRAli22/arat5-arabic-dialects-translation)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("PRAli22/arat5-arabic-dialects-translation")
model = AutoModelForSeq2SeqLM.from_pretrained("PRAli22/arat5-arabic-dialects-translation")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/875 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:

# Arabic sentences list
arabic_sentences = [
    "؟ شخبارك",          # Gulf dialect
    "كيف حالك؟",          # MSA
    "أنا مشغول اليوم",     # I'm busy today
    "وين كنت امس؟",        # Where were you yesterday? (dialect)
    "الجو حار جدا اليوم"   # The weather is very hot today
    , "شلونك"
    , "ازيك"
     , "وايد"
     , "كفو"
     , "دق علي"
     , "وش تبي"
     , " الله يصلح حالك"
     , "سنع"
     , "منقود"
     , "اونه"
]

# Translate each sentence
translations = []
for sentence in arabic_sentences:
    # Format input as expected by T5 (e.g., "translate: وش اخبارك")
    input_text = f"translate: {sentence}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    generated = model.generate(**inputs)
    translation = tokenizer.decode(generated[0], skip_special_tokens=True)
    translations.append(translation)

# Show results
for ar, ar2 in zip(arabic_sentences, translations):
    print(f"Dialect: {ar}")
    print(f"MSA: {ar2}")
    print("-" * 60)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dialect: ؟ شخبارك
MSA: الترجمة. كيف حالك ؟
------------------------------------------------------------
Dialect: كيف حالك؟
MSA: الترجمة. كيف حالك ؟
------------------------------------------------------------
Dialect: أنا مشغول اليوم
MSA: مترجم : أنا مشغول اليوم
------------------------------------------------------------
Dialect: وين كنت امس؟
MSA: مترجم. أين كنت الليلة الماضية ؟
------------------------------------------------------------
Dialect: الجو حار جدا اليوم
MSA: مترجم : الجو حار جدا اليوم
------------------------------------------------------------
Dialect: شلونك
MSA: مترجم. كيف حالك ؟
------------------------------------------------------------
Dialect: ازيك
MSA: مترجم. مرحباً
------------------------------------------------------------
Dialect: وايد
MSA: الترجمة : كثيرة جدا
------------------------------------------------------------
Dialect: كفو
MSA: الترجمة : جيدة
------------------------------------------------------------
Dialect: دق علي
MSA: الترجمة : اتصل بي
---------

In [None]:
# Step 2: Load Arabic → English model (M2M100)
en_tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
en_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M")

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [None]:

final_translations = []

for original_ar, ar2 in zip(arabic_sentences, translations):
    # Set the source language for Arabic
    en_tokenizer.src_lang = "ar"

    # Tokenize MSA sentence (ar2) and translate to English
    en_inputs = en_tokenizer(ar2, return_tensors="pt")
    output_ids = en_model.generate(**en_inputs, forced_bos_token_id=en_tokenizer.get_lang_id("en"))
    english = en_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    final_translations.append((original_ar, ar2, english))

# Nicely print the 3 steps: Dialect → MSA → English
print("Dialect → MSA → English\n" + "="*40)
for dialect, msa, en in final_translations:
    print(f"Dialect: {dialect}")
    print(f"MSA: {msa}")
    print(f"English translation: {en}")
    print("-" * 60)


In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk

# Download punkt for tokenization used by nltk BLEU
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# -----------------------------------------------
#  Step 1: Input Data (Dialect + English Reference)
# -----------------------------------------------
dialect_pairs = [
    ("شلونك اليوم؟ شخبارك؟ إن شاء الله أمورك طيبة وما فيه تعب.",
     "How are you today? Hope you're doing well and not too tired."),
    ("كيف حالك اليوم؟ أتمنى أن تكون بخير وفي أحسن حال.",
     "How are you today? I hope you're well and in great condition."),
    ("أنا مشغول اليوم من الصبح، كان عندي دوام وشغل كثير.",
     "I've been busy since this morning, had work and a lot to do."),
    ("وين كنت امس؟ حاولت أتواصل معك بس ما رديت علي.",
     "Where were you yesterday? I tried to reach you but you didn’t answer."),
    ("الجو حار جدا اليوم، ما أقدر أطلع من البيت بدون تكييف.",
     "It's extremely hot today, I can't leave the house without the AC."),
    ("شلونك يا خوي؟ من زمان ما سمعنا صوتك.",
     "How are you, brother? We haven’t heard from you in a while."),
    ("ازيك يا صاحبي؟ أخبارك إيه؟ وحشتني والله.",
     "How are you, my friend? What’s up? I really missed you."),
    ("وايد زحمة اليوم في الشارع، ما قدرت أوصل بسرعة.",
     "It was really crowded on the streets today, I couldn’t arrive quickly."),
    ("كفو والله! سويت اللي ما قدر عليه غيرك.",
     "Well done! You did what no one else could."),
    ("دق علي لما توصل عشان أجيك عند الباب.",
     "Call me when you arrive so I can come meet you at the door."),
    ("وش تبي بالضبط؟ تكلم واضح عشان أفهم عليك.",
     "What exactly do you want? Speak clearly so I can understand you."),
    ("الله يصلح حالك ويهدي بالك، ما يصير كذا تزعل الناس.",
     "May God guide you and calm your mind. You can’t go around upsetting people like that."),
    ("الرجال هذا سنع، يعرف كيف يرتب أموره.",
     "This guy is reliable, he knows how to handle his stuff."),
    ("منقود هالكلمة من زمان ما سمعتها، تذكرني بأيام زمان.",
     "It’s been a long time since I heard that word 'manqood'. It reminds me of the old days."),
    ("اونه يبي يساعد، بس كل مرة يسحب علينا بدون ما يقول.",
     "He *says* he wants to help, but every time he bails on us without a word.")
]

arabic_dialect_sentences = [pair[0] for pair in dialect_pairs]
reference_english = [pair[1] for pair in dialect_pairs]



In [3]:

# -----------------------------------------------
#  Step 2: Load Models
# -----------------------------------------------

# Dialect → MSA model
dialect_tokenizer = AutoTokenizer.from_pretrained("PRAli22/arat5-arabic-dialects-translation")
dialect_model = AutoModelForSeq2SeqLM.from_pretrained("PRAli22/arat5-arabic-dialects-translation")

# MSA → English model
en_tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
en_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M")
en_tokenizer.src_lang = "ar"


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/875 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [4]:

# -----------------------------------------------
# Step 3: Translate and Evaluate
# -----------------------------------------------

smoothie = SmoothingFunction().method4
total_bleu = 0
translations = []

print("🔁 Translating from Dialect → MSA → English with BLEU Evaluation\n")

for idx, (dialect, reference) in enumerate(zip(arabic_dialect_sentences, reference_english), 1):
    # Step 1: Dialect → MSA
    input_text = f"translate: {dialect}"
    inputs = dialect_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    msa_output = dialect_model.generate(**inputs)
    msa_sentence = dialect_tokenizer.decode(msa_output[0], skip_special_tokens=True)

    # Step 2: MSA → English
    en_inputs = en_tokenizer(msa_sentence, return_tensors="pt")
    en_output = en_model.generate(**en_inputs, forced_bos_token_id=en_tokenizer.get_lang_id("en"))
    en_sentence = en_tokenizer.decode(en_output[0], skip_special_tokens=True)
    translations.append(en_sentence)

    # Step 3: BLEU Evaluation
    bleu = sentence_bleu([reference.split()], en_sentence.split(), smoothing_function=smoothie)
    total_bleu += bleu

    # Output results
    print(f"🟧 Example #{idx}")
    print(f"🟡 Dialect:    {dialect}")
    print(f"🟢 MSA:        {msa_sentence}")
    print(f"🔵 Predicted:  {en_sentence}")
    print(f"🔴 Reference:  {reference}")
    print(f"🎯 BLEU:       {bleu:.4f}")
    print("-" * 100)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


🔁 Translating from Dialect → MSA → English with BLEU Evaluation

🟧 Example #1
🟡 Dialect:    شلونك اليوم؟ شخبارك؟ إن شاء الله أمورك طيبة وما فيه تعب.
🟢 MSA:        مترجم. كيف حالك ؟ أتمنى لك يوماً سعيداً.
🔵 Predicted:  How do you feel? I wish you a good day.
🔴 Reference:  How are you today? Hope you're doing well and not too tired.
🎯 BLEU:       0.0228
----------------------------------------------------------------------------------------------------
🟧 Example #2
🟡 Dialect:    كيف حالك اليوم؟ أتمنى أن تكون بخير وفي أحسن حال.
🟢 MSA:        مترجم. كيف حالك اليوم ؟ أتمنى لك يوماً سعيداً.
🔵 Predicted:  How do you feel today? I wish you a good day.
🔴 Reference:  How are you today? I hope you're well and in great condition.
🎯 BLEU:       0.0564
----------------------------------------------------------------------------------------------------
🟧 Example #3
🟡 Dialect:    أنا مشغول اليوم من الصبح، كان عندي دوام وشغل كثير.
🟢 MSA:        الترجمة. أنا مشغول اليوم ، ولقد قضيت وقتاً طويلاً في العمل

In [5]:


# Average BLEU Score
avg_bleu = total_bleu / len(arabic_dialect_sentences)
print(f"\n🔥 Average BLEU Score: {avg_bleu:.4f}")



🔥 Average BLEU Score: 0.1161
