In [1]:
!pip install transformers datasets sentencepiece gradio evaluate

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17

In [None]:
import re
import unicodedata
from datasets import Dataset
from transformers import MarianMTModel, MarianTokenizer, TrainingArguments, Trainer
import evaluate
import torch
from tqdm import tqdm
import gradio as gr


In [None]:
!wget https://raw.githubusercontent.com/SamirMoustafa/nmt-with-attention-for-ar-to-en/master/ara_.txt


--2025-05-12 21:46:22--  https://raw.githubusercontent.com/SamirMoustafa/nmt-with-attention-for-ar-to-en/master/ara_.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 763396 (746K) [text/plain]
Saving to: ‘ara_.txt’


2025-05-12 21:46:23 (119 MB/s) - ‘ara_.txt’ saved [763396/763396]



In [None]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿ء-ي]+", " ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w


In [None]:
def load_dataset(path, num_examples=None):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    pairs = []
    for line in lines[:num_examples]:
        parts = line.split('\t')
        if len(parts) >= 2:
            eng = preprocess_sentence(parts[0].strip())
            ara = preprocess_sentence(parts[1].strip())
            pairs.append({"ar": ara, "en": eng})
    return pairs

data = load_dataset('ara_.txt')
print(f"Total sentence pairs: {len(data)}")

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1)


Total sentence pairs: 10742


In [None]:
model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
def preprocess_function(batch):
    inputs = tokenizer(batch["ar"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(batch["en"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/9667 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./marian-ar-en-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mriekoaaziz191[0m ([33mriekoaaziz191-helwan-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,9.4524
20,0.5361
30,0.3756
40,0.3298
50,0.2773
60,0.2981
70,0.2198
80,0.1953
90,0.2422
100,0.2084




TrainOutput(global_step=3627, training_loss=0.10739145041136562, metrics={'train_runtime': 474.4619, 'train_samples_per_second': 61.124, 'train_steps_per_second': 7.644, 'total_flos': 983086481276928.0, 'train_loss': 0.10739145041136562, 'epoch': 3.0})

In [None]:
model_ar_en = MarianMTModel.from_pretrained("./marian-ar-en-finetuned")
tokenizer_ar_en = MarianTokenizer.from_pretrained("./marian-ar-en-finetuned")

model_en_ar = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
tokenizer_en_ar = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")




config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

In [None]:
bleu = evaluate.load("bleu")
test_data = dataset["test"].select(range(1000))
predictions = []
references = []

for example in tqdm(test_data):
    input_text = example["ar"]
    reference = example["en"]
    inputs = tokenizer_ar_en(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        output = model_ar_en.generate(**inputs)
    prediction = tokenizer_ar_en.decode(output[0], skip_special_tokens=True)
    predictions.append(prediction)
    references.append([reference])

score = bleu.compute(predictions=predictions, references=references)
print(f"BLEU score on test set: {score['bleu']:.4f}")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

100%|██████████| 1000/1000 [19:37<00:00,  1.18s/it]

BLEU score on test set: 0.7223





In [None]:
def translate(text, direction):
    text = preprocess_sentence(text)
    if direction == "Arabic -> English":
        inputs = tokenizer_ar_en(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            translated = model_ar_en.generate(**inputs)
        return tokenizer_ar_en.decode(translated[0], skip_special_tokens=True)

    elif direction == "English -> Arabic":
        inputs = tokenizer_en_ar(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            translated = model_en_ar.generate(**inputs)
        return tokenizer_en_ar.decode(translated[0], skip_special_tokens=True)


In [None]:
translate("مرحبا", "Arabic -> English")

'<start> hi . <end>'

In [None]:
translate("I want to go back to my home", "English -> Arabic")

'<start> أريد أن أعود إلى بيتي <end>'

In [None]:
translate("أين تقع المكتبة", "Arabic -> English")

'<start> where is the library ? <end>'

In [None]:
translate("The weather is nice today", "English -> Arabic")

'<start>الطقس لطيف اليوم <nend>'

In [None]:
from google.colab import drive
drive.mount('/content/drive')


model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Nlp/marian-ar-en-finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Nlp/marian-ar-en-finetuned")


model_en_ar.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Nlp/opus-mt-en-ar")
tokenizer_en_ar.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Nlp/opus-mt-en-ar")

Mounted at /content/drive




('/content/drive/MyDrive/Colab Notebooks/Nlp/opus-mt-en-ar/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Nlp/opus-mt-en-ar/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Nlp/opus-mt-en-ar/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/Nlp/opus-mt-en-ar/source.spm',
 '/content/drive/MyDrive/Colab Notebooks/Nlp/opus-mt-en-ar/target.spm',
 '/content/drive/MyDrive/Colab Notebooks/Nlp/opus-mt-en-ar/added_tokens.json')

In [None]:
import gradio as gr

gr.Interface(
    fn=translate,
    inputs=[
        gr.Textbox(lines=2, placeholder="اكتب جملة أو Write a sentence..."),
        gr.Radio(choices=["Arabic -> English", "English -> Arabic"], label="Translation Direction")
    ],
    outputs="text",
    title="Arabic-English Translator",
    description="Translation Arabic ⇆ English"
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6303ec316999b5fdc7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


