# Import Toolkits

In [1]:
!pip install datasets



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset , load_dataset

# Loading the dataset

In [3]:
ds = load_dataset("akbargherbal/10K_english_to_arabic_dataset_for_FT")
ds

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 10000
    })
})

In [4]:
ds['train'][0]

{'output': 'ونرحب بالتدابير التي استطاعت فعﻻ الوكالة اتخاذهـــا حتى اﻵن بموجب ما تملكه حاليا من سلطة، غيــر أنه من الواضح أن مزيدا من التدابير اﻹضافية سيكون ضروريا كذلك لتوفير نظام موثوق به وفعال.',
 'input': 'We welcome the measures that the Agency has already been able to adopt under its existing authority, but it is clear that further, complementary measures will also be needed to achieve a credible and effective system.',
 'instruction': 'Convert the following English text into Arabic.'}

In [5]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,output,input,instruction
0,ونرحب بالتدابير التي استطاعت فعﻻ الوكالة اتخاذ...,We welcome the measures that the Agency has al...,Convert the following English text into Arabic.
1,وفي عام 2011، اشتعلت النيران في حوالي 100 حاوي...,"In 2011, a fire ignited about 100 containers h...",Please translate the given English sentence in...
2,وقدَّمت دولة طرف أخرى تفسيرا مختلفا، وهو أنَّ ...,Another State party offered a different explan...,Change the following English phrase to Arabic.
3,114 - واسترشد عمل الشعبة بخطتيه للمراجعة السنو...,114. The work of the Division was guided by it...,Turn the English sentence below into Arabic.
4,ومع ذلك، فإن محكمة الاستئناف رفضت طلب فتح دعوى...,The Court of Appeal nonetheless dismissed the ...,Provide an Arabic translation for the followin...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   output       10000 non-null  object
 1   input        10000 non-null  object
 2   instruction  10000 non-null  object
dtypes: object(3)
memory usage: 234.5+ KB


In [7]:
df = df.sample(n = 6000 , random_state = 42).reset_index(drop = True)
print(df.shape)
df.head()

(6000, 3)


Unnamed: 0,output,input,instruction
0,35 - ويتصل التطور الجديد في العلاقة بين برنامج...,35. A new development in the UNDP-World Bank r...,Convert the following English text into Arabic.
1,ينصب اهتمام خطة عمل الشراكة الجديدة على الديمق...,"The NEPAD plan of action focuses on democracy,...",Provide an Arabic translation for the followin...
2,مستشارة وخبيرة صحية في مرفق مشاريع الهياكل الأ...,Consultant-health expert in Western Balkans In...,Turn the English sentence below into Arabic.
3,وثمة نهج متزايد الشيوع متبع في التنفيذ المشجع ...,Another increasingly common approach to develo...,Change the following English phrase to Arabic.
4,الوظائف حسب المكان - مجموع الموارد,"Posts by location - total resources, continued",Rephrase the following English statement in Ar...


In [8]:
df['input'][1]

'The NEPAD plan of action focuses on democracy, governance and peace and security; economic and corporate governance; infrastructure and information technology; human resource development (notably, health and education); and agriculture and market access.'

In [9]:
df['input'] = df['instruction'] + " " + df['input']
df.head()

Unnamed: 0,output,input,instruction
0,35 - ويتصل التطور الجديد في العلاقة بين برنامج...,Convert the following English text into Arabic...,Convert the following English text into Arabic.
1,ينصب اهتمام خطة عمل الشراكة الجديدة على الديمق...,Provide an Arabic translation for the followin...,Provide an Arabic translation for the followin...
2,مستشارة وخبيرة صحية في مرفق مشاريع الهياكل الأ...,Turn the English sentence below into Arabic. C...,Turn the English sentence below into Arabic.
3,وثمة نهج متزايد الشيوع متبع في التنفيذ المشجع ...,Change the following English phrase to Arabic....,Change the following English phrase to Arabic.
4,الوظائف حسب المكان - مجموع الموارد,Rephrase the following English statement in Ar...,Rephrase the following English statement in Ar...


In [10]:
df = df[['output' , 'input']]
df.head()

Unnamed: 0,output,input
0,35 - ويتصل التطور الجديد في العلاقة بين برنامج...,Convert the following English text into Arabic...
1,ينصب اهتمام خطة عمل الشراكة الجديدة على الديمق...,Provide an Arabic translation for the followin...
2,مستشارة وخبيرة صحية في مرفق مشاريع الهياكل الأ...,Turn the English sentence below into Arabic. C...
3,وثمة نهج متزايد الشيوع متبع في التنفيذ المشجع ...,Change the following English phrase to Arabic....
4,الوظائف حسب المكان - مجموع الموارد,Rephrase the following English statement in Ar...


In [11]:
df['input'][0]

'Convert the following English text into Arabic. 35. A new development in the UNDP-World Bank relationship pertains to the World Bank low-income countries under stress (LICUS) initiative.'

# Splits

In [12]:
train_data , val_data = train_test_split(df , test_size = 0.1 , random_state = 42)

#convert to HuggingFace dataset format

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [13]:
train_dataset

Dataset({
    features: ['output', 'input', '__index_level_0__'],
    num_rows: 5400
})

# Load Pre-trained Model & Tokenizer

In [14]:
from transformers import MarianMTModel , MarianTokenizer

#load tokenizer and model
model_name = 'Helsinki-NLP/opus-mt-en-ar'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

2025-07-15 21:33:21.765695: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752615201.788332     187 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752615201.795183     187 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [15]:
def tokenize_func(examples):
    model_inputs = tokenizer(
        examples['input'],
        max_length = 128,
        truncation = True,
        padding = 'max_length'
    )
    labels = tokenizer(
        examples['output'],
        max_length = 128,
        truncation = True,
        padding = "max_length"
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [16]:
train_dataset = train_dataset.map(tokenize_func , batched = True)
val_dataset = val_dataset.map(tokenize_func , batched = True)

Map:   0%|          | 0/5400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [17]:
#set format for pytorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [18]:
train_dataset[0]

{'input_ids': tensor([ 1747, 17958,     3,  1002,  3617,  5176,   233,  7754,     2, 17597,
           566,  6089, 11615,     3, 26150,   104,    40,  1262,     9,  6312,
             2,     0, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801,
         62801, 62801, 62801, 62801, 62

In [25]:
from transformers import Seq2SeqTrainingArguments , Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/results",
    run_name="en2ar_debug",
    eval_steps=50,
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    save_strategy="epoch",
    predict_with_generate=True,
    report_to="none"
)


In [26]:
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

trainer.train()

Step,Training Loss
10,0.7656
20,0.7361
30,0.7362
40,0.6167
50,0.6738
60,0.5891
70,0.5887
80,0.5853
90,0.6029
100,0.6293


TrainOutput(global_step=2700, training_loss=0.45276827856346413, metrics={'train_runtime': 308.5624, 'train_samples_per_second': 70.002, 'train_steps_per_second': 8.75, 'total_flos': 732204682444800.0, 'train_loss': 0.45276827856346413, 'epoch': 4.0})

# Save Model

In [29]:
model.save_pretrained("/kaggle/working/my_finetuned_model")
tokenizer.save_pretrained("/kaggle/working/my_finetuned_model")

('/kaggle/working/my_finetuned_model/tokenizer_config.json',
 '/kaggle/working/my_finetuned_model/special_tokens_map.json',
 '/kaggle/working/my_finetuned_model/vocab.json',
 '/kaggle/working/my_finetuned_model/source.spm',
 '/kaggle/working/my_finetuned_model/target.spm',
 '/kaggle/working/my_finetuned_model/added_tokens.json')

In [30]:
loaded_model = MarianMTModel.from_pretrained("/kaggle/working/my_finetuned_model")
loaded_tokenizer = MarianTokenizer.from_pretrained("/kaggle/working/my_finetuned_model")



# Translation System

In [37]:
def translate_text(text):
    # Tokenize input
    inputs = loaded_tokenizer(text, return_tensors="pt", max_length=128, truncation=True)

    output = loaded_model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    # Decode the translation
    translation = loaded_tokenizer.decode(output[0], skip_special_tokens=True)
    return translation

In [40]:
text = "When you meet someone for the first time, be careful about your impression"
translated_text = translate_text(text)
print(f"Translated Text : {translated_text}")

Translated Text : عندما تقابل شخصاً للمرة الأولى، كون حذراً عن إنطاقك
