# Importing Libraries

In [None]:
! pip install -U accelerate
! pip install -U transformers
! pip install torch datasets

In [1]:
import warnings
import numpy as np
import pandas as pd

import torch
import transformers

from datasets import Dataset
from datasets import load_metric

from tqdm import tqdm
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Importing The Model

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

# Importing The Dataset

In [3]:
!gdown --id 1muNB7l4GcNvZPW34VFlnIliMR4Vk74SB

Downloading...
From: https://drive.google.com/uc?id=1muNB7l4GcNvZPW34VFlnIliMR4Vk74SB
To: /content/ArzEn-MultiGenre_cleaned_data.xlsx
100% 1.37M/1.37M [00:00<00:00, 36.6MB/s]


In [4]:
# Load dataset into pandas DataFrame
data_path = "/content/ArzEn-MultiGenre_cleaned_data.xlsx"
data = pd.read_excel(data_path)

data.head()


Unnamed: 0.1,Unnamed: 0,egyption_Text,english_Text,category,sub_category
0,0,الأمير الصغير,The little prince,Novels,the-little-prince
1,1,أنطوان دي سانت إكزوبيري,Antoine De Saint-Exuper,Novels,the-little-prince
2,2,الفصل الأول,Chapter 1,Novels,the-little-prince
3,3,في مره، لما كان عندي ست سنين، شفت صوره روعه، ف...,Once when I was six years old I saw a magnific...,Novels,the-little-prince
4,4,وكانت عباره عن تعبان من نوع البوا بيبلع فهد.,It was a picture of a boa constrictor in the a...,Novels,the-little-prince


In [5]:
data.isnull().sum()

Unnamed: 0       0
egyption_Text    0
english_Text     0
category         0
sub_category     0
dtype: int64

In [5]:
data = data[['egyption_Text' , 'english_Text']]
data.head()

Unnamed: 0,egyption_Text,english_Text
0,الأمير الصغير,The little prince
1,أنطوان دي سانت إكزوبيري,Antoine De Saint-Exuper
2,الفصل الأول,Chapter 1
3,في مره، لما كان عندي ست سنين، شفت صوره روعه، ف...,Once when I was six years old I saw a magnific...
4,وكانت عباره عن تعبان من نوع البوا بيبلع فهد.,It was a picture of a boa constrictor in the a...


# Data Splitting

In [6]:
train, validation_test = train_test_split(data, test_size=0.2, random_state=42)
validation, test = train_test_split(validation_test, test_size=0.5, random_state=42)

In [7]:
train_data = Dataset.from_pandas(train)
validation_data = Dataset.from_pandas(validation)
test_data = Dataset.from_pandas(test)

train_data , validation_data , test_data

(Dataset({
     features: ['egyption_Text', 'english_Text', '__index_level_0__'],
     num_rows: 13367
 }),
 Dataset({
     features: ['egyption_Text', 'english_Text', '__index_level_0__'],
     num_rows: 1671
 }),
 Dataset({
     features: ['egyption_Text', 'english_Text', '__index_level_0__'],
     num_rows: 1671
 }))

# Data Tokenziation

In [8]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex for ex in examples["egyption_Text"]]
    targets = [ex for ex in examples["english_Text"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [9]:
tokenized_train = train_data.map(
    preprocess_function,
    batched=True,
    remove_columns=train_data.column_names,
)

tokenized_validation = validation_data.map(
    preprocess_function,
    batched=True,
    remove_columns=validation_data.column_names,
)
tokenized_test = test_data.map(
    preprocess_function,
    batched=True,
    remove_columns=validation_data.column_names,
)

Map:   0%|          | 0/13367 [00:00<?, ? examples/s]

Map:   0%|          | 0/1671 [00:00<?, ? examples/s]

Map:   0%|          | 0/1671 [00:00<?, ? examples/s]

# Setting up The GPU

In [10]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! Training on GPU...")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Training on CPU...")
    device = torch.device("cpu")

CUDA is available! Training on GPU...


# Setting up the Evaluation Metric  

In [11]:
from datasets import load_dataset, load_metric

# Load BLEU metric
metric = load_metric("bleu")

# Define function to compute BLEU score
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=decoded_preds, references=decoded_labels)


  metric = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

# Importing the Model

In [12]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
# 12 epoch
training_args = Seq2SeqTrainingArguments(
    output_dir="./output_dir",  # specify the output directory where model checkpoints and logs will be saved
    per_device_train_batch_size=16,
    num_train_epochs=12,
    logging_dir="./logs",
    learning_rate=2e-5,
        fp16=True,

)

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [14]:
trainer = Seq2SeqTrainer(
    model=model.to(device),
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,2.471
1000,2.2428
1500,2.0195
2000,1.9047
2500,1.8252
3000,1.6643
3500,1.6216
4000,1.5428
4500,1.4795
5000,1.4399


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


TrainOutput(global_step=10032, training_loss=1.519437546935378, metrics={'train_runtime': 1060.5565, 'train_samples_per_second': 151.245, 'train_steps_per_second': 9.459, 'total_flos': 1668865763966976.0, 'train_loss': 1.519437546935378, 'epoch': 12.0})

#Saving The Model


In [15]:
trainer.save_model('Helsinki-SMSM')

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


#Evaluation

In [17]:
# trainer.evaluate(test_dataset=tokenized_test , max_length=max_length)

#Testing

In [18]:
from transformers import pipeline
model_path = "/content/Helsinki-SMSM"
translator = pipeline("translation", model=model_path, tokenizer="Helsinki-NLP/opus-mt-ar-en")

arabic_sentences = [
    "السلام عليكم , ازيك؟ عامل ايه؟",
    "الحتة دي مطلوبة بالامتحان"
]

translated_sentences = translator(arabic_sentences, max_length=50, return_text=True)

for original, translation in zip(arabic_sentences, translated_sentences):
    print("Arabic:", original)
    print("English:", translation)
    print()



Arabic: السلام عليكم , ازيك؟ عامل ايه؟
English: {'translation_text': 'Hello, how are you?'}

Arabic: الحتة دي مطلوبة بالامتحان
English: {'translation_text': 'This section is wanted by the exam.'}

