# Importing Libraries

In [None]:
! pip install -U accelerate
! pip install -U transformers
! pip install torch datasets
! pip install sacrebleu

In [None]:
! pip install gdown

In [1]:
import warnings
import numpy as np
import pandas as pd

import torch
import transformers

from datasets import Dataset
from datasets import load_metric

from tqdm import tqdm
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

2024-03-27 20:23:32.927459: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-27 20:23:32.927564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-27 20:23:33.093926: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Importing The Model

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

# Importing The Dataset

In [3]:
!gdown --id 1muNB7l4GcNvZPW34VFlnIliMR4Vk74SB

Downloading...
From: https://drive.google.com/uc?id=1muNB7l4GcNvZPW34VFlnIliMR4Vk74SB
To: /kaggle/working/ArzEn-MultiGenre_cleaned_data.xlsx
100%|███████████████████████████████████████| 1.37M/1.37M [00:00<00:00, 133MB/s]


In [4]:
# Load dataset into pandas DataFrame
data_path = "/kaggle/working/ArzEn-MultiGenre_cleaned_data.xlsx"
data = pd.read_excel(data_path)

data.head()


Unnamed: 0.1,Unnamed: 0,egyption_Text,english_Text,category,sub_category
0,0,الأمير الصغير,The little prince,Novels,the-little-prince
1,1,أنطوان دي سانت إكزوبيري,Antoine De Saint-Exuper,Novels,the-little-prince
2,2,الفصل الأول,Chapter 1,Novels,the-little-prince
3,3,في مره، لما كان عندي ست سنين، شفت صوره روعه، ف...,Once when I was six years old I saw a magnific...,Novels,the-little-prince
4,4,وكانت عباره عن تعبان من نوع البوا بيبلع فهد.,It was a picture of a boa constrictor in the a...,Novels,the-little-prince


In [5]:
data.isnull().sum()

Unnamed: 0       0
egyption_Text    0
english_Text     0
category         0
sub_category     0
dtype: int64

In [6]:
data = data[['egyption_Text' , 'english_Text']]
data.head()

Unnamed: 0,egyption_Text,english_Text
0,الأمير الصغير,The little prince
1,أنطوان دي سانت إكزوبيري,Antoine De Saint-Exuper
2,الفصل الأول,Chapter 1
3,في مره، لما كان عندي ست سنين، شفت صوره روعه، ف...,Once when I was six years old I saw a magnific...
4,وكانت عباره عن تعبان من نوع البوا بيبلع فهد.,It was a picture of a boa constrictor in the a...


# Data Splitting

In [7]:
train, validation_test = train_test_split(data, test_size=0.2, random_state=42)
validation, test = train_test_split(validation_test, test_size=0.5, random_state=42)

In [8]:
train_data = Dataset.from_pandas(train)
validation_data = Dataset.from_pandas(validation)
test_data = Dataset.from_pandas(test)

train_data , validation_data , test_data

(Dataset({
     features: ['egyption_Text', 'english_Text', '__index_level_0__'],
     num_rows: 13367
 }),
 Dataset({
     features: ['egyption_Text', 'english_Text', '__index_level_0__'],
     num_rows: 1671
 }),
 Dataset({
     features: ['egyption_Text', 'english_Text', '__index_level_0__'],
     num_rows: 1671
 }))

# Data Tokenziation

In [9]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex for ex in examples["egyption_Text"]]
    targets = [ex for ex in examples["english_Text"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [10]:
tokenized_train = train_data.map(
    preprocess_function,
    batched=True,
    remove_columns=train_data.column_names,
)

  0%|          | 0/14 [00:00<?, ?ba/s]

In [11]:
tokenized_validation = validation_data.map(
    preprocess_function,
    batched=True,
    remove_columns=validation_data.column_names,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [12]:

tokenized_test = test_data.map(
    preprocess_function,
    batched=True,
    remove_columns=test_data.column_names,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

# Setting up The GPU

In [13]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! Training on GPU...")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Training on CPU...")
    device = torch.device("cpu")

CUDA is available! Training on GPU...


# Setting up the Evaluation Metric  

In [14]:
from datasets import load_dataset, load_metric

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds: tuple) -> dict:
    """computes bleu score and other performance metrics """

    metric = load_metric("sacrebleu")
    tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {'bleu': result['score']}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result['gen_len'] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

# Importing the Model

In [15]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

training_args = Seq2SeqTrainingArguments(
output_dir="./output_dir",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    logging_dir="./logs",
    learning_rate=2e-5,
    fp16=True,
    predict_with_generate=True,
)

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
trainer = Seq2SeqTrainer(
    model=model.to(device),
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
500,2.3064
1000,1.9572
1500,1.8129


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


TrainOutput(global_step=1672, training_loss=1.9987328634307715, metrics={'train_runtime': 828.2661, 'train_samples_per_second': 129.108, 'train_steps_per_second': 2.019, 'total_flos': 1599688735064064.0, 'train_loss': 1.9987328634307715, 'epoch': 8.0})

# Saving The Model

In [21]:
trainer.save_model('SmsmAI/Finetuned Model')

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


# Loading The Model

In [38]:
model = AutoModelForSeq2SeqLM.from_pretrained("SmsmAI/Helsinki Fine Tuned (Lara) ")

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    compute_metrics=compute_metrics,
)

# Model Evaluation

## Evaluation on Validation 

In [24]:
trainer.evaluate()



Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]



{'eval_loss': 2.1852757930755615,
 'eval_bleu': 16.1952,
 'eval_gen_len': 13.9192,
 'eval_runtime': 104.7839,
 'eval_samples_per_second': 15.947,
 'eval_steps_per_second': 0.134,
 'epoch': 8.0}

## Evaluation on Test 

In [None]:
print(type(tokenized_test))

In [25]:
tokenized_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1671
})

In [27]:
test_dataset = Dataset.from_dict({
    "input_ids": tokenized_test["input_ids"],
    "attention_mask": tokenized_test["attention_mask"],
    "labels": tokenized_test["labels"],
})


trainer.evaluate(test_dataset)


# print("Evaluation results:", evaluation_results)

{'eval_loss': 2.1705574989318848,
 'eval_bleu': 15.9275,
 'eval_gen_len': 13.9629,
 'eval_runtime': 107.4431,
 'eval_samples_per_second': 15.552,
 'eval_steps_per_second': 0.13,
 'epoch': 8.0}

In [35]:
test.head()

Unnamed: 0,egyption_Text,english_Text
10137,‫لا ربنا معاك.,"Well, good luck to you, then!"
13896,إيه جو الصباحية ده؟,Are we newlyweds or something?
3090,اتمنى من كل قلبي ماضطرش أحارب تاني .,I hope so much I do not have to fight again.
4149,ماري جات بعدها بشوية صغيرين.,Presently Marie came back.
12434,هي المشكلة في مؤسسة الجواز .,The problem is with marriage as an institution.


# Model Testing

In [37]:
from transformers import pipeline
model_path = "/content/Finetuned Model"
translator = pipeline("translation", model=model, tokenizer="Helsinki-NLP/opus-mt-ar-en")

arabic_sentences = [
    "اتمنى من كل قلبي ماضطرش أحارب تاني",
]

translated_sentences = translator(arabic_sentences, max_length=50, return_text=True)

for original, translation in zip(arabic_sentences, translated_sentences):
    print("Arabic:", original)
    print("English:", translation)
    print()



Arabic: اتمنى من كل قلبي ماضطرش أحارب تاني
English: {'translation_text': 'I hope, with all my heart, that I do not have to fight again.'}

