In [1]:
!pip install transformers



In [2]:
!pip install sentencepiece



In [3]:
!pip install sacremoses



In [5]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import sacremoses
import sentencepiece
from tqdm import tqdm

In [6]:
data = pd.read_csv("Final_Data.csv", index_col=0)

class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.iloc[idx]["English"]
        target_text = self.data.iloc[idx]["Arabic"]

        source_tokens = self.tokenizer.encode(source_text, padding="max_length", truncation=True, return_tensors="pt")

        with self.tokenizer.as_target_tokenizer():
          target_tokens = self.tokenizer.encode(target_text, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": source_tokens.squeeze(),
            "attention_mask": source_tokens.squeeze().gt(0),
            "labels": target_tokens.squeeze(),
        }

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
data

Unnamed: 0,Arabic,English
0,تفريج الكروب فى تدبير الحروب,TAFRIJ AL-KURUB FI TADBIR AL-HURUBA Muslim Man...
1,مقدمة,INTRODUCTION
2,بسم الله الرحمن الرحيم.. مؤيد الإسلام من سلطان...,"IN THE NAME OF GOD, THE MERCIFUL, THE COMPASSI..."
3,ومسعد جده العالى بإبادة أعدائه الطغاة المارقين...,And [he is] the cause of his noble sire’s happ...
4,وأشهد أن لا إله إلا الله وحده لا شريك له، شهاد...,"I declare that there is no god but God alone, ..."
...,...,...
60475,ثم وجه إليه في ذلك مرة بعد أخرى مع جماعة من ال...,"Another time, the Commander of the faithful se..."
60476,فلما تبين أمير المؤمنين ذلك منه رأى أن يقضي عل...,When the Commander of the faithful perceived t...
60477,حتى توسط الطريق بين مدينة السلام وواسط، وأظهر ...,The rebel was already half-way between Baghdad...
60478,فقدم أمير المؤمنين أخاه الموفق بالله أحمد ولي ...,This obliged the Commander of the faithful to ...


In [None]:
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar")

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

In [None]:
checkpoint_path = "Hunayn_at_different_epochs/model_at_epoch5"

model = MarianMTModel.from_pretrained(checkpoint_path)
tokenizer = MarianTokenizer.from_pretrained(checkpoint_path)

train_dataset = TranslationDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
start_epoch = 6
num_epochs = 10
best_loss = float('inf')

for epoch in tqdm(range(start_epoch - 1, num_epochs), desc="Epochs"):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Average Loss: {average_loss:.4f}")

    model.save_pretrained(f"Hunayn_at_different_epochs/model_at_epoch{epoch + 1}")
    tokenizer.save_pretrained(f"Hunayn_at_different_epochs/model_at_epoch{epoch + 1}")
    torch.save(optimizer.state_dict(), f"Hunayn_at_different_epochs/model_at_epoch{epoch + 1}/optimizer.pt")


print("Training finished.")

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch [6/10] - Average Loss: 0.1734




Epoch [7/10] - Average Loss: 0.1648




Epoch [8/10] - Average Loss: 0.1572




Epoch [9/10] - Average Loss: 0.1503




Epoch [10/10] - Average Loss: 0.1441


Epochs: 100%|██████████| 5/5 [1:50:50<00:00, 1330.10s/it]

Training finished.





In [9]:
def translate_english_to_arabic_hunain(input_text):
    input_text = [input_text]
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    translated_ids = model.generate(input_ids, max_length=len(str(input_text))+10,num_beams=100).to(device)
    with tokenizer.as_target_tokenizer():
      translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

def split_text_into_lines(text, max_words_per_line=100):
    words = text.split()
    lines = []
    current_line = []

    for word in words:
        if len(' '.join(current_line + [word])) <= max_words_per_line:
            current_line.append(word)
        else:
            lines.append(' '.join(current_line))
            current_line = [word]

    if current_line:
        lines.append(' '.join(current_line))

    return lines


In [14]:
for i in range(10):

    checkpoint_path = f"Hunayn_at_different_epochs/model_at_epoch{i + 1}"

    model = MarianMTModel.from_pretrained(checkpoint_path)
    tokenizer = MarianTokenizer.from_pretrained(checkpoint_path)

    model = model.to(device)

    print(f"Translated text by model at epoch {i + 1}")
    # Example usage
    input_text =  "He dominates them; they seem to be discussing something."
    translated_text = translate_english_to_arabic_hunain(input_text)

    input_lines = split_text_into_lines(input_text)
    translated_lines = split_text_into_lines(translated_text)
    print("Input:")
    for line in input_lines:
        print(line)

    print("\nTranslated:")
    for line in translated_lines:
        print(u'{}'.format(line))
    print()
    print("------------------------------------------------")
    print()


Translated text by model at epoch 1
Input:
He dominates them; they seem to be discussing something.

Translated:
يهيمن عليهم، ويظهرون أنهم يناظرون في أمر ما،

------------------------------------------------

Translated text by model at epoch 2
Input:
He dominates them; they seem to be discussing something.

Translated:
يهيمن عليهم، ويظهرون أنهم يناظرون في أمر،

------------------------------------------------

Translated text by model at epoch 3
Input:
He dominates them; they seem to be discussing something.

Translated:
وهو الغالب عليهم، فكأنهم يناظرون في أمر،

------------------------------------------------

Translated text by model at epoch 4
Input:
He dominates them; they seem to be discussing something.

Translated:
هو الغالب عليهم، كأنهم يتكلّمون في شيء،

------------------------------------------------

Translated text by model at epoch 5
Input:
He dominates them; they seem to be discussing something.

Translated:
وهو الغالب عليهم، وكأنهم على ما يبدو مناظرون في شيء.

---------