In [1]:
!pip install transformers



In [2]:
!pip install sentencepiece



In [3]:
!pip install sacremoses



In [4]:
import sys
print(sys.executable)


/home/saitaa0b/miniconda3/envs/allam_hunayn/bin/python


In [10]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import sacremoses
import sentencepiece
from tqdm import tqdm

In [11]:
data = pd.read_csv("Final_Data.csv", index_col=0) 

class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.iloc[idx]["English"]
        target_text = self.data.iloc[idx]["Arabic"]

        source_tokens = self.tokenizer.encode(source_text, padding="max_length", truncation=True, return_tensors="pt")

        with self.tokenizer.as_target_tokenizer():
          target_tokens = self.tokenizer.encode(target_text, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": source_tokens.squeeze(),
            "attention_mask": source_tokens.squeeze().gt(0),
            "labels": target_tokens.squeeze(),
        }

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
data

Unnamed: 0,Arabic,English
0,تفريج الكروب فى تدبير الحروب,TAFRIJ AL-KURUB FI TADBIR AL-HURUBA Muslim Man...
1,مقدمة,INTRODUCTION
2,بسم الله الرحمن الرحيم.. مؤيد الإسلام من سلطان...,"IN THE NAME OF GOD, THE MERCIFUL, THE COMPASSI..."
3,ومسعد جده العالى بإبادة أعدائه الطغاة المارقين...,And [he is] the cause of his noble sire’s happ...
4,وأشهد أن لا إله إلا الله وحده لا شريك له، شهاد...,"I declare that there is no god but God alone, ..."
...,...,...
60475,ثم وجه إليه في ذلك مرة بعد أخرى مع جماعة من ال...,"Another time, the Commander of the faithful se..."
60476,فلما تبين أمير المؤمنين ذلك منه رأى أن يقضي عل...,When the Commander of the faithful perceived t...
60477,حتى توسط الطريق بين مدينة السلام وواسط، وأظهر ...,The rebel was already half-way between Baghdad...
60478,فقدم أمير المؤمنين أخاه الموفق بالله أحمد ولي ...,This obliged the Commander of the faithful to ...


In [14]:
#tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
#model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar")

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ar")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ar").to(device)

In [10]:
for param in model.parameters():
    param.requires_grad = False

for i in range(-1, -6, -1): 
    for param in model.model.decoder.layers[i].parameters():
        param.requires_grad = True

for param in model.lm_head.parameters():
    param.requires_grad = True

train_dataset = TranslationDataset(data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"✅ Trainable parameters: {trainable_params:,}")


✅ Trainable parameters: 146,700,288


In [None]:
checkpoint_path = "Big_Hunayn_at_different_epochs_deep/model_at_epoch1"

model = MarianMTModel.from_pretrained(checkpoint_path).to(device)
tokenizer = MarianTokenizer.from_pretrained(checkpoint_path)

In [11]:
start_epoch = 2
num_epochs = 10
best_loss = float('inf')

for epoch in range(start_epoch - 1, num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Average Loss: {average_loss:.4f}")

    model.save_pretrained(f"Big_Hunayn_at_different_epochs_deep/model_at_epoch{epoch + 1}")
    tokenizer.save_pretrained(f"Big_Hunayn_at_different_epochs_deep/model_at_epoch{epoch + 1}")
    torch.save(optimizer.state_dict(), f"Big_Hunayn_at_different_epochs_deep/model_at_epoch{epoch + 1}/optimizer.pt")

print("Training finished.")

Epoch 2:   0%|          | 5/7560 [00:03<1:25:24,  1.47it/s]


KeyboardInterrupt: 

In [15]:
def translate_english_to_arabic_hunain(input_text):
    input_text = [input_text]
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    translated_ids = model.generate(input_ids, max_length=len(str(input_text))+10,num_beams=100).to(device)
    with tokenizer.as_target_tokenizer():
      translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

def split_text_into_lines(text, max_words_per_line=100):
    words = text.split()
    lines = []
    current_line = []

    for word in words:
        if len(' '.join(current_line + [word])) <= max_words_per_line:
            current_line.append(word)
        else:
            lines.append(' '.join(current_line))
            current_line = [word]

    if current_line:
        lines.append(' '.join(current_line))

    return lines


In [18]:
for i in range(7):

    checkpoint_path = f"Big_Hunayn_at_different_epochs/model_at_epoch{i + 1}"

    model = MarianMTModel.from_pretrained(checkpoint_path)
    tokenizer = MarianTokenizer.from_pretrained(checkpoint_path)
    
    model = model.to(device)

    print(f"Translated text by model at epoch {i + 1}")
    # Example usage
    input_text =  "Time is a precious resource that once spent, can't be regained. Use your time wisely, invest it in things that matter, and create memories that will last a lifetime."
    translated_text = translate_english_to_arabic_hunain(input_text)
            
    input_lines = split_text_into_lines(input_text)
    translated_lines = split_text_into_lines(translated_text)
    print("Input:")
    for line in input_lines:
        print(line)
    
    print("\nTranslated:")
    for line in translated_lines:
        print(u'{}'.format(line))
    print()
    print("------------------------------------------------")
    print()


Translated text by model at epoch 1
Input:
Time is a precious resource that once spent, can't be regained. Use your time wisely, invest it in
things that matter, and create memories that will last a lifetime.

Translated:
الوقت مورد ثمين لا يمكن استعادته. استخدم وقتك بحكمة واستثمره في الأشياء المهمة وخلق ذكريات تدوم مدى
الحياة.

------------------------------------------------

Translated text by model at epoch 2
Input:
Time is a precious resource that once spent, can't be regained. Use your time wisely, invest it in
things that matter, and create memories that will last a lifetime.

Translated:
الوقت مورد ثمين لا يمكن استعادته. استخدم وقتك بحكمة واستثمره في الأشياء المهمة وخلق ذكريات تدوم مدى
الحياة.

------------------------------------------------

Translated text by model at epoch 3
Input:
Time is a precious resource that once spent, can't be regained. Use your time wisely, invest it in
things that matter, and create memories that will last a lifetime.

Translated:
الوقت مورد ثمين 