In [None]:
# install required packages
%pip install \
    transformers \
    datasets \
    evaluate \
    rouge_score\
    loralib \
    bitsandbytes \
    peft --quiet

In [None]:
# import necessary libraries
import time
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, Trainer, DataCollatorForSeq2Seq, AutoConfig
import evaluate
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, PeftModel

import pandas as pd
from getpass import getpass
from tqdm.notebook import tqdm

from datasets import Dataset


In [None]:
# Login to Hugging Face
hf_token = getpass("Hugging Face: ")
!huggingface-cli login --token $hf_token

## 📘 Dataset Reference

**Title:** [Enhancing English-Persian Neural Machine Translation with a Large-Scale Parallel Dataset and Relative Position Representations](https://ieeexplore.ieee.org/abstract/document/10967409)  
**Authors:** Alireza Kamyab, Negar Baghaei Nejad, [Alireza Akhavanpour](https://class.vision/teacher/%D8%B9%D9%84%DB%8C%D8%B1%D8%B6%D8%A7-%D8%A7%D8%AE%D9%88%D8%A7%D9%86-%D9%BE%D9%88%D8%B1/)  
**Affiliation:** [Shenasa AI](https://shenasa.ai), Tehran, Iran  
**Dataset on HuggingFace:** [shenasa/English-Persian-Parallel-Dataset](https://huggingface.co/datasets/shenasa/English-Persian-Parallel-Dataset)


In [None]:
# Load the English-Persian parallel dataset from Hugging Face
huggingface_dataset_name = "shenasa/English-Persian-Parallel-Dataset"
dataset = load_dataset(huggingface_dataset_name)

dataset = dataset['train']

In [None]:
# Rename columns to 'src_lang' and 'tgt_lang'
source_lang_col = 'src_lang'
target_lang_col = 'tgt_lang'

df_train = dataset.to_pandas()
df_train.columns = ['src_lang', 'tgt_lang']

dataset = Dataset.from_pandas(df_train)


In [None]:
# Filter out rows with non-string values in the source or target language columns
dataset = dataset.filter(lambda x: isinstance(x[target_lang_col], str))
dataset = dataset.filter(lambda x: isinstance(x[source_lang_col], str))


In [None]:
# Display some examples from the dataset
source_lang_col = dataset.column_names[0]
target_lang_col = dataset.column_names[1]
for i in range(0,100,20):
    print(f"Source: {dataset[i][source_lang_col]}")
    print(f"Target: {dataset[i][target_lang_col]}")
    print("-" * 50)

In [None]:
# Load the pre-trained model and tokenizer
model_name = "google/gemma-2-9b-it"

bnbConfig = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    quantization_config=bnbConfig
)

In [None]:
# Print the number of trainable model parameters
def print_number_of_trainable_model_parameters(model):
    all_model_params = model.num_parameters()
    trainable_model_params = sum(param.numel() for param in model.parameters() if param.requires_grad)

    percentage_trainable = 100 * trainable_model_params / all_model_params if all_model_params > 0 else 0

    return (f"Trainable model parameters: {trainable_model_params}\n"
            f"All model parameters: {all_model_params}\n"
            f"Percentage of trainable model parameters: {percentage_trainable:.2f}%")

print(print_number_of_trainable_model_parameters(model))

In [None]:
# Example prompt for translation
index = 200

source_lang = dataset[index][source_lang_col]
target_lang = dataset[index][target_lang_col]

prompt = f"""
Translate the following English text to Persian:
English: {source_lang}

Persian translation:
"""
print(prompt)


In [None]:
# Tokenize the prompt and move inputs to GPU
device = torch.device("cuda")
inputs = tokenizer(prompt, return_tensors='pt').to(device)

In [None]:
# Generate translation
model = model.eval()
with torch.no_grad():
  output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=100,
      )[0],
      skip_special_tokens=True
  )
print(output)

In [None]:
# Compare model output with baseline translation
index = 95000

source_lang = dataset[index][source_lang_col]
target_lang = dataset[index][target_lang_col]

prompt = f"""
Translate the following English text to Persian:
English: {source_lang}

Persian translation:
"""

device = torch.device("cuda")
inputs = tokenizer(prompt, return_tensors='pt').to(device)
with torch.no_grad():
  output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=100,
          do_sample=False
      )[0],
      skip_special_tokens=True
  )

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE Translation:\n{target_lang}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

## Lets Load **WIKI PEPC** as a Benchmark!

## 📘 PEPC: Parallel English-Persian Corpus

**عنوان:** [PEPC: Parallel English-Persian Corpus Extracted from Wikipedia](https://iasbs.ac.ir/~ansari/nlp/pepc.html)  
**نویسندگان:** Akbar Karimi, Ebrahim Ansari, Bahram Sadeghi Bigham  
**سازمان:** Institute for Advanced Studies in Basic Sciences (IASBS), Zanjan, Iran  
**دسترسی به دیتاست:** [دانلود PEPC](https://iasbs.ac.ir/~ansari/nlp/pepc.html)


In [None]:
# Parallel English-Persian Corpus
!wget https://iasbs.ac.ir/~ansari/nlp/files_pepc/AK_Test_1K.rar

!unrar x AK_Test_1K.rar

In [None]:
# Load the WIKI PEPC test set
def load_text(path):
    with open(path, 'r', encoding='utf-8') as f:
        texts = f.readlines()
    texts = [line.strip() for line in texts if line.strip()]
    return texts


source_texts = load_text('ak-test-1k.en')
reference_texts = load_text('ak-test-1k.fa')

len(source_texts), len(reference_texts)

In [None]:
# For quick testing, we will use only the first 100 samples
source_texts = source_texts[:100]
reference_texts = reference_texts[:100]
len(source_texts), len(reference_texts)

In [None]:
# Evaluate the model using BLEU score
bleu = evaluate.load("bleu")

predictions = []
references = []
source_texts_list = []

MAX_LENGTH = 512

with torch.no_grad():
    for i in tqdm(range(len(source_texts)), desc="Processing translations"):
        source_text = source_texts[i]
        reference_text = reference_texts[i]

        prompt = f"""
        Translate the following English text to Persian:
        English: {source_text}

        Persian translation:"""

        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        ).to(device)

        input_token_length = inputs["input_ids"].shape[1]

        full_output_ids = model.generate(
            inputs["input_ids"],
            max_new_tokens=100,
            do_sample=False  # deterministic
        )[0]

        output_only_ids = full_output_ids[input_token_length:]
        cleaned_output = tokenizer.decode(
            output_only_ids,
            skip_special_tokens=True
        )
        prediction_text = cleaned_output.strip()

        predictions.append(prediction_text)
        references.append([reference_text])
        source_texts_list.append(source_text)

results = bleu.compute(predictions=predictions, references=references)

print(f"BLEU Score: {results['bleu']:.4f}")

In [None]:
# Display some sample translations
df_samples = pd.DataFrame({
    'Source (English)': source_texts_list[:10],
    'Reference (Persian)': [ref[0] for ref in references[:10]],
    'Prediction (Gemma)': predictions[:10]
})

df_samples

In [None]:
# number of trainable model parameters

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

### Prepares the model for training with low-bit precision (e.g., 4-bit or 8-bit) to reduce memory usage.

In [None]:
# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

In [None]:
# Apply LoRA to the model
peft_config = LoraConfig(
    r=16,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
peft_model = get_peft_model(model, peft_config)
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
#dataset = dataset.shuffle(seed=65).select(range(100000))

In [None]:
# Preprocess the dataset
def preprocess_fn(examples):
    prompts = [
        f"Translate the following English text to Persian:\n"
        f"English: {src}\n\nPersian translation:"
        for src in examples[source_lang_col]
    ]
    targets = examples[target_lang_col]
    full_texts = [p + " " + t for p, t in zip(prompts, targets)]

    model_inputs = tokenizer(
        full_texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    labels = model_inputs["input_ids"].clone()

    for i, prompt in enumerate(prompts):
        prompt_ids = tokenizer(prompt, truncation=True, max_length=128)["input_ids"]
        prompt_len = len(prompt_ids)
        labels[i][:prompt_len] = -100  # mask prompt from loss

    model_inputs["labels"] = labels
    return model_inputs


In [None]:
# Apply the preprocessing function to the dataset
tokenized_train = dataset.map(
    preprocess_fn,
    batched=True,
    remove_columns=[source_lang_col, target_lang_col]
)

In [None]:
# Split the dataset into training and evaluation sets
split_dataset = tokenized_train.train_test_split(test_size=0.2, seed=42)
tokenized_train = split_dataset["train"]
tokenized_eval = split_dataset["test"]

In [None]:
# Define data collator
output_dir = f'./peft-english_to_persian_gemma2-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    gradient_accumulation_steps=2,
    warmup_steps =1000,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    # fp16=True,
    learning_rate=2e-5,
    logging_steps=500,
    eval_steps=1000,
    max_steps=3000,
    label_names=["labels"],
    log_level="info",
    report_to="none",
)

In [None]:
"""
Enable gradient checkpointing for the original model to save memory during training
by trading off some computational overhead. This is especially useful for large models.
"""


#peft_training_args.gradient_checkpointing = True
#peft_model.gradient_checkpointing_enable()

In [None]:
# Define the data collator for seq2seq tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=64,
    return_tensors="pt"
)

In [None]:
# Initialize the Trainer for PEFT model
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,  # مجموعه اعتبارسنجی
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
# Start training
train_output = peft_trainer.train()

In [None]:
# Save the fine-tuned PEFT model and tokenizer
peft_model_path="./peft-english_to_persian_gemma2"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
# get the fine-tuned PEFT model and tokenizer
!gdown https://drive.google.com/uc?id=12R898xmbgA0e41b6PGBVM0trb8wfn7lY
!unzip peft-english_to_persian_gemma2.zip

In [None]:
# Load the fine-tuned PEFT model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"

peft_model_path = "./peft-english_to_persian_gemma2"

tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

bnbConfig = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="auto",
    quantization_config=bnbConfig
)

peft_model = PeftModel.from_pretrained(base_model, peft_model_path)


In [None]:
# Load the test dataset
def load_text(path):
    with open(path, 'r', encoding='utf-8') as f:
        texts = f.readlines()
    texts = [line.strip() for line in texts if line.strip()]
    return texts


source_texts = load_text('ak-test-1k.en')
reference_texts = load_text('ak-test-1k.fa')

len(source_texts), len(reference_texts)

source_texts = source_texts[:100]
reference_texts = reference_texts[:100]

In [None]:
# Evaluate the fine-tuned PEFT model using BLEU score
bleu = evaluate.load("bleu")

predictions = []
references = []
source_texts_list = []

MAX_LENGTH = 512

with torch.no_grad():
    for i in tqdm(range(len(source_texts)), desc="Processing translations"):
        source_text = source_texts[i]
        reference_text = reference_texts[i]

        prompt = f"""
        Translate the following English text to Persian:
        English: {source_text}

        Persian translation:"""

        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        ).to(device)

        input_token_length = inputs["input_ids"].shape[1]

        full_output_ids = peft_model.generate(
            inputs["input_ids"],
            max_new_tokens=100,
            do_sample=False  # deterministic
        )[0]

        output_only_ids = full_output_ids[input_token_length:]
        cleaned_output = tokenizer.decode(
            output_only_ids,
            skip_special_tokens=True
        )
        prediction_text = cleaned_output.strip()

        predictions.append(prediction_text)
        references.append([reference_text])
        source_texts_list.append(source_text)
        if len(references)==500:
            break


results = bleu.compute(predictions=predictions, references=references)

print(f"Recomputed BLEU Score: {results['bleu']:.4f}")

In [None]:
# Display some sample translations
df_samples = pd.DataFrame({
    'Source (English)': source_texts_list[:10],
    'Reference (Persian)': [ref[0] for ref in references[:10]],
    'Prediction (Gemma)': predictions[:10]
})

df_samples

In [None]:
# Function to translate English text to Persian using the fine-tuned PEFT model
def translate_english_to_persian(
    english_text: str,
    model,
    tokenizer,
    device,
    max_input_length: int = 512, # Max length for input + prompt
    max_new_tokens: int = 100   # Max tokens to generate for the translation
):

    prompt = f"""
Translate the following English text to Persian:
English: {english_text}

Persian translation:"""

    # Tokenize the input prompt
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,          # Padding might not be strictly necessary for single input, but good practice
        truncation=True,
        max_length=max_input_length
    ).to(device)

    input_token_length = inputs["input_ids"].shape[1]

    # Generate translation with no sampling (deterministic output)
    with torch.no_grad():
        full_output_ids = model.generate(
            inputs["input_ids"],
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id # Important for generation to know when padding starts if padding=True
        )[0] # Get the first (and only) sequence from the batch

    # Extract only the generated tokens (remove the input prompt tokens)
    output_only_ids = full_output_ids[input_token_length:]    # Decode the generated tokens into text
    translated_text = tokenizer.decode(
        output_only_ids,        skip_special_tokens=True
    )

    return translated_text.strip()


In [None]:
# --- Example Usage ---
english_sentence = "Machine learning is a field of artificial intelligence."
translate_english_to_persian(english_sentence, peft_model, tokenizer, device)