- Needed pips :

In [None]:
!pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

In [None]:
!pip install sacremoses

In [None]:
!pip install nltk

In [None]:
!pip install peft

In [None]:
!pip install accelerate -U

- Loading the model :

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_checkpoint = "Helsinki-NLP/opus-mt-tc-big-en-ar"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/916k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/478M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [12]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# from google.colab import drive
# import shutil

# drive.mount('/content/drive')

# model_checkpoint = "/content/drive/MyDrive/Colab Notebooks/model"

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- Load Dataset :

In [13]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

file_path = 'Project2Dataset20-8.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Split 10% of the data as the test set
train_val, test = train_test_split(df, test_size=0.1, random_state=42)

# Split the remaining 90% data into 20% validation and 70% training
train, validation = train_test_split(train_val, test_size=0.2222, random_state=42)

train_dataset = Dataset.from_pandas(train)
validation_dataset = Dataset.from_pandas(validation)
test_dataset = Dataset.from_pandas(test)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

- Prepare evaluation metric :

In [14]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Ensure that you have the necessary NLTK resources
nltk.download('punkt')

# Define your candidate and reference sentences
labels = [["كيفك","يا","حلو","شو","الاخبار","بدي","قلك","شي","مهم","كتير"]]
prediction = "كيفك يا حلو شو الاخبار بدي قلك شي مهم كتير".split()

# Use a smoothing function
smoothing_function = SmoothingFunction().method1

# Calculate BLEU score with smoothing and proper weights
bleu_score = sentence_bleu(labels, prediction, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)

# Print the BLEU score
print(f"BLEU score: {bleu_score:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


BLEU score: 1.0000


- Data preprocessing :

In [15]:
#prefix = ""
max_input_length = 256
max_target_length = 256
source_lang = "En"
target_lang = "Ar"
def preprocess_function(data):
  inputs = data[source_lang]
  targets = data[target_lang]
  inputs = [str(input_text) for input_text in inputs]
  targets = [str(target_text) for target_text in targets]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, max_length=max_target_length, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True,remove_columns=['En','Ar','__index_level_0__'])

- Choosing training parameter :

In [17]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_steps=100,
    save_total_limit=5,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True  # Enable mixed precision training
)



In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
import numpy as np
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    with tokenizer.as_target_tokenizer():
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    smoothing_function = SmoothingFunction().method1

    bleu_scores = [
        sentence_bleu([label.split()], pred.split(), weights=(0.25, 0.25, 0, 0), smoothing_function=smoothing_function)
        for pred, label in zip(decoded_preds, decoded_labels)
    ]

    avg_bleu = np.mean(bleu_scores)

    result = {"bleu": round(avg_bleu, 4)}

    prediction_lens = [len(pred.split()) for pred in decoded_preds]
    result["gen_len"] = round(np.mean(prediction_lens), 4)

    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

- Training the model :

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.6685,1.527654,0.5479,13.9581
2,1.5129,1.458878,0.553,13.9038
3,1.4348,1.421125,0.555,13.8694
4,1.3771,1.3985,0.5586,13.8656
5,1.3302,1.383357,0.5604,13.8832
6,1.2362,1.37353,0.5633,13.9195
7,1.1981,1.364131,0.5657,13.9379
8,1.1902,1.358032,0.5682,13.9686
9,1.1606,1.354124,0.5688,13.9869


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61246]], 'forced_eos_token_id': 25897}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61246]], 'forced_eos_token_id': 25897}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61246]], 'forced_eos_token_id': 25897}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61246]], 'forced_eos_token_id': 25897}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61246]], 'forced_eos_token_id': 25897}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61246]], 'forced_eos_token_id': 25897}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61246]], 'forced_eos_token_id': 25897}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61246]]

- Evaluate the model on test set :

In [None]:
test_results = trainer.evaluate(tokenized_dataset['test'])
print(test_results['eval_loss'])

- Results :

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model_path = '/content/opus-mt-tc-big-en-ar-finetuned-En-to-Ar/checkpoint-17535'

# Load the fine-tuned model and tokenizer
tokenizer_finetuned = AutoTokenizer.from_pretrained(model_path)
model_finetuned = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model_finetuned.to(device)

def translate_sentence(test_df, index):
    # Get the sentence at the specified index
    row = test_df.iloc[index]
    input_text = f">>ara<< {row['En']}"

    # Translate using the original model
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs)
    translation_original = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Translate using the finetuned model
    inputs_finetuned = tokenizer_finetuned(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs_finetuned = model_finetuned.generate(**inputs_finetuned)
    translation_finetuned = tokenizer_finetuned.decode(outputs_finetuned[0], skip_special_tokens=True)

    # Print the translations and the original Arabic text
    print(f"Original Model Translation: {translation_original}")
    print(f"Finetuned Model Translation: {translation_finetuned}")
    print(f"Original Arabic Output: {row['Ar']}")

In [None]:
translate_sentence(test, index=57)

In [None]:
# from google.colab import drive
# import shutil
# drive.mount('/content/drive')

# # Define paths
# local_model_path = '/content/opus-mt-tc-big-en-ar-finetuned-En-to-Ar/checkpoint-11690'
# drive_model_path = '/content/drive/MyDrive/model2'

# # Copy the model to Google Drive
# shutil.copytree(local_model_path, drive_model_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/test'