In [1]:
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install datasets 
!pip install evaluate    
!pip install sacrebleu
!pip install peft
!pip install rouge_score
# !pip3 install sentencepiece -q
# !pip3 install unbabel-comet -q

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting p

# Loading Dataset

In [2]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "T5"
os.environ["WANDB_NOTES"] = "Fine tune t5"
os.environ["WANDB_NAME"] = "araT5-Base-with-LoRA"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset, DatasetDict

# Load the dataset with the first 40,000 examples
ds = load_dataset("Helsinki-NLP/un_pc", "ar-en", split="train[:20000]")

# Split the dataset into 20,000 for training, and 20,000 for test + validation
train_test_val_split = ds.train_test_split(test_size=5000)  # Keep 20,000 for test+validation

# Further split the 20,000 examples into 10,000 for test and 10,000 for validation
test_validation_split = train_test_val_split['test'].train_test_split(test_size=0.5)

# Create a new dataset dictionary with the updated splits
ds_split = DatasetDict({
    'train': train_test_val_split['train'],  # 20,000 examples for training
    'test': test_validation_split['train'],  # 10,000 examples for testing
    'validation': test_validation_split['test'],  # 10,000 examples for validation
})

# Check the resulting dataset
ds_split

Downloading readme:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/17 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/20044478 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2500
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2500
    })
})

# Data Tokenization

In [4]:
from transformers import AutoTokenizer
import torch
model_name="UBC-NLP/AraT5v2-base-1024"

tokenizer=AutoTokenizer.from_pretrained(model_name, device_map={'':torch.cuda.current_device()})

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



In [5]:
source_lang="ar"
target_lang="en"
prefix="translate Arabic to English: "

def preprocess_function(examples):
    inputs =[prefix+example[source_lang] for example in examples["translation"]]
    targets=[example[target_lang] for example in examples["translation"]]
    model_inputs=tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs


tokenized_ds=ds_split.map(preprocess_function, batched=True)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

# Preparing the model

In [6]:
def initialize_layer_norms(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.LayerNorm):
            module.weight.data.fill_(1.0)
            module.bias.data.zero_()
    
    for name, param in model.named_parameters():
        if "language_adaptor" in name and "norm" in name:
            if "weight" in name:  # Layer normalization scale
                print(f"Initializing {name} with mean=0 and std=1")
                init.ones_(param.data)
            elif "bias" in name:  # Layer normalization bias
                print(f"Initializing {name} with mean=0")
                init.zeros_(param.data)

In [7]:
from peft import PeftModel, prepare_model_for_kbit_training, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForSeq2SeqLM

model=AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map={'':torch.cuda.current_device()})
initialize_layer_norms(model)

config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [8]:
# Freeze the original parameters
model=prepare_model_for_kbit_training(model)

peft_config=LoraConfig(
    # the task to train for (sequence-to-sequence language modeling in this case)
    task_type=TaskType.SEQ_2_SEQ_LM,
    # the dimension of the low-rank matrices
    r=5,
    # the scaling factor for the low-rank matrices
    lora_alpha=32,
    # the dropout probability of the LoRA layers
    lora_dropout=0.06,
    target_modules=["k","q","v","o"],
)

peft_model=get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 1,105,920 || all params: 368,614,656 || trainable%: 0.3000


In [9]:
print(model.config)

T5Config {
  "_name_or_path": "UBC-NLP/AraT5v2-base-1024",
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.44.0",
  "use_cache": true,
  "vocab_size": 110208
}



In [10]:
peft_model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(110208, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(110208, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.06, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=5, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=5, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
            

# BLEU and ROUGE scores

In [11]:
import evaluate
import numpy as np
import sacrebleu

metric_bleu = evaluate.load("sacrebleu")
metric_rouge = evaluate.load("rouge")

# Define a function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Define a function to compute metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    # Compute BLEU score
    result_bleu = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result_bleu = {"bleu": result_bleu["score"]}
#     references=decoded_labels
#     result_bleu = sacrebleu.corpus_bleu(translations=decoded_preds,[references])  
#     result_bleu = {"bleu": round(bleu.score, 2)}
    
    # Compute ROUGE score
    result_rouge = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)
    result_rouge = {"rouge": result_rouge["rougeL"]}
    
    # Combine BLEU and ROUGE results
    result = {**result_bleu, **result_rouge}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    
    # Save the result to a file after each evaluation
    with open('metrics.txt', 'a') as f:
        f.write(f"BLEU: {result['bleu']}, ROUGE: {result['rouge']}, Gen Len: {result['gen_len']}\n")
    
    return result


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Training

In [12]:
from transformers import DataCollatorForSeq2Seq

# ignore tokenizer pad token in the loss
label_pad_token_id=-100

# padding the sentence of the entire datasets
data_collator=DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=peft_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)
print(data_collator)

DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='UBC-NLP/AraT5v2-base-1024', vocab_size=110100, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_

In [13]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer, get_linear_schedule_with_warmup
import torch
import gc
import torch.nn.init as init
import os

training_args=Seq2SeqTrainingArguments(
    
    output_dir=os.getenv("WANDB_NAME"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir=os.getenv("WANDB_NAME")+"/logs",
    logging_strategy="epoch",
    logging_steps=500,
    load_best_model_at_end=True,
    learning_rate=2e-4,
    per_device_train_batch_size= 2,
    per_device_eval_batch_size= 2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs= 7,
    predict_with_generate=True,
    fp16=True,
#     push_to_hub=True,
    report_to="wandb",
    run_name=os.getenv("WANDB_NAME"),
)

# Create Trainer instance
trainer=Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

peft_model.config.use_cache=False
torch.cuda.empty_cache()
gc.collect()
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mmoudjaramina2001[0m ([33mFinalProject_[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Bleu,Rouge,Gen Len
1,4.2677,2.711971,9.3006,0.4268,13.9736
2,3.3333,2.423926,11.2944,0.4621,14.186
3,2.9966,2.266584,12.0064,0.4826,14.0772
4,2.8431,2.186536,12.9448,0.4942,14.24
5,2.7572,2.113259,13.3088,0.4977,14.3268
6,2.6223,2.055182,13.7195,0.5114,14.2744
7,2.5578,2.029679,13.9153,0.5134,14.254




TrainOutput(global_step=52500, training_loss=3.0539958333333335, metrics={'train_runtime': 18007.4437, 'train_samples_per_second': 5.831, 'train_steps_per_second': 2.915, 'total_flos': 8654386925371392.0, 'train_loss': 3.0539958333333335, 'epoch': 7.0})

In [14]:
# Read the final BLEU score from the saved metrics file
with open('metrics.txt', 'r') as f:
    lines = f.readlines()
    print(lines)
# Print the last line which contains the final BLEU score
if lines:
    final_metrics = lines[-1]
    print("Final Evaluation Metrics:", final_metrics)
else:
    print("No evaluation metrics found.")


['BLEU: 9.3006, ROUGE: 0.4268, Gen Len: 13.9736\n', 'BLEU: 11.2944, ROUGE: 0.4621, Gen Len: 14.186\n', 'BLEU: 12.0064, ROUGE: 0.4826, Gen Len: 14.0772\n', 'BLEU: 12.9448, ROUGE: 0.4942, Gen Len: 14.24\n', 'BLEU: 13.3088, ROUGE: 0.4977, Gen Len: 14.3268\n', 'BLEU: 13.7195, ROUGE: 0.5114, Gen Len: 14.2744\n', 'BLEU: 13.9153, ROUGE: 0.5134, Gen Len: 14.254\n']
Final Evaluation Metrics: BLEU: 13.9153, ROUGE: 0.5134, Gen Len: 14.254



In [15]:
eval_results=trainer.evaluate()  
# peft_model.eval()



In [16]:
import math

eval_results=trainer.evaluate() 
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 7.61


# Inference

In [17]:
# from transformers import pipeline

# text=" المملكة العربية السعودية"
# translator=pipeline("translation",model=os.getenv("WANDB_NAME"))
# translator(text)
peft_model.config.use_cache=True
context=tokenizer(["يعتبر مالك بن نبي أن الفكرة هي اللبنة الأساسية في بناء أي حضارة. الفكرة تُشكّل الروح التي تحرك المجتمع نحو التغيير. الأفكار الكبيرة، مثل العدالة، الحرية، والإبداع، هي التي تستطيع توجيه الشعوب نحو النهضة إذا ما تم فهمها وتطبيقها بالشكل الصحيح. من دون فكرة واضحة ومتماسكة"], return_tensors="pt")
output=peft_model.generate(**context)

tokenizer.decode(output[0], skip_special_tokens=True)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

# push the model to Huggingface

In [None]:
tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(os.getenv("WANDB_NAME"))