In [1]:
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install datasets 
!pip install evaluate       
!pip install peft
!pip install rouge_score
!pip3 install sentencepiece -q
# !pip3 install unbabel-comet -q
!pip install sacrebleu

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.2
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00

# Loading Dataset

In [3]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "T5"
os.environ["WANDB_NOTES"] = "Fine tune nllb"
os.environ["WANDB_NAME"] = "finetune-AraT5-on-opus100-Ar2En-with-Qlora"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from datasets import get_dataset_config_names

configs=get_dataset_config_names("opus100")
print(configs)

Downloading readme:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

['af-en', 'am-en', 'an-en', 'ar-de', 'ar-en', 'ar-fr', 'ar-nl', 'ar-ru', 'ar-zh', 'as-en', 'az-en', 'be-en', 'bg-en', 'bn-en', 'br-en', 'bs-en', 'ca-en', 'cs-en', 'cy-en', 'da-en', 'de-en', 'de-fr', 'de-nl', 'de-ru', 'de-zh', 'dz-en', 'el-en', 'en-eo', 'en-es', 'en-et', 'en-eu', 'en-fa', 'en-fi', 'en-fr', 'en-fy', 'en-ga', 'en-gd', 'en-gl', 'en-gu', 'en-ha', 'en-he', 'en-hi', 'en-hr', 'en-hu', 'en-hy', 'en-id', 'en-ig', 'en-is', 'en-it', 'en-ja', 'en-ka', 'en-kk', 'en-km', 'en-kn', 'en-ko', 'en-ku', 'en-ky', 'en-li', 'en-lt', 'en-lv', 'en-mg', 'en-mk', 'en-ml', 'en-mn', 'en-mr', 'en-ms', 'en-mt', 'en-my', 'en-nb', 'en-ne', 'en-nl', 'en-nn', 'en-no', 'en-oc', 'en-or', 'en-pa', 'en-pl', 'en-ps', 'en-pt', 'en-ro', 'en-ru', 'en-rw', 'en-se', 'en-sh', 'en-si', 'en-sk', 'en-sl', 'en-sq', 'en-sr', 'en-sv', 'en-ta', 'en-te', 'en-tg', 'en-th', 'en-tk', 'en-tr', 'en-tt', 'en-ug', 'en-uk', 'en-ur', 'en-uz', 'en-vi', 'en-wa', 'en-xh', 'en-yi', 'en-yo', 'en-zh', 'en-zu', 'fr-nl', 'fr-ru', 'fr-zh', 

In [5]:
from datasets import load_dataset

dataset=load_dataset("opus100", "ar-en")
dataset

Downloading data:   0%|          | 0.00/214k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/99.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/979k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

# Data Tokenization

In [6]:
from transformers import AutoTokenizer
import torch
model_name="UBC-NLP/AraT5v2-base-1024"

tokenizer=AutoTokenizer.from_pretrained(model_name, load_in_4bit=True, device_map={'':torch.cuda.current_device()})

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [7]:
# use a sample of around 2000 instead of the complete dataset as training dataset
train_dataset=dataset['train'].shuffle(seed=42).select(range(7000))

# as evaluation dataset
eval_dataset=dataset['validation']


def preprocess_func(data):
    inputs=[ex['ar'] for ex in data['translation']]
    targets=[ex['en'] for ex in data['translation']]
    
    # tokenize each row of inputs and outputs
    model_inputs=tokenizer(inputs, truncation=True)
    labels=tokenizer(targets, truncation=True)
    
    model_inputs["labels"]=labels["input_ids"]
    return model_inputs


# We tokenize the entire dataset  

train_dataset=train_dataset.map(preprocess_func, batched=True)
eval_dataset=eval_dataset.map(preprocess_func, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
def initialize_layer_norms(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.LayerNorm):
            module.weight.data.fill_(1.0)
            module.bias.data.zero_()
    
    for name, param in model.named_parameters():
        if "language_adaptor" in name and "norm" in name:
            if "weight" in name:  # Layer normalization scale
                print(f"Initializing {name} with mean=0 and std=1")
                init.ones_(param.data)
            elif "bias" in name:  # Layer normalization bias
                print(f"Initializing {name} with mean=0")
                init.zeros_(param.data)

In [9]:
from peft import PeftModel, prepare_model_for_kbit_training, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import BitsAndBytesConfig
from transformers import AutoModelForSeq2SeqLM


nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
model=AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=nf4_config, device_map={'':torch.cuda.current_device()})
initialize_layer_norms(model)

config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [10]:
# Freeze the original parameters
model=prepare_model_for_kbit_training(model)

peft_config=LoraConfig(
    # the task to train for (sequence-to-sequence language modeling in this case)
    task_type=TaskType.SEQ_2_SEQ_LM,
    # the dimension of the low-rank matrices
    r=5,
    # the scaling factor for the low-rank matrices
    lora_alpha=32,
    # the dropout probability of the LoRA layers
    lora_dropout=0.06,
    target_modules=["k","q","v","o"],
)

peft_model=get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 1,105,920 || all params: 368,614,656 || trainable%: 0.3000


In [11]:
print(model.config)

T5Config {
  "_name_or_path": "UBC-NLP/AraT5v2-base-1024",
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"

In [12]:
peft_model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(110208, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(110208, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.06, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=5, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=5, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
    

# BLEU score

In [13]:
import evaluate
import numpy as np
import sacrebleu

metric_bleu = evaluate.load("sacrebleu")
metric_rouge = evaluate.load("rouge")

# Define a function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Define a function to compute metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    # Compute BLEU score
    result_bleu = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result_bleu = {"bleu": result_bleu["score"]}
 
    
    # Compute ROUGE score
    result_rouge = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)
    result_rouge = {"rouge": result_rouge["rougeL"]}
    
    # Combine BLEU and ROUGE results
    result = {**result_bleu, **result_rouge}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    
    # Save the result to a file after each evaluation
    with open('metrics.txt', 'a') as f:
        f.write(f"BLEU: {result['bleu']}, ROUGE: {result['rouge']}, Gen Len: {result['gen_len']}\n")
    
    return result


2024-07-29 01:28:52.258278: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-29 01:28:52.258417: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-29 01:28:52.396319: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Training

In [14]:
from transformers import DataCollatorForSeq2Seq

# ignore tokenizer pad token in the loss
label_pad_token_id=-100

# padding the sentence of the entire datasets
data_collator=DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=peft_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)
print(data_collator)

DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='UBC-NLP/AraT5v2-base-1024', vocab_size=110100, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_

In [15]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer, get_linear_schedule_with_warmup
import torch
import gc
import torch.nn.init as init
import os

training_args=Seq2SeqTrainingArguments(
    
    output_dir=os.getenv("WANDB_NAME"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir=os.getenv("WANDB_NAME")+"/logs",
    logging_strategy="epoch",
    logging_steps=500,
    load_best_model_at_end=True,
    learning_rate=2e-5, #was 2e-5
    per_device_train_batch_size= 5, #was 5
    per_device_eval_batch_size= 5, #was 5
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=37,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to="wandb",
    run_name=os.getenv("WANDB_NAME"),
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
)

optimizer = torch.optim.AdamW(peft_model.parameters(), lr=2e-5)
num_training_steps = len(train_dataset) * training_args.num_train_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
# Create Trainer instance
trainer=Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler)
)

peft_model.config.use_cache=False
torch.cuda.empty_cache()
gc.collect()
try:
    trainer.train()
except RuntimeError as e:
    if 'out of memory' in str(e):
        print("Out of memory error occurred, trying to reduce batch size")
        torch.cuda.empty_cache()
        gc.collect()
        training_args.per_device_train_batch_size = 4
        training_args.per_device_eval_batch_size = 4
        trainer.args = training_args
        trainer.train()
    else:
        raise e
        
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmoudjaramina2001[0m ([33mFinalProject_[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Bleu,Rouge,Gen Len
1,20.549,7.236779,0.3335,0.0206,10.776
2,7.7709,4.969448,1.6117,0.1507,10.0965
3,6.5851,4.634832,3.2084,0.2373,10.784
4,5.919,4.041731,3.1119,0.2224,10.217
5,5.2552,3.912013,3.36,0.2266,10.2515
6,5.0023,3.806243,3.5542,0.2266,10.519
7,4.8417,3.704408,4.1173,0.2399,10.639
8,4.6791,3.623544,4.291,0.2447,10.8245
9,4.5914,3.582485,4.502,0.2562,10.9615
10,4.5138,3.533047,4.7634,0.2586,11.254




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [17]:
# Read the final BLEU score from the saved metrics file
with open('metrics.txt', 'r') as f:
    lines = f.readlines()
    print(lines)
# Print the last line which contains the final BLEU score
if lines:
    final_metrics = lines[-1]
    print("Final Evaluation Metrics:", final_metrics)
else:
    print("No evaluation metrics found.")


['BLEU: 0.3335, ROUGE: 0.0206, Gen Len: 10.776\n', 'BLEU: 1.6117, ROUGE: 0.1507, Gen Len: 10.0965\n', 'BLEU: 3.2084, ROUGE: 0.2373, Gen Len: 10.784\n', 'BLEU: 3.1119, ROUGE: 0.2224, Gen Len: 10.217\n', 'BLEU: 3.36, ROUGE: 0.2266, Gen Len: 10.2515\n', 'BLEU: 3.5542, ROUGE: 0.2266, Gen Len: 10.519\n', 'BLEU: 4.1173, ROUGE: 0.2399, Gen Len: 10.639\n', 'BLEU: 4.291, ROUGE: 0.2447, Gen Len: 10.8245\n', 'BLEU: 4.502, ROUGE: 0.2562, Gen Len: 10.9615\n', 'BLEU: 4.7634, ROUGE: 0.2586, Gen Len: 11.254\n', 'BLEU: 5.1723, ROUGE: 0.2678, Gen Len: 11.1965\n', 'BLEU: 5.2771, ROUGE: 0.2713, Gen Len: 11.1465\n', 'BLEU: 5.6881, ROUGE: 0.2816, Gen Len: 11.065\n', 'BLEU: 5.8538, ROUGE: 0.2867, Gen Len: 11.1105\n', 'BLEU: 5.8824, ROUGE: 0.2887, Gen Len: 11.496\n', 'BLEU: 5.8878, ROUGE: 0.2875, Gen Len: 11.2355\n', 'BLEU: 6.2879, ROUGE: 0.2936, Gen Len: 11.3215\n', 'BLEU: 6.2941, ROUGE: 0.2961, Gen Len: 11.176\n', 'BLEU: 6.5319, ROUGE: 0.3009, Gen Len: 11.5075\n', 'BLEU: 6.7714, ROUGE: 0.3048, Gen Len: 11.4

In [18]:
import math

eval_results=trainer.evaluate() 
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")



KeyboardInterrupt: 

# Inference

In [19]:
# from transformers import pipeline

# text=" المملكة العربية السعودية"
# translator=pipeline("translation",model=os.getenv("WANDB_NAME"))
# translator(text)
peft_model.config.use_cache=True
context=tokenizer(["المملكة العربية السعودية"], return_tensors="pt")
output=peft_model.generate(**context)

tokenizer.decode(output[0], skip_special_tokens=True)



'The Saudi Kingdom'

In [20]:
tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(os.getenv("WANDB_NAME"))

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yasmineee/finetune-AraT5-on-opus100-Ar2En-with-Qlora/commit/593b0ef3fc9caba35e6f861713789e639214468e', commit_message='finetune-AraT5-on-opus100-Ar2En-with-Qlora', commit_description='', oid='593b0ef3fc9caba35e6f861713789e639214468e', pr_url=None, pr_revision=None, pr_num=None)