In [1]:
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install datasets 
!pip install evaluate    
!pip install sacrebleu
!pip install peft
!pip install rouge_score

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m350.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting 

# Loading Dataset

In [2]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "Graduation_notebooks"
os.environ["WANDB_NOTES"] = "Fine tune t5"
os.environ["WANDB_NAME"] = "araT5-Base-with-DoRA"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from datasets import load_dataset, DatasetDict

# Load the dataset with the first 20,000 examples
ds = load_dataset("Helsinki-NLP/un_pc", "ar-en", split="train[:20000]")

train_test_val_split = ds.train_test_split(test_size=5000, seed=42)  
# Further split the 5,000 examples into 2,500 for test and 2,500 for validation
test_validation_split = train_test_val_split['test'].train_test_split(test_size=0.5, seed=42)

# Create a new dataset dictionary with the updated splits
ds_split = DatasetDict({
    'train': train_test_val_split['train'],  # 15,000 examples for training
    'test': test_validation_split['train'],  # 2,500 examples for testing
    'validation': test_validation_split['test'],  # 2,500 examples for validation
})

# Check the resulting dataset
ds_split

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/17 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/20044478 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2500
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2500
    })
})

# Data Tokenization

In [5]:
from transformers import AutoTokenizer
import torch
model_name="UBC-NLP/AraT5v2-base-1024"

tokenizer=AutoTokenizer.from_pretrained(model_name, device_map={'':torch.cuda.current_device()})

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



In [6]:
source_lang="ar"
target_lang="en"
prefix="translate Arabic to English: "

def preprocess_function(examples):
    inputs =[prefix+example[source_lang] for example in examples["translation"]]
    targets=[example[target_lang] for example in examples["translation"]]
    model_inputs=tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs


tokenized_ds=ds_split.map(preprocess_function, batched=True)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

# Preparing the model

In [8]:
from peft import PeftModel, prepare_model_for_kbit_training, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForSeq2SeqLM

model=AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map={'':torch.cuda.current_device()})

config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [9]:
# Freeze the original parameters
model=prepare_model_for_kbit_training(model)

peft_config=LoraConfig(
    # the task to train for (sequence-to-sequence language modeling in this case)
    task_type=TaskType.SEQ_2_SEQ_LM,
    # the dimension of the low-rank matrices
    r=5,
    # the scaling factor for the low-rank matrices
    lora_alpha=32,
    # the dropout probability of the LoRA layers
    lora_dropout=0.06,
    use_dora= True,
    target_modules=["k","q","v","o"],
)

peft_model=get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 1,216,512 || all params: 368,725,248 || trainable%: 0.3299


In [10]:
print(model.config)

T5Config {
  "_name_or_path": "UBC-NLP/AraT5v2-base-1024",
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.44.0",
  "use_cache": true,
  "vocab_size": 110208
}



In [11]:
peft_model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(110208, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(110208, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.06, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=5, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=5, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
            

# evaluation scores

In [12]:
import evaluate
import numpy as np
import sacrebleu

metric_bleu = evaluate.load("sacrebleu")
metric_rouge = evaluate.load("rouge")

# Define a function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Define a function to compute metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    # Compute BLEU score
    result_bleu = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result_bleu = {"bleu": result_bleu["score"]}
    
    # Compute ROUGE score
    result_rouge = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)
    result_rouge = {"rouge": result_rouge["rougeL"]}
    
    # Combine BLEU and ROUGE results
    result = {**result_bleu, **result_rouge}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    
    # Save the result to a file after each evaluation
    with open('metrics.txt', 'a') as f:
        f.write(f"BLEU: {result['bleu']}, ROUGE: {result['rouge']}, Gen Len: {result['gen_len']}\n")
    
    return result


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Training

In [13]:
from transformers import DataCollatorForSeq2Seq

# ignore tokenizer pad token in the loss
label_pad_token_id=-100

# padding the sentence of the entire datasets
data_collator=DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=peft_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [14]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer, get_linear_schedule_with_warmup
import torch
import gc
import torch.nn.init as init
import os

training_args=Seq2SeqTrainingArguments(
    
    output_dir=os.getenv("WANDB_NAME"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir=os.getenv("WANDB_NAME")+"/logs",
    logging_strategy="epoch",
    logging_steps=500,
    load_best_model_at_end=True,
    learning_rate=2e-4,
    per_device_train_batch_size= 2,
    per_device_eval_batch_size= 2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs= 5,
    predict_with_generate=True,
    fp16=True,
    #Mixed precision training typically
    #maintains accuracy by only performing part of the computations in FP16, while keeping critical values like model weights in full precision (FP32)
#     push_to_hub=True,
    report_to="wandb",
    run_name=os.getenv("WANDB_NAME"),
)

# Create Trainer instance
trainer=Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

peft_model.config.use_cache=False
torch.cuda.empty_cache()
gc.collect()
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mmoudjaramina2001[0m ([33mFinalProject_[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Bleu,Rouge,Gen Len
1,4.2609,2.612922,9.8191,0.4309,14.0316
2,3.2235,2.314075,11.3505,0.4801,13.944
3,2.9623,2.188471,12.2927,0.4951,14.0504
4,2.7918,2.104445,12.7617,0.5053,14.0332
5,2.7009,2.082434,13.0059,0.5123,14.0744




TrainOutput(global_step=37500, training_loss=3.1879013541666668, metrics={'train_runtime': 19281.8998, 'train_samples_per_second': 3.89, 'train_steps_per_second': 1.945, 'total_flos': 6193692685320192.0, 'train_loss': 3.1879013541666668, 'epoch': 5.0})

In [16]:
import math

eval_results=trainer.evaluate() 
print(eval_results)
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")



{'eval_loss': 2.0824337005615234, 'eval_bleu': 13.0059, 'eval_rouge': 0.5123, 'eval_gen_len': 14.0744, 'eval_runtime': 1248.982, 'eval_samples_per_second': 2.002, 'eval_steps_per_second': 1.001, 'epoch': 5.0}
Perplexity: 8.02


# Inference

In [17]:
from transformers import pipeline

peft_model = peft_model.to("cuda")  # Move model to GPU
context = tokenizer(["كان يوماً طويلاً"], return_tensors="pt").to("cuda")  # Move input to GPU
output = peft_model.generate(**context)  # Generate
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)


I had a long day, a long day, a long day,


In [18]:
from transformers import pipeline

peft_model = peft_model.to("cuda")  # Move model to GPU
context = tokenizer(["سأفكر في الأمر"], return_tensors="pt").to("cuda")  # Move input to GPU
output = peft_model.generate(**context)  # Generate
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

, I will understand the matter. I will be thinking in the matter.


In [19]:
from transformers import pipeline

peft_model = peft_model.to("cuda")  # Move model to GPU
context = tokenizer(["ما هو رأيك؟"], return_tensors="pt").to("cuda")  # Move input to GPU
output = peft_model.generate(**context)  # Generate
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

The views of your view are the view of your view.


In [20]:
from transformers import pipeline

peft_model = peft_model.to("cuda")  # Move model to GPU
context = tokenizer(["هل هذا المكان مزدحم دائمًا؟"], return_tensors="pt").to("cuda")  # Move input to GPU
output = peft_model.generate(**context)  # Generate
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)


this place is a دائم الازدحام؟


In [21]:
from transformers import pipeline

peft_model = peft_model.to("cuda")  # Move model to GPU
context = tokenizer(["قررت أن أغير وظيفتي لأبحث عن فرص جديدة"], return_tensors="pt").to("cuda")  # Move input to GPU
output = peft_model.generate(**context)  # Generate
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

to change my job to find new فرص for new فرص


In [22]:
from transformers import pipeline

peft_model = peft_model.to("cuda")  # Move model to GPU
context = tokenizer(["أريد أن أتعلم لغة جديدة لتوسيع فرصي المهنية"], return_tensors="pt").to("cuda")  # Move input to GPU
output = peft_model.generate(**context)  # Generate
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

I want to learn new language to expand professional courses and to expand professional courses


In [23]:
from transformers import pipeline

peft_model = peft_model.to("cuda")  # Move model to GPU
context = tokenizer(["الطريق إلى النجاح مليء بالتحديات"], return_tensors="pt").to("cuda")  # Move input to GPU
output = peft_model.generate(**context)  # Generate
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

to success is filled with challenges


# push the model to Huggingface

In [25]:
!python -c 'from huggingface_hub.hf_api import HfFolder; HfFolder.save_token("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")'

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
tokenizer.push_to_hub(os.getenv("WANDB_NAME"))
trainer.push_to_hub(os.getenv("WANDB_NAME"))

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.93M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yasmineee/araT5-Base-with-DoRA/commit/76ede3558eeda28e25fb51cbc61c921f1ce94901', commit_message='araT5-Base-with-DoRA', commit_description='', oid='76ede3558eeda28e25fb51cbc61c921f1ce94901', pr_url=None, pr_revision=None, pr_num=None)