In [1]:
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import DataLoader, random_split, Dataset
from datasets import load_dataset

# USE RAY TUNE. https://docs.ray.io/en/latest/train/examples/intel_gaudi/bert.html
# deepl, chatgpt translations

## Setup

In [2]:
import wandb
from huggingface_hub import HfApi, HfFolder
import transformers

try: # If it is on Kaggle
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()

    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_KEY = user_secrets.get_secret("WANDB_KEY")

except ModuleNotFoundError: # If it is local
    HF_TOKEN = os.environ["HF_TOKEN"]
    WANDB_KEY = os.environ["WANDB_KEY"]
    

HfFolder.save_token(HF_TOKEN)
wandb.login(key=WANDB_KEY)


# Reproducibility

seed = 1
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
transformers.set_seed(seed)
np.random.seed(seed)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mabdulmohsena[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user\.netrc


## Modeling

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, GenerationConfig
from peft import PeftModel, prepare_model_for_kbit_training

In [4]:
# Instantiating The Model
train_lora = True

model_name = "facebook/nllb-200-distilled-600M"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="arb_Arab")
generation_config = GenerationConfig.from_pretrained("AbdulmohsenA/Faseeh")

In [6]:
if train_lora:
    
    q_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4")
    
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=q_config)
    model = prepare_model_for_kbit_training(model)
    
    model_name = "Abdulmohsena/Faseeh_LoRA"
    
    model = PeftModel.from_pretrained(model, model_name, is_trainable=True)
    
    model.print_trainable_parameters()

OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 12.04 GiB is allocated by PyTorch, and 25.69 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [8]:
# Sanity Check
dummy = "And the Egyptian Foreign Minister assured the visitors that security is always a top priority."

model = model.to('cuda')
encoded_ar = tokenizer(dummy, return_tensors="pt").to('cuda')
generated_tokens = model.generate(**encoded_ar, generation_config=generation_config)

tokenizer.decode(generated_tokens[0], skip_special_tokens=True)


'زازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازازا'

In [9]:
dataset = load_dataset("Abdulmohsena/Classic-Arabic-English-Language-Pairs-Downsampled")

dataset = dataset['train']

README.md:   0%|          | 0.00/775 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/20.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/107338 [00:00<?, ? examples/s]

In [10]:
preprocess_function = lambda examples: tokenizer(
        examples['source'], text_target=examples['target'], max_length=256, truncation=True, padding=True, return_tensors='pt')

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.20)

Filter:   0%|          | 0/107338 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True, return_tensors='pt')

Map:   0%|          | 0/90682 [00:00<?, ? examples/s]

In [13]:
import numpy as np
from evaluate import load
import transformers
from functools import partial

#https://huggingface.co/spaces/evaluate-metric/comet
def postprocess_text(preds, labels):
    # Strip whitespace and normalize
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels


# All metrics to be tested
def bertscore(predictions, references):
    # BertSCORE for semantic translation. Read: https://arxiv.org/pdf/1904.09675
    metric = load("bertscore")
    result = metric.compute(predictions=predictions, references=references, lang="ar")
    result = {"bertscore-f1": np.mean(result['f1'])}
    
    return result

def comet(predictions, references):
    metric = load("comet")
    result = metric.compute(predictions=predictions, references=references)
    print(result['scores'])
    result = {"comet-score": np.mean(result['scores'])}
    
    return result


metric = load("bertscore")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Replace label -100 with the padding token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode predictions and labels into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Postprocess text to remove unnecessary spaces
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    # Compute BERTScore (with batch processing)
    bertscore_results = metric.compute(
        predictions=decoded_preds, 
        references=decoded_labels, 
        lang="ar",  # Adjust for your target language
        device='cuda' if torch.cuda.is_available() else 'cpu'  # Ensure GPU usage
    )
    
    # Get the average generation length
    prediction_lengths = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    
    # Prepare final result
    result = {
        "precision": round(np.mean(bertscore_results['precision']), 4),
        "recall": round(np.mean(bertscore_results['recall']), 4),
        "f1": round(np.mean(bertscore_results['f1']), 4),
        "gen_len": round(np.mean(prediction_lengths), 4)
    }
    
    return result

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [14]:
compute_metrics((tokenizer(["test"])['input_ids'], tokenizer(["اختبار"])['input_ids']))

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

{'bertscore-f1': 0.7834, 'gen_len': 1.0}

## Training

In [15]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
train_batch_size = 16
eval_steps = 1000
torch.cuda.empty_cache()

# https://huggingface.co/docs/transformers/v4.44.2/performance
training_args = Seq2SeqTrainingArguments(
    
    output_dir=f"{model_name}",
    save_total_limit=1,
    load_best_model_at_end=True,

    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant":False},
#     torch_compile=False,
    
    logging_strategy="steps",
    logging_steps=eval_steps // 2,
    
    save_strategy='steps',
    save_steps=eval_steps,
    
    eval_strategy='steps',
    eval_steps = eval_steps,
    metric_for_best_model="f1",
    greater_is_better=True,
    
    weight_decay=0.01,
    warmup_steps=eval_steps,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    
    num_train_epochs=2,
    
    predict_with_generate=True,
    fp16=True,
    
    push_to_hub=True,
    report_to='wandb',
    
    dataloader_num_workers=4,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

model.config.use_cache = False
model.gradient_checkpointing_enable()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [16]:
from datetime import datetime
wandb.init(project="Faseeh",name=f"Run @ {datetime.now()}")
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mabdulmohsena[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240922_202524-ci4z0hju[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mRun @ 2024-09-22 20:25:24.871033[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/abdulmohsena/Faseeh[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/abdulmohsena/Faseeh/runs/ci4z0hju[0m
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Bertscore-f1,Gen Len
1000,0.0986,0.068712,0.9699,24.8302


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:       eval/bertscore-f1 ▁
[34m[1mwandb[0m:            eval/gen_len ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁
[34m[1mwandb[0m:       train/global_step ▁▁▁
[34m[1mwandb[0m:         train/grad_norm ▁
[34m[1mwandb[0m:     train/learning_rate ▁
[34m[1mwandb[0m:              train/loss ▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:        eval/bertscore-f1 0.9699
[34m[1mwandb[0m:             eval/gen_len 24.8302
[34m[1mwandb[0m:        

In [17]:
# import torch.profiler as profiler
# with profiler.profile(activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA]) as prof:
    

# print(prof.key_averages().table(sort_by="cuda_time_total"))