In [1]:
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install -U transformers torch wandb evaluate huggingface_hub datasets bert_score evaluate numpy peft accelerate bitsandbytes torchvision

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-vv0rwy4p
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-vv0rwy4p
  Resolved https://github.com/huggingface/transformers to commit 78b2929c0554b79e0489b451ce4ece14d265ead2
  Installing build dependencies ... [?25l- \ | / done
[?25h  Getting requirements to build wheel ... [?25l- \ done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- \ done
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
[?25h  Created wheel for transformers: filename=transformers-4.45.0.dev0-py3-none-any.whl size=9786808 sha256=43833c48a33a417f1faba8c6552e0ac2fa5f75297c755856ce9d9341ecba3ce5
  Sto

In [2]:
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import DataLoader, random_split, Dataset
from datasets import load_dataset

# USE RAY TUNE. https://docs.ray.io/en/latest/train/examples/intel_gaudi/bert.html
# deepl, chatgpt translations

## Setup

In [3]:
import wandb
from huggingface_hub import HfApi, HfFolder
import transformers

try: # If it is on Kaggle
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()

    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_KEY = user_secrets.get_secret("WANDB_KEY")

except ModuleNotFoundError: # If it is local
    HF_TOKEN = os.environ["HF_TOKEN"]
    WANDB_KEY = os.environ["WANDB_KEY"]
    

HfFolder.save_token(HF_TOKEN)
wandb.login(key=WANDB_KEY)


# Reproducibility

seed = 1
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
transformers.set_seed(seed)
np.random.seed(seed)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Modeling

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
from transformers import DataCollatorForSeq2Seq
from transformers import BitsAndBytesConfig

In [5]:
# # # # Configure any model from HF HUB
# assert input("YOU WILL REMOVE THE HUB MODEL FOR THIS, TYPE 'OK' TO PROCEED: ").upper() == 'OK'
# model_name = "facebook/mbart-large-50-many-to-many-mmt"
# model_name = "facebook/m2m100_1.2B"
# #model_name= "Helsinki-NLP/opus-mt-en-ar"
# model_name= "facebook/nllb-200-distilled-600M"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# generation_config = GenerationConfig(
#     temperature=0.5,
#     do_sample=True,
#     max_length=256,
#     forced_bos_token_id = 256011, # Arabic

#     pad_token_id=tokenizer.pad_token_id,
#     bos_token_id= 256011,
#     decoder_start_token_id= 2,
#     eos_token_id= tokenizer.eos_token_id,
    
# #     num_beams = 4,
# #     early_stopping=True,
# #     top_k=50,
    
# #     renormalize_logits=True,
    
# #     # Testing Config
# #       repetition_penalty=0.5,
# #     num_return_sequences=4, # Number of sentences to generate
# #     return_dict_in_generate=True, # Returns the complete generation data from within the model.
# #     output_scores=True, # Score of each token.
# )

# tokenizer.src_lang="eng_Latn"
# tokenizer.tgt_lang="arb_Arab"

# model.push_to_hub("Abdulmohsena/Faseeh_LoRA")
# tokenizer.push_to_hub("Abdulmohsena/Faseeh_LoRA")
# generation_config.push_to_hub("Abdulmohsena/Faseeh_LoRA")

In [6]:
# Instantiating The Model
model_name = "Abdulmohsena/Faseeh"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="arb_Arab")
generation_config = GenerationConfig.from_pretrained(model_name)

# https://huggingface.co/docs/transformers/en/main_classes/text_generation

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.1k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

In [7]:
# # Compressing
# from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training, LoraRuntimeConfig
# from torch.profiler import profile, record_function, ProfilerActivity

# ## Quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=bnb_config)

# model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model) # prepares the whole model for kbit training

# for param in model.parameters():
#     param.requires_grad = False  # freeze the model - train adapters later
#     if param.ndim == 1:
#         # cast the small parameters (e.g. layernorm) to fp32 for stability
#         param.data = param.data.to(torch.float32)
    
# ## Low Rank Adaptation
# lora_config = LoraConfig(
# #     init_lora_weights="olora",
#     use_dora=True,
#     runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True),
#     task_type=TaskType.SEQ_2_SEQ_LM,
#     inference_mode=False, 
#     r=16, 
#     lora_alpha=16, 
#     lora_dropout=0.05,
#     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
# )


# # model.enable_input_require_grads()
# model = get_peft_model(model, lora_config)


# # # Only train decoder weights, not encoder
# for param in model.get_base_model().model.encoder.parameters():
#     param.requires_grad = False

# model.print_trainable_parameters()

# # Pruning, not valid because we need a sparse util
# # for name, module in model.named_modules():
# #     if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)):
# #         prune.l1_unstructured(module, name='weight', amount=0.4)
# #         prune.remove(module, 'weight')

# # # https://huggingface.co/docs/optimum/en/concept_guides/quantization
# # # https://huggingface.co/docs/peft/en/index
# # # https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py

In [8]:
# Sanity Check
dummy = "And the Egyptian Foreign Minister assured the visitors that security is always a top priority."

model = model.to('cuda')
encoded_ar = tokenizer(dummy, return_tensors="pt").to('cuda')
generated_tokens = model.generate(**encoded_ar, generation_config=generation_config)

tokenizer.decode(generated_tokens[0], skip_special_tokens=True)


'وأكد وزير خارجية مصر الزائرين على أن الأمن دائما أولى.'

In [9]:
dataset = load_dataset("Abdulmohsena/Classic-Arabic-English-Language-Pairs")

dataset = dataset['train']

README.md:   0%|          | 0.00/775 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/20.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/107338 [00:00<?, ? examples/s]

In [10]:
dataset = dataset.filter(lambda x: (len(x['source']) < 256) & (len(x['target']) < 256))

Filter:   0%|          | 0/107338 [00:00<?, ? examples/s]

In [11]:
preprocess_function = lambda examples: tokenizer(
        examples['source'], text_target=examples['target'], max_length=256, truncation=True, padding=True, return_tensors='pt')

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.20)

Map:   0%|          | 0/90682 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True, return_tensors='pt')

In [13]:
import numpy as np
from evaluate import load
import transformers
from functools import partial

#https://huggingface.co/spaces/evaluate-metric/comet

# All metrics to be tested
def bertscore(predictions, references):
    # BertSCORE for semantic translation. Read: https://arxiv.org/pdf/1904.09675
    metric = load("bertscore")
    result = metric.compute(predictions=predictions, references=references, lang="ar")
    result = {"bertscore-f1": np.mean(result['f1'])}
    
    return result

def comet(predictions, references):
    metric = load("comet")
    result = metric.compute(predictions=predictions, references=references)
    print(result['scores'])
    result = {"comet-score": np.mean(result['scores'])}
    
    return result


metric = load("bertscore")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(eval_preds): 
    
    preds, labels = eval_preds
    
    # Replace unknown labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode tokens into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Postprocess text for cleaniness
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    # Get metric scores
    result = bertscore(predictions=decoded_preds, references=decoded_labels, )

    # Get avg gen length
    prediction_lengths = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lengths)

    result = {k: round(v, 4) for k, v in result.items()} # Round to 4 figures

    return result

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [14]:
compute_metrics((tokenizer(["test"])['input_ids'], tokenizer(["اختبار"])['input_ids']))

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

{'bertscore-f1': 0.7834, 'gen_len': 1.0}

## Training

In [15]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
train_batch_size = 2
torch.cuda.empty_cache()

# https://huggingface.co/docs/transformers/v4.44.2/performance
training_args = Seq2SeqTrainingArguments(
    ########### TEST
    max_steps=1000,
    ###########
    output_dir=f"{model_name}_LoRA",
    save_total_limit=1,
    load_best_model_at_end=True,

    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    gradient_accumulation_steps=16 // train_batch_size,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant":False},
#     torch_compile=False,
    
    logging_strategy="steps",
    logging_steps=1000,
    
    save_strategy='steps',
    save_steps=1000,
    
    eval_strategy='steps',
    eval_steps = 1000,
    metric_for_best_model="bertscore-f1",
    greater_is_better=True,
    
    weight_decay=0.01,
    warmup_steps=1_000,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    
    num_train_epochs=2,
    
    predict_with_generate=True,
    fp16=True,
    
    push_to_hub=True,
    report_to='wandb'
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

model.config.use_cache = False

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [16]:
from datetime import datetime
wandb.init(project="Faseeh",name=f"Run @ {datetime.now()}")
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mabdulmohsena[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240922_202524-ci4z0hju[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mRun @ 2024-09-22 20:25:24.871033[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/abdulmohsena/Faseeh[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/abdulmohsena/Faseeh/runs/ci4z0hju[0m
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Bertscore-f1,Gen Len
1000,0.0986,0.068712,0.9699,24.8302


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:       eval/bertscore-f1 ▁
[34m[1mwandb[0m:            eval/gen_len ▁
[34m[1mwandb[0m:               eval/loss ▁
[34m[1mwandb[0m:            eval/runtime ▁
[34m[1mwandb[0m: eval/samples_per_second ▁
[34m[1mwandb[0m:   eval/steps_per_second ▁
[34m[1mwandb[0m:             train/epoch ▁▁▁
[34m[1mwandb[0m:       train/global_step ▁▁▁
[34m[1mwandb[0m:         train/grad_norm ▁
[34m[1mwandb[0m:     train/learning_rate ▁
[34m[1mwandb[0m:              train/loss ▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:        eval/bertscore-f1 0.9699
[34m[1mwandb[0m:             eval/gen_len 24.8302
[34m[1mwandb[0m:        

In [17]:
# import torch.profiler as profiler
# with profiler.profile(activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA]) as prof:
    

# print(prof.key_averages().table(sort_by="cuda_time_total"))