In [1]:
import wandb
import torch
from accelerate import Accelerator
import os
import numpy as np
from huggingface_hub import HfApi, HfFolder
import transformers
from tqdm import tqdm
from peft import PeftModel, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
from transformers import DataCollatorForSeq2Seq, BitsAndBytesConfig
from datasets import load_dataset
from evaluate import load
from transformers import get_scheduler
from datetime import datetime

In [2]:
try: # If it is on Kaggle
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()

    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_KEY = user_secrets.get_secret("WANDB_KEY")

except ModuleNotFoundError: # If it is local
    HF_TOKEN = os.environ["HF_TOKEN"]
    WANDB_KEY = os.environ["WANDB_KEY"]
    

HfFolder.save_token(HF_TOKEN)
wandb.login(key=WANDB_KEY)

# Reproducibility
seed = 1
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
transformers.set_seed(seed)
np.random.seed(seed)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mabdulmohsena[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user\.netrc


In [3]:

model_name = "AbdulmohsenA/Faseeh"
lora_name = "AbdulmohsenA/Faseeh_LoRA"

quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                                quantization_config=quantization_config,
                                                torch_dtype=torch.float16,
                                                low_cpu_mem_usage=True)
prepare_model_for_kbit_training(model)

model = PeftModel.from_pretrained(model, lora_name, is_trainable=True)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 619,792,384 || trainable%: 0.7613


In [4]:
model_name = "Abdulmohsena/Faseeh"

tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="arb_Arab")
generation_config = GenerationConfig.from_pretrained(model_name)

In [5]:
dataset = load_dataset("Abdulmohsena/Classic-Arabic-English-Language-Pairs")

dataset = dataset['train']

In [6]:
preprocess_function = lambda examples: tokenizer(
        examples['source'], text_target=examples['target'], max_length=256, truncation=True, padding=True, return_tensors='pt')

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['source', 'target'])
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.20)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True, return_tensors='pt')

In [7]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

metric = load("bertscore")

def compute_metrics(preds, labels):
    labels = torch.where(labels != -100, labels, torch.tensor(tokenizer.pad_token_id).to(labels.device))

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Postprocess text to remove unnecessary spaces
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    bertscore_results = metric.compute(
        predictions=decoded_preds, 
        references=decoded_labels, 
        lang="ar",
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    
    # Get the average generation length
    prediction_lengths = [(pred != tokenizer.pad_token_id).sum().item() for pred in preds]
    
    # Prepare final result
    result = {
        "precision": round(np.mean(bertscore_results['precision']), 4),
        "recall": round(np.mean(bertscore_results['recall']), 4),
        "f1": round(np.mean(bertscore_results['f1']), 4),
        "gen_len": round(np.mean(prediction_lengths), 4)
    }
    
    return result

In [16]:
accelerator = Accelerator(mixed_precision='fp16')

model = model.to(accelerator.device)  # Send the model to device
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

# Define learning rate scheduler
num_update_steps_per_epoch = len(tokenized_dataset["train"]) // (24)
num_training_steps = num_update_steps_per_epoch * 2

lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=50,
    num_training_steps=num_training_steps
)

train_dataloader = torch.utils.data.DataLoader(tokenized_dataset["train"], batch_size=24, shuffle=True, collate_fn=data_collator)
eval_dataloader = torch.utils.data.DataLoader(tokenized_dataset["test"], batch_size=2, collate_fn=data_collator)

model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

model.gradient_checkpointing_disable()
model.gradient_checkpointing_kwargs = {"use_reentrant": False}

ValueError: AcceleratorState has already been initialized and cannot be changed, restart your runtime completely and pass `mixed_precision='fp16'` to `Accelerator()`.

In [9]:
def train_model(model, train_dataloader, optimizer, lr_scheduler, accelerator):
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_dataloader)

    for step, batch in enumerate(progress_bar):
        outputs = model(**batch)
        loss = outputs.loss

        accelerator.backward(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        train_loss += loss.item()

        progress_bar.set_postfix({"loss": train_loss / (step + 1)})

    return train_loss / len(train_dataloader)

def evaluate_model(model, eval_dataloader, accelerator):
    model.eval()

    eval_f1 = 0
    eval_loss = 0
    progress_bar = tqdm(eval_dataloader)
    
    for step, batch in enumerate(progress_bar):
        with torch.no_grad():
            outputs = model(**batch)

            # Convert logits to tokens
            predictions = torch.argmax(outputs.logits, dim=2)
            labels = batch['labels']

            predictions = accelerator.gather(predictions)
            labels = accelerator.gather(labels)

            metrics = compute_metrics(predictions, labels)
            
        eval_loss += outputs.loss.item()
        eval_f1 += metrics['f1']

        progress_bar.set_postfix({"loss": eval_loss / (step + 1)})

    return {"eval_loss": eval_loss / len(eval_dataloader),
                     "f1": eval_f1 / len(eval_dataloader)}

In [10]:
train_dataloader = torch.utils.data.DataLoader(tokenized_dataset["train"].select(range(5)), batch_size=1, shuffle=True, collate_fn=data_collator)
eval_dataloader = torch.utils.data.DataLoader(tokenized_dataset["test"].select(range(5)), batch_size=2, collate_fn=data_collator)

In [11]:
wandb.init(project="Faseeh", name=F"Faseeh @ {datetime.now()}")
torch.cuda.empty_cache()
for epoch in range(2):
    train_loss = train_model(model, train_dataloader, optimizer, lr_scheduler, accelerator)
    eval_metrics = evaluate_model(model, eval_dataloader, accelerator)
    
    eval_metrics['train_loss'] = train_loss
    wandb.log(eval_metrics, step=epoch)

100%|██████████| 5/5 [00:03<00:00,  1.56it/s, loss=0.0481]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 3/3 [00:08<00:00,  2.73s/it, loss=0.0263]
100%|██████████| 5/5 [00:02<00:00,  1.76it/s, loss=0.0383]
100%|██████████| 3/3 [00:04<00:00,  1.43s/it, loss=0.0262]


In [12]:
model = accelerator.unwrap_model(model)
# Save model artifact
model_artifact = wandb.Artifact("model", type="model")

accelerator.save_model(model, "../experiments/models/")
model_artifact.add_file("../experiments/models/model.safetensors")

# wandb.log_artifact(model_artifact)
model.push_to_hub("Abdulmohsena/Faseeh_LoRA", token=True, max_shard_size="5GB", safe_serialization=True)
wandb.finish()



VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval_loss,█▁
f1,▁▁
train_loss,█▁

0,1
eval_loss,0.02616
f1,0.97957
train_loss,0.03833


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Abdulmohsena/Faseeh_LoRA/commit/edc22b26c2743456afc9c947f43c7e03460caa45', commit_message='Upload model', commit_description='', oid='edc22b26c2743456afc9c947f43c7e03460caa45', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Abdulmohsena/Faseeh_LoRA', endpoint='https://huggingface.co', repo_type='model', repo_id='Abdulmohsena/Faseeh_LoRA'), pr_revision=None, pr_num=None)