In [1]:
#!pip install -q -U transformers datasets accelerate peft trl bitsandbytes wandb

In [11]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [12]:
import gc
gc.collect()

2412

In [13]:
import wandb
wandb.login() #65d153552869e76781ae6531b61b7f5ecd979d1c



True

In [15]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
source_lang, source_lang_iso = "Spanish", "spa"
target_lang, target_lang_iso = "Wayuu", "guc" # or pbb

Load Model and Tokenizer

In [16]:
# Model
base_model = "NousResearch/Llama-2-7b-chat-hf"
new_model = f"llama-2-7b-chat-translate-{source_lang_iso}-{target_lang_iso}"

# Dataset
dataset = load_dataset(f"Broomva/instruct-{source_lang_iso}-{target_lang_iso}", split="train")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Define Quantization settings

In [17]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
)

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Create the Trainer instance

In [19]:
from transformers import EarlyStoppingCallback

# Set training arguments
training_arguments = TrainingArguments(
        output_dir=f"./results/{new_model}",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=1,
        fp16=True,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=25,
        save_steps=25,
        logging_steps=1,
        optim="paged_adamw_8bit",
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        warmup_steps=5,
        report_to="wandb",
        push_to_hub=True,
        max_steps=50,
        load_best_model_at_end = True
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [11]:
gc.collect()

0

Merge Fine Tuned weights with the original model

In [12]:
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# Push new model and tokenizer to the hub
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Broomva/llama-2-7b-chat-spa-guc/commit/d56aac8a11c4091cc9471b0180a8908f18363950', commit_message='Upload tokenizer', commit_description='', oid='d56aac8a11c4091cc9471b0180a8908f18363950', pr_url=None, pr_revision=None, pr_num=None)