In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
import torch
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('json', data_files={
    'train': '/home/rox/datasets/singlish/singlish_training_dataset2.jsonl',
    'validation': '/home/rox/datasets/singlish/singlish_validation_dataset2.jsonl'
})

# Define the model name
model_name = "/home/rox/llama-singlish/"

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="longest", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Configure quantization
quant_config = BitsAndBytesConfig(
    load_in_8bit=True  # Use 8-bit quantization to reduce memory usage
)

# Load the base model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config
)

# Configure the PEFT model
peft_config = LoraConfig(
    r=4,  # Reduce rank to 4 for lower memory usage
    lora_alpha=16,  # Lower scaling factor
    target_modules=["q_proj", "v_proj"],  # Make sure these are valid for LLaMA 3.1
    lora_dropout=0.1,  # Dropout rate for LoRA
    bias="none"
)

# Apply the PEFT configuration
peft_model = get_peft_model(model, peft_config)

# Trainer configuration
trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="./output",
        num_train_epochs=5,  # Reduce the number of epochs
        per_device_train_batch_size=2,  # Reduce batch size to fit in VRAM
        per_device_eval_batch_size=2,  # Reduce batch size
        gradient_accumulation_steps=4,  # Increase gradient accumulation steps
        warmup_steps=300,  # Lower warmup steps
        weight_decay=0.01,
        logging_dir="./logs",
        remove_unused_columns=False,
        fp16=True,  # Use mixed precision training
        dataloader_num_workers=2,  # Number of subprocesses to use for data loading
    ),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Start training
trainer.train()

# Clear CUDA cache
torch.cuda.empty_cache()


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

You shouldn't move a model that is dispatched using accelerate hooks.


ValueError: `.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.