In [3]:
# Install necessary libraries and modules
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
import torch
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('json', data_files={
    'train': '/home/rox/singlish-chatbot/datasets/singlish/huggingface/training/singlish_training_dataset1_part1.jsonl',
    'validation': '/home/rox/singlish-chatbot/datasets/singlish/huggingface/validation/singlish_validation_dataset1_part1.jsonl'
})

# Set the device to CPU
device = torch.device("cpu")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/home/rox/llama-singlish/")

# Set the padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="longest", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Load the model with 8-bit quantization on CPU
model_name = "/home/rox/llama-singlish/"

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_compute_dtype=torch.float32
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="cpu",  # Explicitly load the model on the CPU
    torch_dtype=torch.float32  # Use float32 for CPU compatibility
)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Configure the PEFT model
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none"
)
peft_model = get_peft_model(model=model, peft_config=peft_config)

# Trainer configuration
trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="./output",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        remove_unused_columns=False,
        fp16=False,  # Disable fp16 since we're using 8-bit quantization
        dataloader_num_workers=2,
        torch_compile=False,  # Disable TorchScript compilation for CPU
    ),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Start training
trainer.train()

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [2]:
# Install necessary libraries and modules
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
import torch
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('json', data_files={
    'train': '/home/rox/singlish-chatbot/datasets/singlish/huggingface/training/singlish_training_dataset1_part1.jsonl',
    'validation': '/home/rox/singlish-chatbot/datasets/singlish/huggingface/validation/singlish_validation_dataset1_part1.jsonl'
})

# Set the device to CPU to ensure the model is loaded into RAM
device = torch.device("cpu")

# Clear CUDA cache (optional since we're using CPU)
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/home/rox/llama-singlish/")

# Set the padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="longest", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Configure 8-bit quantization with CPU offloading
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    load_in_8bit_fp32_cpu_offload=True  # Offload some layers to the CPU to reduce memory usage
)

# Load the model with 8-bit quantization on CPU
model_name = "/home/rox/llama-singlish/"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",  # Automatically map layers to CPU if GPU is insufficient
    torch_dtype=torch.float32  # Use float32 for CPU compatibility
)

# Configure the PEFT model
peft_config = LoraConfig(
    r=8,  # Rank of the low-rank adaptation
    lora_alpha=32,  # Scaling factor for the low-rank adaptation
    target_modules=["q_proj", "v_proj"],  # Make sure these are valid for LLaMA 3.1
    lora_dropout=0.1,  # Dropout rate for LoRA
    bias="none"
)
peft_model = get_peft_model(model=model, peft_config=peft_config).to(device)

# Trainer configuration
trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="./output",
        num_train_epochs=10,
        per_device_train_batch_size=4,  # Adjust batch size as needed
        per_device_eval_batch_size=4,  # Adjust batch size as needed
        gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        remove_unused_columns=False,
        fp16=False,  # Disable fp16 since we're using the CPU
        dataloader_num_workers=2,  # Number of subprocesses to use for data loading
        torch_compile=False,  # Disable TorchScript compilation for CPU
    ),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Start training
trainer.train()


Unused kwargs: ['load_in_8bit_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 