In [2]:
# Install necessary libraries and modules
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
import torch
from datasets import load_dataset
 
# Load the dataset
dataset = load_dataset('json', data_files={
    'train': '/home/rox/singlish-chatbot/datasets/singlish/huggingface/training/singlish_training_dataset1_part1.jsonl',
    'validation': '/home/rox/singlish-chatbot/datasets/singlish/huggingface/validation/singlish_validation_dataset1_part1.jsonl'
})
 
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clear CUDA cache
if (device == "cuda"):
    torch.cuda.empty_cache()
 
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/home/rox/llama-singlish/")
 
# Set the padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
 
# Tokenize the dataset
def tokenize_function(examples):
    # Use "longest" to dynamically pad to the longest sequence in each batch
    return tokenizer(examples["text"], padding="longest", truncation=True)
 
# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
# Load the model
model_name = "/home/rox/llama-singlish/"
 
# Configure quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True  # Use 16-bit quantization
)
 
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config)
 
# Configure the PEFT model
peft_config = LoraConfig(
    r=8,  # Rank of the low-rank adaptation
    lora_alpha=32,  # Scaling factor for the low-rank adaptation
    target_modules=["q_proj", "v_proj"],  # Make sure these are valid for LLaMA 3.1
    lora_dropout=0.1,  # Dropout rate for LoRA
    bias="none"
)
peft_model = get_peft_model(model=model, peft_config=peft_config).to(device)
 
# Trainer configuration
trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="./output",
        num_train_epochs=10,
        per_device_train_batch_size=4,  # Reduce batch size
        per_device_eval_batch_size=4,  # Reduce batch size
        gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        remove_unused_columns=False,
        fp16=True,  # Use mixed precision training
        dataloader_num_workers=2,  # Number of subprocesses to use for data loading
        torch_compile=True,  # Enable TorchScript compilation
    ),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)
 
# Start training
trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/tmp/tmp2yc3ii58/main.c:5:10: fatal error: Python.h: No such file or directory
    5 | #include <Python.h>
      |          ^~~~~~~~~~
compilation terminated.
/tmp/tmpulwhuu6v/main.c:5:10: fatal error: Python.h: No such file or directory
    5 | #include <Python.h>
      |          ^~~~~~~~~~
compilation terminated.


BackendCompilerFailed: backend='inductor' raised:
CalledProcessError: Command '['/bin/gcc', '/tmp/tmpulwhuu6v/main.c', '-O3', '-shared', '-fPIC', '-o', '/tmp/tmpulwhuu6v/cuda_utils.cpython-311-x86_64-linux-gnu.so', '-lcuda', '-L/usr/local/lib/python3.11/dist-packages/triton/backends/nvidia/lib', '-L/usr/lib/wsl/lib', '-I/usr/local/lib/python3.11/dist-packages/triton/backends/nvidia/include', '-I/tmp/tmpulwhuu6v', '-I/usr/include/python3.11']' returned non-zero exit status 1.

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True
