In [1]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
from datasets import DatasetDict


2026-01-03 13:03:56.553326: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767445436.751714      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767445436.805185      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767445437.290493      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767445437.290529      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767445437.290532      23 computation_placer.cc:177] computation placer alr

In [3]:
from datasets import DatasetDict

In [4]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"

# New instruction dataset
# guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model
new_model = "llama-2-7b-chat-guanaco_fined_turned"

In [5]:
from datasets import load_dataset

raw_dataset = load_dataset(
    "json",
    data_files="/kaggle/input/legal-datasets/QAPairs (3).jsonl"
)


Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
def to_llama2_format(example):
    user_msg = None
    assistant_msg = None

    for msg in example["messages"]:
        if msg["role"] == "user":
            user_msg = msg["content"].strip()
        elif msg["role"] == "assistant":
            assistant_msg = msg["content"].strip()

    if user_msg is None or assistant_msg is None:
        return {"text": ""}  # safe fallback

    return {
        "text": f"<s>[INST] {user_msg} [/INST] {assistant_msg} </s>"
    }


In [7]:
processed_train = raw_dataset["train"].map(
    to_llama2_format,
    remove_columns=raw_dataset["train"].column_names
)

dataset = DatasetDict({
    "train": processed_train
})


Map:   0%|          | 0/2573 [00:00<?, ? examples/s]

In [8]:
# dataset = dataset["train"].shuffle(seed=42).select(range(10))
# dataset = DatasetDict({"train": dataset})

In [9]:
print(dataset)
print(dataset["train"][0]["text"])


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2573
    })
})
<s>[INST] What is the title of the document? [/INST] The Constitution of Nepal </s>


In [10]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,                       #data will be loaded in 4-bit format
    bnb_4bit_quant_type="nf4",               #specifying the quantization type
    bnb_4bit_compute_dtype=compute_dtype,    #float16 .&. NormalFloat 4-bit
    bnb_4bit_use_double_quant=False,         #double quantization will not be used.
)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,                    # "NousResearch/Llama-2-7b-chat-hf"
    quantization_config=quant_config,  # Apply 4-bit quantization
                # Load entire model to GPU 0
    device_map={"": 0},
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,

)

#  use CPU-offloading: device_map="auto" with some layers on CPU

model.config.use_cache = False    # Disables KV cache for training
model.config.pretraining_tp = 1   # Sets tensor parallelism to 1

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token   # Set padding token = end-of-sequence
tokenizer.padding_side = "right"           # Pad on the right side

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [13]:
#  LoRA Configuration
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [14]:
# Tokenize your dataset before passing to SFTTrainer
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,      # Cut longer sequences to 512 tokens
        padding="max_length", # Pad shorter sequences to 512 tokens
        max_length=512        # Fixed length for all sequences
    )

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/2573 [00:00<?, ? examples/s]

In [15]:
from trl import SFTConfig

In [16]:
# Option 2: Use BFloat16 instead of FP16 (recommended for modern GPUs)
training_params = SFTConfig(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,  # Disable FP16
    bf16=torch.cuda.is_bf16_supported(),  # Enable BF16 if supported
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    packing=False,
    dataset_text_field="text",
)

In [17]:
trainer = SFTTrainer(
    model=model,                    # Your quantized model
    train_dataset=tokenized_datasets['train'],  # Pre-tokenized dataset
    peft_config=peft_params,        # LoRA configuration
    args=training_params,           # Training hyperparameters
)

Truncating train dataset:   0%|          | 0/2573 [00:00<?, ? examples/s]

In [18]:
# import os
# os.environ["ACCELERATE_DISABLE_MEMORY_CLEANUP"] = "1"

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss
25,2.3793
50,0.4629
75,0.3347
100,0.2837
125,0.2875
150,0.293
175,0.2897
200,0.2868
225,0.2698
250,0.2719




TrainOutput(global_step=322, training_loss=0.4588400695634925, metrics={'train_runtime': 13568.6049, 'train_samples_per_second': 0.19, 'train_steps_per_second': 0.024, 'total_flos': 5.249135824129229e+16, 'train_loss': 0.4588400695634925, 'entropy': 0.13438818298957564, 'num_tokens': 1317376.0, 'mean_token_accuracy': 0.9674212742935527, 'epoch': 1.0})

In [19]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('llama-2-7b-chat-guanaco_fined_turned/tokenizer_config.json',
 'llama-2-7b-chat-guanaco_fined_turned/special_tokens_map.json',
 'llama-2-7b-chat-guanaco_fined_turned/tokenizer.model',
 'llama-2-7b-chat-guanaco_fined_turned/added_tokens.json',
 'llama-2-7b-chat-guanaco_fined_turned/tokenizer.json')