In [None]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
from datasets import DatasetDict


In [None]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"

# New instruction dataset
# guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model
new_model = "llama-2-7b-chat-guanaco_fined_turned"

In [None]:
from datasets import load_dataset

raw_dataset = load_dataset(
    "json",
    data_files="/content/QAPairs (3).jsonl"
)


In [None]:
def to_llama2_format(example):
    user_msg = None
    assistant_msg = None

    for msg in example["messages"]:
        if msg["role"] == "user":
            user_msg = msg["content"].strip()
        elif msg["role"] == "assistant":
            assistant_msg = msg["content"].strip()

    if user_msg is None or assistant_msg is None:
        return {"text": ""}  # safe fallback

    return {
        "text": f"<s>[INST] {user_msg} [/INST] {assistant_msg} </s>"
    }


In [None]:
processed_train = raw_dataset["train"].map(
    to_llama2_format,
    remove_columns=raw_dataset["train"].column_names
)

dataset = DatasetDict({
    "train": processed_train
})


In [None]:
print(dataset)
print(dataset["train"][0]["text"])


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2573
    })
})
<s>[INST] What is the title of the document? [/INST] The Constitution of Nepal </s>


In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,                       #data will be loaded in 4-bit format
    bnb_4bit_quant_type="nf4",               #specifying the quantization type
    bnb_4bit_compute_dtype=compute_dtype,    #float16 .&. NormalFloat 4-bit
    bnb_4bit_use_double_quant=False,         #double quantization will not be used.
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,                    # "NousResearch/Llama-2-7b-chat-hf"
    quantization_config=quant_config,  # Apply 4-bit quantization
    device_map={"": 0},            # Load entire model to GPU 0
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,

)

#  use CPU-offloading: device_map="auto" with some layers on CPU

model.config.use_cache = False    # Disables KV cache for training
model.config.pretraining_tp = 1   # Sets tensor parallelism to 1

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token   # Set padding token = end-of-sequence
tokenizer.padding_side = "right"           # Pad on the right side

In [None]:
#  LoRA Configuration
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
# Tokenize your dataset before passing to SFTTrainer
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,      # Cut longer sequences to 512 tokens
        padding="max_length", # Pad shorter sequences to 512 tokens
        max_length=512        # Fixed length for all sequences
    )

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/2573 [00:00<?, ? examples/s]

In [None]:
from trl import SFTConfig

In [None]:
# Option 2: Use BFloat16 instead of FP16 (recommended for modern GPUs)
training_params = SFTConfig(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,  # Disable FP16
    bf16=torch.cuda.is_bf16_supported(),  # Enable BF16 if supported
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    packing=False,
    dataset_text_field="text",
)

In [None]:
trainer = SFTTrainer(
    model=model,                    # Your quantized model
    train_dataset=tokenized_datasets['train'],  # Pre-tokenized dataset
    peft_config=peft_params,        # LoRA configuration
    args=training_params,           # Training hyperparameters
)

Truncating train dataset:   0%|          | 0/2573 [00:00<?, ? examples/s]

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


# Loading and Testing


In [None]:
!pip install -U \
  transformers \
  accelerate \
  peft \
  bitsandbytes \
  sentencepiece


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


In [None]:
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False


In [None]:
!unzip /content/llama-2-7b-chat-guanaco_fined_turned.zip

Archive:  /content/llama-2-7b-chat-guanaco_fined_turned.zip
replace llama-2-7b-chat-guanaco_fined_turned/adapter_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
BASE_MODEL = "NousResearch/Llama-2-7b-chat-hf"
FT_MODEL_PATH = "/content/llama-2-7b-chat-guanaco_fined_turned"


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    FT_MODEL_PATH,
    trust_remote_code=True
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
model = PeftModel.from_pretrained(
    base_model,
    FT_MODEL_PATH
)

model.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj)

In [None]:
model = model.merge_and_unload()
model.eval()


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,),

In [None]:
def generate(prompt, max_new_tokens=200):
    text = f"<s>[INST] {prompt} [/INST]"
    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.1,
            top_p=0.9,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
print(generate("Who may acquire the citizenship of Nepal by descent?"))


[INST] Who may acquire the citizenship of Nepal by descent? [/INST] A person who is born in Nepal to a parent or parents who are not citizens of Nepal at the time of his or her birth


In [None]:
questions = [
    "What remedies are available if a fundamental right is violated?",
    "Can a citizen directly approach the Supreme Court?",
    "What writs are recognized under the Constitution of Nepal?",
    "Does the Constitution provide habeas corpus?",
    "What is the role of constitutional commissions?",
]


for q in questions:
    print("Q:", q)
    print("A:", generate(q))
    print("-" * 50)


Q: What remedies are available if a fundamental right is violated?
A: [INST] What remedies are available if a fundamental right is violated? [/INST] The Supreme Court shall have the power to issue appropriate writs, orders or directions in accordance with law.
--------------------------------------------------
Q: Can a citizen directly approach the Supreme Court?
A: [INST] Can a citizen directly approach the Supreme Court? [/INST] Yes, a citizen can directly approach the Supreme Court.
--------------------------------------------------
Q: What writs are recognized under the Constitution of Nepal?
A: [INST] What writs are recognized under the Constitution of Nepal? [/INST] The writs recognized under this Part shall be as provided for in the Federal law.
--------------------------------------------------
Q: Does the Constitution provide habeas corpus?
A: [INST] Does the Constitution provide habeas corpus? [/INST] Yes, the Constitution provides for the right to be free from arrest and det