In [1]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer
from peft import PeftModel

In [2]:
def format_prompt(sample):
    return f"""
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample["instruction"]}

### Input:
{sample["input"]}

### Response:
{sample["output"]}
"""

In [3]:
# Dataset
training_data = load_dataset("json", data_files="test_dataset/train.json", split="train")

In [4]:
# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "test_model5" #You can give it your own name

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="finetuned-llama-7b-chat-hf-med",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    tokenizer=llama_tokenizer,
    max_seq_length=1024,
    packing=True,
    formatting_func=format_prompt,
    args=train_params
)

# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(refined_model)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,0.5791
20,0.3521
30,0.1787
40,0.1073
50,0.0668




In [None]:
peft_model = PeftModel.from_pretrained(base_model, refined_model)

In [7]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# Load finetuned LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    '/home/isr-lab2/Documents/LLMTest/finetuned-llama-7b-chat-hf-med/checkpoint-55',
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained('/home/isr-lab2/Documents/LLMTest/finetuned-llama-7b-chat-hf-med/checkpoint-55')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
dataset = load_dataset("json", data_files="test_dataset/train.json", split="train")
sample = dataset[randrange(len(dataset))]

prompt = f"""
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample["instruction"]}

### Input:
{sample["input"]}

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = model.generate(input_ids=input_ids, max_new_tokens=512, do_sample=True, top_p=0.6,temperature=0.9)

print(f"Instruction:\n{sample['instruction']}\n")
print(f"Input:\n{sample['input']}\n")
print(f"Generated Response:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}\n")
print(f"Ground Truth:\n{sample['output']}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Instruction:
Generate anwer in XML format using tags: task, action, direction, object, location

Input:
I'm in the garden. Plant flowers.

Generated Response:
<task><action><actionType>PLANT</actionType><object>flowers</object></action></task>


Ground Truth:
<task><action><actionType>PLANT</actionType><object>flowers</object></action></task>


In [10]:
prompt = f"""
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample["instruction"]}

### Input:
`i'm in the kitchen. Take chocolate from fridge and wash dishes.

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = model.generate(input_ids=input_ids, max_new_tokens=512, do_sample=True, top_p=0.6,temperature=0.9)

print(f"Instruction:\n{sample['instruction']}\n")
print(f"Input:\n{sample['input']}\n")
print(f"Generated Response:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}\n")
print(f"Ground Truth:\n{sample['output']}")

Instruction:
Generate anwer in XML format using tags: task, action, direction, object, location

Input:
I'm in the garden. Plant flowers.

Generated Response:
<task><action><actionType>TAKE</actionType><object>chocolate</object></action><action><actionType>WASH</actionType><object>dishes</object></action></task>


Ground Truth:
<task><action><actionType>PLANT</actionType><object>flowers</object></action></task>
