In [None]:
!pip install transformers datasets peft accelerate bitsandbytes safetensors sentencepiece --upgrade

"daryl149/llama-2-7b-chat-hf"
p3.8xlarge
max_length: 128

- vanila: 10.22s

- tf32 instead of fp32: 10.38s
```python
torch.backends.cuda.matmul.allow_tf32 = True
```

- half-precision: 10.87s (save memory)
```python
torch_dtype=torch.bfloat16,
```

- load int 8: 33.92s
```python
load_in_8bit=True,
```

In [None]:
import os, sys, time
import torch
import datasets
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LlamaTokenizer,
    LlamaForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    GenerationConfig
)
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    print(f"trainable model parameters: {trainable_model_params}\n all model parameters: {all_model_params} ")
    return trainable_model_params

# 1) Model Loading

[Fine-tuning a GPT — LoRA](https://dataman-ai.medium.com/fine-tune-a-gpt-lora-e9b72ad4ad3)



In [None]:
model_id = "NousResearch/Llama-2-7b-hf"
# model_id = "daryl149/llama-2-7b-chat-hf"
max_length = 256

### device set up
device_map = "auto"

batch_size = 64
micro_batch_size = 16
gradient_accumulation_steps = batch_size // micro_batch_size
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    gradient_accumulation_steps = gradient_accumulation_steps // world_size

# nf4" use a symmetric quantization scheme with 4 bits precision
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = LlamaForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map="auto"
)

# value different than 1 will activate the more accurate but slower computation
model.config.pretraining_tp = 1

if not ddp and torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True

ori_p = print_number_of_trainable_model_parameters(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

trainable model parameters: 262410240
 all model parameters: 3500412928 




In [None]:
### tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

```python
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

# reduce number of stored activations
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)
```

In [None]:
# this line is similar to the block above
model = prepare_model_for_kbit_training(model)
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)
peft_p = print_number_of_trainable_model_parameters(model)
print(f"# Trainable Parameter \nBefore: {ori_p} \nAfter: {peft_p} \nPercentage: {round(peft_p / ori_p * 100, 2)}")

trainable model parameters: 4194304
 all model parameters: 3504607232 
# Trainable Parameter 
Before: 262410240 
After: 4194304 
Percentage: 1.6


# 2) Data Loading

```python

# recommended way for llama
train_dataset = load_dataset('json', data_files='/content/train.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='/content/test.jsonl', split="train")

# Preprocess datasets
train_dataset_mapped = train_dataset.map(
    lambda examples: {
        'text': [f'[INST] <>\n{system_message.strip()}\n<>\n\n' + prompt + ' [/INST] ' + response \
                 for prompt, response in zip(examples['prompt'], examples['response'])]
        },
    batched=True
)
```



In [None]:
### generate prompt based on template ###
prompt_template = {
    "prompt_input": \
    "<s>[INST] <<SYS>> You are a helpful assistant. <</SYS>> \
    Below is an instruction that describes a task, paired with an input that provides further context.\
    Write a response that appropriately completes the request.\
    \n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: [/INST]\n",

    "prompt_no_input": \
    "<s>[INST] <<SYS>> You are a helpful assistant. <</SYS>> \
    Below is an instruction that describes a task.\
    Write a response that appropriately completes the request.\
    \n\n### Instruction:\n{instruction}\n\n### Response: [/INST]\n",

    "response_split": "### Response:"
}

def generate_prompt(instruction, input=None, label=None, prompt_template=prompt_template):
    if input:
        res = prompt_template["prompt_input"].format(
            instruction=instruction, input=input)
    else:
        res = prompt_template["prompt_no_input"].format(
            instruction=instruction)
    if label:
        res = f"{res}{label}"
    return res

In [None]:
def tokenize(tokenizer, prompt, max_length=max_length, add_eos_token=False):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None)

    # result["input_ids"].append(tokenizer.eos_token_id)
    # result["attention_mask"].append(1)
    result["labels"] = result["input_ids"].copy()
    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["context"],
        data_point["response"],
    )
    tokenized_full_prompt = tokenize(tokenizer, full_prompt)
    user_prompt = generate_prompt(data_point["instruction"], data_point["context"])
    tokenized_user_prompt = tokenize(tokenizer, user_prompt)
    user_prompt_len = len(tokenized_user_prompt["input_ids"])
    mask_token = [-100] * user_prompt_len
    tokenized_full_prompt["labels"] = mask_token + tokenized_full_prompt["labels"][user_prompt_len:]

    return tokenized_full_prompt

```python
class DataCollatorForSeq2Seq:
    def __init__(self, tokenizer, pad_to_multiple_of=None):
        self.tokenizer = tokenizer
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, examples):
        input_ids = [example["input_ids"] for example in examples]
        attention_mask = [example["attention_mask"] for example in examples]
        labels = [example["labels"] for example in examples]

        # Pad the sequences to the maximum length in the batch
        max_length = max(len(seq) for seq in input_ids)
        input_ids = [seq + [self.tokenizer.pad_token_id] * (max_length - len(seq)) for seq in input_ids]
        attention_mask = [seq + [0] * (max_length - len(seq)) for seq in attention_mask]
        labels = [seq + [-100] * (max_length - len(seq)) for seq in labels]

        # Convert the lists to PyTorch tensors
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)
        labels = torch.tensor(labels)

        # Pad the sequences to the multiple of `pad_to_multiple_of` if specified
        if self.pad_to_multiple_of is not None:
            pad_length = self.pad_to_multiple_of - (input_ids.size(1) % self.pad_to_multiple_of)
            if pad_length != self.pad_to_multiple_of:
                input_ids = F.pad(input_ids, (0, pad_length), value=self.tokenizer.pad_token_id)
                attention_mask = F.pad(attention_mask, (0, pad_length), value=0)
                labels = F.pad(labels, (0, pad_length), value=-100)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

```

```python
class DataCollatorForLanguageModeling:
    def __init__(self, tokenizer, mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability

    def __call__(self, examples):
        input_ids = [example["input_ids"] for example in examples]
        attention_mask = [example["attention_mask"] for example in examples]
        labels = [example["labels"] for example in examples]

        # Mask tokens for masked language modeling
        for i in range(len(input_ids)):
            for j in range(len(input_ids[i])):
                if random.random() < self.mlm_probability:
                    # 80% of the time, replace with [MASK] token
                    if random.random() < 0.8:
                        input_ids[i][j] = self.tokenizer.mask_token_id
                    # 10% of the time, replace with a random token
                    elif random.random() < 0.5:
                        input_ids[i][j] = random.randint(0, len(self.tokenizer) - 1)
                    # 10% of the time, keep the original token

        # Convert the lists to PyTorch tensors
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)
        labels = torch.tensor(labels)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }
```

In [None]:
# Use the end-of-sequence token as the padding token and set mlm=False. \
# This will use the inputs as labels shifted to the right by one element

max_length = 256
dataset = datasets.load_dataset(
    "databricks/databricks-dolly-15k", split='train'
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="pt",
# )

# print(f"dataset size: {len(dataset)}")
dataset = dataset.train_test_split(test_size=1000, shuffle=True, seed=42)
cols = ["instruction", "context", "response", "category"]
train_data = dataset["train"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols,)
train_data = train_data.filter(lambda rec: len(rec["input_ids"]) < max_length)
val_data = dataset["test"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols,)
val_data = val_data.filter(lambda rec: len(rec["input_ids"]) < max_length)

# test collator
val_batch = data_collator(list(iter(val_data)))
n = 100
for k, v in val_batch.items():
    print(k, v[n])

print('\nDecoding input_ids\n', tokenizer.decode(val_batch['input_ids'][n]))
print('\nDecoding labels\n', tokenizer.decode([x for x in val_batch['labels'][n] if x > 0]))

Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/14011 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14011 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

input_ids tensor([    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         1678, 14350,   263,  2933,   393,  7128,  2486,  1614,  2167,   278,
         2009, 29889,   268,    13,    13,  2277, 29937,  2799,  4080, 29901,
           13,  5618,   338,  7783,  5614,   309,   327,    13,    13,  2277,
        29937, 13291, 29901,    13, 11277, 29871, 29941, 29953, 29945, 10061,
          309,   327,   338,   385, 23116, 21082, 20255,  4682,   363,  7783,
        29871, 29941, 29953, 29945,  8324,   322,  5786, 29892,   607,  3160,
          967,  3234,  2068,  9460,   310, 11104, 29892,  1316,   408, 10803,
        29892, 11388, 29892,  9206,  5228, 29892,   322,  4451,  6914, 29889,
          450, 13465,   310, 10061,   309,   327,   411, 29871, 29941, 29953,
        29945, 29892,  9326,   373,  4779, 29892, 29871, 29896, 29953, 29892,
        29871, 29906, 29900, 29906, 29941, 29892,   338, 12919,  3625,   304,
          263,  2319,  1353,   310,  3896,   558,  463

# 3) Model Training

In [None]:
# args = TrainingArguments(
#     output_dir="./llama-7b-int4-dolly",
#     num_train_epochs=20,
#     max_steps=2000,
#     fp16=False,
#     tf32=False,
#     optim="paged_adamw_8bit",
#     learning_rate=2e-4,
#     lr_scheduler_type="constant",
#     per_device_train_batch_size=micro_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     gradient_checkpointing=True,
#     ddp_find_unused_parameters=False if ddp else None,
#     group_by_length=False,
#     logging_steps=10,
#     save_strategy="epoch",
#     save_total_limit=3,
#     # report_to="wandb",
#     # run_name="llma_run_00",
#     disable_tqdm=False,
# )

# trainer = Trainer(
#     model=model,
#     train_dataset=train_data,
#     eval_dataset=val_data,
#     args=args,
#     data_collator=data_collator,
# )

trainer = Trainer(
    model=model,
    train_dataset=train_data,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        max_steps=1000,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="./llama-7b-int4-dolly",
        optim="paged_adamw_8bit"
    ),
    data_collator=data_collator,
)
# silence the warnings. re-enable for inference!
model.config.use_cache = False
trainer.train()
model.save_pretrained("llama-7b-int4-dolly_1")

Step,Training Loss
1,0.9991
2,0.8241
3,1.2736
4,0.7018
5,1.1079
6,1.4621
7,1.889
8,2.1955
9,1.0744
10,2.1239


# 4) Generation

```python
# standard prompt for llama
prompt = f'''\
    [INST] <>\n{system_message}\n<>\n\nWrite a function that reverses a string. \
    [/INST]" # replace the command here with something relevant to your task \
    '''
```

```python
def build_llama2_prompt(messages):
    startPrompt = "<s>[INST] "
    endPrompt = " [/INST]"
    conversation = []
    for index, message in enumerate(messages):
        if message["role"] == "system" and index == 0:
            conversation.append(f"<<SYS>>\n{message['content']}\n<</SYS>>\n\n")
        elif message["role"] == "user":
            conversation.append(message["content"].strip())
        else:
            conversation.append(f" [/INST] {message.content}</s><s>[INST] ")

    return startPrompt + "".join(conversation) + endPrompt

messages = [
  {
    "role": "system",
    "content": '''You are a friendly and knowledgeable vacation planning assistant named Clara. \
    Your goal is to have natural conversations with users to help them plan their perfect vacation. '''}
]

instruction = "What are some cool ideas to do in the summer?"
messages.append({"role": "user", "content": instruction})
prompt = build_llama2_prompt(messages)
chat = llm.predict({"inputs":prompt})
print(chat[0]["generated_text"][len(prompt):])
```

In [None]:
# model path and weight
model_id = "NousResearch/Llama-2-7b-hf"
peft_path = "llama-7b-int4-dolly_1"

# loading model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)
model = PeftModel.from_pretrained(
    model,
    peft_path,
    torch_dtype=torch.float16,
)
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.eval()

# generation config
generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4, # beam search
)

with torch.no_grad():
    prompt = "<s>[INST] <<SYS>> You are a helpful assistant. <</SYS>> Write me a poem about Singapore. [/INST]"

    inputs = tokenizer(prompt, return_tensors="pt")
    generation_output = model.generate(
        input_ids=inputs.input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=64,
    )b
    print('\nAnswer: ', tokenizer.decode(generation_output.sequences[0]))