# Note
- [Training script](https://www.kaggle.com/code/shelterw/training-llama3-8b-4-bit-qlora-sft)

# Import

In [1]:
cd /kaggle/input

/kaggle/input


In [2]:
!pip install -q -U bitsandbytes --no-index --find-links ./package-file
!pip install -q -U transformers --no-index --find-links ./llm-detect-pip
!pip install -q -U tokenizers --no-index --find-links ./llm-detect-pip
!pip install -q -U peft --no-index --find-links ./llm-detect-pip
print('finished!')

finished!


In [3]:
!nvidia-smi

Mon Jul 29 07:46:51 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                      

In [4]:
import time
import torch
import sklearn
import numpy as np
import pandas as pd
import torch.nn as nn
from torch.cuda.amp import autocast
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
from threading import Thread
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorForSeq2Seq
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType 
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers import BitsAndBytesConfig, LlamaForCausalLM, LlamaModel, LlamaPreTrainedModel
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from transformers import set_seed
torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cuda.enable_flash_sdp(True)
assert torch.cuda.device_count() == 2, "Sorry - multi-GPU required!"

2024-07-29 07:47:03.006258: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-29 07:47:03.006371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-29 07:47:03.149860: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
MODEL_NAME = './llama-3-8b-instruct-bnb-4bit/pytorch/default/1/llama-3-8b-Instruct-bnb-4bit'
WEIGHTS_PATH = './train10000-bs8-ep1/train10000_bs8_ep1/'
MAX_LENGTH = 2400
BATCH_SIZE = 2
DEVICE = torch.device("cuda")    

# Prepare Data 

In [6]:
test = pd.read_csv('./lmsys-chatbot-arena/test.csv')
def tokenize(example, tokenizer):
    prompt = tokenizer('<prompt>: ' + " ".join(eval(example['prompt'], {"null": ""})), add_special_tokens=False)["input_ids"]
    response_a = tokenizer('\n\n<response_a>: ' + " ".join(eval(example['response_a'], {"null": ""})), add_special_tokens=False)["input_ids"]
    response_b = tokenizer('\n\n<response_b>: ' + " ".join(eval(example['response_b'], {"null": ""})), add_special_tokens=False)["input_ids"]
    if len(prompt+response_a+response_b) > MAX_LENGTH:
        prompt = tokenizer('<prompt>: ' + eval(example['prompt'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:400]
        response_a = tokenizer('\n\n<response_a>: ' + eval(example['response_a'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:1000]
        response_b = tokenizer('\n\n<response_b>: ' + eval(example['response_b'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:1000]
    extra_prompt = tokenizer('\n\n---------\nWhich is the better response for the prompt ? a or b or tie ?\n\nAnswer: ', add_special_tokens=False)["input_ids"]
    label_token_id = [128250]
    input_ids = [tokenizer.bos_token_id] + prompt + response_a + response_b + extra_prompt + label_token_id + [tokenizer.eos_token_id]
    attention_mask = len(input_ids)*[1]
    labels = [-100]* len([tokenizer.bos_token_id] + prompt + response_a + response_b + extra_prompt) + label_token_id + [tokenizer.eos_token_id]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Tokenize

In [7]:
%%time
tokenizer = AutoTokenizer.from_pretrained(WEIGHTS_PATH)
LABEL_IDS = [tokenizer(i, add_special_tokens=False)["input_ids"][0] for i in ['a', 'b', 'tie']]
def load_data(df, tokenizer):
    raw_datasets = Dataset.from_pandas(df)
    tokenized_datasets = raw_datasets.map(
        tokenize, 
        # remove_columns=raw_datasets.column_names,
        fn_kwargs={'tokenizer': tokenizer},
    )
    return tokenized_datasets
test_ds = load_data(test, tokenizer)
test_ds

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

CPU times: user 658 ms, sys: 59.6 ms, total: 718 ms
Wall time: 846 ms


Dataset({
    features: ['id', 'prompt', 'response_a', 'response_b', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3
})

In [8]:
data = test_ds.to_pandas()
data["max_len"] = data["input_ids"].apply(len)
data[:3]

Unnamed: 0,id,prompt,response_a,response_b,input_ids,attention_mask,labels,max_len
0,136060,"[""I have three oranges today, I ate an orange ...","[""You have two oranges today.""]","[""You still have three oranges. Eating an oran...","[128000, 8085, 15091, 27916, 358, 617, 2380, 8...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, -100, -100, -100, -100, -100, -10...",82
1,211333,"[""You are a mediator in a heated political deb...","[""Thank you for sharing the details of the sit...","[""Mr Reddy and Ms Blue both have valid points ...","[128000, 8085, 15091, 27916, 1472, 527, 264, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, -100, -100, -100, -100, -100, -10...",469
2,1233961,"[""How to initialize the classification head wh...","[""When you want to initialize the classificati...","[""To initialize the classification head when p...","[128000, 8085, 15091, 27916, 2650, 311, 9656, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, -100, -100, -100, -100, -100, -10...",1593


In [9]:
data['input_ids'][0]

array([128000,   8085,  15091,  27916,    358,    617,   2380,  85138,
         3432,     11,    358,  30912,    459,  19087,  13985,     13,
         2650,   1690,  85138,    656,    358,    617,     30,    271,
           27,   2376,   4404,  27916,   1472,    617,   1403,  85138,
         3432,     13,    271,     27,   2376,    890,  27916,   1472,
         2103,    617,   2380,  85138,     13,  60638,    459,  19087,
        13985,   1587,    539,   7958,    279,   1396,    315,  85138,
          499,    617,   3432,     13,    271,  29547,  23956,    374,
          279,   2731,   2077,    369,    279,  10137,    949,    264,
          477,    293,    477,  18623,  24688,  16533,     25,    220,
       128250, 128009], dtype=int32)

In [10]:
print(tokenizer.decode(data["input_ids"][0]))

<|begin_of_text|><prompt>: I have three oranges today, I ate an orange yesterday. How many oranges do I have?

<response_a>: You have two oranges today.

<response_b>: You still have three oranges. Eating an orange yesterday does not affect the number of oranges you have today.

---------
Which is the better response for the prompt? a or b or tie?

Answer: <|reserved_special_token_245|><|eot_id|>


# Load model 
We load 1 model on each gpu.  

In [11]:
class Llama3ForSFT(LlamaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.post_init()

    def forward(
        self,
        input_ids= None,
        attention_mask= None,
        position_ids = None,
        past_key_values= None,
        inputs_embeds= None,
        labels= None,
        use_cache= None,
        output_attentions= None,
        output_hidden_states = None,
        return_dict= None,
        cache_position = None,
    ):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )
        hidden_states = outputs[0]
        if self.config.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
            logits = torch.cat(logits, dim=-1)
        else:
            logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)

            fake_label_tokens_ids = torch.tensor([128250],device=shift_labels.device)
            label_tokens_ids = torch.tensor(LABEL_IDS,device=shift_labels.device)
#             index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
#             true_labels = shift_labels[torch.isin(shift_labels, label_tokens_ids)]
#             true_labels = torch.tensor([index_mapping[label.item()] for label in true_labels], device=true_labels.device)
            true_logits = shift_logits[torch.isin(shift_labels, fake_label_tokens_ids)][:,label_tokens_ids]
#             loss = loss_fct(true_logits, true_labels)

        return CausalLMOutputWithPast(
            loss=loss,
            logits=true_logits,
        )

In [12]:
# Load base model on GPU 0
device0 = torch.device('cuda:0')
base_model_0 = Llama3ForSFT.from_pretrained(
    MODEL_NAME,
    use_cache=False,
    device_map='cuda:0',
)
# Load base model on GPU 1
device1 = torch.device('cuda:1')
base_model_1 = Llama3ForSFT.from_pretrained(
    MODEL_NAME,
    use_cache=False,
    device_map='cuda:1',
)

# Load weights 

In [13]:
# Get peft
model_0 = PeftModel.from_pretrained(base_model_0, model_id=WEIGHTS_PATH).to(device0) 
model_0.eval()

model_1 = PeftModel.from_pretrained(base_model_1, model_id=WEIGHTS_PATH).to(device1)
model_1.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Llama3ForSFT(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=1024, bias=False
            

In [14]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Mon Jul 29 07:48:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P0             26W /   70W |    5755MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                      

# Inference


In [15]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=BATCH_SIZE, max_length=MAX_LENGTH):
    a_win, b_win, tie = [], [], []

    model.eval()
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        labels = tmp["labels"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        pad_labels=[]
        for label in labels:
            label = list(label) + [tokenizer.pad_token_id]*(input_ids[0].shape[0]-label.shape[0])
            pad_labels.append(label)
        labels = torch.tensor(pad_labels).to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        proba = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    df['winner_model_a'] = a_win
    df['winner_model_b'] = b_win
    df['winner_tie'] = tie
    return df

In [16]:
st = time.time()

data = data.sort_values("max_len", ascending=False)
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device0, device1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

elapsed time: 3.655832052230835


In [17]:
cd /kaggle/working/

/kaggle/working


In [18]:
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
2,1233961,0.273839,0.531979,0.194181
0,136060,0.124172,0.608815,0.267013
1,211333,0.241968,0.558215,0.199817
