In [2]:
import torch
import sklearn
import numpy as np
import pandas as pd
import time
from typing import Optional, Union

from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
import datasets
from datasets import Dataset
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from dataclasses import dataclass
from torch.cuda.amp import autocast
from threading import Thread

import gc
import os
import io
import time
import json
import random
import pickle
import zipfile
import datetime
import matplotlib.pyplot as plt
from IPython.display import display
from collections import Counter
from collections import defaultdict
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import log_loss
import tokenizers

import random
def seed_everything(seed=None):
    '''
    固定seed
    :param seed: int, 随机种子
    '''
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    if (seed is None) or not (min_seed_value <= seed <= max_seed_value):
        seed = random.randint(np.iinfo(np.uint32).min, np.iinfo(np.uint32).max)
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    return seed


seed_everything(42)

42

In [17]:
@dataclass
class DataCollatorForClassification:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        #print(f"features is {features}")
        #label_name = 'label' if 'label' in features[0].keys() else 'labels'
        #labels = [feature.pop(label_name) for feature in features]

        # Flatten the features (no need to handle multiple choices)
        #self.padding = False
        #print(self.padding)
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        
        # Adjust the shape of input_ids to ensure [batch_size, sequence_length]
        if batch['input_ids'].dim() == 3:
            batch['input_ids'] = batch['input_ids'].squeeze(1)
        if batch['input_ids'].dim() == 1:
            batch['input_ids'] = batch['input_ids'].unsqueeze(0)

        if 'token_type_ids' in batch:
            if batch['token_type_ids'].dim() == 3:
                batch['token_type_ids'] = batch['token_type_ids'].squeeze(1)
            if batch['token_type_ids'].dim() == 1:
                batch['token_type_ids'] = batch['token_type_ids'].unsqueeze(0)

        if 'attention_mask' in batch:
            if batch['attention_mask'].dim() == 3:
                batch['attention_mask'] = batch['attention_mask'].squeeze(1)
            if batch['attention_mask'].dim() == 1:
                batch['attention_mask'] = batch['attention_mask'].unsqueeze(0)
                
#         # Directly add labels to the batch
#         batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch
    
def preprocess(example):
        sentences = [" # Prompt" + "\n" + example['prompt'] + "\n\n" + "# Answer A" + "\n" + example['response_a'] + "\n\n" +  "# Answer B" + "\n" + example['response_b']]
        #print(f"sentences is {sentences}")
        tokenized_example = tokenizer(sentences, truncation=True, padding='max_length',
                                      max_length=MAX_LENGTH)
        return tokenized_example

In [3]:
test = pd.read_csv('dataset/random_valid.csv')#.sample(5).reset_index(drop = True)
#test = test.loc[:100,:].reset_index(drop=True)
#sample_sub = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')

In [19]:
#concatenate strings in list
def process(input_str):
    if len(input_str) < 10:
        return 'None'
    
    else:
        stripped_str = input_str.strip('[]')
        sentences = [s.strip('"') for s in stripped_str.split('","')]
        return  ' '.join(sentences)


test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)

In [20]:
test

Unnamed: 0.1,Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,label
0,0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0,0
1,1,1256092,claude-2.1,vicuna-13b,Write a python code that calculates sum of 5 a...,Here is the python code that calculates the su...,Here is a Python code that calculates the sum ...,0,0,1,2
2,2,3258431,stablelm-tuned-alpha-7b,vicuna-13b,The following is a command that the user is ab...,,,1,0,0,0
3,3,4186011,palm-2,gpt-4-1106-preview,how can I write a function in Rust that revers...,```rust\nfn reverse_string(s: &str) -> String ...,"In Rust, you can reverse a string by using the...",0,1,0,1
4,4,5717448,gpt-3.5-turbo-1106,claude-2.1,Write a haiku about underwear,"Underwear snug fit,\nSoft fabric against the s...",Here is a haiku about underwear:\n\nCovering d...,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2869,2869,4287311513,mistral-7b-instruct,llama-2-70b-chat,"Write me a animal hybrid fusion prompt , descr...",Imagine a world where animals have fused with ...,Sure! Here's a prompt for a animal hybrid fusi...,0,1,0,1
2870,2870,4289792977,codellama-34b-instruct,gpt-4-0613,i need to add a file upload field to a model i...,"To add a file upload field to a model in Odoo,...","In Odoo, you can add a file upload field to a ...",1,0,0,0
2871,2871,4291367819,zephyr-7b-beta,wizardlm-13b,"Continue the text: \""Ukraine (\/ju\u02d0\u02c8...","To the south, Ukraine shares a maritime border...",and the Sea of Azov to the south and southeast...,0,1,0,1
2872,2872,4292709507,llama-2-70b-chat,gpt-3.5-turbo-0613,"Answer the following statements with \""Agree\""...","Sure, here are my answers:\n\n1. Disagree\n2. ...",1. Agree\n2. Agree\n3. Agree\n4. Disagree\n5. ...,0,1,0,1


In [21]:
from tqdm import tqdm
def inference(model, test_dataloader):
    test_predictions = []
    for batch in tqdm(test_dataloader):
        for k in batch.keys():
            batch[k] = batch[k].to(device)
        with torch.no_grad():
            outputs = model(**batch)
            #logits = outputs.logits.cpu().detach().numpy()
            predict = torch.softmax(outputs.logits, dim=-1).cpu().numpy()#.to(torch.float)
            #redict = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
        test_predictions.append(predict)
    return test_predictions

In [22]:
device = torch.device("cuda:0")

In [23]:
base_model = 'meta-llama/llama-3-transformers-8b-hf-v1'
model_path = "output/warm-breeze-102/checkpoint-1200"
MAX_LENGTH = 1500

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)
base_model_0 = LlamaForSequenceClassification.from_pretrained(
    base_model,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map=device,
    trust_remote_code=True)
base_model_0.config.pad_token_id = tokenizer.pad_token_id
base_model_0.resize_token_embeddings(len(tokenizer))
new_model = model_path
model0 = PeftModel.from_pretrained(base_model_0, new_model).to(device)
#model0 = model0.merge_and_unload()
model0.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/llama-3-transformers-8b-hf-v1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128257, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
          

In [None]:
dataset = datasets.Dataset.from_pandas(test)
#['id', 'prompt', 'response_a', 'response_b']
tokenized_dataset = dataset.map(preprocess, remove_columns=test.columns.tolist())

data_collator = DataCollatorForClassification(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

sub_pred = inference(model = model0, test_dataloader = test_dataloader)

Map:   0%|          | 0/2874 [00:00<?, ? examples/s]

  0%|          | 0/2874 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  1%|          | 27/2874 [00:20<35:36,  1.33it/s]

In [None]:
prediciton = np.vstack(sub_pred)

In [4]:
from sklearn.metrics import log_loss

In [8]:
prediction = np.array(pd.read_csv("inference_on_test.csv"))

In [9]:
log_loss(test.label, prediction)



1.041973668626307

In [None]:
3.176938522630114

In [None]:
test['label'][:12]