In [1]:
import argparse
from typing import Optional, Union

import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from dataclasses import dataclass

import datasets
from datasets import Dataset
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import log_loss

from transformers import (
    AutoTokenizer,
    AutoConfig,
    EarlyStoppingCallback,
    AutoModelForCausalLM,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    RobertaForMultipleChoice,
    AutoModelForSequenceClassification,
    LlamaModel,
    LlamaForSequenceClassification,
    BitsAndBytesConfig,
    get_polynomial_decay_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    TrainerCallback,
)
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

from peft import (
    get_peft_config,
    PeftModel,
    PeftConfig,
    get_peft_model,
    LoraConfig,
    TaskType,
)
import os

import random
def seed_everything(seed=None):
    '''
    固定seed
    :param seed: int, 随机种子
    '''
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    if (seed is None) or not (min_seed_value <= seed <= max_seed_value):
        seed = random.randint(np.iinfo(np.uint32).min, np.iinfo(np.uint32).max)
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    return seed


seed_everything(42)

42

In [2]:
from torch.utils.data import Dataset
class InstructionDataSet(Dataset):
    def __init__(self, data, tokenizer, max_source_length, max_target_length):
        super(InstructionDataSet, self).__init__()
        #self.data = data.sample(len(data), random_state=0).reset_index(drop=True)
        self.data = data
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        # self.A_token = self.tokenizer.encode(text='A', add_special_tokens=False, truncation=True, )
        # self.B_token = self.tokenizer.encode(text='B', add_special_tokens=False, truncation=True, )
        # self.C_token = self.tokenizer.encode(text='C', add_special_tokens=False, truncation=True, )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        now_data = self.data.loc[index]
        idx = now_data['id']
        templete_part1 = "<start_of_turn>user\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n\n"
        templete_part1_input_ids = self.tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']
        
        templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<end_of_turn>\n"
        templete_part2_input_ids = self.tokenizer(text=templete_part2, add_special_tokens=True, padding=False)['input_ids'][1:]
        #print(f"templete_part2 is {templete_part2_input_ids}")
        templete_part3 = "<start_of_turn>model\n"
        templete_part3_input_ids = self.tokenizer(text=templete_part3, add_special_tokens=True, padding=False)['input_ids'][1:]
        prompt_response = now_data['prompt_response']
        #print(f"id is {now_data['id']}")
        #print(prompt_response)
        prompt_response_ids = self.tokenizer(text=prompt_response, add_special_tokens=True, truncation=True,
                                          max_length=self.max_source_length, padding=False)['input_ids'][1:]
        
        input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids
        input_text = self.tokenizer.decode(input_ids, skip_special_tokens=False)
        #print(f"input is {self.tokenizer.decode(input_ids)}")
        return {
            "input_ids": input_text,
            "id": idx
        }

In [3]:
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
# @dataclass
# class DataCollatorForInstruction:
#     tokenizer: PreTrainedTokenizerBase
#     model: Optional[Any] = None
#     padding: Union[bool, str, PaddingStrategy] = True
#     max_length: Optional[int] = None
#     pad_to_multiple_of: Optional[int] = None
#     label_pad_token_id: int = -100
#     return_tensors: str = "pt"

#     def __call__(self, features, return_tensors=None):
#         if return_tensors is None:
#             return_tensors = self.return_tensors
#         labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
#         print(f"features is {features}")
#         # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
#         # same length to return tensors.

# #         features = self.tokenizer(
# #             features,
# #             padding='longest',
# #             max_length=MAX_LENGTH,
# #             pad_to_multiple_of=self.pad_to_multiple_of,
# #             return_tensors=return_tensors,
# #             truncation=True
# #         )

#         # prepare decoder_input_ids
#         if (
#                 labels is not None
#                 and self.model is not None
#                 and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
#         ):
#             decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
#             features["decoder_input_ids"] = decoder_input_ids
#         # breakpoint() # [(len(features[i]['input_ids']),len(features[i]['labels'])) for i in range(4)]
#         return features


def collate_fn(batch):
    batch = {k: [item[k] for item in batch] for k in ('input_ids','id')}
    #print(batch)
    batch_input = tokenizer(
        batch['input_ids'],
        padding='longest',
        truncation=True,
        return_tensors="pt",
        add_special_tokens=True,
        max_length=MAX_LENGTH + 50
    )
    return batch_input, batch['id']

In [4]:
from utils import load_split_data
data_path = "dataset/train.csv"
prompt_type = 2
MAX_INPUT = 2300
if_train = True
df_train , df_valid = load_split_data(data_path, prompt_type, MAX_INPUT, if_train, True)
test = df_valid

100%|██████████| 71514/71514 [00:22<00:00, 3209.00it/s]


In [42]:
from tqdm import tqdm
def inference(model, test_dataloader):
    test_predictions = []
    for batch in tqdm(test_dataloader):
        batch_input, idx = batch
        for k in batch_input.keys():
            batch_input[k] = batch_input[k].to(device)
        with torch.no_grad():
            #batch_input, idx = batch['input_ids'], batch['id']
            #batch_input = {"input_ids": batch_input}
            #logits = outputs.logits.cpu().detach().numpy()
            response = model.generate(**batch_input, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)
            #batch_input['input_ids'].shape[-1] + 1
            #print(f"score {response.scores}")
            #print(f"score is {response.scores[0]}")
            #print(f"response is {response}")
            #redict = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
            score = response.scores[0]
            A_prob, B_prob, C_prob = score[:,A_TOKEN_IDS], score[:,B_TOKEN_IDS], score[:,C_TOKEN_IDS]
            print(f"A_prob shape is {A_prob.shape}")
            logits = torch.Tensor([[A_prob,B_prob,C_prob]])
            logits = torch.softmax(logits, dim=-1).cpu().numpy()
            print(f"logits is {logits.shape}")
            node_result = [[idx[i],logits[i]] for i in range(batch_size)]

        test_predictions.append(node_result)
    return test_predictions

In [6]:
device = torch.device("cuda:0")

In [18]:
base_model = 'google/gemma-2-9b-it'
model_path = "output/peachy-sun-253/checkpoint-100"
MAX_LENGTH = 2300

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_path, truncation_side = 'left')
config = AutoConfig.from_pretrained(base_model, trust_remote_code=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
base_model_0 = AutoModelForCausalLM.from_pretrained(base_model,
                                                 config=config,
                                                 quantization_config=bnb_config,
                                                 torch_dtype=torch.float16,
                                                 device_map="auto",
                                                 trust_remote_code=True)
# base_model_0.config.pad_token_id = tokenizer.pad_token_id
# base_model_0.resize_token_embeddings(len(tokenizer))
new_model = model_path
model0 = PeftModel.from_pretrained(base_model_0, new_model).to(device)
#model0 = model0.merge_and_unload()
model0.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2SdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (

In [20]:
A_TOKEN_IDS = tokenizer('A',add_special_tokens=True, truncation=True, max_length=1024)['input_ids'][1:]
B_TOKEN_IDS = tokenizer('B',add_special_tokens=True, truncation=True, max_length=1024)['input_ids'][1:]
C_TOKEN_IDS = tokenizer('C',add_special_tokens=True, truncation=True, max_length=1024)['input_ids'][1:]


In [21]:
A_TOKEN_IDS,B_TOKEN_IDS,C_TOKEN_IDS

([235280], [235305], [235288])

In [11]:
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# tokenized_dataset = InstructionDataSet(test, tokenizer, MAX_LENGTH, 1)

# token_ids = tokenized_dataset[0]['input_ids']
# tokens = tokenizer.convert_ids_to_tokens(token_ids)

# for token, token_id in zip(tokens, token_ids):
#     print(f"Token: {token}, ID: {token_id}")

In [22]:
batch_size = 1
tokenized_dataset = InstructionDataSet(test, tokenizer, MAX_LENGTH, 1)

test_dataloader = torch.utils.data.DataLoader(tokenized_dataset, batch_size = batch_size ,collate_fn=collate_fn)

In [43]:
sub_pred = inference(model = model0, test_dataloader = test_dataloader)

  0%|          | 2/2874 [00:00<09:29,  5.04it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)
A_prob shape is torch.Size([1, 1])
logits is (1, 3)


  0%|          | 4/2874 [00:00<07:56,  6.03it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)
A_prob shape is torch.Size([1, 1])
logits is (1, 3)


  0%|          | 6/2874 [00:01<07:28,  6.40it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)
A_prob shape is torch.Size([1, 1])
logits is (1, 3)


  0%|          | 8/2874 [00:01<07:12,  6.63it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)
A_prob shape is torch.Size([1, 1])
logits is (1, 3)


  0%|          | 9/2874 [00:01<07:56,  6.01it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)


  0%|          | 11/2874 [00:01<09:22,  5.09it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)
A_prob shape is torch.Size([1, 1])
logits is (1, 3)


  0%|          | 13/2874 [00:02<08:03,  5.92it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)
A_prob shape is torch.Size([1, 1])
logits is (1, 3)


  1%|          | 15/2874 [00:02<08:13,  5.80it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)
A_prob shape is torch.Size([1, 1])
logits is (1, 3)


  1%|          | 16/2874 [00:02<08:32,  5.58it/s]

A_prob shape is torch.Size([1, 1])
logits is (1, 3)





KeyboardInterrupt: 

In [29]:
# 提取数据
processed_data = []
for item in sub_pred:
    item = item[0]
    id = item[0].item()  # 获取id
    array_values = item[1].tolist()  # 获取array并转换为列表
    processed_data.append([id] + array_values)

In [30]:
new_columns = ['id', 'winner_model_a', 'winner_model_b', 'winner_tie']
df = pd.DataFrame(processed_data, columns=new_columns)

In [31]:
df = df.groupby('id').mean().reset_index()

In [32]:
df

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,30192,0.964621,0.011230,0.024149
1,1256092,0.961766,0.011373,0.026860
2,3258431,0.972880,0.010640,0.016480
3,4186011,0.043130,0.839631,0.117239
4,5717448,0.863039,0.061549,0.075412
...,...,...,...,...
2869,4287311513,0.257485,0.545095,0.197420
2870,4289792977,0.260140,0.222510,0.517350
2871,4291367819,0.079588,0.767002,0.153410
2872,4292709507,0.028645,0.255313,0.716042


In [33]:
str2num = {'A':0, "B":1, "C":2}
test['label_number'] = test.label.map(str2num)
from sklearn.metrics import log_loss
prediction = np.array(df[new_columns[1:]])
test = test.drop_duplicates(subset = ['id']).reset_index(drop = True)
log_loss(test.label_number, prediction)



1.0588871269259368