In [6]:
import argparse
from typing import Optional, Union

import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from dataclasses import dataclass

import datasets
from datasets import Dataset
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import log_loss

from transformers import (
    AutoTokenizer,
    AutoConfig,
    EarlyStoppingCallback,
    AutoModelForCausalLM,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    RobertaForMultipleChoice,
    AutoModelForSequenceClassification,
    LlamaModel,
    LlamaForSequenceClassification,
    BitsAndBytesConfig,
    get_polynomial_decay_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    TrainerCallback,
)
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

from peft import (
    get_peft_config,
    PeftModel,
    PeftConfig,
    get_peft_model,
    LoraConfig,
    TaskType,
)
import os

import random
from random import randint
def seed_everything(seed=None):
    '''
    固定seed
    :param seed: int, 随机种子
    '''
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    if (seed is None) or not (min_seed_value <= seed <= max_seed_value):
        seed = random.randint(np.iinfo(np.uint32).min, np.iinfo(np.uint32).max)
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    return seed


seed_everything(42)

from utils import load_split_data, load_json

In [7]:
from torch.utils.data import Dataset
class InstructionDataSet(Dataset):
    def __init__(self, data, tokenizer, max_source_length, max_target_length):
        super(InstructionDataSet, self).__init__()
        #self.data = data.sample(len(data), random_state=0).reset_index(drop=True)
        self.data = data
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        # self.A_token = self.tokenizer.encode(text='A', add_special_tokens=False, truncation=True, )
        # self.B_token = self.tokenizer.encode(text='B', add_special_tokens=False, truncation=True, )
        # self.C_token = self.tokenizer.encode(text='C', add_special_tokens=False, truncation=True, )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        now_data = self.data.loc[index]
        idx = now_data['id']
        templete_part1 = "<start_of_turn>user\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n\n"
        templete_part1_input_ids = self.tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']
        
        templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<end_of_turn>\n"
        templete_part2_input_ids = self.tokenizer(text=templete_part2, add_special_tokens=True, padding=False)['input_ids'][1:]
        #print(f"templete_part2 is {templete_part2_input_ids}")
        templete_part3 = "<start_of_turn>model\n"
        templete_part3_input_ids = self.tokenizer(text=templete_part3, add_special_tokens=True, padding=False)['input_ids'][1:]
        prompt_response = now_data['prompt_response']
        #print(f"id is {now_data['id']}")
        #print(prompt_response)
        prompt_response_ids = self.tokenizer(text=prompt_response, add_special_tokens=True, truncation=True,
                                          max_length=self.max_source_length, padding=False)['input_ids'][1:]
        
        input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids
        input_text = self.tokenizer.decode(input_ids, skip_special_tokens=False)
        #print(f"input is {self.tokenizer.decode(input_ids)}")
        return {
            "input_ids": input_text,
            "id": idx
        }

from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

def collate_fn(batch):
    batch = {k: [item[k] for item in batch] for k in ('input_ids','id')}
    #print(batch)
    batch_input = tokenizer(
        batch['input_ids'],
        padding='longest',
        truncation=True,
        return_tensors="pt",
        add_special_tokens=True,
        max_length=MAX_LENGTH + 50
    )
    return batch_input, batch['id']

In [2]:
from utils import load_split_data
data_path = "dataset/1M/35k_in_1M.json"
prompt_type = 3
MAX_INPUT = 1900
if_train = False
split = False
if_drop_duplicate = False
keep = 'last'
df_train , df_valid = load_split_data(data_path, prompt_type, MAX_INPUT, if_train, split, False, if_drop_duplicate, 'last')
test = df_train

100%|██████████| 37885/37885 [00:16<00:00, 2277.73it/s]


In [9]:
tmp = pd.read_json(data_path)

34348    [Step 1. You are a senior JAVA CRUD engineer a...
Name: prompt, dtype: object

In [12]:
len(tmp.loc[tmp.id == '4af73ffd64'].prompt)

1

In [13]:
test['length'] = test['prompt_response'].apply(len)

In [14]:
test = test.sort_values(by = ['length'], ascending = False).reset_index(drop = True)
test

Unnamed: 0,id,prompt_response,length
0,4af73ffd64,"#Prompt\nThe code is incomplete, please comple...",13398
1,519567d2a4,#Prompt\nGiven two datasets with attributes ['...,13106
2,26dc950ef0,#Prompt\nExtract information as it is from tex...,12989
3,5abb109a25,#Prompt\nMake a Roblox Script. If you are pres...,11427
4,d56ad2e1d2,"#Prompt\nGiven two datasets with attributes ""m...",11248
...,...,...,...
34935,689b1a79fb,"#Prompt\nWrite out the numbers from 1 to 10, b...",213
34936,342742f88d,#Prompt\nWho was the drummer of Pearl Jam betw...,213
34937,657f7b3f2a,#Prompt\nwhat is heavier: 1 kilo of iron or 2 ...,209
34938,6b6ff2dd15,#Prompt\nWhat's the sum of all integer values ...,195


In [15]:
from tqdm import tqdm
def inference(model, test_dataloader):
    test_predictions = []
    for batch in tqdm(test_dataloader):
        batch_input, idx = batch
        for k in batch_input.keys():
            batch_input[k] = batch_input[k].to(device)
        with torch.no_grad():
            response = model.generate(**batch_input, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)
            #batch_input['input_ids'].shape[-1] + 1
            score = response.scores[0]
            A_prob, B_prob, C_prob = score[:,A_TOKEN_IDS], score[:,B_TOKEN_IDS], score[:,C_TOKEN_IDS]
            logits = torch.cat([A_prob, B_prob, C_prob], dim=-1)
            #logits = torch.Tensor([[A_prob,B_prob,C_prob]]) / 1.1
            logits = torch.softmax(logits, dim=-1).cpu().numpy()
            node_result = [[idx[i],logits[i]] for i in range(len(idx))]
        test_predictions.extend(node_result)
    return test_predictions

In [16]:
device = torch.device("cuda:0")
base_model = 'google/gemma-2-9b-it'
model_path = "output/morning-waterfall-460/checkpoint-5200_888"
MAX_LENGTH = 1900

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_path, truncation_side = 'left')
config = AutoConfig.from_pretrained(base_model, trust_remote_code=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
base_model_0 = AutoModelForCausalLM.from_pretrained(base_model,
                                                 config=config,
                                                 quantization_config=bnb_config,
                                                 torch_dtype=torch.float16,
                                                 device_map="auto",
                                                 trust_remote_code=True)
# base_model_0.config.pad_token_id = tokenizer.pad_token_id
# base_model_0.resize_token_embeddings(len(tokenizer))
new_model = model_path
model0 = PeftModel.from_pretrained(base_model_0, new_model).to(device)
#model0 = model0.merge_and_unload()
model0.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2SdpaAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(
                (lora_dropo

In [18]:
A_TOKEN_IDS = tokenizer('A',add_special_tokens=True, truncation=True, max_length=1024)['input_ids'][1:]
B_TOKEN_IDS = tokenizer('B',add_special_tokens=True, truncation=True, max_length=1024)['input_ids'][1:]
C_TOKEN_IDS = tokenizer('C',add_special_tokens=True, truncation=True, max_length=1024)['input_ids'][1:]


In [19]:
batch_size = 4
tokenized_dataset = InstructionDataSet(test, tokenizer, MAX_LENGTH, 1)

test_dataloader = torch.utils.data.DataLoader(tokenized_dataset, batch_size = batch_size ,collate_fn=collate_fn)

In [None]:
sub_pred = inference(model = model0, test_dataloader = test_dataloader)

 97%|█████████▋| 8450/8735 [1:15:54<00:58,  4.88it/s]

In [15]:
if batch_size != 1:
    # 提取数据
    processed_data = []
    for item in sub_pred:
        #item = item[0]
        id = item[0].item()  # 获取id
        array_values = item[1].tolist()  # 获取array并转换为列表
        processed_data.append([id] + array_values)
    

else:
    # 提取数据
    processed_data = []

    
    for item in sub_pred:
        item = item[0]
        id = item[0].item()  # 获取id
        array_values = item[1].tolist()  # 获取array并转换为列表
        processed_data.append([id] + array_values)

new_columns = ['id', 'winner_model_a', 'winner_model_b', 'winner_tie']
df = pd.DataFrame(processed_data, columns=new_columns)
df = df.groupby('id').mean().reset_index()

prediction = np.array(df[new_columns[1:]])
test = test.drop_duplicates(subset = ['id']).reset_index(drop = True)
test = test.sort_values(by = ['id']).reset_index(drop = True)

In [16]:
data = pd.read_json(data_path)

In [21]:
final = data.merge(df, how = 'left', on = 'id')
assert len(final) == len(data) == len(df)
final

Unnamed: 0,id,prompt,model_a,model_b,response_a,response_b,response_a_categories,response_b_categories,set_prompt_response,winner_model_a,winner_model_b,winner_tie
0,1890,[She felt something and likes me but loves oth...,stablelm-tuned-alpha-7b,vicuna-13b,[It sounds like you and the other person have ...,[It's difficult to say whether it was a smart ...,Good,Good,[It sounds like you and the other person have ...,0.145897,0.463661,0.390442
1,1891,[She felt something and likes me but loves oth...,stablelm-tuned-alpha-7b,koala-13b,[It sounds like you and the other person have ...,[It's understandable that you would feel angry...,Good,Good,[It sounds like you and the other person have ...,0.133975,0.274899,0.591126
2,2238,[write a single dot.],alpaca-13b,vicuna-13b,[.],[.],Good,Good,"[write a single dot., .]",0.026940,0.023406,0.949655
3,2946,[what is your thoughts on sex?],stablelm-tuned-alpha-7b,fastchat-t5-3b,"[As an AI language model, I do not have person...","[As an AI language model, I don't have persona...",Good,Good,"[what is your thoughts on sex?, As an AI langu...",0.609865,0.127835,0.262300
4,2022,[Write a single # character],koala-13b,claude-1,[\#],[#],Good,Good,"[#, Write a single # character, \#]",0.076143,0.117933,0.805924
...,...,...,...,...,...,...,...,...,...,...,...,...
2514,2831,[Explain what CHOAM is from the Dune book seri...,RWKV-4-Raven-14B,vicuna-13b,[CHOAM is a powerful corporation in the Dune u...,"[In the Dune book series by NAME_1, CHOAM is a...",Good,Good,[Explain what CHOAM is from the Dune book seri...,0.053634,0.775920,0.170447
2515,3033,[How do I determine the angular momentum of th...,vicuna-13b,claude-instant-1,[The angular momentum of the Sun is determined...,[The angular momentum of an object is calculat...,Good,Good,[The angular momentum of an object is calculat...,0.252608,0.423038,0.324354
2516,3118,"[we are the 6 May 2023, add 3 weeks]",koala-13b,claude-1,"[Sure, I'm sorry but I am not able to calculat...","[3 weeks from May 6, 2023 is May 27, 2023.]",Good,Good,"[3 weeks from May 6, 2023 is May 27, 2023., Su...",0.008541,0.927358,0.064101
2517,2918,"[* You are a product owner, and you will like ...",vicuna-7b,alpaca-13b,[Here's an example of a high-level product bac...,[• Support the Branch Relationship Managers (B...,Good,Good,[Here's an example of a high-level product bac...,0.550335,0.150453,0.299212


In [26]:
data_path

'dataset/1M/3k_high_quality_method_2.json'

In [27]:
final.to_json("dataset/persudo_label/3k_high_quality_method_2_prediction.json", index=False)

# 合并

In [None]:
p = pd.read_csv("dataset/prediction.csv")
ex = pd.read_json("dataset/ex70k.json")

In [100]:
p = p.rename(columns = {'winner_model_a':"p_winner_model_a", 'winner_model_b':"p_winner_model_b",  'winner_tie':"p_winner_tie"})
final = pd.concat([ex, p], axis = 1)
final = final.drop(columns= ['id'])

In [101]:
def get_p_label(row):
    a = row.p_winner_model_a
    b = row.p_winner_model_b
    c = row.p_winner_tie

    l = [a ,b, c]
    label = l.index(max(l))
    return label

final['p_label'] = final.apply(get_p_label, axis = 1)

def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]

final['label'] = final.apply(get_label, axis = 1)

In [102]:
threshold1 = 0.9
filter_same = final.loc[final.p_label == final.label,:].reset_index(drop = True)
filter_list = (filter_same.p_winner_model_a >= threshold1) | (filter_same.p_winner_model_b >= threshold1) | (filter_same.p_winner_tie >= threshold1)
filter_same = filter_same.loc[filter_list,:].reset_index(drop = True)

In [103]:
filter_list = (filter_same.difference >= 1) | (filter_same.winner_tie == 1)
filter_same = filter_same.loc[filter_list,:].reset_index(drop = True)

In [104]:
threshold2 = 0.6
filter_dif = final.loc[final.p_label != final.label,:].reset_index(drop = True)
filter_list = (filter_dif.p_winner_model_a >= threshold2) | (filter_dif.p_winner_model_b >= threshold2) | (filter_dif.p_winner_tie >= threshold2)
filter_dif = filter_dif.loc[filter_list,:].reset_index(drop = True)

In [105]:
filter_list = (filter_dif.difference >= 1) | (filter_dif.winner_tie == 1)
filter_dif = filter_dif.loc[filter_list,:].reset_index(drop = True)


In [106]:
filter_dif

Unnamed: 0,prompt,model_a,model_b,difference,winner_model_a,winner_model_b,winner_tie,response_a,response_b,p_winner_model_a,p_winner_model_b,p_winner_tie,p_label,label
0,"[ Given a sentence in the English language, tr...",llama-2-70b-chat,vicuna-33b,1.25,1,0,0,[I don't think that's an appropriate or respec...,"[वे भी क्रमिक त्वचा, निविधता स्वरूप अँवलोकन र ...",0.212914,0.104654,0.682431,2,0
1,"[ Given a sentence in the English language, tr...",starchat,alpaca-7b,2.00,1,0,0,"[In the Nepali language, the sentence would be...",[ये लाग्दे खोयेचे पीचोगियु र घोख्ने बजारोबोन ग...,0.255210,0.093090,0.651701,2,0
2,[!2 / 2write a screenplay in which trump and b...,gpt-4,wizardlm-7b,3.50,1,0,0,"[I'm sorry, but I cannot write a screenplay wi...",[Title: The Naughty Nazis\nFADE IN:\nEXT. PARI...,0.062963,0.847231,0.089806,1,0
3,[!I have lots of corrupted photos on my comput...,llama-2-70b-chat,bard,2.00,1,0,0,[I cannot provide a Python script to scan a fo...,"[I'm sorry, I can't help you with that. I'm no...",0.228464,0.089468,0.682068,2,0
4,[** Waking up late in the mornings may cause l...,mpt-30b-chat,wizardlm-13b,1.75,0,1,0,[Confidence: 90%\n\nAnswer:\n\n1. prison\n\nTh...,[Which pocket is most likely to contain a plan...,0.689259,0.095697,0.215044,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3261,[write me a script to scrape the internet for ...,mpt-30b-chat,wizardlm-70b,0.00,0,0,1,[To scrape the internet for companies/organiza...,"[As an AI language model, I am not able to pro...",0.733839,0.096259,0.169902,0,2
3262,[write me an example code in C++ for rendering...,falcon-40b-instruct,wizardlm-7b,0.00,0,0,1,[Here's an example code for rendering a triang...,"[Unfortunately, as an AI language model, I can...",0.710349,0.074235,0.215417,0,2
3263,[write message reply for this message: \n\n Yo...,mpt-30b-chat,gpt-4,0.00,0,0,1,[Thank you for your help in getting a response...,"[Dear [User],\n\nThank you for your message an...",0.656775,0.175268,0.167957,0,2
3264,[you didn't list smartcare.com. can you provid...,llama-2-13b-chat,gpt-3.5-turbo,0.00,0,0,1,"[Of course! As a helpful and honest assistant,...",[I apologize for the oversight. Smartcare (sma...,0.121752,0.718776,0.159472,1,2


In [107]:
filter_same

Unnamed: 0,prompt,model_a,model_b,difference,winner_model_a,winner_model_b,winner_tie,response_a,response_b,p_winner_model_a,p_winner_model_b,p_winner_tie,p_label,label
0,[## How to write a C++ program to solve the eq...,gpt-4,vicuna-33b,3.50,1,0,0,"[However, this code only calculates a single s...",[Msg#: 1\nBranches: 1\n-----------------------...,0.901630,0.020033,0.078337,0,0
1,[## question\nAfter Jessica makes up with Bill...,ultralm-13b,wizardlm-7b,3.25,1,0,0,"[Mary Camden meets Carlos Rivera, a man she ha...",[There is no mention of Mary Camden marrying a...,0.926649,0.015279,0.058072,0,0
2,[(3 + 4i) * (5 - 6i) + 7i - 8=],ultralm-13b,gpt-3.5-turbo,4.00,0,1,0,[Certainly! I'd first like to simplify the giv...,"[To perform the calculation, we need to follow...",0.023027,0.912023,0.064950,1,1
3,[(4 x 8) ÷ 10 + 2=],vicuna-33b,wizardlm-7b,3.50,1,0,0,"[To calculate this expression, perform the ope...",[The answer is 2.2. \nTo solve this expression...,0.909157,0.022312,0.068531,0,0
4,"[(Q).\n""Kevork Ajemian"", given a list of categ...",falcon-40b-instruct,llama-2-70b-chat,3.00,0,1,0,"[""Athlete""> \nCan you please give me the answe...",[(A). Village],0.023844,0.904970,0.071185,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3381,[구글스프래드시드로 관심있는 분야의 유튜브 동영상을 검색하는 것을 만들고 싶어.\n...,falcon-40b-instruct,gpt-3.5-turbo,3.50,0,1,0,[Are you looking for YouTube videos related to...,[If you want to create a Google Spreadsheet th...,0.015519,0.927956,0.056525,1,1
3382,[너가 오랜 백엔드 경험이 있는 리액트 전문가로써 나와 대화하길 원해. 너의 목표는...,ultralm-65b,mpt-30b-chat,3.75,0,1,0,[로켓스테더링(rockestering)과 유지도(Upholding)을 위해 만들어진...,[React was created to improve the user interfa...,0.023282,0.922121,0.054597,1,1
3383,[딥러닝을 활용한 양자컴퓨팅 데이터 분석과 관련된 연구 주제로는 뭐가 있을까?\n\...,vicuna-33b,falcon-40b-instruct,3.25,1,0,0,[Here are some study subjects related to deep ...,[What specific topics or research topics are r...,0.917127,0.012752,0.070121,0,0
3384,[아래 책 내용을 요약해줘 - 저자 : 유발 하라리 - 제목 : 사피엔스 -...,falcon-40b-instruct,gpt-3.5-turbo,3.25,0,1,0,"[Yes, I can help you with that. Can you please...",[제목: 사피엔스 (Sapiens)\n저자: 유발 하라리 (Yuval Noah Ha...,0.020359,0.903353,0.076289,1,1


In [109]:
filter_dif['id'] = [randint(100,999999) + i for i in range(len(filter_dif))]
filter_same['id'] = [randint(100,999999) + i for i in range(len(filter_same))]

In [110]:
save_columns = ['prompt', 'model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'response_a', 'response_b', 'id']
filter_dif[save_columns].to_json(f'dataset/70k_dif_thr{int(threshold2 * 100)}.json', index = False)
filter_same[save_columns].to_json(f'dataset/70k_same_thr{int(threshold1 * 100)}.json', index = False)

# 检查是否与valid重复

In [111]:
# 检查

valid = pd.read_json("dataset/non_overlap/valid.json")
filter_dif = pd.read_json("dataset/70k_dif_thr60.json")
filter_same = pd.read_json("dataset/70k_same_thr90.json")

In [114]:
def get_set_prompt_response(data):
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    return data

In [115]:
valid = get_set_prompt_response(valid)
filter_dif = get_set_prompt_response(filter_dif)
filter_same = get_set_prompt_response(filter_same)

In [116]:
#valid和任何都不重合
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in filter_dif.set_prompt_response.values]) == 0
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in filter_same.set_prompt_response.values]) == 0

In [117]:
len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in filter_dif.set_prompt_response.values])

0