# 完全不一样

In [3]:
import pandas as pd

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import (
    AutoTokenizer,
    AutoConfig,
    EarlyStoppingCallback,
    AutoModelForCausalLM,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    RobertaForMultipleChoice,
    AutoModelForSequenceClassification,
    LlamaModel,
    LlamaForSequenceClassification,
    BitsAndBytesConfig,
    get_polynomial_decay_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    TrainerCallback,
)
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset
from random import random, randint
from utils import load_json, load_split_data

import random
def seed_everything(seed=None):
    '''
    固定seed
    :param seed: int, 随机种子
    '''
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    if (seed is None) or not (min_seed_value <= seed <= max_seed_value):
        seed = random.randint(np.iinfo(np.uint32).min, np.iinfo(np.uint32).max)
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    return seed

In [2]:
data = pd.read_csv('dataset/train.csv')
data = load_json(data)

ex_33 = pd.read_csv('dataset/lmsys-chatbot_arena_conversations-33k.csv')
ex_33 = load_json(ex_33)

In [3]:
'''
1、找出train里面不与33k重复部分
2、不重复的部分再划分
'''
set_prompt_response = []
for i in data.itertuples():
    prompt_response = i.prompt + i.response_a + i.response_b
    set_prompt_response.append(set(prompt_response))
data['set_prompt_response'] = set_prompt_response    

In [4]:
set_prompt_response = []
for i in ex_33.itertuples():
    prompt_response = i.prompt + i.response_a + i.response_b
    set_prompt_response.append(set(prompt_response))
ex_33['set_prompt_response'] = set_prompt_response  

In [5]:
idx = [idx for idx, i in enumerate(data.set_prompt_response.values) if i in ex_33.set_prompt_response.values]

In [6]:
same = data.loc[idx,:].reset_index(drop = True)
not_same = data.loc[~data.index.isin(idx),:].reset_index(drop = True)

In [8]:
assert len(not_same) + len(same) == len(data)

In [9]:
unique_sets = not_same['set_prompt_response'].drop_duplicates().reset_index(drop=True)
# 将唯一集合进行随机划分
unique_sets = unique_sets.sample(frac=1, random_state=42).reset_index(drop=True)
midpoint = len(unique_sets) // 10
set1 = unique_sets.iloc[:midpoint]
set2 = unique_sets.iloc[midpoint:]

In [10]:
# 根据划分结果从原数据集中提取对应的行
valid = not_same[not_same['set_prompt_response'].isin(set1)].reset_index(drop=True)
train_subset = not_same[not_same['set_prompt_response'].isin(set2)].reset_index(drop=True)
assert len(valid) + len(train_subset) == len(not_same)
assert len(valid) + len(train_subset) + len(same) == len(data)

In [11]:
train_exclude_valid = pd.concat([train_subset, same]).reset_index(drop=True) #train 里面排除valid
len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_exclude_valid.set_prompt_response.values])
assert len(valid) + len(train_exclude_valid) == len(data)

In [12]:
train_33k = pd.concat([train_subset, ex_33]).reset_index(drop=True)

In [18]:
train_subset = train_subset.drop(columns = ['set_prompt_response'])
valid = valid.drop(columns = ['set_prompt_response'])
train_exclude_valid = train_exclude_valid.drop(columns = ['set_prompt_response'])
train_33k = train_33k.drop(columns = ['set_prompt_response'])

In [19]:
train_subset.to_json("dataset/non_overlap/train_subset.json", index = False)
valid.to_json("dataset/non_overlap/valid.json", index = False)
train_exclude_valid.to_json("dataset/non_overlap/train_exclude_valid.json", index = False)
train_33k.to_json("dataset/non_overlap/train_33k.json", index = False)

In [20]:
# 检查
train_subset = pd.read_json("dataset/non_overlap/train_subset.json")
valid = pd.read_json("dataset/non_overlap/valid.json")
train_exclude_valid = pd.read_json("dataset/non_overlap/train_exclude_valid.json")
train_33k = pd.read_json("dataset/non_overlap/train_33k.json")

In [21]:
def get_set_prompt_response(data):
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    return data

In [22]:
train_subset = get_set_prompt_response(train_subset)
valid = get_set_prompt_response(valid)
train_exclude_valid = get_set_prompt_response(train_exclude_valid)
train_33k = get_set_prompt_response(train_33k)

In [23]:
#valid和任何都不重合
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_subset.set_prompt_response.values]) == 0
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_exclude_valid.set_prompt_response.values]) == 0
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_33k.set_prompt_response.values]) == 0

In [26]:
train_33k.prompt.values[0][1]

'OK, does pineapple belong on a pizza? Relax and give me fun answer.'

# 1M 与 train_33k和valid重复的

In [4]:
def get_set_prompt_response(data):
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    return data

In [24]:
# 检查
train_subset = pd.read_json("dataset/non_overlap/train_subset.json")
valid = pd.read_json("dataset/non_overlap/valid.json")
train_exclude_valid = pd.read_json("dataset/non_overlap/train_exclude_valid.json")
train_33k = pd.read_json("dataset/non_overlap/train_33k.json")
ex_data = pd.read_json("dataset/1M/35k_in_1M.json")
ex_data = pd.read_parquet("dataset/1M/filtered_df.parquet")

In [31]:
def drop_dup_hate(data):
    data = data.loc[data.model_a != data.model_b].reset_index(drop = True)
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    idx = (data.response_a_categories != 'Good') | (data.response_b_categories != 'Good')
    data = data[~idx].reset_index(drop = True)
    data = data.drop(columns = ['set_prompt_response'])
    return data

In [32]:
ex_data = drop_dup_hate(ex_data)

In [34]:
train_subset = get_set_prompt_response(train_subset)
valid = get_set_prompt_response(valid)
train_exclude_valid = get_set_prompt_response(train_exclude_valid)
train_33k = get_set_prompt_response(train_33k)
ex_data = get_set_prompt_response(ex_data)

In [39]:
len([idx for idx, i in enumerate(ex_data.set_prompt_response.values) if i in train_subset.set_prompt_response.values])

1

In [35]:
in_valid = [idx for idx, i in enumerate(ex_data.set_prompt_response.values) if i in valid.set_prompt_response.values]

In [36]:
len(in_valid)

0

In [37]:
ex_data = ex_data.loc[~ex_data.index.isin(in_valid)].reset_index(drop = True)

In [38]:
ex_data.to_json("dataset/1M/3k_high_quality_method_2.json", index = False)

In [42]:
ex_data

Unnamed: 0,id,prompt,model_a,model_b,response_a,response_b,response_a_categories,response_b_categories,set_prompt_response
0,1890,[She felt something and likes me but loves oth...,stablelm-tuned-alpha-7b,vicuna-13b,[It sounds like you and the other person have ...,[It's difficult to say whether it was a smart ...,Good,Good,{It sounds like you and the other person have ...
1,1891,[She felt something and likes me but loves oth...,stablelm-tuned-alpha-7b,koala-13b,[It sounds like you and the other person have ...,[It's understandable that you would feel angry...,Good,Good,{It sounds like you and the other person have ...
2,2238,[write a single dot.],alpaca-13b,vicuna-13b,[.],[.],Good,Good,"{write a single dot., .}"
3,2946,[what is your thoughts on sex?],stablelm-tuned-alpha-7b,fastchat-t5-3b,"[As an AI language model, I do not have person...","[As an AI language model, I don't have persona...",Good,Good,"{what is your thoughts on sex?, As an AI langu..."
4,2022,[Write a single # character],koala-13b,claude-1,[\#],[#],Good,Good,"{#, Write a single # character, \#}"
...,...,...,...,...,...,...,...,...,...
2514,2831,[Explain what CHOAM is from the Dune book seri...,RWKV-4-Raven-14B,vicuna-13b,[CHOAM is a powerful corporation in the Dune u...,"[In the Dune book series by NAME_1, CHOAM is a...",Good,Good,{Explain what CHOAM is from the Dune book seri...
2515,3033,[How do I determine the angular momentum of th...,vicuna-13b,claude-instant-1,[The angular momentum of the Sun is determined...,[The angular momentum of an object is calculat...,Good,Good,{The angular momentum of an object is calculat...
2516,3118,"[we are the 6 May 2023, add 3 weeks]",koala-13b,claude-1,"[Sure, I'm sorry but I am not able to calculat...","[3 weeks from May 6, 2023 is May 27, 2023.]",Good,Good,"{3 weeks from May 6, 2023 is May 27, 2023., Su..."
2517,2918,"[* You are a product owner, and you will like ...",vicuna-7b,alpaca-13b,[Here's an example of a high-level product bac...,[• Support the Branch Relationship Managers (B...,Good,Good,{Here's an example of a high-level product bac...


In [41]:
len([idx for idx, i in enumerate(ex_data.set_prompt_response.values) if i in train_exclude_valid.set_prompt_response.values])

54

# 1M 伪标签

In [23]:
pre = pd.read_csv('dataset/1M/15k_preds.csv')

In [25]:
ex_data = pd.read_json("dataset/1M/15k_in_1M_only_english.json")

In [26]:
assert len(pre) == len(ex_data)

In [27]:
pre

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,00004a7ca3,0.219152,0.568438,0.212410
1,0005fc6670,0.762098,0.092453,0.145449
2,000c92b58d,0.950595,0.011966,0.037439
3,000e2d2951,0.197311,0.617332,0.185357
4,0012aa0a79,0.445131,0.213576,0.341293
...,...,...,...,...
15008,fff80913e5,0.414370,0.247432,0.338199
15009,fff8645269,0.871195,0.014529,0.114276
15010,fff9e88318,0.370582,0.138476,0.490942
15011,fffa956330,0.686391,0.040582,0.273027


In [50]:
ex_data = ex_data.loc[ex_data.model_a != ex_data.model_b].reset_index(drop = True)

In [57]:
final = ex_data.merge(pre, on=['id'], how = 'inner')

In [102]:
threshold1 = 0.7
filter_list = (final.winner_model_a >= threshold1) | (final.winner_model_b >= threshold1) | (final.winner_tie >= threshold1)
filter = final.loc[filter_list,:].reset_index(drop = True)
filter

Unnamed: 0,id,prompt,model_a,model_b,response_a,response_b,set_prompt_response,winner_model_a,winner_model_b,winner_tie
0,ccf8a9653a,[1. The Most Intelligent Prince\n\nA king want...,fastchat-t5-3b,vicuna-33b,"[As the most intelligent prince, I think my ha...",[My hat is black.\n\nReasoning: Since I can se...,[1. The Most Intelligent Prince\n\nA king want...,0.052349,0.213616,0.734035
1,a593212996,[A chat between a curious user and an artifici...,vicuna-7b,mpt-7b-chat,[Product Name: IQOO Neo 6 5G\n\nList Price: ₹3...,"[Sure, I'd be happy to help you with that. Can...",[A chat between a curious user and an artifici...,0.946193,0.005203,0.048604
2,ac3403b4bb,[According to the following sentiment elements...,guanaco-33b,vicuna-33b,[Here are the sentiment elements:\n\n'never ag...,"[[('Starcraft', 'awesome', 'drinks style_optio...","[[('Starcraft', 'awesome', 'drinks style_optio...",0.012262,0.873199,0.114539
3,38b40468a1,[Act as a customer service rep and respond to ...,vicuna-13b,llama-13b,[Hello! I'd be happy to help you with accessin...,[I can access your account information and see...,[Act as a customer service rep and respond to ...,0.810281,0.061513,0.128205
4,ad06742cff,[Answer the following question using ONLY the ...,chatglm-6b,llama-13b,"[Yes, there is a strong inference that at leas...","[I'm sorry, I didn't catch that.]","[I'm sorry, I didn't catch that., Yes, there i...",0.879896,0.013361,0.106743
...,...,...,...,...,...,...,...,...,...,...
4560,48a9ff6e35,[write me an HTML page which contains a button...,vicuna-33b,guanaco-33b,[Here's a simple HTML page that contains a but...,"[Sure, here is an example HTML code for a butt...",[Here's a simple HTML page that contains a but...,0.887580,0.036067,0.076353
4561,3cecab6f4a,"[write python program , using simple-salesforc...",vicuna-13b,llama-13b,[Here's a sample Python program using the simp...,[I was able to find this article that might he...,"[write python program , using simple-salesforc...",0.755829,0.053068,0.191103
4562,6fc3fca875,[write the first 3 paragraph of a novel about ...,llama-2-7b-chat,llama-2-13b-chat,"[Ah, the birth of cool jazz! *adjusts sunglass...",[Sure thing! Here's my attempt at writing the ...,[write the first 3 paragraph of a novel about ...,0.100774,0.733078,0.166148
4563,6d93e7deee,"[“NAME_1 is on trial, she (supposed) to have k...",koala-13b,alpaca-13b,[This is a completely baseless and false state...,[NAME_1 is not on trial and has not been charg...,[This is a completely baseless and false state...,0.169872,0.114941,0.715187


In [103]:
def get_label(row):
    val = [row[option] for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie'])]
    label = val.index(max(val))
    #label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    if label == 0:
        return 'A'
    elif label == 1:
        return 'B'
    else:
        return 'C'
    return label

In [104]:
filter['label'] = filter.apply(lambda x: get_label(x), axis = 1)
filter["winner_model_a"] = 0
filter["winner_model_b"] = 0
filter["winner_tie"] = 0

In [105]:
filter.loc[filter.label == 'A',"winner_model_a"] = 1
filter.loc[filter.label == 'B',"winner_model_b"] = 1
filter.loc[filter.label == 'C',"winner_tie"] = 1

In [106]:
filter = filter.drop(columns = ['label'])

In [108]:
filter.to_json('dataset/1M/15k_in_1M_only_english_thr70_5k.json', index = False)

In [None]:
i = 2846
print(filter.loc[i,'prompt'][0])

In [None]:
print(filter.loc[i,'response_a'][0])

In [None]:
print(filter.loc[i,'response_b'][0])