# 完全不一样

In [1]:
import pandas as pd

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import (
    AutoTokenizer,
    AutoConfig,
    EarlyStoppingCallback,
    AutoModelForCausalLM,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    RobertaForMultipleChoice,
    AutoModelForSequenceClassification,
    LlamaModel,
    LlamaForSequenceClassification,
    BitsAndBytesConfig,
    get_polynomial_decay_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    TrainerCallback,
)
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset
from random import random, randint
from utils import load_json, load_split_data

import random
def seed_everything(seed=None):
    '''
    固定seed
    :param seed: int, 随机种子
    '''
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    if (seed is None) or not (min_seed_value <= seed <= max_seed_value):
        seed = random.randint(np.iinfo(np.uint32).min, np.iinfo(np.uint32).max)
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    return seed

In [None]:
data = pd.read_csv('dataset/train.csv')
data = load_json(data)

ex_33 = pd.read_csv('dataset/lmsys-chatbot_arena_conversations-33k.csv')
ex_33 = load_json(ex_33)

In [None]:
'''
1、找出train里面不与33k重复部分
2、不重复的部分再划分
'''
set_prompt_response = []
for i in data.itertuples():
    prompt_response = i.prompt + i.response_a + i.response_b
    set_prompt_response.append(set(prompt_response))
data['set_prompt_response'] = set_prompt_response    

In [None]:
set_prompt_response = []
for i in ex_33.itertuples():
    prompt_response = i.prompt + i.response_a + i.response_b
    set_prompt_response.append(set(prompt_response))
ex_33['set_prompt_response'] = set_prompt_response  

In [None]:
idx = [idx for idx, i in enumerate(data.set_prompt_response.values) if i in ex_33.set_prompt_response.values]

In [None]:
same = data.loc[idx,:].reset_index(drop = True)
not_same = data.loc[~data.index.isin(idx),:].reset_index(drop = True)

In [None]:
assert len(not_same) + len(same) == len(data)

In [None]:
unique_sets = not_same['set_prompt_response'].drop_duplicates().reset_index(drop=True)
# 将唯一集合进行随机划分
unique_sets = unique_sets.sample(frac=1, random_state=42).reset_index(drop=True)
midpoint = len(unique_sets) // 10
set1 = unique_sets.iloc[:midpoint]
set2 = unique_sets.iloc[midpoint:]

In [None]:
# 根据划分结果从原数据集中提取对应的行
valid = not_same[not_same['set_prompt_response'].isin(set1)].reset_index(drop=True)
train_subset = not_same[not_same['set_prompt_response'].isin(set2)].reset_index(drop=True)
assert len(valid) + len(train_subset) == len(not_same)
assert len(valid) + len(train_subset) + len(same) == len(data)

In [None]:
train_exclude_valid = pd.concat([train_subset, same]).reset_index(drop=True) #train 里面排除valid
len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_exclude_valid.set_prompt_response.values])
assert len(valid) + len(train_exclude_valid) == len(data)

In [None]:
train_33k = pd.concat([train_subset, ex_33]).reset_index(drop=True)

In [None]:
train_subset = train_subset.drop(columns = ['set_prompt_response'])
valid = valid.drop(columns = ['set_prompt_response'])
train_exclude_valid = train_exclude_valid.drop(columns = ['set_prompt_response'])
train_33k = train_33k.drop(columns = ['set_prompt_response'])

In [None]:
train_subset.to_json("dataset/non_overlap/train_subset.json", index = False)
valid.to_json("dataset/non_overlap/valid.json", index = False)
train_exclude_valid.to_json("dataset/non_overlap/train_exclude_valid.json", index = False)
train_33k.to_json("dataset/non_overlap/train_33k.json", index = False)

In [None]:
# 检查
train_subset = pd.read_json("dataset/non_overlap/train_subset.json")
valid = pd.read_json("dataset/non_overlap/valid.json")
train_exclude_valid = pd.read_json("dataset/non_overlap/train_exclude_valid.json")
train_33k = pd.read_json("dataset/non_overlap/train_33k.json")

In [None]:
def get_set_prompt_response(data):
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    return data

In [None]:
train_subset = get_set_prompt_response(train_subset)
valid = get_set_prompt_response(valid)
train_exclude_valid = get_set_prompt_response(train_exclude_valid)
train_33k = get_set_prompt_response(train_33k)

In [None]:
#valid和任何都不重合
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_subset.set_prompt_response.values]) == 0
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_exclude_valid.set_prompt_response.values]) == 0
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_33k.set_prompt_response.values]) == 0

In [None]:
train_33k.prompt.values[0][1]

# 1M 与 train_33k和valid重复的

In [2]:
def get_set_prompt_response(data):
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    return data

In [3]:
# 检查
train_subset = pd.read_json("dataset/non_overlap/train_subset.json")
valid = pd.read_json("dataset/non_overlap/valid.json")
train_exclude_valid = pd.read_json("dataset/non_overlap/train_exclude_valid.json")
train_33k = pd.read_json("dataset/non_overlap/train_33k.json")
ex_data = pd.read_json("dataset/1M/35k_in_1M.json")
ex_data = pd.read_parquet("dataset/1M/filtered_df.parquet")

In [None]:
def drop_dup_hate(data):
    data = data.loc[data.model_a != data.model_b].reset_index(drop = True)
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    idx = (data.response_a_categories != 'Good') | (data.response_b_categories != 'Good')
    data = data[~idx].reset_index(drop = True)
    data = data.drop(columns = ['set_prompt_response'])
    return data

In [None]:
ex_data = drop_dup_hate(ex_data)

In [5]:
train_subset = get_set_prompt_response(train_subset)
valid = get_set_prompt_response(valid)
train_exclude_valid = get_set_prompt_response(train_exclude_valid)
train_33k = get_set_prompt_response(train_33k)
ex_data = get_set_prompt_response(ex_data)

In [9]:
train_33k = train_33k.drop_duplicates(subset = ['set_prompt_response']).reset_index(drop = True)

In [None]:
len([idx for idx, i in enumerate(ex_data.set_prompt_response.values) if i in train_subset.set_prompt_response.values])

In [None]:
in_valid = [idx for idx, i in enumerate(ex_data.set_prompt_response.values) if i in valid.set_prompt_response.values]

In [None]:
len(in_valid)

In [None]:
ex_data = ex_data.loc[~ex_data.index.isin(in_valid)].reset_index(drop = True)

In [None]:
ex_data.to_json("dataset/1M/3k_high_quality_method_2.json", index = False)

In [None]:
ex_data

In [None]:
len([idx for idx, i in enumerate(ex_data.set_prompt_response.values) if i in train_exclude_valid.set_prompt_response.values])

# 1M 伪标签

In [13]:
final = pd.read_json("dataset/persudo_label/35k_in_1M_prediction.json")

In [14]:
filter = final

In [11]:
threshold1 = 0.7
filter_list = (final.winner_model_a >= threshold1) | (final.winner_model_b >= threshold1) | (final.winner_tie >= threshold1)
filter = final.loc[filter_list,:].reset_index(drop = True)
filter

Unnamed: 0,id,prompt,model_a,model_b,response_a,response_b,response_a_categories,response_b_categories,set_prompt_response,winner_model_a,winner_model_b,winner_tie
0,2238,[write a single dot.],alpaca-13b,vicuna-13b,[.],[.],Good,Good,"[write a single dot., .]",0.026940,0.023406,0.949655
1,2022,[Write a single # character],koala-13b,claude-1,[\#],[#],Good,Good,"[#, Write a single # character, \#]",0.076143,0.117933,0.805924
2,2220,[Create a list of 3 startup ideas in enterpris...,fastchat-t5-3b,oasst-pythia-12b,"[1. ""Blockchain for Supply Chain Management""\n...",[Sure! Here are three startup ideas in enterpr...,Good,Good,"[1. ""Blockchain for Supply Chain Management""\n...",0.003541,0.907954,0.088506
3,2230,[How do social media platforms influence the w...,fastchat-t5-3b,koala-13b,[Social media platforms have had a significant...,[Social media platforms can influence the way ...,Good,Good,[Social media platforms have had a significant...,0.030739,0.884389,0.084872
4,3000,[who is the incumbent president of usa],oasst-pythia-12b,koala-13b,"[As of February 2023, NAME_1 is the 68th and c...",[The current President of the United States is...,Good,Good,"[As of February 2023, NAME_1 is the 68th and c...",0.115674,0.141727,0.742598
...,...,...,...,...,...,...,...,...,...,...,...,...
985,2911,[My drawer is 16 x 6. A handle to install is ...,koala-13b,claude-instant-1,[To install a handle that is 5 inches wide in ...,"[Okay, let's solve this step-by-step:\n* The d...",Good,Good,[To install a handle that is 5 inches wide in ...,0.042402,0.800058,0.157541
986,2745,"[Hey, how are you?]",alpaca-13b,koala-13b,"[Hey there! I'm doing great, thank you for ask...","[Hello! As an AI language model, I don't have ...",Good,Good,"[Hey there! I'm doing great, thank you for ask...",0.707198,0.167974,0.124828
987,2831,[Explain what CHOAM is from the Dune book seri...,RWKV-4-Raven-14B,vicuna-13b,[CHOAM is a powerful corporation in the Dune u...,"[In the Dune book series by NAME_1, CHOAM is a...",Good,Good,[Explain what CHOAM is from the Dune book seri...,0.053634,0.775920,0.170447
988,3118,"[we are the 6 May 2023, add 3 weeks]",koala-13b,claude-1,"[Sure, I'm sorry but I am not able to calculat...","[3 weeks from May 6, 2023 is May 27, 2023.]",Good,Good,"[3 weeks from May 6, 2023 is May 27, 2023., Su...",0.008541,0.927358,0.064101


In [16]:
def get_label(row):
    val = [row[option] for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie'])]
    label = val.index(max(val))
    #label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    if label == 0:
        return 'A'
    elif label == 1:
        return 'B'
    else:
        return 'C'
    return label

In [17]:
filter['label'] = filter.apply(lambda x: get_label(x), axis = 1)
filter["winner_model_a"] = 0
filter["winner_model_b"] = 0
filter["winner_tie"] = 0

In [18]:
filter.loc[filter.label == 'A',"winner_model_a"] = 1
filter.loc[filter.label == 'B',"winner_model_b"] = 1
filter.loc[filter.label == 'C',"winner_tie"] = 1

In [19]:
filter = filter.drop(columns = ['label'])

In [20]:
filter

Unnamed: 0,id,prompt,model_a,model_b,response_a,response_b,categories_a,categories_b,set_prompt_response,winner_model_a,winner_model_b,winner_tie
0,aa7a6008f4,[How would a French native say «Pain is inevit...,alpaca-13b,RWKV-4-Raven-14B,"[En français, on dirait: ""La douleur est inévi...","[The French native would say ""La douleur est i...",Good,Good,"[The French native would say ""La douleur est i...",0,0,1
1,aa7a60ed20,[How would a French native say «Pain is inevit...,alpaca-13b,dolly-v2-12b,"[En français, on dirait: ""La douleur est inévi...","[«Le pain est une leçon de tempérament, la sou...",Good,Good,"[En français, on dirait: ""La douleur est inévi...",0,0,1
2,aa7a61d0d5,[How would a French native say «Pain is inevit...,alpaca-13b,oasst-pythia-12b,"[En français, on dirait: ""La douleur est inévi...","[A French native might say ""La douleur est iné...",Good,Good,"[A French native might say ""La douleur est iné...",0,1,0
3,aa7a674ddf,[How would a French native say «Pain is inevit...,alpaca-13b,oasst-pythia-12b,"[En français, on dirait: ""La douleur est inévi...","[En francais, NAME_1 dit ""La douleur est inévi...",Good,Good,"[En francais, NAME_1 dit ""La douleur est inévi...",0,0,1
4,aa7a62e17f,[How would a French native say «Pain is inevit...,alpaca-13b,RWKV-4-Raven-14B,"[En français, on dirait: ""La douleur est inévi...","[NAME_1, un Français natif direait: «La douleu...",Good,Good,"[NAME_1, un Français natif direait: «La douleu...",0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34512,e35824aab4,[Write me a Python function computing the fibo...,RWKV-4-Raven-14B,palm-2,"[Sure, here's a Python function that computes ...","[```python\ndef fibonacci(n):\n """"""Computes t...",Good,Good,[Write me a Python function computing the fibo...,0,1,0
34513,c02b756584,[The sum of the distances the sun travels arou...,fastchat-t5-3b,RWKV-4-Raven-14B,[To find the distance between the sun and Eart...,[The formula to find the distance between the ...,Good,Good,[The sum of the distances the sun travels arou...,0,0,1
34514,133161eb0d,"[I bought 16 stamps, including 250 won and 300...",oasst-pythia-12b,vicuna-33b,[We can solve this problem by using the formul...,[Let's use variables to represent the number o...,Good,Good,"[My name is Vicuna, and I'm a language model d...",0,0,1
34515,952ae3cd46,"[Переведи на русский в молодежном стиле ""By us...",chatglm-6b,oasst-pythia-12b,"[By accessing this service, users are required...","[By using this service, users are required to ...",Good,Good,"[By using this service, users are required to ...",0,0,1


In [21]:
filter.to_json('dataset/persudo_label/35k_in_1M_prediction_thr0_35k.json', index = False)

In [None]:
i = 2846
print(filter.loc[i,'prompt'][0])

In [None]:
print(filter.loc[i,'response_a'][0])

In [None]:
print(filter.loc[i,'response_b'][0])

# MT bench 与 train 33k valid 重复

In [2]:
def get_set_prompt_response(data):
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    return data

In [3]:
# 检查
train_subset = pd.read_json("dataset/non_overlap/train_subset.json")
valid = pd.read_json("dataset/non_overlap/valid.json")
train_exclude_valid = pd.read_json("dataset/non_overlap/train_exclude_valid.json")
train_33k = pd.read_json("dataset/non_overlap/train_33k.json")
ex_data = pd.read_json("dataset/mt_bentch_human.json")

In [4]:
train_subset = get_set_prompt_response(train_subset)
valid = get_set_prompt_response(valid)
train_exclude_valid = get_set_prompt_response(train_exclude_valid)
train_33k = get_set_prompt_response(train_33k)
ex_data = get_set_prompt_response(ex_data)

In [10]:
in_valid = [idx for idx, i in enumerate(ex_data.set_prompt_response.values) if i in valid.set_prompt_response.values]

In [11]:
len(in_valid)

0