In [1]:
import pandas as pd

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import (
    AutoTokenizer,
    AutoConfig,
    EarlyStoppingCallback,
    AutoModelForCausalLM,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    RobertaForMultipleChoice,
    AutoModelForSequenceClassification,
    LlamaModel,
    LlamaForSequenceClassification,
    BitsAndBytesConfig,
    get_polynomial_decay_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    TrainerCallback,
)
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset
from random import random, randint
from utils import load_json, load_split_data

In [None]:
class args:
    train_data = './dataset/demo_train.csv'
    MAX_INPUT = 1024

In [None]:
df_train = pd.read_csv(args.train_data).reset_index(drop = True)
#df_valid = pd.read_csv(args.valid_data).reset_index(drop = True)

In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}

In [None]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]

In [None]:
df_train['label'] = df_train.apply(lambda x: get_label(x), axis = 1)

In [None]:
def preprocess(example):
    first_sentence = [ "[CLS] " + example['prompt'] ] * 2
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in ['response_a','response_b']]
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='longest_first', 
                                  max_length=args.MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [None]:
def preprocess(example):
    sentences = [" #### " + example['prompt'] + " [SEP] " + example['response_a'] + " [SEP]" +  " #### " + example['prompt'] + " [SEP] " + example['response_b'] + " [SEP]"]
    tokenized_example = tokenizer(sentences, truncation=True, 
                                  max_length=args.MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [None]:
dataset = datasets.Dataset.from_pandas(df_train)
MODEL = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
dataset

In [None]:
tokenized_dataset = dataset.map(preprocess, remove_columns=['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b'])# 

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['input_ids'][0]

In [None]:
df_train.loc[40]

In [None]:
df_train.loc[:1000,].reset_index(drop = True).to_csv('demo_train.csv')

In [None]:
df_train.loc[1000:1200,].reset_index(drop = True).to_csv('demo_valid.csv')

In [None]:
import os

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoConfig
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, RobertaForMultipleChoice, AutoModelForSequenceClassification, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset
from sklearn.metrics import log_loss
import torch.nn as nn
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
    

In [None]:
MODEL = 'meta-llama/llama-3-transformers-8b-hf-v1'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.add_special_tokens({"pad_token":"<pad>"})

In [None]:
tokenizer("<pad>")['input_ids'][0]

In [None]:
128256 in tokenizer("<pad>")['input_ids']

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False
    )
    
#config = AutoConfig.from_pretrained(args.MODEL)
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    #config = config,
    device_map="auto")

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False
    )
    
#config = AutoConfig.from_pretrained(args.MODEL)
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    #config = config,
    device_map="auto")
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

# config = AutoConfig.from_pretrained(MODEL)
# config.hidden_dropout_prob = args.dropout_rate
# config.attention_probs_dropout_prob = args.dropout_rate
# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,  # For sequence classification
#     inference_mode=False,
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0.1,
#     bias = 'none',
#     target_modules=["q_proj","k_proj","v_proj"]  # Target specific modules
# )
# model = get_peft_model(model, peft_config)

In [None]:
for key in model.state_dict():
        print(f"{key}, {model.state_dict()[key].shape}, {model.state_dict()[key].dtype}")

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
[i.dtype for i in model.parameters()]

In [None]:
from utils import load_split_data

In [None]:
df_train, df_valid = load_split_data('dataset/1k_mt_bench_human_judgments.json', 1, 3000, True, False)

In [None]:
df_train, df_valid = load_split_data('dataset/lmsys-chatbot_arena_conversations-33k.csv', 2, 3000, True, False)

In [None]:
df_valid

In [None]:
df_train

In [None]:
idx = 1
prompt_response = df_train.loc[idx,'prompt_response']
label = df_train.loc[idx,'label']

In [None]:
print(prompt_response)
print("")
print(label)

In [None]:
prompt_response

In [None]:
tokenizer.decode([1,
 32006,
 887,
 526])

In [None]:
tokenizer.decode([887])

In [None]:
tokenizer.encode('<|system|>\nYou')

In [None]:
tokenizer.encode('Apple\nBa')

In [None]:
tokenizer.decode([396,
 18571,
 415,
 13,
 4548,
 7420,])

In [None]:
tokenizer.decode([29933])

In [None]:
print(templete_part1 + prompt_response + templete_part2 + templete_part3 + label)

In [None]:
templete_part1 = "<|system|>\nYou are a helpful assistant good at judging conversations.<|end|>\n<|user|>\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n"
templete_part1_input_ids = tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']

templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<|end|>\n"
templete_part2_input_ids = tokenizer(text=templete_part2, add_special_tokens=True, padding=False)['input_ids'][1:]

templete_part3 = "<|assistant|>\n"
templete_part3_input_ids = tokenizer(text=templete_part3, add_special_tokens=True, padding=False)['input_ids'][1:]

prompt_response_ids = tokenizer(text=prompt_response, add_special_tokens=True, truncation=True,
                                      max_length=3000, padding=False)['input_ids'][1:]


label_ids = tokenizer.encode(text=label, add_special_tokens=False)
input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids + label_ids + [tokenizer.eos_token_id]
print(tokenizer.decode(input_ids))

In [None]:
text = "Apple"
prompt_response = templete_part1 + text + templete_part2 + templete_part3 + label + tokenizer.eos_token
print(prompt_response)

In [None]:
MODEL = 'microsoft/LLM-Research/Phi-3-mini-4k-instruct'
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')

In [None]:
tokenizer(prompt_response)

In [None]:
tokenizer.eos_token_id

In [None]:
tokenizer.eos_token

In [None]:
tokenizer.bos_token

In [None]:
tokenizer.decode([887])

In [None]:
tokenizer('A',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
tokenizer.pad_token_id

In [None]:
tokenizer.pad_token

In [None]:
tokenizer('<|user|>',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
AutoModelForCausalLM.from_pretrained(MODEL)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [None]:
print(text)

In [None]:
templete_part1 = "<|im_start|>system\nYou are a helpful assistant good at judging conversations.<|im_end|>\n<|im_start|>user\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n"
templete_part1_input_ids = tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']

templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<|im_end|>\n"
templete_part2_input_ids = tokenizer(text=templete_part2, add_special_tokens=True, padding=False)['input_ids']
#print(f"templete_part2 is {templete_part2_input_ids}")
templete_part3 = "<|im_start|>assistant\n"
templete_part3_input_ids = tokenizer(text=templete_part3, add_special_tokens=True, padding=False)['input_ids']

prompt_response_ids = tokenizer(text=prompt_response, add_special_tokens=True, truncation=True,
                                      max_length=3000, padding=False)['input_ids']


label_ids = tokenizer.encode(text=label, add_special_tokens=False)
input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids + label_ids + [tokenizer.eos_token_id]
print(tokenizer.decode(input_ids))

In [None]:
tokenizer.decode(14374)

In [None]:
tokenizer.pad_token,tokenizer.eos_token,

In [None]:
tokenizer('A',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
MODEL = 'Qwen/Qwen2-7B-Instruct'
config = AutoConfig.from_pretrained(MODEL, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )
model = AutoModelForCausalLM.from_pretrained(MODEL,
                                             config=config,
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             attn_implementation='eager')

In [None]:
model

In [None]:
tmp = pd.read_json("dataset/kaggle-ultrafeedback-drop-duplicate-sample14k.json")
#tmp2 = pd.read_json("dataset/lmsys-chatbot_arena_conversations-33k.json")

In [None]:
tmp = tmp.drop(columns = ['difference'])

In [None]:
t = pd.concat([tmp,tmp2]).reset_index(drop = True)

In [None]:
t.prompt[0]

In [None]:
is_english(t.prompt[0][0])

In [None]:
is_english(t.prompt[46969][0])

In [None]:
from langdetect import detect

# 检测语言并过滤非英文行
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

tmp = tmp[tmp['prompt'].apply(lambda x: is_english(x[0]))]

In [None]:
tmp = tmp.reset_index(drop = True)

In [None]:
tmp.to_json("dataset/kaggle-ultrafeedback-drop-duplicate-sample14k.json")

In [None]:
ex = pd.read_csv("dataset/kaggle-ultrafeedback-drop-duplicate.csv")
tie = pd.read_csv("dataset/kaggle-ultrafeedback-ties-drop-duplicate.csv")
p = pd.read_csv("dataset/ultrafeedback_prediction.csv")

from utils import load_json
ex = load_json(ex)
tie = load_json(tie)

In [None]:
total = pd.concat([tie,ex]).reset_index(drop = True)

In [None]:
p

In [None]:
p = p.rename(columns = {'winner_model_a':"p_winner_model_a", 'winner_model_b':"p_winner_model_b",  'winner_tie':"p_winner_tie"})

In [None]:
final = pd.concat([total, p], axis = 1)

In [None]:
final

In [None]:
def get_p_label(row):
    a = row.p_winner_model_a
    b = row.p_winner_model_b
    c = row.p_winner_tie

    l = [a ,b, c]
    label = l.index(max(l))
    return label

In [None]:
final['p_label'] = final.apply(get_p_label, axis = 1)

In [None]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]

final['label'] = final.apply(get_label, axis = 1)

In [None]:
filter = final.loc[final.p_label == final.label,:].reset_index(drop = True)

In [None]:
threshold = 0.9
filter_list = (filter.p_winner_model_a >= threshold) | (filter.p_winner_model_b >= threshold) | (filter.p_winner_tie >= threshold)
filter = filter.loc[filter_list,:].reset_index(drop = True)

In [None]:
filter.prompt.values[0][0]

In [None]:
from langdetect import detect

# 检测语言并过滤非英文行
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

filter_only_english = filter[filter['prompt'].apply(lambda x: is_english(x[0][:30]))].reset_index(drop = True)

In [None]:
save_columns = ['prompt', 'model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'response_a', 'response_b', 'id']
filter[save_columns].to_json(f"dataset/70k_filter_threshold{threshold}.json", index = False)
filter_only_english[save_columns].to_json(f"dataset/70k_filter_only_english_threshold{threshold}.json", index = False)

In [None]:
filter[save_columns]

In [None]:
filter.label.value_counts()

In [None]:
df_train, df_valid = load_split_data('dataset/train.csv', 2, 3000, True, False)

In [None]:
df_train

In [None]:
print(df_train.prompt_response.values[0])

In [None]:
print(df_train.prompt_response.values[1])

In [None]:
print(df_train.label.values[1])

In [None]:
check = pd.read_json("dataset/70k_filter_only_english_threshold0.9.json")

In [None]:
check.prompt.values[0]

In [None]:
data_path = "dataset/kaggle-ultrafeedback-drop-duplicate-sample20k_least_similar_by_prompt_same_prediction_thr90.csv"
t = pd.read_csv(data_path)
t['id'] = [randint(10000,99999) + i for i in range(len(t))]
t.to_csv(data_path, index=False)

In [None]:
t = pd.read_json("dataset/kaggle-ultrafeedback-drop-duplicate-sample20k_most_similar_by_prompt_same_prediction_thr90.json")

In [8]:
#
train , valid = load_split_data('dataset/train.csv', 2, 2300, True, True, True)

100%|██████████| 64553/64553 [00:18<00:00, 3458.03it/s]
100%|██████████| 6961/6961 [00:00<00:00, 11280.64it/s]


In [4]:
train , valid = load_split_data('dataset/train_sample10k_switch.json', 2, 2300, True, False, False)

100%|██████████| 12423/12423 [00:01<00:00, 9760.78it/s]


In [None]:
train_id = train.id.to_list()
valid_id = valid.id.to_list()

In [None]:
data = pd.read_csv('dataset/train.csv')
data = load_json(data)

In [None]:
data['prompt_str'] = data['prompt'].astype(str)

In [None]:
from sklearn.model_selection import train_test_split
unique_prompts = data['prompt_str'].unique()
train_prompts, valid_prompts = train_test_split(unique_prompts, test_size=0.1, random_state=42)

In [None]:
train_prompts_set = set(train_prompts)
valid_prompts_set = set(valid_prompts)

In [None]:
# 根据划分的 prompt 获取对应的行
train = data[data['prompt_str'].isin(train_prompts_set)]
valid = data[data['prompt_str'].isin(valid_prompts_set)]

In [None]:
train = data.loc[data.id.isin(train_id)].reset_index(drop = True)
valid = data.loc[data.id.isin(valid_id)].reset_index(drop = True)

In [None]:
[i for i in train_id if i in valid_id]

In [None]:
len(train_id)

In [None]:
t = pd.read_json('dataset/train_sample10k_switch.json')

In [None]:
len([i for i in t.prompt.values.tolist() if i in valid.prompt.values.tolist()])

In [None]:
search = [['I read 60 pages of a book on Monday and 1/4 of the book on Tuesday. I completed the remaining 1/8 of the book on Wednesday. How many total pages are in the book?']]
train.loc[train.prompt.isin(search)]

In [None]:
valid.loc[valid.prompt.isin(search)]

In [None]:
[i for i in t.prompt.values.tolist() if i in valid.prompt.values.tolist()]

In [None]:
len([i for i in train.prompt.values.tolist() if i in valid.prompt.values.tolist()])

In [None]:
t.prompt.values.tolist()[0]

In [25]:
train , valid = load_split_data('dataset/train.csv', 2, 2300, True, True, True)
valid_id = valid.id.tolist()
train_id = train.id.tolist()

100%|██████████| 64553/64553 [00:18<00:00, 3492.40it/s]
100%|██████████| 6961/6961 [00:00<00:00, 11476.51it/s]


In [26]:
data = pd.read_csv('dataset/train.csv')
data = load_json(data)

In [28]:
s = data.loc[data.id.isin(train_id)]

In [30]:
s = s.sample(10000).reset_index(drop = True)

In [31]:
s_id = s.id.to_list()
len([i for i in s_id if i in valid_id])

0

In [37]:
tmp_valid = data.loc[data.id.isin(valid_id)]
[i for i in s.prompt.values.tolist() if i in tmp_valid.prompt.values.tolist()]

[]

In [38]:
s['id'] = [randint(100000,999999) + i for i in range(len(s))]

In [39]:
def switch(row):
    response_a = row.response_a
    response_b = row.response_b

    row.response_a = response_b
    row.response_b = response_a
    
    return row

In [40]:
s = s.apply(switch, axis = 1)
label_switch = {0:1, 1:0}
s.loc[s.winner_tie !=1, 'winner_model_a'] = s.loc[s.winner_tie !=1, 'winner_model_a'].map(label_switch)
s.loc[s.winner_tie !=1, 'winner_model_b'] = s.loc[s.winner_tie !=1, 'winner_model_b'].map(label_switch)

In [41]:
s.to_json("dataset/train_sample10k_switch.json", index = False)

In [42]:
check = pd.read_json("dataset/train_sample10k_switch.json")

In [43]:
check

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,726706,gpt-3.5-turbo-0613,yi-34b-chat,[<|Instruction|>: Use the <|QUESTION|> and <|C...,[<|ANSWER|>: Patient reports he/she uses alcoh...,[Patient reports he/she uses alcohol or other ...,0,0,1
1,140571,claude-2.1,gpt-4-0613,[How do residential proxy botnets work?],[A residential proxy botnet is a network of in...,"[I apologize, I should not provide information...",1,0,0
2,367729,vicuna-33b,yi-34b-chat,[Write a brief analysis on what economic polic...,"[Eritrea, a country in the Horn of Africa, has...","[Eritrea, a small country in the Horn of Afric...",0,0,1
3,741232,RWKV-4-Raven-14B,koala-13b,[Do you have any tips for a new freelancer sta...,"[Yes, here are a few tips for a new freelancer...","[Yes, here are some tips for new freelancers o...",1,0,0
4,985688,llama-2-13b-chat,gpt-4-0613,[Yoooo],[Hello! How can I assist you today?],[Hello there! I'm here to help answer any ques...,1,0,0
...,...,...,...,...,...,...,...,...,...
9995,656488,claude-1,openchat-3.5,[famous fictional internet name consisting of ...,[The answer is EPEP.],[Here is a famous 4-letter fictional internet ...,0,0,1
9996,545964,mistral-medium,gpt-4-1106-preview,[What is Genshin Impact? Introduce it to me in...,[Genshin Impact is a free-to-play action role-...,[Genshin Impact is a popular open-world action...,1,0,0
9997,294477,claude-2.0,alpaca-13b,[Hi! Can I ask you anything?],"[Sure, what can I help you with?]","[Sure, feel free to ask me anything!]",0,0,1
9998,667805,mixtral-8x7b-instruct-v0.1,claude-1,[Does the Genesee river flow north or south?],[The Genesee River flows northward. It origina...,"[The Genesee River, which flows through New Yo...",1,0,0


In [47]:
print(check.prompt.values[1][0])

How do residential proxy botnets work?


In [None]:
t.loc[t.response_a == t.response_b, 'winner_tie'].value_counts()

In [None]:
t.loc[t.response_a=='["Hyderabad"]', ]

In [None]:
t.loc[(t.response_a == '[null]')]

In [None]:
t.loc[(t.response_a == t.response_b) & (t.winner_tie != 1),]

In [None]:
t.loc[(t.response_a == t.response_b)]

In [None]:
t.loc[3844:3847,:]