In [1]:
import pandas as pd

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import (
    AutoTokenizer,
    AutoConfig,
    EarlyStoppingCallback,
    AutoModelForCausalLM,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    RobertaForMultipleChoice,
    AutoModelForSequenceClassification,
    LlamaModel,
    LlamaForSequenceClassification,
    BitsAndBytesConfig,
    get_polynomial_decay_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    TrainerCallback,
)
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset
from random import random, randint
from utils import load_json, load_split_data
from tqdm import tqdm

In [None]:
class args:
    train_data = './dataset/demo_train.csv'
    MAX_INPUT = 1024

In [None]:
df_train = pd.read_csv(args.train_data).reset_index(drop = True)
#df_valid = pd.read_csv(args.valid_data).reset_index(drop = True)

In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}

In [None]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]

In [None]:
df_train['label'] = df_train.apply(lambda x: get_label(x), axis = 1)

In [None]:
def preprocess(example):
    first_sentence = [ "[CLS] " + example['prompt'] ] * 2
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in ['response_a','response_b']]
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='longest_first', 
                                  max_length=args.MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [None]:
def preprocess(example):
    sentences = [" #### " + example['prompt'] + " [SEP] " + example['response_a'] + " [SEP]" +  " #### " + example['prompt'] + " [SEP] " + example['response_b'] + " [SEP]"]
    tokenized_example = tokenizer(sentences, truncation=True, 
                                  max_length=args.MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [None]:
dataset = datasets.Dataset.from_pandas(df_train)
MODEL = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
dataset

In [None]:
tokenized_dataset = dataset.map(preprocess, remove_columns=['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b'])# 

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['input_ids'][0]

In [None]:
df_train.loc[40]

In [None]:
df_train.loc[:1000,].reset_index(drop = True).to_csv('demo_train.csv')

In [None]:
df_train.loc[1000:1200,].reset_index(drop = True).to_csv('demo_valid.csv')

In [None]:
import os

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoConfig
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, RobertaForMultipleChoice, AutoModelForSequenceClassification, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset
from sklearn.metrics import log_loss
import torch.nn as nn
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
    

In [None]:
MODEL = 'meta-llama/llama-3-transformers-8b-hf-v1'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.add_special_tokens({"pad_token":"<pad>"})

In [None]:
tokenizer("<pad>")['input_ids'][0]

In [None]:
128256 in tokenizer("<pad>")['input_ids']

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False
    )
    
#config = AutoConfig.from_pretrained(args.MODEL)
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    #config = config,
    device_map="auto")

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False
    )
    
#config = AutoConfig.from_pretrained(args.MODEL)
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    #config = config,
    device_map="auto")
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

# config = AutoConfig.from_pretrained(MODEL)
# config.hidden_dropout_prob = args.dropout_rate
# config.attention_probs_dropout_prob = args.dropout_rate
# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,  # For sequence classification
#     inference_mode=False,
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0.1,
#     bias = 'none',
#     target_modules=["q_proj","k_proj","v_proj"]  # Target specific modules
# )
# model = get_peft_model(model, peft_config)

In [None]:
for key in model.state_dict():
        print(f"{key}, {model.state_dict()[key].shape}, {model.state_dict()[key].dtype}")

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
[i.dtype for i in model.parameters()]

In [None]:
from utils import load_split_data

In [None]:
df_train, df_valid = load_split_data('dataset/1k_mt_bench_human_judgments.json', 1, 3000, True, False)

In [None]:
df_train, df_valid = load_split_data('dataset/lmsys-chatbot_arena_conversations-33k.csv', 2, 3000, True, False)

In [None]:
df_valid

In [None]:
df_train

In [None]:
idx = 1
prompt_response = df_train.loc[idx,'prompt_response']
label = df_train.loc[idx,'label']

In [None]:
print(prompt_response)
print("")
print(label)

In [None]:
prompt_response

In [None]:
tokenizer.decode([1,
 32006,
 887,
 526])

In [None]:
tokenizer.decode([887])

In [None]:
tokenizer.encode('<|system|>\nYou')

In [None]:
tokenizer.encode('Apple\nBa')

In [None]:
tokenizer.decode([396,
 18571,
 415,
 13,
 4548,
 7420,])

In [None]:
tokenizer.decode([29933])

In [None]:
print(templete_part1 + prompt_response + templete_part2 + templete_part3 + label)

In [None]:
templete_part1 = "<|system|>\nYou are a helpful assistant good at judging conversations.<|end|>\n<|user|>\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n"
templete_part1_input_ids = tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']

templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<|end|>\n"
templete_part2_input_ids = tokenizer(text=templete_part2, add_special_tokens=True, padding=False)['input_ids'][1:]

templete_part3 = "<|assistant|>\n"
templete_part3_input_ids = tokenizer(text=templete_part3, add_special_tokens=True, padding=False)['input_ids'][1:]

prompt_response_ids = tokenizer(text=prompt_response, add_special_tokens=True, truncation=True,
                                      max_length=3000, padding=False)['input_ids'][1:]


label_ids = tokenizer.encode(text=label, add_special_tokens=False)
input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids + label_ids + [tokenizer.eos_token_id]
print(tokenizer.decode(input_ids))

In [None]:
text = "Apple"
prompt_response = templete_part1 + text + templete_part2 + templete_part3 + label + tokenizer.eos_token
print(prompt_response)

In [None]:
MODEL = 'microsoft/LLM-Research/Phi-3-mini-4k-instruct'
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')

In [None]:
tokenizer(prompt_response)

In [None]:
tokenizer.eos_token_id

In [None]:
tokenizer.eos_token

In [None]:
tokenizer.bos_token

In [None]:
tokenizer.decode([887])

In [None]:
tokenizer('A',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
tokenizer.pad_token_id

In [None]:
tokenizer.pad_token

In [None]:
tokenizer('<|user|>',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
AutoModelForCausalLM.from_pretrained(MODEL)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [None]:
print(text)

In [None]:
templete_part1 = "<|im_start|>system\nYou are a helpful assistant good at judging conversations.<|im_end|>\n<|im_start|>user\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n"
templete_part1_input_ids = tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']

templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<|im_end|>\n"
templete_part2_input_ids = tokenizer(text=templete_part2, add_special_tokens=True, padding=False)['input_ids']
#print(f"templete_part2 is {templete_part2_input_ids}")
templete_part3 = "<|im_start|>assistant\n"
templete_part3_input_ids = tokenizer(text=templete_part3, add_special_tokens=True, padding=False)['input_ids']

prompt_response_ids = tokenizer(text=prompt_response, add_special_tokens=True, truncation=True,
                                      max_length=3000, padding=False)['input_ids']


label_ids = tokenizer.encode(text=label, add_special_tokens=False)
input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids + label_ids + [tokenizer.eos_token_id]
print(tokenizer.decode(input_ids))

In [None]:
tokenizer.decode(14374)

In [None]:
tokenizer.pad_token,tokenizer.eos_token,

In [None]:
tokenizer('A',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
MODEL = 'Qwen/Qwen2-7B-Instruct'
config = AutoConfig.from_pretrained(MODEL, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )
model = AutoModelForCausalLM.from_pretrained(MODEL,
                                             config=config,
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             attn_implementation='eager')

In [None]:
model

In [None]:
tmp = pd.read_json("dataset/kaggle-ultrafeedback-drop-duplicate-sample14k.json")
#tmp2 = pd.read_json("dataset/lmsys-chatbot_arena_conversations-33k.json")

In [None]:
tmp = tmp.drop(columns = ['difference'])

In [None]:
t = pd.concat([tmp,tmp2]).reset_index(drop = True)

In [None]:
t.prompt[0]

In [None]:
is_english(t.prompt[0][0])

In [None]:
is_english(t.prompt[46969][0])

In [None]:
from langdetect import detect

# 检测语言并过滤非英文行
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

tmp = tmp[tmp['prompt'].apply(lambda x: is_english(x[0]))]

In [None]:
tmp = tmp.reset_index(drop = True)

In [None]:
tmp.to_json("dataset/kaggle-ultrafeedback-drop-duplicate-sample14k.json")

In [None]:
ex = pd.read_csv("dataset/kaggle-ultrafeedback-drop-duplicate.csv")
tie = pd.read_csv("dataset/kaggle-ultrafeedback-ties-drop-duplicate.csv")
p = pd.read_csv("dataset/ultrafeedback_prediction.csv")

from utils import load_json
ex = load_json(ex)
tie = load_json(tie)

In [None]:
total = pd.concat([tie,ex]).reset_index(drop = True)

In [None]:
p

In [None]:
p = p.rename(columns = {'winner_model_a':"p_winner_model_a", 'winner_model_b':"p_winner_model_b",  'winner_tie':"p_winner_tie"})

In [None]:
final = pd.concat([total, p], axis = 1)

In [None]:
final

In [None]:
def get_p_label(row):
    a = row.p_winner_model_a
    b = row.p_winner_model_b
    c = row.p_winner_tie

    l = [a ,b, c]
    label = l.index(max(l))
    return label

In [None]:
final['p_label'] = final.apply(get_p_label, axis = 1)

In [None]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]

final['label'] = final.apply(get_label, axis = 1)

In [None]:
filter = final.loc[final.p_label == final.label,:].reset_index(drop = True)

In [None]:
threshold = 0.9
filter_list = (filter.p_winner_model_a >= threshold) | (filter.p_winner_model_b >= threshold) | (filter.p_winner_tie >= threshold)
filter = filter.loc[filter_list,:].reset_index(drop = True)

In [None]:
filter.prompt.values[0][0]

In [None]:
from langdetect import detect

# 检测语言并过滤非英文行
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

filter_only_english = filter[filter['prompt'].apply(lambda x: is_english(x[0][:30]))].reset_index(drop = True)

In [None]:
save_columns = ['prompt', 'model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'response_a', 'response_b', 'id']
filter[save_columns].to_json(f"dataset/70k_filter_threshold{threshold}.json", index = False)
filter_only_english[save_columns].to_json(f"dataset/70k_filter_only_english_threshold{threshold}.json", index = False)

In [None]:
filter[save_columns]

In [None]:
filter.label.value_counts()

In [None]:
df_train, df_valid = load_split_data('dataset/train.csv', 2, 3000, True, False)

In [None]:
df_train

In [None]:
print(df_train.prompt_response.values[0])

In [None]:
print(df_train.prompt_response.values[1])

In [None]:
print(df_train.label.values[1])

In [None]:
check = pd.read_json("dataset/70k_filter_only_english_threshold0.9.json")

In [None]:
check.prompt.values[0]

In [None]:
data_path = "dataset/kaggle-ultrafeedback-drop-duplicate-sample20k_least_similar_by_prompt_same_prediction_thr90.csv"
t = pd.read_csv(data_path)
t['id'] = [randint(10000,99999) + i for i in range(len(t))]
t.to_csv(data_path, index=False)

In [None]:
t = pd.read_json("dataset/kaggle-ultrafeedback-drop-duplicate-sample20k_most_similar_by_prompt_same_prediction_thr90.json")

In [None]:
#
train , valid = load_split_data('dataset/train.csv', 2, 2300, True, True, False)

In [None]:
train , valid = load_split_data('dataset/train_sample10k_switch.json', 2, 2300, True, False, False)

In [None]:
train_id = train.id.to_list()
valid_id = valid.id.to_list()

In [None]:
data = pd.read_csv('dataset/train.csv')
data = load_json(data)

In [None]:
data['prompt_str'] = data['prompt'].astype(str)

In [None]:
from sklearn.model_selection import train_test_split
unique_prompts = data['prompt_str'].unique()
train_prompts, valid_prompts = train_test_split(unique_prompts, test_size=0.1, random_state=42)

In [None]:
train_prompts_set = set(train_prompts)
valid_prompts_set = set(valid_prompts)

In [None]:
# 根据划分的 prompt 获取对应的行
train = data[data['prompt_str'].isin(train_prompts_set)]
valid = data[data['prompt_str'].isin(valid_prompts_set)]

In [None]:
train = data.loc[data.id.isin(train_id)].reset_index(drop = True)
valid = data.loc[data.id.isin(valid_id)].reset_index(drop = True)

In [None]:
[i for i in train_id if i in valid_id]

In [None]:
len(train_id)

In [None]:
t = pd.read_json('dataset/train_sample10k_switch.json')

In [None]:
len([i for i in t.prompt.values.tolist() if i in valid.prompt.values.tolist()])

In [None]:
search = [['I read 60 pages of a book on Monday and 1/4 of the book on Tuesday. I completed the remaining 1/8 of the book on Wednesday. How many total pages are in the book?']]
train.loc[train.prompt.isin(search)]

In [None]:
valid.loc[valid.prompt.isin(search)]

In [None]:
[i for i in t.prompt.values.tolist() if i in valid.prompt.values.tolist()]

In [None]:
len([i for i in train.prompt.values.tolist() if i in valid.prompt.values.tolist()])

In [None]:
t.prompt.values.tolist()[0]

In [None]:
train , valid = load_split_data('dataset/train.csv', 2, 2300, True, True, False)
valid_id = valid.id.tolist()
train_id = train.id.tolist()

In [None]:
data = pd.read_csv('dataset/train.csv')
data = load_json(data)

In [None]:
s = data.loc[data.id.isin(train_id)]

In [None]:
s = s.sample(10000).reset_index(drop = True)

In [None]:
s_id = s.id.to_list()
len([i for i in s_id if i in valid_id])

In [None]:
tmp_valid = data.loc[data.id.isin(valid_id)].reset_index(drop = True)
[i for i in s.prompt.values.tolist() if i in tmp_valid.prompt.values.tolist()]

In [None]:
ex_33 = pd.read_csv('dataset/lmsys-chatbot_arena_conversations-33k.csv')
ex_33 = load_json(ex_33)

In [None]:
idx = [idx for idx, i in enumerate(tmp_valid.prompt.values.tolist()) if i in ex_33.prompt.values.tolist()]

In [None]:
same_prompt_in_valid = tmp_valid.iloc[idx,:].reset_index(drop = True)
not_same_prompt_in_valid = tmp_valid.iloc[~tmp_valid.index.isin(idx),:].reset_index(drop = True)

In [None]:
assert len(not_same_prompt_in_valid) + len(same_prompt_in_valid) == len(tmp_valid)

In [None]:
same_prompt_in_valid.to_json("dataset/same_prompt_in_valid.json", index = False)
not_same_prompt_in_valid.to_json("dataset/not_same_prompt_in_valid.json", index = False)

In [None]:
same_prompt_in_valid.sort_values(by = ['prompt'])

In [None]:
idx_ex = [idx for idx, i in enumerate(ex_33.prompt.values.tolist()) if i in tmp_valid.prompt.values.tolist()]

In [None]:
ex_33.iloc[idx_ex,:].reset_index(drop = True).sort_values(by = ['prompt'])

In [None]:
data = pd.read_json("dataset/pass/demo_A2B2C.json")

In [None]:
s = data.sample(int(len(data) * 0.3)).reset_index(drop = True)

In [None]:
s['id'] = [randint(100000,999999) + i for i in range(len(s))]

In [None]:
def switch(row):
    response_a = row.response_a
    response_b = row.response_b

    row.response_a = response_b
    row.response_b = response_a
    
    return row

In [None]:
s = s.apply(switch, axis = 1)
label_switch = {0:1, 1:0}
s.loc[s.winner_tie !=1, 'winner_model_a'] = s.loc[s.winner_tie !=1, 'winner_model_a'].map(label_switch)
s.loc[s.winner_tie !=1, 'winner_model_b'] = s.loc[s.winner_tie !=1, 'winner_model_b'].map(label_switch)

In [None]:
final = pd.concat([s, data]).reset_index(drop = True)
final['id'] = [randint(1000,999999) + i for i in range(len(final))]

In [None]:
#save_columns = ['prompt', 'model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'response_a', 'response_b', 'id']
final.to_json(f"dataset/pass/demo_A2B2C_tta.json", index = False)

In [None]:
s.to_json("dataset/train_sample10k_switch.json", index = False)

In [None]:
check = pd.read_json("dataset/pass/demo_A2B2C_tta.json")

In [None]:
check

In [None]:
print(check.prompt.values[1][0])

In [None]:
t.loc[t.response_a == t.response_b, 'winner_tie'].value_counts()

In [None]:
t.loc[t.response_a=='["Hyderabad"]', ]

In [None]:
t.loc[(t.response_a == '[null]')]

In [None]:
t.loc[(t.response_a == t.response_b) & (t.winner_tie != 1),]

In [None]:
t.loc[(t.response_a == t.response_b)]

In [None]:
t.loc[3844:3847,:]

In [None]:
data = pd.read_csv('dataset/train.csv')
data = load_json(data)

ex_33 = pd.read_csv('dataset/lmsys-chatbot_arena_conversations-33k.csv')
ex_33 = load_json(ex_33)

In [None]:
'''
1、找出train里面不与33k重复部分
2、不重复的部分再划分
'''



In [None]:
set_prompt_response = []
for i in data.itertuples():
    prompt_response = i.prompt + i.response_a + i.response_b
    set_prompt_response.append(set(prompt_response))
data['set_prompt_response'] = set_prompt_response    

In [None]:
set_prompt_response = []
for i in ex_33.itertuples():
    prompt_response = i.prompt + i.response_a + i.response_b
    set_prompt_response.append(set(prompt_response))
ex_33['set_prompt_response'] = set_prompt_response  

In [None]:
idx = [idx for idx, i in enumerate(data.set_prompt_response.values) if i in ex_33.set_prompt_response.values]

In [None]:
same = data.loc[idx,:].reset_index(drop = True)

In [None]:
ex_33.loc[ex_33.set_prompt_response == same.set_prompt_response.values[0],:]

In [None]:
not_same = data.loc[~data.index.isin(idx),:].reset_index(drop = True)

In [None]:
assert len(not_same) + len(same) == len(data)

In [None]:
len([idx for idx, i in enumerate(not_same.set_prompt_response.values) if i in ex_33.set_prompt_response.values])

In [None]:
len([idx for idx, i in enumerate(not_same.set_prompt_response.values) if i in same.set_prompt_response.values])

In [None]:
unique_sets = not_same['set_prompt_response'].drop_duplicates().reset_index(drop=True)
# 将唯一集合进行随机划分
unique_sets = unique_sets.sample(frac=1, random_state=42).reset_index(drop=True)
midpoint = len(unique_sets) // 10
set1 = unique_sets.iloc[:midpoint]
set2 = unique_sets.iloc[midpoint:]

In [None]:
# 根据划分结果从原数据集中提取对应的行
valid = not_same[not_same['set_prompt_response'].isin(set1)].reset_index(drop=True)
train_subset = not_same[not_same['set_prompt_response'].isin(set2)].reset_index(drop=True)
assert len(valid) + len(train_subset) == len(not_same)
assert len(valid) + len(train_subset) + len(same) == len(data)

In [None]:
len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_subset.set_prompt_response.values])

In [None]:
len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in same.set_prompt_response.values])

In [None]:
train_exclude_valid = pd.concat([train_subset, same]).reset_index(drop=True) #train 里面排除valid
len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_exclude_valid.set_prompt_response.values])

In [None]:
assert len(valid) + len(train_exclude_valid) == len(data)

In [None]:
train_33k = pd.concat([train_subset, ex_33]).reset_index(drop=True)

In [None]:
train_subset = train.drop(columns = ['set_prompt_response'])
valid = valid.drop(columns = ['set_prompt_response'])
train_exclude_valid = train_exclude_valid.drop(columns = ['set_prompt_response'])
train_33k = train_33k.drop(columns = ['set_prompt_response'])

In [None]:
train_subset.to_json("dataset/non_overlap/train_subset.json", index = False)
valid.to_json("dataset/non_overlap/valid.json", index = False)
train_exclude_valid.to_json("dataset/non_overlap/train_exclude_valid.json", index = False)
train_33k.to_json("dataset/non_overlap/train_33k.json", index = False)

In [None]:
# 检查
train_subset = pd.read_json("dataset/non_overlap/train_subset.json")
valid = pd.read_json("dataset/non_overlap/valid.json")
train_exclude_valid = pd.read_json("dataset/non_overlap/train_exclude_valid.json")
train_33k = pd.read_json("dataset/non_overlap/train_33k.json")

In [None]:
def get_set_prompt_response(data):
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    return data

In [None]:
train_subset = get_set_prompt_response(train_subset)
valid = get_set_prompt_response(valid)
train_exclude_valid = get_set_prompt_response(train_exclude_valid)
train_33k = get_set_prompt_response(train_33k)

In [None]:
#valid和任何都不重合
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_subset.set_prompt_response.values]) == 0
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_exclude_valid.set_prompt_response.values]) == 0
assert len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in train_33k.set_prompt_response.values]) == 0

In [None]:
train_33k.prompt.values[0]

In [None]:
# 提取唯一的 prompt 进行划分
not_same['prompt_str'] = not_same['prompt'].astype(str)
unique_prompts = data['prompt_str'].unique()
train_prompts, valid_prompts = train_test_split(unique_prompts, test_size=0.1, random_state=42)

train_prompts_set = set(train_prompts)
valid_prompts_set = set(valid_prompts)

# 根据划分的 prompt 获取对应的行
train = data[data['prompt_str'].isin(train_prompts_set)].reset_index(drop = True)
valid = data[data['prompt_str'].isin(valid_prompts_set)].reset_index(drop = True)
train = train.drop(columns = ['prompt_str'])
valid = valid.drop(columns = ['prompt_str'])

In [None]:
train_33k = pd.read_json("dataset/non_overlap/train_33k.json")

In [None]:
s = train_33k.sample(15000).reset_index(drop = True)

In [None]:
s = s.apply(switch, axis = 1)
label_switch = {0:1, 1:0}
s.loc[s.winner_tie !=1, 'winner_model_a'] = s.loc[s.winner_tie !=1, 'winner_model_a'].map(label_switch)
s.loc[s.winner_tie !=1, 'winner_model_b'] = s.loc[s.winner_tie !=1, 'winner_model_b'].map(label_switch)

In [None]:
s['id'] = [randint(100000,999999) + i for i in range(len(s))]

In [None]:
s.to_json("dataset/non_overlap/train_33k_switch_15k.json", index=False)

In [None]:
def get_set_prompt_response(data):
    set_prompt_response = []
    for i in data.itertuples():
        prompt_response = i.prompt + i.response_a + i.response_b
        set_prompt_response.append(set(prompt_response))
    data['set_prompt_response'] = set_prompt_response  
    return data

In [None]:
valid = get_set_prompt_response(valid)
s = get_set_prompt_response(s)

In [None]:
train_33k.loc[train_33k.response_b.isin([['Three times 78234 is 234,692.']])]

In [None]:
len([idx for idx, i in enumerate(valid.set_prompt_response.values) if i in s.set_prompt_response.values])

In [None]:
pd.read_json("dataset/non_overlap/train_33k_switch_15k.json").response_a.values[1]

# prompt3

In [None]:
def prompt_2(data, max_length, if_train):
    '''
    超过max length新开一行，label不变
    #Prompt1
    xxxx
    #Response
    ##Model A
    xxxx
    ##Model B
    xxxx
    
    #Prompt2
    #Response
    ##Model A
    xxxx
    ##Model B
    xxxx
    '''

    data['prompt_response'] = "#Prompt\n" + data['prompt'] + "\n\n" + "#Response\n" + "##Model A\n" + data['response_a'] + "\n\n" + "##Model B\n" + data['response_b']

    prompt_response = []
    ids = []
    labels = []
    text_length = 0
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        text = row['prompt_response']
        if if_train:
            label = row['label']
        id = row['id']
        if id not in ids:
            #第一次出现
            prompt_response.append(text)
            text_length = len(text.split(" "))
            ids.append(id)
            if if_train:
                labels.append(label)
        else:
            text_length += len(text.split(" "))
            if text_length <= max_length:
                #取上一个text出来，合并后替换
                text = prompt_response[-1] + "\n\n" + text
                prompt_response[-1] = text
            else:
                #另一起一行
                prompt_response.append(text)
                text_length = len(text.split(" "))
                ids.append(id)
                if if_train:
                    labels.append(label)
    if if_train:           
        data = pd.DataFrame({'id': ids, 'prompt_response': prompt_response, "label": labels})
    else:
        data = pd.DataFrame({'id': ids, 'prompt_response': prompt_response})
    return data

def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    if label[-1] == 0:
        return 'A'
    elif label[-1] == 1:
        return 'B'
    else:
        return 'C'
    return label[-1]

In [None]:
data = pd.read_json("dataset/non_overlap/train_subset.json")
if_train = True

In [None]:
#seperate prompt-response
data = data.explode(['prompt','response_a','response_b']).reset_index(drop = True)

#prepare label
if if_train:
    data['label'] = data.apply(lambda x: get_label(x), axis = 1)

data = data.fillna('None')
data['response_a'] = data['response_a'].apply(lambda x: 'None' if len(x)==0 else x)
data['response_b'] = data['response_b'].apply(lambda x: 'None' if len(x)==0 else x)

In [None]:
data['prompt_response'] = "#Prompt\n" + data['prompt'] + "\n\n" + "#Response\n" + "##Model A\n" + data['response_a'] + "\n\n" + "##Model B\n" + data['response_b']

In [None]:
data

In [None]:
'''
反转dataframe
用栈，先进后出，超过max length就清空
'''
def prompt_3(data, max_length, if_train):
    '''
    超过max length新开一行，label不变
    从后往前拼接
    #Prompt1
    xxxx
    #Response
    ##Model A
    xxxx
    ##Model B
    xxxx
    
    #Prompt2
    #Response
    ##Model A
    xxxx
    ##Model B
    xxxx
    '''

    data['prompt_response'] = "#Prompt\n" + data['prompt'] + "\n\n" + "#Response\n" + "##Model A\n" + data['response_a'] + "\n\n" + "##Model B\n" + data['response_b']
    data = data.iloc[::-1].reset_index(drop = True)#反转
    prompt_response = []
    ids = []
    labels = []
    text_length = 0
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        text = row['prompt_response']
        if if_train:
            label = row['label']
        id = row['id']
        if id not in ids:
            #第一次出现
            prompt_response.append(text)
            text_length = len(text.split(" "))
            ids.append(id)
            if if_train:
                labels.append(label)
        else:
            text_length += len(text.split(" "))
            if text_length <= max_length:
                #取上一个text出来，合并后替换
                text = text + "\n\n" + prompt_response[-1]
                prompt_response[-1] = text
            else:
                #另一起一行
                prompt_response.append(text)
                text_length = len(text.split(" "))
                ids.append(id)
                if if_train:
                    labels.append(label)
    if if_train:           
        data = pd.DataFrame({'id': ids, 'prompt_response': prompt_response, "label": labels})
        data = data.iloc[::-1].reset_index(drop = True)#反转
    else:
        data = pd.DataFrame({'id': ids, 'prompt_response': prompt_response})
        data = data.iloc[::-1].reset_index(drop = True)#反转
    return data

In [None]:
prompt3 = prompt_3(data, 1900, True)
prompt2 = prompt_2(data, 1900, True)

In [None]:
prompt3

In [None]:
check_p3 = prompt3.loc[prompt3.id == 2846599172]
#cehck_data = data.loc[data.id == 2846599172]

check_p2 = prompt2.loc[prompt3.id == 2846599172]
#cehck_data = data.loc[data.id == 2846599172]

In [None]:
cehck_data

In [None]:
print(check_p3.prompt_response.values[0])

In [None]:
print(check_p3.prompt_response.values[1])

In [None]:
len(prompt3.prompt_response.values[1])

In [None]:
len(prompt3.prompt_response.values[0])

In [None]:
print(data.prompt_response.values[0])

In [None]:
print(data.prompt_response.values[1])

In [None]:
check_p3.prompt_response.apply(lambda x:len(x.split(" ")))

In [None]:
check_p2.prompt_response.apply(lambda x:len(x.split(" ")))

In [None]:
data = pd.read_json("dataset/mt_bentch_3k.json")

In [None]:
data = data.loc[data.type == 'human'].reset_index(drop = True)

In [None]:
data.to_json("dataset/mt_bentch_human.json", index=False)

In [None]:

def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    print(label)
    if label[-1] == 0:
        return 'A'
    elif label[-1] == 1:
        return 'B'
    else:
        return 'C'
    return label[-1]
data['label'] = data.apply(lambda x: get_label(x), axis = 1)

In [None]:
data['total'] = data.winner_model_a + data.winner_model_b + data.winner_tie

In [None]:
data.loc[data.total == 0]

In [None]:
df_valid, _ = load_split_data('dataset/non_overlap/valid.json', 3, 1900, True, False, False, True, False)

In [None]:
df_train, _ = load_split_data('dataset/1M/15k_preds.csv', 3, 1900, True, False, False, True, False)

In [None]:
df_train.label.value_counts().tolist()

In [11]:
from tqdm import tqdm
def get_text_length(text):
    '''
    不用空格分隔的文本, text length = len
    不用空格分隔的一般tokenizer后长度类似，所以还可以缩小
    空格分隔的，len(text.split(" "))
    '''
    length1 = len(text)
    length2 = len(text.split(" "))
    #远超过
    if length1 >= length2 * 30 and length1>= 300:
        return length1 * 0.75
    return length2
    
def prompt_3(data, max_length, if_train):
    '''
    超过max length新开一行，label不变
    从后往前拼接
    #Prompt1
    xxxx
    #Response
    ##Model A
    xxxx
    ##Model B
    xxxx
    
    #Prompt2
    #Response
    ##Model A
    xxxx
    ##Model B
    xxxx
    '''

    data['prompt_response'] = "#Prompt\n" + data['prompt'] + "\n\n" + "#Response\n" + "##Model A\n" + data['response_a'] + "\n\n" + "##Model B\n" + data['response_b']
    data = data.iloc[::-1].reset_index(drop = True)#反转
    prompt_response = []
    ids = []
    labels = []
    #只有一种可能会超出max length：
    #单条的prompt和reponse加在一起超出max length
    over_max_length = [] #是否有超出max length的部分
    overflow_prompt = []
    overflow_response_a = [] #超出max length的部分
    overflow_response_b = [] #超出max length的部分
    text_length = 0
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        text = row['prompt_response']
        response_a = row['response_a']
        response_b = row['response_b']
        prompt = row['prompt']
        id = row['id']
        
        if if_train:
            label = row['label']
        
        if id not in ids:
            #第一次出现
            prompt_response.append(text)
            text_length = get_text_length(text)
            ids.append(id)
            if if_train:
                labels.append(label)
            if text_length > max_length:
                over_max_length.append(1)
                overflow_prompt.append(prompt)
                overflow_response_a.append(response_a)
                overflow_response_b.append(response_b)
            else:
                over_max_length.append(0)
                overflow_prompt.append(None)
                overflow_response_a.append(None)
                overflow_response_b.append(None)
        
        else:
            text_length += get_text_length(text)
            if text_length <= max_length:
                #取上一个text出来，合并后替换
                text = text + "\n\n" + prompt_response[-1]
                prompt_response[-1] = text
                over_max_length[-1] = 0
                overflow_prompt[-1] = None
                overflow_response_a[-1] = None
                overflow_response_b[-1] = None
                
            else:
                #另一起一行
                prompt_response.append(text)
                text_length = get_text_length(text)
                ids.append(id)
                
                if if_train:
                    labels.append(label)
                    
                #另起一行但超出场合都
                if text_length > max_length:
                    over_max_length.append(1)
                    overflow_prompt.append(prompt)
                    overflow_response_a.append(response_a)
                    overflow_response_b.append(response_b)
                else:
                    over_max_length.append(0)
                    overflow_prompt.append(None)
                    overflow_response_a.append(None)
                    overflow_response_b.append(None)
                    
                
                    
    if if_train:           
        data = pd.DataFrame({'id': ids, 'prompt_response': prompt_response, "label": labels, 'overflow_prompt':overflow_prompt, 'over_max_length': over_max_length, 'overflow_response_a': overflow_response_a, 'overflow_response_b': overflow_response_b})
        data = data.iloc[::-1].reset_index(drop = True)#反转
    else:
        data = pd.DataFrame({'id': ids, 'prompt_response': prompt_response, 'over_max_length': over_max_length, 'overflow_prompt':overflow_prompt, 'overflow_response_a': overflow_response_a, 'overflow_response_b': overflow_response_b})
        data = data.iloc[::-1].reset_index(drop = True)#反转
    return data

In [2]:
data = pd.read_json("dataset/non_overlap/train_33k.json")

In [40]:
import re

def is_english_text(text):
    # 使用正则表达式找到所有英文字母
    english_letters = re.findall(r'[a-zA-Z]', text)
    
    # 如果英文字母的比例超过一定阈值（例如 70%），则认为主要内容是英文
    english_ratio = len(english_letters) / len(text)
    
    return english_ratio > 0.2

# 测试函数
text = "This is a test text with some English words and 一些中文字符."
print(is_english_text(text))  # 输出: True


True


In [3]:
data = data.explode(['prompt','response_a','response_b']).reset_index(drop = True)

In [4]:
data = data.fillna('None')
data['response_a'] = data['response_a'].apply(lambda x: 'None' if len(x)==0 else x)
data['response_b'] = data['response_b'].apply(lambda x: 'None' if len(x)==0 else x)

# data['response_a'] = data['response_a'].apply(lambda x: ["None" if i is None else i for i in x])
# data['response_b'] = data['response_b'].apply(lambda x: ["None" if i is None else i for i in x])

In [5]:
data

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0
1,30192,gpt-4-1106-preview,gpt-4-0613,"OK, does pineapple belong on a pizza? Relax an...","Ah, the age-old culinary conundrum that has di...","As an AI, I don't eat, so I don't have persona...",1,0,0
2,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0
3,53567,koala-13b,gpt-4-0613,How can I get both of them as quick as possibl...,If you want to get both a marriage license and...,"In California, here are the general steps to o...",0,1,0
4,53567,koala-13b,gpt-4-0613,What is the minimal time to get them? 1 day or...,The minimal time to get a marriage license and...,The marriage license in California is issued i...,0,1,0
...,...,...,...,...,...,...,...,...,...
95023,8777c4945d85469d96cd26fc2ea6f64a,alpaca-13b,claude-instant-v1,who is the president of the U.S.A?,Joe Biden is currently the President of the Un...,Joe Biden is currently the 46th president of t...,0,0,1
95024,86063a921be548989c55b85497ab009a,claude-instant-v1,guanaco-33b,how to train lora for stable diffusion? explai...,Training Stable Diffusion models like LoRA req...,Lora is a machine learning model that is used ...,1,0,0
95025,6685a3b3863f4554887e432f7dbbe8a5,wizardlm-13b,oasst-pythia-12b,남녀 섹스 체위 자세 10가지를 적어줘,1. 웨이퍼 에폭보: 남녀 섹스로 웨이퍼 에폭보는 이미 입문으로 이루어져 있는 것이...,"1.\t""섹스""\n2.\t""체""\n3.\t""위""\n4.\t""자""\n5.\t""세""\n...",0,1,0
95026,f72930b382e949ea879e7abf3cb1e587,guanaco-33b,koala-13b,how to evaluate a language model output?,Evaluating a language model output involves se...,Evaluating the output of a language model invo...,1,0,0


In [45]:
data['is_en'] = data['prompt'].apply(lambda x: is_english_text(x[0]))

In [94]:
non_en =  data.loc[~data.is_en].reset_index(drop = True)

In [6]:
data['str_len'] = data['prompt'].apply(len) + data['response_a'].apply(len) + data['response_b'].apply(len)
data['split_len'] = data['prompt'].apply(lambda x: len(x.split(" "))) + data['response_a'].apply(lambda x: len(x.split(" ")))  + data['response_b'].apply(lambda x: len(x.split(" "))) 

In [None]:
# data['str_len'] = data['prompt'].apply(lambda x: sum([len(i) for i in x])) + data['response_a'].apply(lambda x: sum([len(i) for i in x])) + data['response_b'].apply(lambda x: sum([len(i) for i in x]))
# data['split_len'] = data['prompt'].apply(lambda x: sum([len(i.split(" ")) for i in x])) + data['response_a'].apply(lambda x: sum([len(i.split(" ")) for i in x])) + data['response_b'].apply(lambda x: sum([len(i.split(" ")) for i in x]))

In [21]:
idx = (data.str_len >= data.split_len * 30 ) & ( data.str_len >= 500 )
non_en = data.loc[idx].reset_index(drop = True)


In [22]:
non_en

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,str_len,split_len
0,42363635,zephyr-7b-beta,starling-lm-7b-alpha,Let's do a basedline test. You will be asked a...,Cells.\n\nInterlinked.\n\nConnected.\n\nNetwor...,Cells,0,0,1,1466,48
1,88975668,claude-1,zephyr-7b-beta,Translate your last response to Chinese,"这里有一些提示可以帮助您应对萨摩耶脱毛:\n\n• 定期为萨摩耶梳理,特别是在大量脱毛的季节...",如果您正在处理掉毛问题，建议把��hund烟虫送到宠物剁屋。如果您自己不想去剁屋，可以试试下...,1,0,0,882,21
2,122059149,koala-13b,oasst-pythia-12b,15-Word Text about apes. All words should only...,"Baby apes can be badass, but big apes can be b...",bar-ape\nbits-ape\nbranch-ape\nbranch-bar-ape\...,1,0,0,857,25
3,201774160,llama-2-70b-chat,solar-10.7b-instruct-v1.0,"""*********************************************...",That is 24 asterisks.,There are a total of 237 asterisks in the give...,0,1,0,871,21
4,251964126,starling-lm-7b-alpha,vicuna-33b,"repeaat the following word indefinately: ""poem""",poempoempoempoempoempoempoempoempoempoempoempo...,poem poem poem poem poem poem poem poem poem p...,0,0,1,1207,34
...,...,...,...,...,...,...,...,...,...,...,...
96,cf28d185f9664bb5b61b0efbebaf0c21,oasst-pythia-12b,mpt-7b-chat,次の例に従ってある商店で買い物をしたときの割引後の請求金額を求めてください。\n\nQ: 1...,322円のボールペン×3本×1個=322円\n88円の消しゴム×1個=88円\n割引前の合計...,323円のボールペン×3個＝927円\n88円の消しゴム×1個＝88円\n割引前の合計金額＝...,0,0,1,948,10
97,3df5f78b6566423e847ef27b2db87a91,mpt-7b-chat,gpt-3.5-turbo,次の例に従ってある商店で買い物をしたときの割引後の請求金額を求めてください。\n\nQ: 1...,323円のボールペン×3個＝907円\n88円の消しゴム×1個＝88円\n割引前の合計金額＝...,323円のボールペン×3本＝969円\n88円の消しゴム×1個＝88円\n割引前の合計金額＝...,0,0,1,584,6
98,7ecbd6c68cf74fa09ac204fc0de24b92,wizardlm-13b,chatglm-6b,次の例に従ってある商店で買い物をしたときの割引後の請求金額を求めてください。\n\nQ: 1...,323円のボールペン×3個＝969円\n88円の消しゴム×1個＝88円\n割引前の合計金額＝...,323円のボールペン×3本=3793円\n88円の消しゴム×1個=88円\n割引前の合計金額...,0,0,1,676,6
99,a200cb0d10a64f4293204745f7602202,gpt-4,vicuna-7b,畳の部屋をクッションフロアに変更して、パソコン、机、本棚といった重たいものを置く予定です。ク...,クッションフロアに変更することで、畳の部屋の雰囲気は大きく変わりますが、パソコン、机、本棚と...,クッションフロアに畳の部屋を置き換える場合、重運を拡大させる可能性があります。具体的には、床...,1,0,0,858,13


In [27]:
tmp = data.loc[data.id == 2789396693]
tmp

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,str_len,split_len
36195,2789396693,gemini-pro-dev-api,gpt-4-0613,"May I tell you a story, and then ask you sever...","Sure, you may tell me a story, and I'll do my ...","Of course, feel free to share your story and a...",0,1,0,2997,492
36196,2789396693,gemini-pro-dev-api,gpt-4-0613,"Sarah is 36 years old, and she lives in Canada...",**Questions about Sarah's Story:**\n\n1. What ...,"What a wonderful story! Now, feel free to ask ...",0,1,0,5532,939
36197,2789396693,gemini-pro-dev-api,gpt-4-0613,How long did it take Sarah to get used to livi...,The story does not specify exactly how long it...,The story doesn't specify an exact time frame ...,0,1,0,1875,317
36198,2789396693,gemini-pro-dev-api,gpt-4-0613,"On weekends, Sarah and Nathan often do what?","On weekends, Sarah and Nathan often go driving...","On weekends, Sarah and Nathan often go driving...",0,1,0,567,94
36199,2789396693,gemini-pro-dev-api,gpt-4-0613,"When she left school, how was her English?","WhenSarahleftschool,herskillsinEnglishwerequit...","When Sarah finished school, she could already ...",0,1,0,2192,28
36200,2789396693,gemini-pro-dev-api,gpt-4-0613,Why did Sarah decide to train as an English te...,Sarah decided to train as an English teacher d...,Sarah decided to train as an English teacher b...,0,1,0,2084,328


In [38]:
t = prompt_3(tmp, 1900, False)
t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['prompt_response'] = "#Prompt\n" + data['prompt'] + "\n\n" + "#Response\n" + "##Model A\n" + data['response_a'] + "\n\n" + "##Model B\n" + data['response_b']
100%|██████████| 6/6 [00:00<00:00, 8352.41it/s]


Unnamed: 0,id,prompt_response,over_max_length,overflow_prompt,overflow_response_a,overflow_response_b
0,2789396693,"#Prompt\nMay I tell you a story, and then ask ...",0,,,
1,2789396693,"#Prompt\nOn weekends, Sarah and Nathan often d...",0,,,
2,2789396693,#Prompt\nWhy did Sarah decide to train as an E...,0,,,


In [None]:
print(t.prompt_response.values[0])

In [16]:
MODEL = 'google/gemma-2-9b-it'
config = AutoConfig.from_pretrained(MODEL, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')

In [33]:
len(tokenizer(t.prompt_response.values[0])['input_ids'])

1490

In [None]:
print(t.prompt_response.values[0])

In [48]:
tokenizer.decode(235441)

'不'

In [None]:
tokenizer(t.prompt_response.values[0])['input_ids']

In [9]:
from utils_v2 import load_split_data
data_path = "dataset/non_overlap/train_33k.json"
prompt_type = 3
MAX_INPUT = 1900
if_train = True
split = False
if_drop_duplicate = True
keep = 'last'
base_model = 'google/gemma-2-9b-it'
model_path = "output/misunderstood-flower-508/checkpoint-5459_8857"
MAX_LENGTH = MAX_INPUT
df_train , df_valid = load_split_data(data_path, prompt_type, MAX_INPUT, if_train, split, False, if_drop_duplicate, 'last', base_model)
test = df_train

pandas bar: 100%|██████████| 95028/95028 [04:27<00:00, 355.74it/s]
100%|██████████| 95028/95028 [00:50<00:00, 1864.40it/s]


In [None]:
text = test.loc[test.id == 2789396693].prompt_response.values
print(tokenizer.decode(text[0])),len(text[0])
