In [1]:
import argparse
from typing import Optional, Union

import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from dataclasses import dataclass

import datasets
from datasets import Dataset

from sklearn.metrics import log_loss

from transformers import (
    AutoTokenizer,
    AutoConfig,
    EarlyStoppingCallback,
    AutoModelForCausalLM,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    RobertaForMultipleChoice,
    AutoModelForSequenceClassification,
    LlamaModel,
    LlamaForSequenceClassification,
    BitsAndBytesConfig,
    get_polynomial_decay_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    TrainerCallback,
)
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

from peft import (
    get_peft_config,
    PeftModel,
    PeftConfig,
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)
import os

import random

def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    if label[-1] == 0:
        return 'A'
    elif label[-1] == 1:
        return 'B'
    else:
        return 'C'
    return label[-1]

from tqdm import tqdm

In [2]:
data = pd.read_json("dataset/non_overlap/train_subset.json")

In [3]:
#seperate prompt-response
data = data.explode(['prompt','response_a','response_b']).reset_index(drop = True)

data['label'] = data.apply(lambda x: get_label(x), axis = 1)

data = data.fillna('None')
data['response_a'] = data['response_a'].apply(lambda x: 'None' if len(x)==0 else x)
data['response_b'] = data['response_b'].apply(lambda x: 'None' if len(x)==0 else x)

In [7]:
MODEL = 'google/gemma-2-9b-it'
config = AutoConfig.from_pretrained(MODEL, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')

In [95]:
templete_part1 = "<start_of_turn>user\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n\n"
templete_part1_input_ids = tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']

templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<end_of_turn>\n"
templete_part2_input_ids = tokenizer(text=templete_part2, add_special_tokens=False, padding=False)['input_ids']
#print(f"templete_part2 is {templete_part2_input_ids}")
templete_part3 = "<start_of_turn>model\n"
templete_part3_input_ids = tokenizer(text=templete_part3, add_special_tokens=False, padding=False)['input_ids']

use_in_prompt_1 = tokenizer(text="#Prompt\n", add_special_tokens=False, padding=False)['input_ids']
use_in_prompt_2 = tokenizer(text="\n\n" + "#Response\n" + "##Model A\n", add_special_tokens=False, padding=False)['input_ids']
use_in_prompt_3 = tokenizer(text="\n\n" + "##Model B\n", add_special_tokens=False, padding=False)['input_ids']

templete_part4_input_ids = tokenizer(text="\n\n", add_special_tokens=False, padding=False)['input_ids']
eos_token_id = tokenizer.eos_token_id

In [6]:
data = data.sample(1000).reset_index(drop = True)

In [7]:
def tokenize(row, tokenizer):

    now_data = row
    response_a = row['response_a']
    response_a_input_ids = tokenizer(text=response_a, add_special_tokens=False, padding=False)['input_ids']
    row['response_a_input_ids'] = response_a_input_ids
    
    response_b = row['response_b']
    response_b_input_ids = tokenizer(text=response_b, add_special_tokens=False, padding=False)['input_ids']
    row['response_b_input_ids'] = response_b_input_ids
    
    prompt = row['prompt']
    prompt_input_ids = tokenizer(text=prompt, add_special_tokens=False, padding=False)['input_ids']
    row['prompt_input_ids'] = prompt_input_ids
    
    label = now_data['label']
    label_ids = tokenizer.encode(text=label, add_special_tokens=False)
    row['label_ids'] = label_ids

    return row

In [8]:
data = data.apply(lambda x: tokenize(x, tokenizer), axis = 1)

In [103]:
def adjust_values(A, B, a_space, b_space, ex_space):
    # 计算A和a_space的差值
    a_diff = a_space - A
    b_diff = b_space - B
    
    # 第一种情况：A小于a_space，B小于b_space
    if A < a_space and B < b_space:
        ex_space += a_diff + b_diff
        return A, B, ex_space

    # 第二种情况：如果A和B都各自大于自己的space
    elif A > a_space and B > b_space:
        total_extra_needed = (A - a_space) + (B - b_space)
        if total_extra_needed > ex_space:
            A = int(a_space + ex_space / 2)
            B = int(b_space + ex_space / 2)
            ex_space = 0
        else:
            a_space = A
            b_space = B
            ex_space -= total_extra_needed
            
        return A, B, ex_space
        
    # 第三种情况：A或者B其中有一个大于a_space, b_space
    elif A >= a_space or B >= b_space:
        # 如果A大于a_space但是B小于b_space
        if A >= a_space and B <= b_space:
            extra_needed = A - a_space
            ex_space += b_space - B
            #够用
            if ex_space >= extra_needed:
                ex_space -= extra_needed
                
            else:
                #不够用
                #b_space = B + available_space
                A = a_space + ex_space
                ex_space = 0

        # 如果B大于b_space但是A小于a_space
        elif B > b_space and A < a_space:
            extra_needed = B - b_space
            ex_space += a_space - A
            
            if ex_space >= extra_needed:
                ex_space -= extra_needed
                
            else:
                B = b_space + ex_space
                ex_space = 0

        return A, B, ex_space
    

def adjust(current_lengths, prompt_length_space=300, response_length_space=800):
    prompt_length = current_lengths[0]
    response_a_length = current_lengths[1]
    response_b_length = current_lengths[2]
    #先看prompt的额度
    ex_space = max(0, prompt_length_space - prompt_length)
    response_a_length, response_b_length, ex_space = adjust_values(response_a_length, response_b_length, response_length_space, response_length_space, ex_space)
    prompt_length = min(prompt_length, prompt_length_space)
    prompt_length += ex_space

    return prompt_length, response_a_length, response_b_length

In [101]:
def over_max_length(prompt_input_ids, model_a_input_ids, model_b_input_ids, max_length):
    '''
    单条超出max length
    '''
    length = [len(prompt_input_ids), len(model_a_input_ids), len(model_b_input_ids)]
    prompt_length = int(max_length // 5)
    response_length = int((max_length - prompt_length) // 2)
    prompt_max_length, a_max_length, b_max_length = adjust(length, prompt_length, response_length)
    prompt_ids = prompt_input_ids[:prompt_max_length] + templete_part4_input_ids
    model_a_input_ids = model_a_input_ids[:a_max_length] + templete_part4_input_ids
    model_b_input_ids = model_b_input_ids[:b_max_length] + templete_part4_input_ids
    prompt_response_ids = prompt_ids + model_a_input_ids + model_b_input_ids
    return prompt_response_ids

In [102]:
def add_template(prompt_response_ids):
    input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids
    return input_ids

# make prompt

In [104]:
def prompt_3(data, max_length, if_train):
    data = data.iloc[::-1].reset_index(drop = True)#反转
    prompt_response = []
    ids = []
    labels = []
    #只有一种可能会超出max length：
    #单条的prompt和reponse加在一起超出max length
    
    text_length = 0
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        # data['prompt_response_ids'] = use_in_prompt_1 + data['prompt_input_ids'] + use_in_prompt_2 + data['response_a_input_ids'] + use_in_prompt_3 + data['response_b_input_ids']
        # text = row['prompt_response_ids']
        text = use_in_prompt_1 + row['prompt_input_ids'] + use_in_prompt_2 + row['response_a_input_ids'] + use_in_prompt_3 + row['response_b_input_ids']
        response_a = row['response_a_input_ids']
        response_b = row['response_b_input_ids']
        prompt = row['prompt_input_ids']
        id = row['id']
        
        if if_train:
            label = row['label_ids']
        
        if id not in ids:
            #第一次出现
            text_length = len(text)
            ids.append(id)
            if if_train:
                labels.append(label)
            if text_length > max_length:
                text = over_max_length(prompt_input_ids = prompt, model_a_input_ids = response_a, model_b_input_ids = response_b, max_length = max_length)\
                
            text = add_template(text)
            prompt_response.append(text)
        
        else:
            text_length += len(text)
            if text_length <= max_length:
                #取上一个text出来，合并后替换
                text = text + templete_part4_input_ids + prompt_response[-1]
                prompt_response[-1] = text
                
            else:
                #另一起一行
                text_length = len(text)
                ids.append(id)
                
                if if_train:
                    labels.append(label)
                    
                #另起一行但超出长度
                if text_length > max_length:
                    text = over_max_length(prompt_input_ids = prompt, model_a_input_ids = response_a, model_b_input_ids = response_b, max_length = max_length)
                
                text = add_template(text)
                prompt_response.append(text)
                    
                
                    
    if if_train:           
        data = pd.DataFrame({'id': ids, 'prompt_response': prompt_response, "label": labels})
        data = data.iloc[::-1].reset_index(drop = True)#反转
    else:
        data = pd.DataFrame({'id': ids, 'prompt_response': prompt_response})
        data = data.iloc[::-1].reset_index(drop = True)#反转
    return data

In [105]:
final = prompt_3(data, max_length = 1900, if_train = True)

100%|██████████| 1000/1000 [00:00<00:00, 8356.32it/s]


In [106]:
final['length'] = final['prompt_response'].apply(len)

In [107]:
idx = final['length'] == final['length'].max()
final.loc[idx]

Unnamed: 0,id,prompt_response,label,length
70,2639720283,"[2, 106, 1645, 108, 4858, 708, 1378, 2872, 235...",[235305],1953
255,685334275,"[2, 106, 1645, 108, 4858, 708, 1378, 2872, 235...",[235280],1953
376,1383698196,"[2, 106, 1645, 108, 4858, 708, 1378, 2872, 235...",[235280],1953
466,3837487706,"[2, 106, 1645, 108, 4858, 708, 1378, 2872, 235...",[235280],1953
827,3712274856,"[2, 106, 1645, 108, 4858, 708, 1378, 2872, 235...",[235280],1953
851,2581731284,"[2, 106, 1645, 108, 4858, 708, 1378, 2872, 235...",[235288],1953
858,3071276906,"[2, 106, 1645, 108, 4858, 708, 1378, 2872, 235...",[235305],1953
976,4097865015,"[2, 106, 1645, 108, 4858, 708, 1378, 2872, 235...",[235288],1953


In [110]:
print(tokenizer.decode(final.prompt_response.values[1]))

<bos><start_of_turn>user
Here are two question-answering dialogues. Compare two model performance on answering question, determine which is better.

#Prompt
What is discursive reasoning

#Response
##Model A
Discursive reasoning is a type of reasoning that involves making arguments and defending them using logical reasoning. It involves constructing a series of statements or claims that can be challenged and supported by evidence. Discursive reasoning is often used in debates and discussions, and is a key part of critical thinking.

##Model B
Hello! I'm here to help answer your questions safely and helpfully. Discursive reasoning is a type of thinking and reasoning that involves evaluating and analyzing information, arguments, and beliefs in a systematic and logical way. It involves considering multiple perspectives, weighing evidence, and making connections between ideas.

Discursive reasoning is an important skill for critical thinking, problem-solving, and effective communication. It

In [8]:
tqdm.pandas(desc = 'pandas bar')

In [1]:
from utils_v2 import *

In [2]:
tmp_train , _ = load_split_data("dataset/non_overlap/train_subset.json", 3, 1900, True, False, False, False, 'last', 'google/gemma-2-9b-it')

pandas bar: 100%|██████████| 55745/55745 [02:56<00:00, 315.25it/s]
100%|██████████| 55745/55745 [00:19<00:00, 2818.03it/s]


In [3]:
tmp_train['length'] = tmp_train['prompt_response'].apply(len)

In [4]:
idx = tmp_train['length'] == tmp_train['length'].max()
tmp_train.loc[idx]

Unnamed: 0,id,prompt_response,label,length
39,3254113,"[235345, 55440, 108, 3611, 692, 1707, 3104, 48...",[235288],1916
416,35088788,"[235345, 55440, 108, 24926, 5598, 476, 4866, 1...",[235288],1916
447,37697874,"[235345, 55440, 108, 11071, 235292, 1646, 708,...",[235288],1916
497,43053669,"[235345, 55440, 108, 156910, 889, 736, 5078, 2...",[235280],1916
554,48531611,"[235345, 55440, 108, 108, 235345, 1915, 664, 1...",[235305],1916
...,...,...,...,...
46553,4280501571,"[235345, 55440, 108, 1638, 608, 7588, 8293, 61...",[235305],1916
46570,4281980776,"[235345, 55440, 108, 90822, 573, 2412, 3409, 2...",[235280],1916
46637,4289338231,"[235345, 55440, 108, 49688, 573, 2412, 2793, 7...",[235288],1916
46648,4290839285,"[235345, 55440, 108, 19584, 145739, 26399, 235...",[235305],1916


In [5]:
data = pd.read_json("dataset/non_overlap/train_subset.json")

In [31]:
data[data['id'] == 35088788].prompt.values[0][1]

'please make it shorter, reddit style reply'

In [6]:
tmp_train[tmp_train['id'] == 35088788]

Unnamed: 0,id,prompt_response,label,length
416,35088788,"[235345, 55440, 108, 24926, 5598, 476, 4866, 1...",[235288],1916
417,35088788,"[235345, 55440, 108, 24926, 1501, 665, 25270, ...",[235288],380


In [4]:
len(tmp_train.prompt_response.values[0])

1178

In [2]:
MODEL = 'google/gemma-2-9b-it'
config = AutoConfig.from_pretrained(MODEL, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')

In [3]:
tokenizer

GemmaTokenizerFast(name_or_path='google/gemma-2-9b-it', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='left', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, single_

In [9]:
print(tokenizer.decode(tmp_train.prompt_response.values[417]))

#Prompt
please make it shorter, reddit style reply

#Response
##Model A
Hey [username], I feel the same way as you regarding XMR and BCH. While getting delisted can be tough, it could also spur innovation. Excited to see what new ideas emerge! And I couldn't agree more about XMR's focus on privacy aligning with the Cypherpunk's Manifesto. As someone who values privacy, I'm glad XMR is becoming less of a speculator's game. I'm also selling stuff on Monero Market now! [Insert link] Let's work together towards a future where privacy is a right, not a luxury. 🙌🔒🔑 #XMR #Cypherpunks #PrivacyMatters #AnonymityRocks

P.S. If you're interested in buying any of my items on Monero Market, check them out! [Insert link]

##Model B
Hey!

Totally vibing with your take on XMR's situation! Watching Bitcoin's journey had me worried too, especially seeing how the quick-buck mindset overshadowed the real essence of crypto. It's refreshing to see someone else who values privacy and the core principles laid