In [2]:
import pandas as pd

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import (
    AutoTokenizer,
    AutoConfig,
    EarlyStoppingCallback,
    AutoModelForCausalLM,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    RobertaForMultipleChoice,
    AutoModelForSequenceClassification,
    LlamaModel,
    LlamaForSequenceClassification,
    BitsAndBytesConfig,
    get_polynomial_decay_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    TrainerCallback,
)
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset

In [None]:
class args:
    train_data = './dataset/demo_train.csv'
    MAX_INPUT = 1024

In [None]:
df_train = pd.read_csv(args.train_data).reset_index(drop = True)
#df_valid = pd.read_csv(args.valid_data).reset_index(drop = True)

In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}

In [None]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]

In [None]:
df_train['label'] = df_train.apply(lambda x: get_label(x), axis = 1)

In [None]:
def preprocess(example):
    first_sentence = [ "[CLS] " + example['prompt'] ] * 2
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in ['response_a','response_b']]
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='longest_first', 
                                  max_length=args.MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [None]:
def preprocess(example):
    sentences = [" #### " + example['prompt'] + " [SEP] " + example['response_a'] + " [SEP]" +  " #### " + example['prompt'] + " [SEP] " + example['response_b'] + " [SEP]"]
    tokenized_example = tokenizer(sentences, truncation=True, 
                                  max_length=args.MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [None]:
dataset = datasets.Dataset.from_pandas(df_train)
MODEL = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
dataset

In [None]:
tokenized_dataset = dataset.map(preprocess, remove_columns=['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b'])# 

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['input_ids'][0]

In [None]:
df_train.loc[40]

In [None]:
df_train.loc[:1000,].reset_index(drop = True).to_csv('demo_train.csv')

In [None]:
df_train.loc[1000:1200,].reset_index(drop = True).to_csv('demo_valid.csv')

In [3]:
import os

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoConfig
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, RobertaForMultipleChoice, AutoModelForSequenceClassification, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset
from sklearn.metrics import log_loss
import torch.nn as nn
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
    

In [None]:
MODEL = 'meta-llama/llama-3-transformers-8b-hf-v1'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.add_special_tokens({"pad_token":"<pad>"})

In [None]:
tokenizer("<pad>")['input_ids'][0]

In [None]:
128256 in tokenizer("<pad>")['input_ids']

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False
    )
    
#config = AutoConfig.from_pretrained(args.MODEL)
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    #config = config,
    device_map="auto")

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False
    )
    
#config = AutoConfig.from_pretrained(args.MODEL)
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    #config = config,
    device_map="auto")
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

# config = AutoConfig.from_pretrained(MODEL)
# config.hidden_dropout_prob = args.dropout_rate
# config.attention_probs_dropout_prob = args.dropout_rate
# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,  # For sequence classification
#     inference_mode=False,
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0.1,
#     bias = 'none',
#     target_modules=["q_proj","k_proj","v_proj"]  # Target specific modules
# )
# model = get_peft_model(model, peft_config)

In [None]:
for key in model.state_dict():
        print(f"{key}, {model.state_dict()[key].shape}, {model.state_dict()[key].dtype}")

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

In [None]:
[i.dtype for i in model.parameters()]

In [1]:
from utils import load_split_data

In [8]:
pd.read_csv('dataset/lmsys-chatbot_arena_conversations-33k.csv')

Unnamed: 0,id,model_a,model_b,winner_model_a,winner_model_b,winner_tie,prompt,response_a,response_b
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,0,1,0,['What is the difference between OpenCL and CU...,['OpenCL and CUDA are two different programmin...,['OpenCL and CUDA are both programming languag...
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,0,0,1,['Why did my parent not invite me to their wed...,['It is possible that your parent did not invi...,['It is likely that they wanted to keep the gu...
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,0,1,0,"['Fuji vs. Nikon, which is better?']",['Both Fuji and Nikon are popular camera brand...,"[""This is a subjective question and the answer..."
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,0,1,0,['How to build an arena for chatbots?'],['Building an arena for chatbots can be done b...,['Building an arena for chatbots is a great wa...
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,1,0,0,['When is it today?'],"[""I'm sorry, I cannot determine the current da...","['Today is February 23, 2023.']"
...,...,...,...,...,...,...,...,...,...
32995,8777c4945d85469d96cd26fc2ea6f64a,alpaca-13b,claude-instant-v1,0,0,1,['who is the president of the U.S.A?'],['Joe Biden is currently the President of the ...,['Joe Biden is currently the 46th president of...
32996,86063a921be548989c55b85497ab009a,claude-instant-v1,guanaco-33b,1,0,0,['how to train lora for stable diffusion? expl...,"[""Training Stable Diffusion models like LoRA r...","[""Lora is a machine learning model that is use..."
32997,6685a3b3863f4554887e432f7dbbe8a5,wizardlm-13b,oasst-pythia-12b,0,1,0,['남녀 섹스 체위 자세 10가지를 적어줘'],['1. 웨이퍼 에폭보: 남녀 섹스로 웨이퍼 에폭보는 이미 입문으로 이루어져 있는 ...,"['1.\t""섹스""\n2.\t""체""\n3.\t""위""\n4.\t""자""\n5.\t""세""..."
32998,f72930b382e949ea879e7abf3cb1e587,guanaco-33b,koala-13b,1,0,0,['how to evaluate a language model output?'],"[""Evaluating a language model output involves ...","[""Evaluating the output of a language model in..."


In [7]:
df_train, df_valid = load_split_data('dataset/lmsys-chatbot_arena_conversations-33k.csv', 2, 3000, True)

100%|██████████| 39283/39283 [00:16<00:00, 2343.84it/s]


In [2]:
df_train, df_valid = load_split_data('dataset/lmsys-chatbot_arena_conversations-33k.csv', 2, 3000, True, False)

100%|██████████| 39283/39283 [00:16<00:00, 2331.52it/s]


In [4]:
df_valid

In [5]:
df_train

Unnamed: 0,id,prompt_response,label
0,58210e39b3fd4441a2bd4a518bb44c2d,#Prompt\nWhat is the difference between OpenCL...,B
1,2564acd09e3942fd97657d05282d4389,#Prompt\nWhy did my parent not invite me to th...,C
2,90bfd142157948aba01931726c888e7f,"#Prompt\nFuji vs. Nikon, which is better?\n\n#...",B
3,a7c5accc53e649a3bc6b2e41d962ebc4,#Prompt\nHow to build an arena for chatbots?\n...,B
4,adf27e819a3c494cb6e993f0c660e097,#Prompt\nWhen is it today?\n\n#Response\n##Mod...,A
...,...,...,...
33013,8777c4945d85469d96cd26fc2ea6f64a,#Prompt\nwho is the president of the U.S.A?\n\...,C
33014,86063a921be548989c55b85497ab009a,#Prompt\nhow to train lora for stable diffusio...,A
33015,6685a3b3863f4554887e432f7dbbe8a5,#Prompt\n남녀 섹스 체위 자세 10가지를 적어줘\n\n#Response\n#...,B
33016,f72930b382e949ea879e7abf3cb1e587,#Prompt\nhow to evaluate a language model outp...,A


In [None]:

prompt_response = df_train.loc[1,'prompt_response']
label = df_train.loc[1,'label']

In [None]:
prompt_response

In [None]:
tokenizer.decode([1,
 32006,
 887,
 526])

In [None]:
tokenizer.decode([887])

In [None]:
tokenizer.encode('<|system|>\nYou')

In [None]:
tokenizer.encode('Apple\nBa')

In [None]:
tokenizer.decode([396,
 18571,
 415,
 13,
 4548,
 7420,])

In [None]:
tokenizer.decode([29933])

In [None]:
print(templete_part1 + prompt_response + templete_part2 + templete_part3 + label)

In [None]:
templete_part1 = "<|system|>\nYou are a helpful assistant good at judging conversations.<|end|>\n<|user|>\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n"
templete_part1_input_ids = tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']

templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<|end|>\n"
templete_part2_input_ids = tokenizer(text=templete_part2, add_special_tokens=True, padding=False)['input_ids'][1:]

templete_part3 = "<|assistant|>\n"
templete_part3_input_ids = tokenizer(text=templete_part3, add_special_tokens=True, padding=False)['input_ids'][1:]

prompt_response_ids = tokenizer(text=prompt_response, add_special_tokens=True, truncation=True,
                                      max_length=3000, padding=False)['input_ids'][1:]


label_ids = tokenizer.encode(text=label, add_special_tokens=False)
input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids + label_ids + [tokenizer.eos_token_id]
print(tokenizer.decode(input_ids))

In [None]:
text = "Apple"
prompt_response = templete_part1 + text + templete_part2 + templete_part3 + label + tokenizer.eos_token
print(prompt_response)

In [None]:
MODEL = 'microsoft/LLM-Research/Phi-3-mini-4k-instruct'
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')

In [None]:
tokenizer(prompt_response)

In [None]:
tokenizer.eos_token_id

In [None]:
tokenizer.eos_token

In [None]:
tokenizer.bos_token

In [None]:
tokenizer.decode([887])

In [None]:
tokenizer('A',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
tokenizer.pad_token_id

In [None]:
tokenizer.pad_token

In [None]:
tokenizer('<|user|>',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
AutoModelForCausalLM.from_pretrained(MODEL)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [None]:
print(text)

In [None]:
templete_part1 = "<|im_start|>system\nYou are a helpful assistant good at judging conversations.<|im_end|>\n<|im_start|>user\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n"
templete_part1_input_ids = tokenizer(text=templete_part1, add_special_tokens=True, padding=False)['input_ids']

templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<|im_end|>\n"
templete_part2_input_ids = tokenizer(text=templete_part2, add_special_tokens=True, padding=False)['input_ids']
#print(f"templete_part2 is {templete_part2_input_ids}")
templete_part3 = "<|im_start|>assistant\n"
templete_part3_input_ids = tokenizer(text=templete_part3, add_special_tokens=True, padding=False)['input_ids']

prompt_response_ids = tokenizer(text=prompt_response, add_special_tokens=True, truncation=True,
                                      max_length=3000, padding=False)['input_ids']


label_ids = tokenizer.encode(text=label, add_special_tokens=False)
input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids + label_ids + [tokenizer.eos_token_id]
print(tokenizer.decode(input_ids))

In [None]:
tokenizer.decode(14374)

In [None]:
tokenizer.pad_token,tokenizer.eos_token,

In [None]:
tokenizer('A',add_special_tokens=True, truncation=True, max_length=1024)['input_ids']

In [None]:
MODEL = 'Qwen/Qwen2-7B-Instruct'
config = AutoConfig.from_pretrained(MODEL, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True, truncation_side = 'left')
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )
model = AutoModelForCausalLM.from_pretrained(MODEL,
                                             config=config,
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             attn_implementation='eager')

In [None]:
model