In [1]:
import pandas as pd

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoConfig
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, RobertaForMultipleChoice
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset

In [3]:
class args:
    train_data = './dataset/demo_train.csv'
    MAX_INPUT = 1024

In [4]:
df_train = pd.read_csv(args.train_data).reset_index(drop = True)
#df_valid = pd.read_csv(args.valid_data).reset_index(drop = True)

In [21]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}

In [22]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]

In [23]:
df_train['label'] = df_train.apply(lambda x: get_label(x), axis = 1)

In [35]:
def preprocess(example):
    first_sentence = [ "[CLS] " + example['prompt'] ] * 2
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in ['response_a','response_b']]
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='longest_first', 
                                  max_length=args.MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [5]:
def preprocess(example):
    sentences = [" #### " + example['prompt'] + " [SEP] " + example['response_a'] + " [SEP]" +  " #### " + example['prompt'] + " [SEP] " + example['response_b'] + " [SEP]"]
    tokenized_example = tokenizer(sentences, truncation=True, 
                                  max_length=args.MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [6]:
dataset = datasets.Dataset.from_pandas(df_train)
MODEL = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [37]:
dataset

Dataset({
    features: ['Unnamed: 0', 'id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'label'],
    num_rows: 1001
})

In [7]:
tokenized_dataset = dataset.map(preprocess, remove_columns=['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b'])# 

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [13]:
tokenized_dataset

Dataset({
    features: ['Unnamed: 0', 'winner_model_a', 'winner_model_b', 'winner_tie', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1001
})

In [None]:
tokenized_dataset['input_ids'][0]

In [32]:
df_train.loc[40]

Unnamed: 0                                                       40
id                                                          3258431
model_a                                     stablelm-tuned-alpha-7b
model_b                                                  vicuna-13b
prompt            ["The following is a command that the user is ...
response_a                                                  ["Yes"]
response_b                                                   ["NO"]
winner_model_a                                                    1
winner_model_b                                                    0
winner_tie                                                        0
label                                                             0
Name: 40, dtype: object

In [14]:
df_train.loc[:1000,].reset_index(drop = True).to_csv('demo_train.csv')

In [15]:
df_train.loc[1000:1200,].reset_index(drop = True).to_csv('demo_valid.csv')

In [2]:
import os

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoConfig
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, RobertaForMultipleChoice, AutoModelForSequenceClassification, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset
from sklearn.metrics import log_loss
import torch.nn as nn
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
    

In [3]:
MODEL = 'meta-llama/llama-3-transformers-8b-hf-v1'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.add_special_tokens({"pad_token":"<pad>"})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [7]:
tokenizer("<pad>")['input_ids'][0]

128256

In [8]:
128256 in tokenizer("<pad>")['input_ids']

True

In [4]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False
    )
    
#config = AutoConfig.from_pretrained(args.MODEL)
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    #config = config,
    device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/llama-3-transformers-8b-hf-v1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

Layer: model.embed_tokens.weight, dtype: torch.bfloat16
Layer: model.layers.0.self_attn.q_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.k_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.v_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.o_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.gate_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.up_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.down_proj.weight, dtype: torch.uint8
Layer: model.layers.0.input_layernorm.weight, dtype: torch.bfloat16
Layer: model.layers.0.post_attention_layernorm.weight, dtype: torch.bfloat16
Layer: model.layers.1.self_attn.q_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.k_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.v_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.o_proj.weight, dtype: torch.uint8
Layer: model.layers.1.mlp.gate_proj.weight, dtype: torch.uint8
Layer: model.layers.1

In [3]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False
    )
    
#config = AutoConfig.from_pretrained(args.MODEL)
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=3,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    #config = config,
    device_map="auto")
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

# config = AutoConfig.from_pretrained(MODEL)
# config.hidden_dropout_prob = args.dropout_rate
# config.attention_probs_dropout_prob = args.dropout_rate
# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,  # For sequence classification
#     inference_mode=False,
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0.1,
#     bias = 'none',
#     target_modules=["q_proj","k_proj","v_proj"]  # Target specific modules
# )
# model = get_peft_model(model, peft_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/llama-3-transformers-8b-hf-v1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(128257, 4096)

In [7]:
for key in model.state_dict():
        print(f"{key}, {model.state_dict()[key].shape}, {model.state_dict()[key].dtype}")

model.embed_tokens.weight, torch.Size([128257, 4096]), torch.bfloat16
model.layers.0.self_attn.q_proj.weight, torch.Size([8388608, 1]), torch.uint8
model.layers.0.self_attn.q_proj.weight.absmax, torch.Size([262144]), torch.float32
model.layers.0.self_attn.q_proj.weight.quant_map, torch.Size([16]), torch.float32
model.layers.0.self_attn.q_proj.weight.quant_state.bitsandbytes__nf4, torch.Size([82]), torch.uint8
model.layers.0.self_attn.k_proj.weight, torch.Size([2097152, 1]), torch.uint8
model.layers.0.self_attn.k_proj.weight.absmax, torch.Size([65536]), torch.float32
model.layers.0.self_attn.k_proj.weight.quant_map, torch.Size([16]), torch.float32
model.layers.0.self_attn.k_proj.weight.quant_state.bitsandbytes__nf4, torch.Size([82]), torch.uint8
model.layers.0.self_attn.v_proj.weight, torch.Size([2097152, 1]), torch.uint8
model.layers.0.self_attn.v_proj.weight.absmax, torch.Size([65536]), torch.float32
model.layers.0.self_attn.v_proj.weight.quant_map, torch.Size([16]), torch.float32
mod

In [4]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

Layer: model.embed_tokens.weight, dtype: torch.bfloat16
Layer: model.layers.0.self_attn.q_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.k_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.v_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.o_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.gate_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.up_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.down_proj.weight, dtype: torch.uint8
Layer: model.layers.0.input_layernorm.weight, dtype: torch.bfloat16
Layer: model.layers.0.post_attention_layernorm.weight, dtype: torch.bfloat16
Layer: model.layers.1.self_attn.q_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.k_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.v_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.o_proj.weight, dtype: torch.uint8
Layer: model.layers.1.mlp.gate_proj.weight, dtype: torch.uint8
Layer: model.layers.1

In [33]:
for name, param in model.named_parameters():
    print(f'Layer: {name}, dtype: {param.dtype}')

Layer: model.embed_tokens.weight, dtype: torch.float16
Layer: model.layers.0.self_attn.q_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.k_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.v_proj.weight, dtype: torch.uint8
Layer: model.layers.0.self_attn.o_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.gate_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.up_proj.weight, dtype: torch.uint8
Layer: model.layers.0.mlp.down_proj.weight, dtype: torch.uint8
Layer: model.layers.0.input_layernorm.weight, dtype: torch.float16
Layer: model.layers.0.post_attention_layernorm.weight, dtype: torch.float16
Layer: model.layers.1.self_attn.q_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.k_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.v_proj.weight, dtype: torch.uint8
Layer: model.layers.1.self_attn.o_proj.weight, dtype: torch.uint8
Layer: model.layers.1.mlp.gate_proj.weight, dtype: torch.uint8
Layer: model.layers.1.ml

In [None]:
[i.dtype for i in model.parameters()]