In [1]:
!cp /kaggle/input/lmsys-utils/utils.py /kaggle/working/utils.py
!pip install -q vllm==0.5.1 --no-index --find-links ../input/lib-vllm051
!pip install -q transformers==4.42.4 --no-index --find-links /kaggle/input/lib-transformers4424
!pip install -q peft==0.11.1 --no-index --find-links /kaggle/input/lib-peft0111
!pip install -q bitsandbytes==0.43.1 --no-index --find-links /kaggle/input/lib-bitsandbytes0431

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.6.1 requires cubinlinker, which is not installed.
cudf 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.6.1 requires ptxcompiler, which is not installed.
cuml 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.6.1 requires cupy-cuda11x>=12.0.0, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 16.1.0 which is incompatible.
cudf 24.6.1 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.5.0 which is incompatible.
jupy

In [2]:
import time
from glob import glob
from dataclasses import dataclass

import torch
import sklearn
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, LlamaForSequenceClassification, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from transformers import (
    BitsAndBytesConfig,
    AutoModelForCausalLM,
    AutoTokenizer,

    LlamaPreTrainedModel,
    LlamaModel,
    LlamaForSequenceClassification,

    Gemma2PreTrainedModel,
    Gemma2Model,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,

    PreTrainedTokenizerFast,
    PreTrainedTokenizerBase, 
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    DataCollatorForSeq2Seq,
)
from peft import PeftModel



from utils import get_now_time_fullstring, seed_everything, current_date_time
from utils import process, compute_metrics

import torch
print(f"{torch.__version__ = }")

import transformers
print(f"{transformers.__version__ = }")

import peft
print(f"{peft.__version__ = }")

import bitsandbytes
print(f"{bitsandbytes.__version__ = }")

assert torch.cuda.device_count() == 2, "Sorry - multi-GPU required!"
torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cuda.enable_flash_sdp(True)  # Doesn't have any effect as Flash Attention does not support T4/P100

2024-08-18 02:25:22.090371: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-18 02:25:22.090508: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-18 02:25:22.240714: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


torch.__version__ = '2.3.0+cu121'
transformers.__version__ = '4.42.4'
peft.__version__ = '0.11.1'
bitsandbytes.__version__ = '0.43.3'


In [3]:
weights_path = '/kaggle/input/gemma9b-ru'
MAX_LEN = 2048+1024
PADDING_SIDE = "right"
TRUNCATION_SIDE = "right"

use_softcapping = False

seed_everything(42)


df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
df['prompt'] = df['prompt'].apply(process)
df['response_a'] = df['response_a'].apply(process)
df['response_b'] = df['response_b'].apply(process)
df['id'] = df['id'].astype(str)

df_tta = df.copy()
df_tta["response_a"], df_tta["response_b"] = df_tta["response_b"], df_tta["response_a"]


tokenizer = GemmaTokenizerFast.from_pretrained(weights_path)
tokenizer.add_eos_token = True
tokenizer.padding_side = PADDING_SIDE
if TRUNCATION_SIDE == "left":
    tokenizer.truncation_side = "left"
# tokenizer.pad_token = tokenizer.eos_token


    
def tokenize_cls_p3(example, tokenizer, max_length, is_train):
    input_ids = []
    attention_mask = []
    dot_tokens = tokenizer("......", add_special_tokens=False)["input_ids"]
    final_p_tokens = tokenizer("\n\n---\nWhich response is better? [A or B or tie]\nAnswer: ", add_special_tokens=False)["input_ids"]
    for ps, ras, rbs in zip(example['prompt'], example['response_a'], example['response_b']):
        one_input_ids = [tokenizer.bos_token_id]
        prev_tokens_num = 2 + len(final_p_tokens) # 2 for bos_token and eos_token
        for idx, (p, ra, rb) in enumerate(zip(ps, ras, rbs)):
            r_tokens  = tokenizer(f'\n\n## Round {idx+1}:' if idx else f'## Round {idx+1}:', add_special_tokens=False)["input_ids"]
            p_tokens  = tokenizer(f'\n### Prompt:\n{p}', add_special_tokens=False)["input_ids"]
            ra_tokens = tokenizer(f'\n\n### Response A:\n{ra}', add_special_tokens=False)["input_ids"]
            rb_tokens = tokenizer(f'\n\n### Response B:\n{rb}', add_special_tokens=False)["input_ids"]
            all_tokens_num = prev_tokens_num + len(r_tokens) + len(p_tokens) + len(ra_tokens) + len(rb_tokens)

            if all_tokens_num > max_length:
                remain_tokens_num = max_length - prev_tokens_num - len(r_tokens) - 3*len(dot_tokens) 
                if remain_tokens_num >= 80:
                    p_tokens  =  p_tokens[:int(remain_tokens_num*0.2)] + dot_tokens if len( p_tokens) > int(remain_tokens_num*0.2) else  p_tokens
                    ra_tokens = ra_tokens[:int(remain_tokens_num*0.4)] + dot_tokens if len(ra_tokens) > int(remain_tokens_num*0.4) else ra_tokens
                    rb_tokens = rb_tokens[:int(remain_tokens_num*0.4)] + dot_tokens if len(rb_tokens) > int(remain_tokens_num*0.4) else rb_tokens
                    one_input_ids += r_tokens + p_tokens + ra_tokens + rb_tokens
                break
            else:
                prev_tokens_num = all_tokens_num
                one_input_ids += r_tokens + p_tokens + ra_tokens + rb_tokens
        
        one_input_ids += final_p_tokens + [tokenizer.eos_token_id]
        one_attention_mask = [1] * len(one_input_ids)

        input_ids.append(one_input_ids)
        attention_mask.append(one_attention_mask)
    
    if is_train:
        labels = [0 if a_win else 1 if b_win else 2 for a_win, b_win, tie in zip(example['winner_model_a'], example['winner_model_b'], example['winner_tie'])]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }
    else:
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
    

    
    

def infer_process_batch(df, tokenizer, max_length, batch_size):
    results = {"input_ids": [], "attention_mask": []}

    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        example = {
            'prompt': batch['prompt'].tolist(),
            'response_a': batch['response_a'].tolist(),
            'response_b': batch['response_b'].tolist(),
        }
        tokenized = tokenize_cls_p3(example, tokenizer, max_length, is_train=False)
        results["input_ids"].extend(tokenized["input_ids"])
        results["attention_mask"].extend(tokenized["attention_mask"])
        
    return results
    
    
    
tokenized_results = infer_process_batch(
    df=df, 
    tokenizer=tokenizer, 
    max_length=MAX_LEN, 
    batch_size=100
)

tokenized_results_tta = infer_process_batch(
    df=df_tta, 
    tokenizer=tokenizer, 
    max_length=MAX_LEN, 
    batch_size=100
)


data = pd.DataFrame()
data["id"] = df["id"]
data['input_ids'] = tokenized_results['input_ids']
data['attention_mask'] = tokenized_results['attention_mask']
data["length"] = data["input_ids"].apply(len)


data_tta = pd.DataFrame()
data_tta["id"] = df_tta["id"]
data_tta['input_ids'] = tokenized_results_tta['input_ids']
data_tta['attention_mask'] = tokenized_results_tta['attention_mask']
data_tta["length"] = data_tta["input_ids"].apply(len)

In [4]:
bnb_config =  BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=False,
)

# Load base model on GPU 0
device_0 = torch.device('cuda:0')
base_model_0 = Gemma2ForSequenceClassification.from_pretrained(
    weights_path,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda:0')
base_model_0.config.pad_token_id = tokenizer.pad_token_id
base_model_0.config.use_cache = False


# Load base model on GPU 1
device_1 = torch.device('cuda:1')
base_model_1 = Gemma2ForSequenceClassification.from_pretrained(
    weights_path,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda:1')
base_model_1.config.pad_token_id = tokenizer.pad_token_id
base_model_1.config.use_cache = False

if not use_softcapping:
    base_model_0.config.attn_logit_softcapping = None
    base_model_1.config.attn_logit_softcapping = None


model_0 = base_model_0
model_1 = base_model_1

Unused kwargs: ['bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=2, max_length=MAX_LEN):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

In [6]:
from concurrent.futures import ThreadPoolExecutor

# sort by input length to fully leverage dynaminc padding
data     = data.sort_values("length", ascending=False)
data_tta = data_tta.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.copy()
sub_2 = data_tta.copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))
    
results_list = list(results)
result_df = results_list[0].sort_values("id", ascending=True).reset_index(drop=True)
result_df_tta = results_list[1].sort_values("id", ascending=True).reset_index(drop=True)

proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values
proba_tta = result_df_tta[["winner_model_a", "winner_model_b", "winner_tie"]].values

submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.loc[:, "winner_model_a"] = (proba[:, 0] + proba_tta[:, 1]) / 2
submission_df.loc[:, "winner_model_b"] = (proba[:, 1] + proba_tta[:, 0]) / 2
submission_df.loc[:, "winner_tie"]     = (proba[:, 2] + proba_tta[:, 2]) / 2
submission_df['id'] = submission_df['id'].astype(str)

In [7]:
df2 = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
df2['id'] = df2['id'].astype(str)

a_null_df = df2[(df2["response_a"]== '[null]') | (df2["response_a"]== '[]') | (df2["response_a"]== '[ ]') | (df2["response_a"]== '[  ]') | (df2["response_a"]== '[""]') | (df2["response_a"]== '["",""]')]
a_null_id_list = a_null_df["id"].tolist()
submission_df.loc[submission_df['id'].isin(a_null_id_list), ['winner_model_a', 'winner_model_b', 'winner_tie']] = [0.04, 0.88, 0.08]


b_null_df = df2[(df2["response_b"]== '[null]') | (df2["response_b"]== '[]') | (df2["response_b"]== '[ ]') | (df2["response_b"]== '[  ]') | (df2["response_b"]== '[""]') | (df2["response_b"]== '["",""]')]
b_null_id_list = b_null_df["id"].tolist()
submission_df.loc[submission_df['id'].isin(b_null_id_list), ['winner_model_a', 'winner_model_b', 'winner_tie']] = [0.88, 0.04, 0.08]


same_a_b_df2 = df2[(df2["response_a"]==df2["response_b"])]
same_a_b_id_list = same_a_b_df2["id"].tolist()
submission_df.loc[submission_df['id'].isin(same_a_b_id_list), ['winner_model_a', 'winner_model_b', 'winner_tie']] = [0.06, 0.06, 0.88]


submission_df.to_csv('submission.csv', index=False)
!rm -rf __pycache__

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
