In [32]:
!pip install kaggle

In [None]:
!kaggle competitions download -c lmsys-chatbot-arena

In [31]:
import pandas as pd
import numpy as np
import ast
import re
from typing import List

In [None]:
# change paths
train_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv', encoding_errors='ignore')
test_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv', encoding_errors='ignore')

In [33]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig, AutoModelForSequenceClassification
from functools import partial
import torch

MODEL_CKPT = "RLHFlow/ArmoRM-Llama3-8B-v0.1"
# MODEL_CKPT = "meta-llama/Meta-Llama-3-8B-Instruct"

In [35]:
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForSequenceClassification
from functools import partial
import torch

MODEL_CKPT = "RLHFlow/ArmoRM-Llama3-8B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT, use_fast=True)

def create_chat(human_chat: List[str],
                ai_chat: List[str]) -> List[dict[str, str]]:
    messages = []
    for human_message, ai_message in zip(human_chat, ai_chat):
        messages += [{"role": "user", "content": human_message},
                     {"role": "assistant", "content": ai_message}]
    return messages

def tokenize(row, model_response: str = 'response_a'):
    
    messages = create_chat(row['prompt'], row[model_response])
    # i do not convert input_ids to Python list format
    input_ids = tokenizer.apply_chat_template(messages, truncation=True, return_tensors='pt')
    
    return input_ids[0].tolist()

def label_numerical_encoding(row):
    if row['winner_model_a'] == 1:
        return 0
    if row['winner_model_b'] == 1:
        return 1
    if row['winner_tie'] == 1:
        return 2

# replace null with literal "null" string
def replace_null(input_string):
    
    # Define a regex pattern to match `null` elements not enclosed in quotes
    pattern = r'(?<=,|\[)null(?=,|\])'
    
    # Replace the matched `null` with `"No message"`
    result = re.sub(pattern, '"null"', input_string)
    return result

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
train_df_cop = train_df.copy()

train_df_cop['prompt'] = train_df_cop['prompt'].apply(replace_null)
train_df_cop['response_a'] = train_df_cop['response_a'].apply(replace_null)
train_df_cop['response_b'] = train_df_cop['response_b'].apply(replace_null)

# parse the stringified list to a list of strings
train_df_cop['prompt'] = train_df_cop.apply(lambda x: ast.literal_eval(x['prompt']), axis=1)
train_df_cop['response_a'] = train_df_cop.apply(lambda x: ast.literal_eval(x['response_a']), axis=1)
train_df_cop['response_b'] = train_df_cop.apply(lambda x: ast.literal_eval(x['response_b']), axis=1)

# tokenize before-hand to avoid conflicts during transition from dataframe to huggingface's dataset
train_df_cop['input_ids_a'] = train_df_cop.apply(partial(tokenize, model_response='response_a'), axis=1)
train_df_cop['input_ids_b'] = train_df_cop.apply(partial(tokenize, model_response='response_b'), axis=1)

# obtain numerical labels 
train_df_cop['label'] = train_df_cop.apply(label_numerical_encoding, axis=1)

In [14]:
import pandas as pd
data = pd.read_pickle('/kaggle/input/lmsys-preprocessed-data/preprocessed_data.pkl')

In [15]:
MAX_LENGTH = 1536

In [16]:
token_pattern = [128009, 128006, 78191, 128007, 271]
def find_token_for_gating(lst, ):
    """Find the last occurrence of a token_pattern in a list."""
    token_pattern_len = len(token_pattern)
    search_end = len(lst)
    for j in range(search_end - token_pattern_len, -1, -1):
        if lst[j:j + token_pattern_len] == token_pattern:
            return j
    return -1
    # raise ValueError("Token pattern not found in the list.")

In [17]:
def truncate(df, max_length):
    def truncation_op(x, col: str, max_len: int = 1024):
        return x[col][:min(len(x[col]), max_len)]
    
    df['input_ids_a'] = df.apply(truncation_op, args=('input_ids_a', max_length), axis=1)
    df['input_ids_b'] = df.apply(truncation_op, args=('input_ids_b', max_length), axis=1)
    
    return df

data = truncate(data, MAX_LENGTH)

In [22]:
# remove all data points where the prompt token lengths exceed 
exp = []
for i in range(len(data)):
    if(find_token_for_gating(data.iloc[i]['input_ids_a']) < 0):
        exp.append(i)

In [19]:
# remove these rows from training data        
data = data.drop(index=exp)

In [23]:
exp

[]

In [21]:
data

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,input_ids_a,input_ids_b,label
0,30192,gpt-4-1106-preview,gpt-4-0613,[Is it morally right to try to have a certain ...,[The question of whether it is morally right t...,"[As an AI, I don't have personal beliefs or op...",1,0,0,"[128006, 882, 128007, 271, 3957, 433, 57323, 1...","[128006, 882, 128007, 271, 3957, 433, 57323, 1...",0
1,53567,koala-13b,gpt-4-0613,[What is the difference between marriage licen...,[A marriage license is a legal document that a...,[A marriage license and a marriage certificate...,0,1,0,"[128006, 882, 128007, 271, 3923, 374, 279, 681...","[128006, 882, 128007, 271, 3923, 374, 279, 681...",1
2,65089,gpt-3.5-turbo-0613,mistral-medium,[explain function calling. how would you call ...,[Function calling is the process of invoking o...,[Function calling is the process of invoking a...,0,0,1,"[128006, 882, 128007, 271, 95444, 734, 8260, 1...","[128006, 882, 128007, 271, 95444, 734, 8260, 1...",2
3,96401,llama-2-13b-chat,mistral-7b-instruct,[How can I create a test set for a very rare c...,[Creating a test set for a very rare category ...,[When building a classifier for a very rare ca...,1,0,0,"[128006, 882, 128007, 271, 4438, 649, 358, 189...","[128006, 882, 128007, 271, 4438, 649, 358, 189...",0
4,198779,koala-13b,gpt-3.5-turbo-0314,[What is the best way to travel from Tel-Aviv ...,[The best way to travel from Tel Aviv to Jerus...,[The best way to travel from Tel-Aviv to Jerus...,0,1,0,"[128006, 882, 128007, 271, 3923, 374, 279, 188...","[128006, 882, 128007, 271, 3923, 374, 279, 188...",1
...,...,...,...,...,...,...,...,...,...,...,...,...
57472,4294656694,gpt-4-0613,claude-1,"[A simple mnemonic for π:\n""How I wish I could...","[Sure, let's break it down:\n\n1. ""How"" has 3 ...",[Here is how that mnemonic represents the digi...,1,0,0,"[128006, 882, 128007, 271, 32, 4382, 87901, 36...","[128006, 882, 128007, 271, 32, 4382, 87901, 36...",0
57473,4294692063,claude-2.0,llama-2-13b-chat,"[In python, implement a naive Bayes with gauss...",[Here is an implementation of a naive Bayes cl...,[Sure! Here's an implementation of a naive Bay...,1,0,0,"[128006, 882, 128007, 271, 644, 10344, 11, 430...","[128006, 882, 128007, 271, 644, 10344, 11, 430...",0
57474,4294710549,claude-1,alpaca-13b,"[is it unethical to work on building weapons?,...",[Working on weapons technology raises some eth...,[It depends on the context. Weapons can be use...,1,0,0,"[128006, 882, 128007, 271, 285, 433, 89735, 31...","[128006, 882, 128007, 271, 285, 433, 89735, 31...",0
57475,4294899228,palm-2,tulu-2-dpo-70b,"[If a bait contains 0,0025% bromadiolon then h...",[Bromadiolone is a rodenticide which is most o...,"[As an AI language model, I do not promote or ...",0,1,0,"[128006, 882, 128007, 271, 2746, 264, 49160, 5...","[128006, 882, 128007, 271, 2746, 264, 49160, 5...",1


In [11]:
data

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,input_ids_a,input_ids_b,label
0,30192,gpt-4-1106-preview,gpt-4-0613,[Is it morally right to try to have a certain ...,[The question of whether it is morally right t...,"[As an AI, I don't have personal beliefs or op...",1,0,0,"[128006, 882, 128007, 271, 3957, 433, 57323, 1...","[128006, 882, 128007, 271, 3957, 433, 57323, 1...",0
1,53567,koala-13b,gpt-4-0613,[What is the difference between marriage licen...,[A marriage license is a legal document that a...,[A marriage license and a marriage certificate...,0,1,0,"[128006, 882, 128007, 271, 3923, 374, 279, 681...","[128006, 882, 128007, 271, 3923, 374, 279, 681...",1
2,65089,gpt-3.5-turbo-0613,mistral-medium,[explain function calling. how would you call ...,[Function calling is the process of invoking o...,[Function calling is the process of invoking a...,0,0,1,"[128006, 882, 128007, 271, 95444, 734, 8260, 1...","[128006, 882, 128007, 271, 95444, 734, 8260, 1...",2
3,96401,llama-2-13b-chat,mistral-7b-instruct,[How can I create a test set for a very rare c...,[Creating a test set for a very rare category ...,[When building a classifier for a very rare ca...,1,0,0,"[128006, 882, 128007, 271, 4438, 649, 358, 189...","[128006, 882, 128007, 271, 4438, 649, 358, 189...",0
4,198779,koala-13b,gpt-3.5-turbo-0314,[What is the best way to travel from Tel-Aviv ...,[The best way to travel from Tel Aviv to Jerus...,[The best way to travel from Tel-Aviv to Jerus...,0,1,0,"[128006, 882, 128007, 271, 3923, 374, 279, 188...","[128006, 882, 128007, 271, 3923, 374, 279, 188...",1
...,...,...,...,...,...,...,...,...,...,...,...,...
57472,4294656694,gpt-4-0613,claude-1,"[A simple mnemonic for π:\n""How I wish I could...","[Sure, let's break it down:\n\n1. ""How"" has 3 ...",[Here is how that mnemonic represents the digi...,1,0,0,"[128006, 882, 128007, 271, 32, 4382, 87901, 36...","[128006, 882, 128007, 271, 32, 4382, 87901, 36...",0
57473,4294692063,claude-2.0,llama-2-13b-chat,"[In python, implement a naive Bayes with gauss...",[Here is an implementation of a naive Bayes cl...,[Sure! Here's an implementation of a naive Bay...,1,0,0,"[128006, 882, 128007, 271, 644, 10344, 11, 430...","[128006, 882, 128007, 271, 644, 10344, 11, 430...",0
57474,4294710549,claude-1,alpaca-13b,"[is it unethical to work on building weapons?,...",[Working on weapons technology raises some eth...,[It depends on the context. Weapons can be use...,1,0,0,"[128006, 882, 128007, 271, 285, 433, 89735, 31...","[128006, 882, 128007, 271, 285, 433, 89735, 31...",0
57475,4294899228,palm-2,tulu-2-dpo-70b,"[If a bait contains 0,0025% bromadiolon then h...",[Bromadiolone is a rodenticide which is most o...,"[As an AI language model, I do not promote or ...",0,1,0,"[128006, 882, 128007, 271, 2746, 264, 49160, 5...","[128006, 882, 128007, 271, 2746, 264, 49160, 5...",1


In [17]:
data.to_pickle('preprocessed_data_v2.pkl')