In [1]:
import pandas as pd

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoConfig
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, RobertaForMultipleChoice
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset

In [3]:
df_train = pd.read_csv("dataset/random_train.csv")


In [5]:
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

In [6]:
df_train.loc[:, 'prompt'] = df_train['prompt'].apply(process)
df_train.loc[:, 'response_a'] = df_train['response_a'].apply(process)
df_train.loc[:, 'response_b'] = df_train['response_b'].apply(process)

In [12]:
t = df_train.loc[df_train.index % 4 != 0,].reset_index(drop = True)

In [13]:
t.loc[20,:]

Unnamed: 0                                                       27
id                                                          2051408
model_a                                                openchat-3.5
model_b                                                 chatglm2-6b
prompt              What\u2019s the best prank to play on a friend?
response_a        There are many types of pranks that can be pla...
response_b        It's important to remember that pranks can be ...
winner_model_a                                                    0
winner_model_b                                                    0
winner_tie                                                        1
label                                                             2
Name: 20, dtype: object

In [4]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]
    
df_train = pd.read_csv('dataset/train.csv').reset_index(drop = True)
df_train['label'] = df_train.apply(lambda x: get_label(x), axis = 1)

In [8]:
df_train.loc[df_train.index % 4 != 0,]

Unnamed: 0.1,Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,label
1,1,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1,2
2,2,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0,0
3,3,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0,1
5,5,313413,mixtral-8x7b-instruct-v0.1,vicuna-13b,Why water is not used in bath tub?,Water is actually used in a bath tub. A bath t...,Water is not used in a bath tub because it is ...,1,0,0,0
6,6,370945,gemini-pro,claude-2.0,"\""Bacteria is life on Mars but a heartbeat isn...",Dune,This quote seems to be referencing the debate ...,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
51722,51722,4294452769,gpt-4-1106-preview,openchat-3.5,Old McDonald harvested 4 pumpkins this year. H...,Old McDonald originally harvested 4 pumpkins. ...,Old McDonald had 4 pumpkins initially and now ...,0,0,1,2
51723,51723,4294633312,pplx-7b-online,gpt-4-0613,abc,It seems like you're referring to the Californ...,"Hello! It seems like you've typed \""abc.\"" How...",1,0,0,0
51725,51725,4294692063,claude-2.0,llama-2-13b-chat,"In python, implement a naive Bayes with gaussi...",Here is an implementation of a naive Bayes cla...,Sure! Here's an implementation of a naive Baye...,1,0,0,0
51726,51726,4294710549,claude-1,alpaca-13b,is it unethical to work on building weapons? a...,Working on weapons technology raises some ethi...,It depends on the context. Weapons can be used...,1,0,0,0


In [6]:
df_train.loc[df_train.index % 20 == 0,].reset_index(drop = True).to_csv('dataset/random_valid.csv')
df_train.loc[df_train.index % 20 != 0,].reset_index(drop = True).to_csv('dataset/random_train.csv')

In [5]:
df_train

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,label
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0,1
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1,2
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
57472,4294656694,gpt-4-0613,claude-1,"[""A simple mnemonic for \u03c0:\n\""How I wish ...","[""Sure, let's break it down:\n\n1. \""How\"" has...","[""Here is how that mnemonic represents the dig...",1,0,0,0
57473,4294692063,claude-2.0,llama-2-13b-chat,"[""In python, implement a naive Bayes with gaus...","[""Here is an implementation of a naive Bayes c...","[""Sure! Here's an implementation of a naive Ba...",1,0,0,0
57474,4294710549,claude-1,alpaca-13b,"[""is it unethical to work on building weapons?...","[""Working on weapons technology raises some et...","[""It depends on the context. Weapons can be us...",1,0,0,0
57475,4294899228,palm-2,tulu-2-dpo-70b,"[""If a bait contains 0,0025% bromadiolon then ...","[""Bromadiolone is a rodenticide which is most ...","[""As an AI language model, I do not promote or...",0,1,0,1
