In [14]:
import pandas as pd

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoConfig
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, RobertaForMultipleChoice
import argparse
from transformers import get_polynomial_decay_schedule_with_warmup, TrainerCallback
import datasets
from datasets import Dataset

In [9]:
df_train = pd.read_csv("dataset/random_train.csv")

In [None]:
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

In [None]:
df_train.loc[:, 'prompt'] = df_train['prompt'].apply(process)
df_train.loc[:, 'response_a'] = df_train['response_a'].apply(process)
df_train.loc[:, 'response_b'] = df_train['response_b'].apply(process)

In [None]:
t = df_train.loc[df_train.index % 4 != 0,].reset_index(drop = True)

In [None]:
t.loc[20,:]

In [None]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    return label[-1]
    
df_train = pd.read_csv('dataset/train.csv').reset_index(drop = True)
df_train['label'] = df_train.apply(lambda x: get_label(x), axis = 1)

In [None]:
df_train.loc[df_train.index % 4 != 0,]

In [None]:
df_train.loc[df_train.index % 20 == 0,].reset_index(drop = True).to_csv('dataset/random_valid.csv')
df_train.loc[df_train.index % 20 != 0,].reset_index(drop = True).to_csv('dataset/random_train.csv')

In [None]:
df_train.loc[df_train.index % 20 == 0,]

# For instruction

In [11]:
def get_label(row):
    label = [idx for idx, option in enumerate(['winner_model_a','winner_model_b','winner_tie']) if row[option] == 1]
    if label[-1] == 0:
        return 'A'
    elif label[-1] == 1:
        return 'B'
    else:
        return 'C'
    return label[-1]
    
train = pd.read_csv('dataset/train.csv').reset_index(drop = True)
train['label'] = train.apply(lambda x: get_label(x), axis = 1)

In [12]:
def preprocess(row):
    row['response_a'] = row['response_a'].replace("null",'"null"')
    row['response_b'] = row['response_b'].replace("null",'"null"')
    if row['prompt'][-3:] == ',"]':
        row['prompt'] = row['prompt'][:-3] + ']'
    return row

train= train.apply(lambda x: preprocess(x), axis = 1)

In [13]:
from tqdm import tqdm
output_as = []
output_bs = []
scuess_idx = []
labels = []
ids = []
for idx, row in tqdm(train.iterrows(), total=len(train)):
    try:
        prompt = row["prompt"]
        response_a = row["response_a"]
        response_b = row["response_b"]
        label = row['label']
        id = row['id']
        if prompt.find('","') != -1:
            #不止一个prompt
            #cnt = prompt.count('","')
            prompt = row["prompt"].split('","')
            cnt = len(prompt)
            response_a = row["response_a"].split('","')
            response_b = row["response_b"].split('","')
            for sentence_idx in range(cnt):
                output_a = "###Model A\n"
                output_b = "###Model B\n"
                output_a += "Prompt: " + prompt[sentence_idx][2:].strip() + "\n"
                output_a += "Response: " + response_a[sentence_idx][2:].strip() + "\n\n"
                output_b += "Prompt: " + prompt[sentence_idx][2:].strip() + "\n"
                output_b += "Response: " +response_b[sentence_idx][2:].strip() + "\n\n"
                output_as.append(output_a)
                output_bs.append(output_b)
                scuess_idx.append(idx)
                labels.append(label)
                ids.append(id)
        else:
            output_a = "###Model A\nPrompt: " + row["prompt"][2:-2].strip() + "\nResponse: " + row["response_a"][2:-2].strip() + "\n\n"
            output_b = "###Model B\nPrompt: " + row["prompt"][2:-2].strip() + "\nResponse: " + row["response_b"][2:-2].strip() + "\n\n"
            output_as.append(output_a)
            output_bs.append(output_b)
            scuess_idx.append(idx)
            labels.append(label)
            ids.append(id)
        
    except Exception as e:
        print(f"Error: {idx}")
        print(f"{e}")
        output_a = "###Model A\nPrompt: " + row["prompt"][2:-2].strip() + "\nResponse: " + row["response_a"].strip() + "\n\n"
        output_b = "###Model B\nPrompt: " + row["prompt"][2:-2].strip() + "\nResponse: " + row["response_b"].strip() + "\n\n"
        output_as.append(output_a)
        output_bs.append(output_b)
        scuess_idx.append(idx)
        labels.append(label)
        ids.append(id)

100%|██████████| 57477/57477 [00:03<00:00, 16292.07it/s]


In [6]:
assert len(set(ids)) == len(train)

In [14]:
train = pd.DataFrame({'id': ids, 'instruction_a': output_as, 'instruction_b': output_bs, 'label': labels })

In [15]:
train

Unnamed: 0,id,instruction_a,instruction_b,label
0,30192,###Model A\nPrompt: Is it morally right to try...,###Model B\nPrompt: Is it morally right to try...,A
1,30192,"###Model A\nPrompt: , does pineapple belong on...","###Model B\nPrompt: , does pineapple belong on...",A
2,53567,###Model A\nPrompt: What is the difference bet...,###Model B\nPrompt: What is the difference bet...,B
3,53567,###Model A\nPrompt: w can I get both of them a...,###Model B\nPrompt: w can I get both of them a...,B
4,53567,###Model A\nPrompt: at is the minimal time to ...,###Model B\nPrompt: at is the minimal time to ...,B
...,...,...,...,...
71509,4294710549,###Model A\nPrompt: e deterrents necessary in ...,###Model B\nPrompt: e deterrents necessary in ...,A
71510,4294710549,###Model A\nPrompt: w realistic is it for toda...,###Model B\nPrompt: w realistic is it for toda...,A
71511,4294710549,###Model A\nPrompt: n you explain concisely wh...,###Model B\nPrompt: n you explain concisely wh...,A
71512,4294899228,"###Model A\nPrompt: If a bait contains 0,0025%...","###Model B\nPrompt: If a bait contains 0,0025%...",B


In [16]:
idx = train.id.unique()
valid_idx = [idx[i] for i in range(len(idx)) if i % 20 == 0]

In [17]:
train.loc[train.id.isin(valid_idx),].to_csv('dataset/random_instruction_valid.csv', index = False)
train.loc[~train.id.isin(valid_idx),].to_csv('dataset/random_instruction_train.csv', index = False)

# 每个对话一段文本，做classification

In [11]:
valid = pd.read_csv("dataset/random_instruction_valid.csv")
train = pd.read_csv("dataset/random_instruction_train.csv")

In [None]:
random_multi_valid

In [12]:
str2num = {'A':0, "B":1, "C":2}
train['label'] = train.label.map(str2num)
valid['label'] = valid.label.map(str2num)

In [13]:
train.to_csv('dataset/random_multi_turn_cls_train.csv', index = False)
valid.to_csv('dataset/random_multi_turn_cls_valid.csv', index = False)

# 所有对话拼接一起

In [15]:
valid = pd.read_csv("dataset/random_instruction_valid.csv")
train = pd.read_csv("dataset/random_instruction_train.csv")

In [16]:
%%time
# 定义一个函数来删除###Model A\n
def remove(row):
    row['instruction_a'] = row['instruction_a'].replace('###Model A\n', '')
    row['instruction_b'] = row['instruction_b'].replace('###Model B\n', '')
    return row
# 添加
def add(row):
    row['instruction_a'] = "###Model A\n" + row['instruction_a']
    row['instruction_b'] = '###Model B\n' + row['instruction_b']
    return row
    
def main(data):
    data['instruction_a'] = data['instruction_a'].apply(lambda x: x.replace('###Model A\n', ''))
    data['instruction_b'] = data['instruction_b'].apply(lambda x: x.replace('###Model B\n', ''))
    data = data.groupby('id').agg({'instruction_a': ''.join, 'instruction_b': ''.join, 'label': lambda x: list(x)[0]}).reset_index()
    data['instruction_a'] = "###Model A\n" + data['instruction_a']
    data['instruction_b'] = "###Model B\n" + data['instruction_b']
    data['prompt_response'] = data['instruction_a'] + data['instruction_b']
    return data
train = main(train)
valid = main(valid)

CPU times: user 2.4 s, sys: 507 ms, total: 2.91 s
Wall time: 2.91 s


In [11]:
train.to_csv('dataset/random_all_in_one_train.csv', index = False)
valid.to_csv('dataset/random_all_in_one_valid.csv', index = False)

In [13]:
pd.read_csv('dataset/random_all_in_one_train.csv')

Unnamed: 0,id,instruction_a,instruction_b,label,prompt_response
0,53567,###Model A\nPrompt: What is the difference bet...,###Model B\nPrompt: What is the difference bet...,B,###Model A\nPrompt: What is the difference bet...
1,65089,###Model A\nPrompt: explain function calling. ...,###Model B\nPrompt: explain function calling. ...,C,###Model A\nPrompt: explain function calling. ...
2,96401,###Model A\nPrompt: How can I create a test se...,###Model B\nPrompt: How can I create a test se...,A,###Model A\nPrompt: How can I create a test se...
3,198779,###Model A\nPrompt: What is the best way to tr...,###Model B\nPrompt: What is the best way to tr...,B,###Model A\nPrompt: What is the best way to tr...
4,292873,"###Model A\nPrompt: Construct a rap battle, in...","###Model B\nPrompt: Construct a rap battle, in...",B,"###Model A\nPrompt: Construct a rap battle, in..."
...,...,...,...,...,...
54598,4294656694,###Model A\nPrompt: A simple mnemonic for \u03...,###Model B\nPrompt: A simple mnemonic for \u03...,A,###Model A\nPrompt: A simple mnemonic for \u03...
54599,4294692063,"###Model A\nPrompt: In python, implement a nai...","###Model B\nPrompt: In python, implement a nai...",A,"###Model A\nPrompt: In python, implement a nai..."
54600,4294710549,###Model A\nPrompt: is it unethical to work on...,###Model B\nPrompt: is it unethical to work on...,A,###Model A\nPrompt: is it unethical to work on...
54601,4294899228,"###Model A\nPrompt: If a bait contains 0,0025%...","###Model B\nPrompt: If a bait contains 0,0025%...",B,"###Model A\nPrompt: If a bait contains 0,0025%..."


In [12]:
# classification
str2num = {'A':0, "B":1, "C":2}
train['label'] = train.label.map(str2num)
valid['label'] = valid.label.map(str2num)

train.to_csv('dataset/random_all_in_one_cls_train.csv', index = False)
valid.to_csv('dataset/random_all_in_one_cls_valid.csv', index = False)

In [18]:
all = pd.concat([train, valid]).reset_index(drop = True)

In [21]:
all.to_csv("dataset/full_all_in_one.csv", index = False)