In [124]:
import pandas as pd
from pandas import DataFrame

path_data_old: str = '../../data/processed/all_games_merged_old_processed.jsonl'
path_referencegame_new: str = '../../data/training_data/D70005_LIGHT.csv'

all_old_data: pd.DataFrame = pd.read_json(path_data_old, lines=True)
reference_game_new: pd.DataFrame = pd.read_csv(path_referencegame_new)
df_taboo = all_old_data[all_old_data.game == 'taboo']

In [125]:
games_to_consider: list = [
    'imagegame',
    'privateshared',
    'wordle',
    'wordle_withclue',
    'wordle_withcritic',
    # 'taboo',
]

top_3_models: list = [
    'gpt-4-0613-t0.0--gpt-4-0613-t0.0',
    'claude-v1.3-t0.0--claude-v1.3-t0.0',
    'gpt-4-1106-preview-t0.0--gpt-4-1106-preview-t0.0',
]

data_old_no_reference_game: pd.DataFrame = all_old_data[all_old_data.game.isin(games_to_consider)]

In [126]:
data_old_no_reference_game_success = data_old_no_reference_game[data_old_no_reference_game.Success == 1]
data_taboo_success = df_taboo[(df_taboo.Success == 1) & (df_taboo.model.isin(top_3_models))]

In [127]:
len(data_old_no_reference_game_success), len(reference_game_new), len(data_taboo_success)

(976, 858, 338)

In [128]:
reference_game_clean= reference_game_new.drop(['Unnamed: 0', 'text'], axis=1)

In [129]:
reference_game_clean.chat = reference_game_clean.chat.map(eval)

In [130]:
merged_data = pd.concat([data_old_no_reference_game_success, reference_game_clean, data_taboo_success])

In [131]:
len(merged_data)

2172

In [132]:
import copy


def ensure_alternating_roles(messages, cull_system_message: bool = True):
    """
    The messages format assumes alternating roles of user and assistant. This method checks, if this constraint
    is satisfied. If this is not the case and there are consecutive user or assistant messages,
    then these are merged into a single one.

    :param cull_system_message:
    :param messages: to be checked
    :return: a new messages object with the alternating roles ensured
    """
    consec_msg = 0
    _messages = copy.deepcopy(messages)

    if cull_system_message:
        if _messages[0]['role'] == "system" and not _messages[0]["content"]:
            del _messages[0]

    def is_same_role(msg1, msg2):
        return msg1["role"] == msg2["role"]

    delimiter = "\n\n"

    def join_content(msg1, msg2):
        return f"{msg1['content']}{delimiter}{msg2['content']}"

    if len(_messages) <= 1:
        return _messages

    def is_valid(idx):
        return idx < len(_messages)

    msg_idx = 1
    while is_valid(msg_idx):
        prev_message = _messages[msg_idx - 1]
        message = _messages[msg_idx]
        if is_same_role(prev_message, message):
            warn_msg = (f"Found consecutive role assignments. These will be merged into one:\n"
                        f"{prev_message}\n"
                        f"{message}")
            #logger.warning(warn_msg)
            #print("X")
            consec_msg += 1
            prev_message['content'] = join_content(prev_message, message)
            del _messages[msg_idx]
        else:
            msg_idx += 1
    #print(f"{consec_msg} consecutive messages have been merged!")
    return _messages

In [134]:
def prepare_qa_data(data: DataFrame) -> DataFrame:
    result_data: pd.DataFrame = {key: [] for key in data.columns}

    for i, row in data.iterrows():
        chat = row['chat']
        game = row['game']

        chat_last_index = len(chat)
        
        if game == 'privateshared':
            chat_last_index = len(chat)-1
            messages = []
            probes = []
            for i, item in enumerate(chat):
                if "message" in item["type"]:
                    messages.append(item)
                elif "probe" in item["type"]:
                    if i != chat_last_index:
                        if "probe" in chat[i+1]["type"]: # if next turn is also part of probing
                            probes.append(item)
                        else:
                            probes.append(item)
                            for column in data.columns:
                                if column == 'chat':
                                    result_data[column].append(messages + probes) # append everything above "send message"
                                    probes = []
                                else:
                                    result_data[column].append(row[column])
    
    
                    else:
                        for column in data.columns:
    
                            if column == 'chat':
                                probes.append(item)
                                result_data[column].append(messages + probes) # append everything above "send message"
                                probes = []
                            else:
                                result_data[column].append(row[column])
        else:
            for i, item in enumerate(chat):
                if item['role'] == 'assistant':
                    for key in data.columns:
                        if key != 'chat':
                            result_data[key].append(row[key])
                        else:
                            if i + 1 < chat_last_index:
                                result_data[key].append(chat[:i + 1])
                            else:
                                result_data[key].append(row[key])
    return result_data

In [135]:
merged_data.chat = merged_data.chat.apply(ensure_alternating_roles)
# shuffle data
merged_data_shuffled = merged_data.sample(frac=1).reset_index(drop=True)
result_data = prepare_qa_data(merged_data_shuffled)
result_df = pd.DataFrame.from_dict(result_data)

In [136]:
len(merged_data_shuffled), len(merged_data), len(result_df)

(2172, 2172, 4531)

In [137]:
merged_data_shuffled.game.value_counts()

game
referencegame        858
wordle_withcritic    395
taboo                338
wordle_withclue      268
imagegame            246
privateshared         41
wordle                26
Name: count, dtype: int64

In [138]:
result_df.game.value_counts()

game
imagegame            1157
wordle_withcritic    1113
referencegame         858
wordle_withclue       506
taboo                 438
privateshared         336
wordle                123
Name: count, dtype: int64

In [140]:
save_path: str = '../../data/training_data/D90000_LIGHT.csv'
result_df.to_csv(save_path, index=False)

In [114]:
d = pd.read_csv('../../data/training_data/D90000_light.csv')

In [139]:
len(result_df)

4531