In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import copy
#from src.utils.utils import ensure_alternating_roles

path: str = ('../../data/processed/privateshared_old_processed.jsonl')
data = pd.read_json(path, lines=True)

In [2]:
def ensure_alternating_roles(messages, cull_system_message: bool = True):
    """
    The messages format assumes alternating roles of user and assistant. This method checks, if this constraint
    is satisfied. If this is not the case and there are consecutive user or assistant messages,
    then these are merged into a single one.

    :param cull_system_message:
    :param messages: to be checked
    :return: a new messages object with the alternating roles ensured
    """
    consec_msg = 0
    _messages = copy.deepcopy(messages)

    if cull_system_message:
        if _messages[0]['role'] == "system" and not _messages[0]["content"]:
            del _messages[0]

    def is_same_role(msg1, msg2):
        return msg1["role"] == msg2["role"]

    delimiter = "\n\n"

    def join_content(msg1, msg2):
        return f"{msg1['content']}{delimiter}{msg2['content']}"

    if len(_messages) <= 1:
        return _messages

    def is_valid(idx):
        return idx < len(_messages)

    msg_idx = 1
    while is_valid(msg_idx):
        prev_message = _messages[msg_idx - 1]
        message = _messages[msg_idx]
        if is_same_role(prev_message, message):
            warn_msg = (f"Found consecutive role assignments. These will be merged into one:\n"
                        f"{prev_message}\n"
                        f"{message}")
            #logger.warning(warn_msg)
            #print("X")
            consec_msg += 1
            prev_message['content'] = join_content(prev_message, message)
            del _messages[msg_idx]
        else:
            msg_idx += 1
    #print(f"{consec_msg} consecutive messages have been merged!")
    return _messages


In [48]:
def prepare_qa_data_psh1(data: pd.DataFrame) -> pd.DataFrame:
    """
    Splits privateshared dialogues in such a way that all probing blocks remain together.
    """
    result_data: pd.DataFrame = {key: [] for key in data.columns}

    for i, row in data.iterrows():
        chat = row['chat']
        game = row['game']

        chat_last_index = len(chat)-1

        for i, item in enumerate(chat):
            if item['type'] == 'send message':
                for column in data.columns:
                    if chat[:i]:
                        if column == 'chat':
                            result_data[column].append(chat[:i]) # append everything above "send message"
                        else: 
                            result_data[column].append(row[column])
                         
            else:
                if i == chat_last_index: # maybe should test whether last entry is also assistant?
                    for key in data.columns:
                            result_data[key].append(row[key])
    return result_data

In [None]:
# def prepare_qa_data_psh2(data: pd.DataFrame) -> pd.DataFrame:
#     """
#     Splits privateshared dialogues in such a way that only a sample of each probing block remains together.
#     """
#     result_data: pd.DataFrame = {key: [] for key in data.columns}

#     for i, row in data.iterrows():
#         chat = row['chat']
#         chat_last_index = len(chat)-1
#         messages = []
#         probes = []
#         for i, item in enumerate(chat):
#             if "message" in item["type"]:
#                 messages.append(item)
#             elif "probe" in item["type"]:
#                 if i != chat_last_index:
#                     if "probe" in chat[i+1]["type"]: # if next turn is also part of probing
#                         probes.append(item)
#                     else:
#                         probes.append(item)
#                         for column in data.columns:
#                             if column == 'chat':
#                                 # sample probes here
#                                 result_data[column].append(messages + probes) # append everything above "send message"
#                                 probes = []
#                             else: 
#                                 result_data[column].append(row[column])
                            
                        
#                 else:
#                     for column in data.columns:
#                         probes.append(item)
#                         if column == 'chat':
#                             result_data[column].append(messages + probes) # append everything above "send message"
#                         else: 
#                             result_data[column].append(row[column])


#     return result_data

In [6]:
def prepare_qa_data_psh3(data: pd.DataFrame) -> pd.DataFrame:
    """
    Splits privateshared dialogues in such a way that all probing blocks remain together and only one probing block remains.
    """
    result_data: pd.DataFrame = {key: [] for key in data.columns}

    for i, row in data.iterrows():
        chat = row['chat']
        chat_last_index = len(chat)-1
        messages = []
        probes = []
        for i, item in enumerate(chat):
            if "message" in item["type"]:
                messages.append(item)
            elif "probe" in item["type"]:
                if i != chat_last_index:
                    if "probe" in chat[i+1]["type"]: # if next turn is also part of probing
                        probes.append(item)
                    else:
                        probes.append(item)
                        for column in data.columns:
                            if column == 'chat':
                                result_data[column].append(messages + probes) # append everything above "send message"
                                probes = []
                            else: 
                                result_data[column].append(row[column])
                            
                        
                else:
                    for column in data.columns:
                        
                        if column == 'chat':
                            probes.append(item)
                            result_data[column].append(messages + probes) # append everything above "send message"
                            probes = []
                        else: 
                            result_data[column].append(row[column])

    return result_data

In [None]:
save_path: str = '../../data/training_data/D8000p1.csv'
data_successful = data[data.Success == 1]
print(len(data_successful))
data_successful.chat = data_successful.chat.apply(ensure_alternating_roles)
result = prepare_qa_data_psh1(data_successful)
data_collapsed = pd.DataFrame(result)
print(len(data), len(data_collapsed))
#data_collapsed[["chat"]].iloc[:15].to_csv("psh1.csv")

data_collapsed.to_csv(save_path, index=False)


41
2497 336


In [11]:
save_path: str = '../../data/training_data/D8000p3.csv'
data_successful = data[data.Success == 1]
print(len(data_successful))

result = prepare_qa_data_psh3(data_successful)
data_collapsed = pd.DataFrame(result)

# important to merge *after* splitting so that the first probe question is not automatically appended to the prompt
data_collapsed.chat = data_collapsed.chat.apply(ensure_alternating_roles)

print(len(data), len(data_collapsed))
#data_collapsed[["chat"]].iloc[:33].to_csv("psh3_ear.csv")

data_collapsed.to_csv(save_path, index=False)

41
2497 336
