In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import copy
#from src.utils.utils import ensure_alternating_roles

path: str = ('../../data/processed/privateshared_old_processed.jsonl')
data = pd.read_json(path, lines=True)

In [237]:
data.benchmark_version.unique()

array(['v0.9', 'v1.0'], dtype=object)

In [24]:
def ensure_alternating_roles(messages, cull_system_message: bool = True):
    """
    The messages format assumes alternating roles of user and assistant. This method checks, if this constraint
    is satisfied. If this is not the case and there are consecutive user or assistant messages,
    then these are merged into a single one.

    :param cull_system_message:
    :param messages: to be checked
    :return: a new messages object with the alternating roles ensured
    """
    consec_msg = 0
    _messages = copy.deepcopy(messages)

    if cull_system_message:
        if _messages[0]['role'] == "system" and not _messages[0]["content"]:
            del _messages[0]

    def is_same_role(msg1, msg2):
        return msg1["role"] == msg2["role"]

    delimiter = "\n\n"

    def join_content(msg1, msg2):
        return f"{msg1['content']}{delimiter}{msg2['content']}"

    if len(_messages) <= 1:
        return _messages

    def is_valid(idx):
        return idx < len(_messages)

    msg_idx = 1
    while is_valid(msg_idx):
        prev_message = _messages[msg_idx - 1]
        message = _messages[msg_idx]
        if is_same_role(prev_message, message):
            warn_msg = (f"Found consecutive role assignments. These will be merged into one:\n"
                        f"{prev_message}\n"
                        f"{message}")
            #logger.warning(warn_msg)
            #print("X")
            consec_msg += 1
            prev_message['content'] = join_content(prev_message, message)
            del _messages[msg_idx]
        else:
            msg_idx += 1
    #print(f"{consec_msg} consecutive messages have been merged!")
    return _messages


In [28]:
def prepare_qa_data_psh1(data: pd.DataFrame) -> pd.DataFrame:
    """
    Splits privateshared dialogues in such a way that all probing blocks remain together.
    """
    result_data: pd.DataFrame = {key: [] for key in data.columns}

    for i, row in data.iterrows():
        chat = row['chat']
        game = row['game']

        chat_last_index = len(chat)-1

        for i, item in enumerate(chat):
            if item['type'] == 'send message':
                for column in data.columns:
                    if chat[:i]:
                        if column == 'chat':
                            result_data[column].append(chat[:i]) # append everything above "send message"
                        else: 
                            result_data[column].append(row[column])
                         
            else:
                if i == chat_last_index: # maybe should test whether last entry is also assistant?
                    for key in data.columns:
                            result_data[key].append(row[key])
    return result_data

In [None]:
# def prepare_qa_data_psh2(data: pd.DataFrame) -> pd.DataFrame:
#     """
#     Splits privateshared dialogues in such a way that only a sample of each probing block remains together.
#     """
#     result_data: pd.DataFrame = {key: [] for key in data.columns}

#     for i, row in data.iterrows():
#         chat = row['chat']
#         chat_last_index = len(chat)-1
#         messages = []
#         probes = []
#         for i, item in enumerate(chat):
#             if "message" in item["type"]:
#                 messages.append(item)
#             elif "probe" in item["type"]:
#                 if i != chat_last_index:
#                     if "probe" in chat[i+1]["type"]: # if next turn is also part of probing
#                         probes.append(item)
#                     else:
#                         probes.append(item)
#                         for column in data.columns:
#                             if column == 'chat':
#                                 # sample probes here
#                                 result_data[column].append(messages + probes) # append everything above "send message"
#                                 probes = []
#                             else: 
#                                 result_data[column].append(row[column])
                            
                        
#                 else:
#                     for column in data.columns:
#                         probes.append(item)
#                         if column == 'chat':
#                             result_data[column].append(messages + probes) # append everything above "send message"
#                         else: 
#                             result_data[column].append(row[column])


#     return result_data

In [6]:
def prepare_qa_data_psh3(data: pd.DataFrame) -> pd.DataFrame:
    """
    Splits privateshared dialogues in such a way that all probing blocks remain together and only one probing block remains.
    """
    result_data: pd.DataFrame = {key: [] for key in data.columns}

    for i, row in data.iterrows():
        chat = row['chat']
        chat_last_index = len(chat)-1
        messages = []
        probes = []
        for i, item in enumerate(chat):
            if "message" in item["type"]:
                messages.append(item)
            elif "probe" in item["type"]:
                if i != chat_last_index:
                    if "probe" in chat[i+1]["type"]: # if next turn is also part of probing
                        probes.append(item)
                    else:
                        probes.append(item)
                        for column in data.columns:
                            if column == 'chat':
                                result_data[column].append(messages + probes) # append everything above "send message"
                                probes = []
                            else: 
                                result_data[column].append(row[column])
                            
                        
                else:
                    for column in data.columns:
                        
                        if column == 'chat':
                            probes.append(item)
                            result_data[column].append(messages + probes) # append everything above "send message"
                            probes = []
                        else: 
                            result_data[column].append(row[column])

    return result_data

In [29]:
save_path: str = '../../data/training_data/D8000p1.csv'
data_successful = data[data.Success == 1]
print(len(data_successful))
data_successful.chat = data_successful.chat.apply(ensure_alternating_roles)
result = prepare_qa_data_psh1(data_successful)
data_collapsed = pd.DataFrame(result)
print(len(data), len(data_collapsed))
#data_collapsed[["chat"]].iloc[:15].to_csv("psh1.csv")

#ata_collapsed.to_csv(save_path, index=False)


41
2497 336


In [11]:
save_path: str = '../../data/training_data/D8000p3.csv'
data_successful = data[data.Success == 1]
print(len(data_successful))

result = prepare_qa_data_psh3(data_successful)
data_collapsed = pd.DataFrame(result)

# important to merge *after* splitting so that the first probe question is not automatically appended to the prompt
data_collapsed.chat = data_collapsed.chat.apply(ensure_alternating_roles)

print(len(data), len(data_collapsed))
#data_collapsed[["chat"]].iloc[:33].to_csv("psh3_ear.csv")

data_collapsed.to_csv(save_path, index=False)

41
2497 336


# Rework of p1 -> D70001

In [171]:
def get_probe_question_mask(chat: list) -> list:
    probe_questions_indexes: list = []
    is_probe: bool = False

    for chat_idx, chat_item in enumerate(chat):
        if chat_item['type'] == 'send message':
            if is_probe:
                is_probe = False
        if chat_item['type'] == 'probe question':
            if not is_probe:
                is_probe = True
            probe_questions_indexes.append(chat_idx)
        if chat_item['type'] == 'probe answer':
            probe_questions_indexes.append(chat_idx)

    return probe_questions_indexes

def prepare_qa_data_psh4(data: pd.DataFrame) -> pd.DataFrame:
    result_data: pd.DataFrame = {key: [] for key in data.columns}
    
    for idx, row in data.iterrows():
        chat = row['chat']
        probe_question_mask = get_probe_question_mask(chat)

        chat_last_index = len(chat)

        for i, item in enumerate(chat):
            if i in probe_question_mask:
                
                # check that we are at the end of a probe block
                if i+1 not in probe_question_mask:
                    for key in data.columns:
                        if key != 'chat':
                            result_data[key].append(row[key])
                        else:
                            if i + 1 < chat_last_index:
                                result_data[key].append(chat[:i + 1])
                            else:
                                result_data[key].append(row[key])     
                continue
                
            if item['role'] == 'assistant':
                for key in data.columns:
                    if key != 'chat':
                        result_data[key].append(row[key])
                    else:
                        if i + 1 < chat_last_index:
                            result_data[key].append(chat[:i + 1])
                        else:
                            result_data[key].append(row[key])
    return result_data

In [172]:
data_successful = data[data.Success == 1]
print(len(data_successful))

result = prepare_qa_data_psh4(data_successful)
data_collapsed_ph4 = pd.DataFrame(result)

# important to merge *after* splitting so that the first probe question is not automatically appended to the prompt
data_collapsed_ph4.chat = data_collapsed_ph4.chat.apply(ensure_alternating_roles)

print(len(data), len(data_collapsed_ph4))

41
2497 631


In [173]:
save_path: str = '../../data/training_data/D70001.csv'
data_collapsed_ph4.to_csv(save_path, index=False)

# P70002

In [229]:
import random

def get_probe_question_mask(chat: list) -> (list, list):
    probe_questions_indexes: list = []
    probe_mask: list = []
    is_probe: bool = False

    for chat_idx, chat_item in enumerate(chat):
        if chat_item['type'] == 'send message':
            if is_probe:
                is_probe = False
        if chat_item['type'] == 'probe question':
            if not is_probe:
                is_probe = True
            probe_questions_indexes.append(chat_idx)
            probe_mask.append(chat_idx)
        if chat_item['type'] == 'probe answer':
            probe_mask.append(chat_idx)
            
    return probe_questions_indexes, probe_mask

def sample(idx_list: list, fraction: float) -> list:
    num_samples: int = int(len(idx_list) * (fraction/2))
    new_samples = random.sample(idx_list, num_samples)
    
    return_samples = []
    
    for n in new_samples:
        if n%2 == 0:
            return_samples.append(n)
            return_samples.append(n-1)
        else:
            return_samples.append(n)
            return_samples.append(n+1)
    
    return list(set(return_samples))
            

def prepare_qa_data_psh2(data: pd.DataFrame) -> pd.DataFrame:
    result_data: pd.DataFrame = {key: [] for key in data.columns}

    for idx, row in data.iterrows():
        chat = row['chat']
        probe_question_mask, _ = get_probe_question_mask(chat)
        probe_questions_to_drop = sample(probe_question_mask, 0.9)
        
        reduced_chat = [c for i, c in enumerate(chat) if i not in probe_questions_to_drop]
        
        _, probe_mask = get_probe_question_mask(reduced_chat)
        chat_last_index = len(reduced_chat)

        for i, item in enumerate(reduced_chat):
            if i in probe_mask:

                # check that we are at the end of a probe block
                if i+1 not in probe_mask:
                    for key in data.columns:
                        if key != 'chat':
                            result_data[key].append(row[key])
                        else:
                            if i + 1 < chat_last_index:
                                result_data[key].append(reduced_chat[:i + 1])
                            else:
                                result_data[key].append(row[key])
                continue

            if item['role'] == 'assistant':
                for key in data.columns:
                    if key != 'chat':
                        result_data[key].append(row[key])
                    else:
                        if i + 1 < chat_last_index:
                            result_data[key].append(reduced_chat[:i + 1])
                        else:
                            result_data[key].append(row[key])
    return result_data

In [230]:
data_successful = data[data.Success == 1]
print(len(data_successful))

result = prepare_qa_data_psh2(data_successful)
data_collapsed_ph2 = pd.DataFrame(result)

# important to merge *after* splitting so that the first probe question is not automatically appended to the prompt
data_collapsed_ph2.chat = data_collapsed_ph2.chat.apply(ensure_alternating_roles)

print(len(data_collapsed_ph4), len(data_collapsed_ph2))

41
631 629


In [231]:
lenC = 0
for i, r in data_collapsed_ph4.iterrows():
    lenC += len(r['chat'])

In [232]:
lenC 

63780

In [233]:
lenB = 0
for i, r in data_collapsed_ph2.iterrows():
    lenB += len(r['chat'])

In [234]:
lenB

39842

In [4]:
save_path: str = '../../data/training_data/D70002.csv'
#data_collapsed_ph2.to_csv(save_path, index=False)

In [5]:
df = pd.read_csv(save_path)

In [6]:
df.head()

Unnamed: 0,game,benchmark_version,game_id,model,experiment,episode,Aborted,Lose,Success,chat,target,text
0,privateshared,v0.9,4,claude-v1.3-t0.0--claude-v1.3-t0.0,0_travel-booking,episode_4,0,0,1,"[{'role': 'user', 'content': 'You are a custom...","{'from': 'Stockholm', 'to': 'Madrid', 'by': 'P...",<|begin_of_text|><|start_header_id|>system<|en...
1,privateshared,v0.9,4,claude-v1.3-t0.0--claude-v1.3-t0.0,0_travel-booking,episode_4,0,0,1,"[{'role': 'user', 'content': 'You are a custom...","{'from': 'Stockholm', 'to': 'Madrid', 'by': 'P...",<|begin_of_text|><|start_header_id|>system<|en...
2,privateshared,v0.9,4,claude-v1.3-t0.0--claude-v1.3-t0.0,0_travel-booking,episode_4,0,0,1,"[{'role': 'user', 'content': 'You are a custom...","{'from': 'Stockholm', 'to': 'Madrid', 'by': 'P...",<|begin_of_text|><|start_header_id|>system<|en...
3,privateshared,v0.9,4,claude-v1.3-t0.0--claude-v1.3-t0.0,0_travel-booking,episode_4,0,0,1,"[{'role': 'user', 'content': 'You are a custom...","{'from': 'Stockholm', 'to': 'Madrid', 'by': 'P...",<|begin_of_text|><|start_header_id|>system<|en...
4,privateshared,v0.9,4,claude-v1.3-t0.0--claude-v1.3-t0.0,0_travel-booking,episode_4,0,0,1,"[{'role': 'user', 'content': 'You are a custom...","{'from': 'Stockholm', 'to': 'Madrid', 'by': 'P...",<|begin_of_text|><|start_header_id|>system<|en...


In [7]:
print(df.text.iloc[10])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a customer of a travel agency. Here is a description of the details of the travel you want to make:

WHAT: Travel
FROM: Stockholm
TO: Madrid
BY: Plane
CLASS: The most comfortable
WHEN: On April 10

The travel agent does not know about it yet. 

Questions from the travel agent will start with TRAVEL-AGENT. Please reply in the form: ANSWER: <some text>

I will also ask you questions. These questions will start with ME: . Please answer these questions with: ASIDE: <some text>

Important: Give short, direct answers!

Let us start.

ME:  Does the travel agent already know your origin? Please answer yes or no.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

ASIDE: No<|eot_id|><|start_header_id|>user<|end_header_id|>

ME:  Does the travel agent already know your destination? Please answer yes or no.<|eot