# Datasets
- D40001 Balanced before splitting random sampling of models
- D40002 Balanced after splitting random sampling of models 
- D40003 Balanced before splitting random sampling of models  (Oversampled, Private-shared, wordle)
- D40004 Balanced after splitting random sampling of models  (Oversampled, Private-shared, wordle)
- D40005 Balanced before splitting sample from top-scores top models
- D40006 Balanced after splitting sample from top-scores top models
- D40007 Balanced before splitting sample from top-scores top models  (Oversampled, Private-shared, wordle)
- D40008 Balanced after splitting sample from top-scores top models  (Oversampled, Private-shared, wordle)

In [1]:
import pandas as pd
from pandas import DataFrame
import random


In [2]:
path_taboo: str = "../../data/processed/processed_extended_data/taboo_old_processed.jsonl"
path_imagegame: str = "../../data/processed/processed_extended_data/imagegame_old_processed.jsonl"
path_referencegame: str = "../../data/processed/processed_extended_data/referencegame_new_processed.jsonl"
path_privateshared: str = "../../data/processed/processed_extended_data/privateshared_old_processed.jsonl"
path_wordle: str = "../../data/processed/processed_extended_data/wordle_old_processed.jsonl"
path_wordle_withclue: str = "../../data/processed/processed_extended_data/wordle_withclue_old_processed.jsonl"
path_wordle_withcritic: str = "../../data/processed/processed_extended_data/wordle_withcritic_old_processed.jsonl"

In [3]:
df_taboo = pd.read_json(path_taboo, lines=True)
df_imagegame = pd.read_json(path_imagegame, lines=True)
df_referencegame = pd.read_json(path_referencegame, lines=True)
df_privateshared = pd.read_json(path_privateshared, lines=True)
df_wordle = pd.read_json(path_wordle, lines=True)
df_wordle_withclue = pd.read_json(path_wordle_withclue, lines=True)
df_wordle_withcritic = pd.read_json(path_wordle_withcritic, lines=True)

In [4]:
df_taboo_success = df_taboo[df_taboo.Success == 1]
df_imagegame_success = df_imagegame[df_imagegame.Success == 1]
df_referencegame_success = df_referencegame[df_referencegame.Success == 1]
df_privateshared_success = df_privateshared[df_privateshared.Success == 1]
df_wordle_success = df_wordle[df_wordle.Success == 1]
df_wordle_withclue_success = df_wordle_withclue[df_wordle_withclue.Success == 1]
df_wordle_withcritic_success = df_wordle_withcritic[df_wordle_withcritic.Success == 1]

In [5]:
print(f"Taboo Data: {len(df_taboo_success)}")
print(f"Image Data: {len(df_imagegame_success)}")
print(f"Reference Game Data: {len(df_referencegame_success)}")
print(f"Private Shared Data: {len(df_privateshared_success)}")
print(f"Wordle Data: {len(df_wordle_success)}")
print(f"Wordle With Clue Data: {len(df_wordle_withclue_success)}")
print(f"Wordle With Critic Data: {len(df_wordle_withcritic_success)}")

Taboo Data: 1668
Image Data: 246
Reference Game Data: 19278
Private Shared Data: 41
Wordle Data: 26
Wordle With Clue Data: 268
Wordle With Critic Data: 395


## Utils

In [6]:
path_v09: str = '../../data/scores_v09_v10/results_v09.csv'
path_v10: str = '../../data/scores_v09_v10/results_v10.csv'
top_models_v09 = pd.read_csv(path_v09)
top_models_v10 = pd.read_csv(path_v10)

top_models = pd.concat([top_models_v09,top_models_v10])
top_models.sort_values(by=['-, clemscore'], ascending=False, inplace=True)
top_models.head()

model_rankings = list(top_models['Unnamed: 0'])
model_rankings_set = []
for m in model_rankings:
    if m not in model_rankings_set:
        model_rankings_set.append(m)
model_rankings_set

['gpt-4-0613-t0.0--gpt-4-0613-t0.0',
 'gpt-4-1106-preview-t0.0--gpt-4-1106-preview-t0.0',
 'gpt-4-t0.0--gpt-4-t0.0',
 'gpt-4-0314-t0.0--gpt-4-0314-t0.0',
 'gpt-4-t0.0--gpt-3.5-turbo-t0.0',
 'gpt-3.5-turbo-t0.0--gpt-4-t0.0',
 'claude-v1.3-t0.0--claude-v1.3-t0.0',
 'gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0',
 'claude-2.1-t0.0--claude-2.1-t0.0',
 'claude-2-t0.0--claude-2-t0.0',
 'gpt-3.5-turbo-0613-t0.0--gpt-3.5-turbo-0613-t0.0',
 'gpt-3.5-turbo-1106-t0.0--gpt-3.5-turbo-1106-t0.0',
 'openchat_3.5-t0.0--openchat_3.5-t0.0',
 'mistral-medium-t0.0--mistral-medium-t0.0',
 'Mixtral-8x7B-Instruct-v0.1-t0.0--Mixtral-8x7B-Instruct-v0.1-t0.0',
 'openchat-3.5-1210-t0.0--openchat-3.5-1210-t0.0',
 'sheep-duck-llama-2-70b-v1.1-t0.0--sheep-duck-llama-2-70b-v1.1-t0.0',
 'Yi-34B-Chat-t0.0--Yi-34B-Chat-t0.0',
 'WizardLM-70b-v1.0-t0.0--WizardLM-70b-v1.0-t0.0',
 'tulu-2-dpo-70b-t0.0--tulu-2-dpo-70b-t0.0',
 'text-davinci-003-t0.0--text-davinci-003-t0.0',
 'SUS-Chat-34B-t0.0--SUS-Chat-34B-t0.0',
 'claude-instant

In [11]:
path_v15: str = '../../data/scores_v15_v16/results_v15.csv'
path_v16: str = '../../data/scores_v15_v16/results_v16.csv'
top_models_v15 = pd.read_csv(path_v15)
top_models_v16 = pd.read_csv(path_v16)

top_models_new = pd.concat([top_models_v09,top_models_v10])
top_models_new.sort_values(by=['-, clemscore'], ascending=False, inplace=True)

model_rankings_new = list(top_models_new['Unnamed: 0'])
model_rankings_set_new = []
for m in model_rankings_new:
    if m not in model_rankings_set_new:
        model_rankings_set_new.append(m)
model_rankings_set_new

['gpt-4-0613-t0.0--gpt-4-0613-t0.0',
 'gpt-4-1106-preview-t0.0--gpt-4-1106-preview-t0.0',
 'gpt-4-t0.0--gpt-4-t0.0',
 'gpt-4-0314-t0.0--gpt-4-0314-t0.0',
 'gpt-4-t0.0--gpt-3.5-turbo-t0.0',
 'gpt-3.5-turbo-t0.0--gpt-4-t0.0',
 'claude-v1.3-t0.0--claude-v1.3-t0.0',
 'gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0',
 'claude-2.1-t0.0--claude-2.1-t0.0',
 'claude-2-t0.0--claude-2-t0.0',
 'gpt-3.5-turbo-0613-t0.0--gpt-3.5-turbo-0613-t0.0',
 'gpt-3.5-turbo-1106-t0.0--gpt-3.5-turbo-1106-t0.0',
 'openchat_3.5-t0.0--openchat_3.5-t0.0',
 'mistral-medium-t0.0--mistral-medium-t0.0',
 'Mixtral-8x7B-Instruct-v0.1-t0.0--Mixtral-8x7B-Instruct-v0.1-t0.0',
 'openchat-3.5-1210-t0.0--openchat-3.5-1210-t0.0',
 'sheep-duck-llama-2-70b-v1.1-t0.0--sheep-duck-llama-2-70b-v1.1-t0.0',
 'Yi-34B-Chat-t0.0--Yi-34B-Chat-t0.0',
 'WizardLM-70b-v1.0-t0.0--WizardLM-70b-v1.0-t0.0',
 'tulu-2-dpo-70b-t0.0--tulu-2-dpo-70b-t0.0',
 'text-davinci-003-t0.0--text-davinci-003-t0.0',
 'SUS-Chat-34B-t0.0--SUS-Chat-34B-t0.0',
 'claude-instant

In [12]:
def prepare_reduced_data(df: DataFrame, max_size: int = 2):
    sorted_data = df.sort_values(by=['experiment', 'episode', 'model', 'player'])
    data_lookup: dict = {}
    all_indices = []

    for i, row in sorted_data.iterrows():
        key = row['experiment'] + row['episode']
        if key not in data_lookup:
            data_lookup[key] = {
                'player 1' : [],
                'player 2' : []
            }
        data_lookup[key][row.player].append({"model": row['model'], 'index': i})

    for key in data_lookup:
        p1, p2 = data_lookup[key]['player 1'], data_lookup[key]['player 2']

        try:
            assert len(p1) == len(p2)
        except AssertionError:
            # sometimes there is an unequal number of turns so we have to even them 
            # only keep the ones that are present in p1 and p2
            p1 = [p for p in p1 if p['model'] in [t['model'] for t in p2]]
            p2 = [p for p in p2 if p['model'] in [t['model'] for t in p1]]
            assert len(p1) == len(p2)
    
        if len(p1) > max_size:
            # pick 2 random numbers from
            indexes = random.sample(range(0, len(p1) - 1), max_size)
            data_lookup[key]['player 1'] = [p1[i]['index'] for i in indexes]
            data_lookup[key]['player 2'] = [p2[i]['index'] for i in indexes]
        else:
            data_lookup[key]['player 1'] = [t['index'] for t in p1]
            data_lookup[key]['player 2'] = [t['index'] for t in p2]

    for key in data_lookup:
        p1, p2 = data_lookup[key]['player 1'], data_lookup[key]['player 2']
        assert len(p1) == len(p2)
        assert len(p1) <= max_size
    
        all_indices.extend(p1)
        all_indices.extend(p2)

    reduced_data = sorted_data.loc[all_indices]
    
    return reduced_data

def prepare_reduced_data_with_model_ranking(df: DataFrame, max_size: int = 2):
    sorted_data = df.sort_values(by=['experiment', 'episode', 'model', 'player'])
    data_lookup: dict = {}
    all_indices = []

    for i, row in sorted_data.iterrows():
        key = row['experiment'] + row['episode']
        if key not in data_lookup:
            data_lookup[key] = {
                'player 1' : [],
                'player 2' : []
            }
        data_lookup[key][row.player].append({"model": row['model'], 'index': i})

    for key in data_lookup:
        p1, p2 = data_lookup[key]['player 1'], data_lookup[key]['player 2']

        try:
            assert len(p1) == len(p2)
        except AssertionError:
            # sometimes there is an unequal number of turns so we have to even them 
            # only keep the ones that are present in p1 and p2
            p1 = [p for p in p1 if p['model'] in [t['model'] for t in p2]]
            p2 = [p for p in p2 if p['model'] in [t['model'] for t in p1]]
            assert len(p1) == len(p2)

        if len(p1) > max_size:
            # pick 2 random numbers from
            for p, _p in zip(p1, p2): 
                try:
                    p['sort_index'] = model_rankings_set.index(p['model'])
                    _p['sort_index'] = model_rankings_set.index(_p['model'])
                except ValueError:
                    try:
                        p['sort_index'] = model_rankings_set_new.index(p['model'])
                        _p['sort_index'] = model_rankings_set_new.index(_p['model'])
                    except ValueError:
                        p['sort_index'] = 1000
                        _p['sort_index'] = 1000
                        
                     
            
            p1 = sorted(p1, key=lambda p: p['sort_index'], reverse=False)
            p2 = sorted(p2, key=lambda p: p['sort_index'], reverse=False)

            data_lookup[key]['player 1'] = [p['index'] for p in p1[:max_size]]
            data_lookup[key]['player 2'] = [p['index'] for p in p2[:max_size]]
        else:
            data_lookup[key]['player 1'] = [t['index'] for t in p1]
            data_lookup[key]['player 2'] = [t['index'] for t in p2]

    for key in data_lookup:
        p1, p2 = data_lookup[key]['player 1'], data_lookup[key]['player 2']
        assert len(p1) == len(p2)
        assert len(p1) <= max_size

        all_indices.extend(p1)
        all_indices.extend(p2)

    reduced_data = sorted_data.loc[all_indices]

    return reduced_data

def prepare_reduced_data_single_player(df: DataFrame, max_size: int = 2):
    sorted_data = df.sort_values(by=['experiment', 'episode', 'model'])
    data_lookup: dict = {}
    all_indices = []

    for i, row in sorted_data.iterrows():
        key = row['experiment'] + row['episode']
        if key not in data_lookup:
            data_lookup[key] = []
        data_lookup[key].append({"model": row['model'], 'index': i})

    for key in data_lookup:
        p1 = data_lookup[key]

        if len(p1) > max_size:
            # pick 2 random numbers from
            indexes = random.sample(range(0, len(p1) - 1), max_size)
            data_lookup[key] = [p1[i]['index'] for i in indexes]
        else:
            data_lookup[key] = [t['index'] for t in p1]

    for key in data_lookup:
        p1 = data_lookup[key]
        assert len(p1) <= max_size

        all_indices.extend(p1)

    reduced_data = sorted_data.loc[all_indices]

    return reduced_data

def prepare_reduced_data_single_player_model_ranking(df: DataFrame, max_size: int = 2):
    sorted_data = df.sort_values(by=['experiment', 'episode', 'model'])
    data_lookup: dict = {}
    all_indices = []

    for i, row in sorted_data.iterrows():
        key = row['experiment'] + row['episode']
        if key not in data_lookup:
            data_lookup[key] = []
        data_lookup[key].append({"model": row['model'], 'index': i})

    for key in data_lookup:
        p1 = data_lookup[key]

        if len(p1) > max_size:
            # pick 2 random numbers from
            # pick 2 random numbers from
            for p in p1:
                p['sort_index'] = model_rankings_set.index(p['model'])

            p1 = sorted(p1, key=lambda p: p['sort_index'], reverse=False)

            data_lookup[key] = [p['index'] for p in p1[:max_size]]
        else:
            data_lookup[key] = [t['index'] for t in p1]

    for key in data_lookup:
        p1 = data_lookup[key]
        assert len(p1) <= max_size

        all_indices.extend(p1)

    reduced_data = sorted_data.loc[all_indices]

    return reduced_data

def prepare_oversampled_data(df: DataFrame, over_sample_from: int = 2, over_sample_until: int = 3):
    sorted_data = df.sort_values(by=['experiment', 'episode', 'model'])
    data_lookup: dict = {}
    all_indices = []

    for i, row in sorted_data.iterrows():
        key = row['experiment'] + row['episode']
        if key not in data_lookup:
            data_lookup[key] = []
        data_lookup[key].append({"model": row['model'], 'index': i})

    for key in data_lookup:
        p1 = data_lookup[key]
        
        # check if we should oversample or not
        if len(p1) < over_sample_from:
            if len(p1) == 1:
                p1 = [p1[0]['index']] * over_sample_until
                data_lookup[key] = p1
            else:
                num_samples = over_sample_until - len(p1)
                # pick random numbers according to the needed amount
    
                indexes = [random.sample(range(0, len(p1) - 1), 1)[0] for n in range(num_samples)]
                p1.extend([p1[i] for i in indexes])
                data_lookup[key] = [p['index'] for p in p1]
        else:
            data_lookup[key] = [t['index'] for t in p1]

    for key in data_lookup:
        p1 = data_lookup[key]
        
        all_indices.extend(p1)
    
    oversampled_data = sorted_data.loc[all_indices]

    return oversampled_data


# Reduce Taboo
### Steps: 
- take only top score episodes
- only allow certain number of duplicates

In the data when only considering the top scores 56/60 episodes are represented, but some of them occur 66 times
Down size the dataset to about 200 by only allowing each episode to occur 4 times at max which is effectively 2 time since there is always player 1 and player 2

In [13]:
taboo_data_D40001 = prepare_reduced_data(df_taboo_success, max_size=2)
taboo_data_D40002 = taboo_data_D40001.copy()
taboo_data_D40003 = taboo_data_D40001.copy()
taboo_data_D40004 = taboo_data_D40001.copy()

taboo_data_D40005 = prepare_reduced_data_with_model_ranking(df_taboo_success, max_size=2)
taboo_data_D40006 = taboo_data_D40005.copy()
taboo_data_D40007 = taboo_data_D40005.copy()
taboo_data_D40008 = taboo_data_D40005.copy()

# Reduced Referencegame Data

Reference game data is used from benchmark version 1.6 which means that some episodes will be dropped out of the dataset.

However, there is much more data available on benchmark version 1.6 which means that every episode will be represented at a maximum of 1 time. This still results in overall 288 instances 144 player 1 and 144 player 2. All episodes are represented. 144/180 will be part of training data and the other 36 are testing data.

In [14]:
positive_episodes: dict = {'0_line_grids_rows': ['episode_25', 'episode_18'],
                           '1_line_grids_columns': ['episode_10', 'episode_0'],
                           '2_diagonal_grids': ['episode_27', 'episode_24'],
                           '3_letter_grids': ['episode_8', 'episode_11'],
                           '4_shape_grids': ['episode_15', 'episode_24'],
                           '5_random_grids': ['episode_28', 'episode_0']}

negative_episodes: dict = {'0_line_grids_rows': ['episode_29', 'episode_7', 'episode_10', 'episode_15'],
                           '1_line_grids_columns': ['episode_15', 'episode_5', 'episode_3', 'episode_22'],
                           '2_diagonal_grids': ['episode_5', 'episode_29', 'episode_17', 'episode_8'],
                           '3_letter_grids': ['episode_29', 'episode_22', 'episode_24', 'episode_13'],
                           '4_shape_grids': ['episode_13', 'episode_9', 'episode_17', 'episode_1'],
                           '5_random_grids': ['episode_23', 'episode_17', 'episode_1', 'episode_20']}

test_episodes: dict = {key: [*positive_episodes[key], *negative_episodes[key]] for key in positive_episodes.keys()}

In [15]:
def filter_dataframe(df, episode_dict):
    # Apply filtering logic row by row
    mask = df.apply(lambda row: row['episode'] not in episode_dict[row['experiment']], axis=1)
    return df[mask]

# Filtered DataFrame
filtered_reference_data_success = filter_dataframe(df_referencegame_success, test_episodes)
stats = filtered_reference_data_success.groupby('experiment').episode.unique()

In [16]:
reference_game_data_D40001 = prepare_reduced_data(filtered_reference_data_success, max_size=1)
reference_game_data_D40002 = reference_game_data_D40001.copy()
reference_game_data_D40003 = reference_game_data_D40001.copy()
reference_game_data_D40004 = reference_game_data_D40001.copy()

reference_game_data_D40005 = prepare_reduced_data_with_model_ranking(filtered_reference_data_success, max_size=1)
reference_game_data_D40006 = reference_game_data_D40005.copy()
reference_game_data_D40007 = reference_game_data_D40005.copy()
reference_game_data_D40008 = reference_game_data_D40005.copy()

# Wordle With Critic:

In [17]:
df_wordle_withcritic_success.groupby('experiment').episode.value_counts()

experiment                                 episode  
0_high_frequency_words_clue_with_critic    episode_1    24
                                           episode_5    24
                                           episode_7    22
                                           episode_6    18
                                           episode_2    14
                                           episode_3    12
                                           episode_4    10
                                           episode_0     6
                                           episode_9     2
1_medium_frequency_words_clue_with_critic  episode_1    20
                                           episode_8    18
                                           episode_3    16
                                           episode_9    12
                                           episode_2    10
                                           episode_0     6
                                           episode_4     6
   

In [18]:
wordle_with_critic_game_data_D40001 = prepare_reduced_data(df_wordle_withcritic_success, max_size=5)
wordle_with_critic_game_data_D40002 = prepare_reduced_data(df_wordle_withcritic_success, max_size=2)
wordle_with_critic_game_data_D40003 = wordle_with_critic_game_data_D40001.copy()
wordle_with_critic_game_data_D40004 = wordle_with_critic_game_data_D40002.copy()

wordle_with_critic_game_data_D40005 = prepare_reduced_data_with_model_ranking(df_wordle_withcritic_success, max_size=5)
wordle_with_critic_game_data_D40006 = prepare_reduced_data_with_model_ranking(df_wordle_withcritic_success, max_size=2)
wordle_with_critic_game_data_D40007 = wordle_with_critic_game_data_D40005.copy()
wordle_with_critic_game_data_D40008 = wordle_with_critic_game_data_D40006.copy()

# Oversample privateshared

In [19]:
privateshared_game_data_D40001 = df_privateshared_success.copy()
privateshared_game_data_D40002 = df_privateshared_success.copy()
privateshared_game_data_D40003 = prepare_oversampled_data(df_privateshared_success, over_sample_from = 6, over_sample_until = 7)
privateshared_game_data_D40004 = prepare_oversampled_data(df_privateshared_success, over_sample_from = 1, over_sample_until = 2)

privateshared_game_data_D40005 = df_privateshared_success.copy()
privateshared_game_data_D40006 = df_privateshared_success.copy()
privateshared_game_data_D40007 = prepare_oversampled_data(df_privateshared_success, over_sample_from = 6, over_sample_until = 7)
privateshared_game_data_D40008 = prepare_oversampled_data(df_privateshared_success, over_sample_from = 1, over_sample_until = 2)

In [20]:
oversampled_privateshared_data.groupby('experiment').episode.value_counts()

NameError: name 'oversampled_privateshared_data' is not defined

# Oversample Wordle

In [21]:
wordle_game_data_D40001 = df_wordle_success.copy()
wordle_game_data_D40002 = df_wordle_success.copy()
wordle_game_data_D40003 = prepare_oversampled_data(df_wordle_success, over_sample_from = 6, over_sample_until = 16)
wordle_game_data_D40004 = prepare_oversampled_data(df_wordle_success, over_sample_from = 6, over_sample_until = 5)

wordle_game_data_D40005 = df_wordle_success.copy()
wordle_game_data_D40006 = df_wordle_success.copy()
wordle_game_data_D40007 = wordle_game_data_D40003.copy()
wordle_game_data_D40008 = wordle_game_data_D40004.copy()

In [22]:
len(wordle_game_data_D40003)

208

# Wordle Withclue

In [23]:
wordle_withclue_game_data_D40001 = df_wordle_withclue_success.copy()
wordle_withclue_game_data_D40002 = prepare_reduced_data_single_player(df_wordle_withclue_success, max_size = 8)
wordle_withclue_game_data_D40003 = df_wordle_withclue_success.copy()
wordle_withclue_game_data_D40004 = wordle_withclue_game_data_D40002.copy()

wordle_withclue_game_data_D40005 = df_wordle_withclue_success.copy()
wordle_withclue_game_data_D40006 = prepare_reduced_data_single_player_model_ranking(df_wordle_withclue_success, max_size = 8)
wordle_withclue_game_data_D40007 = df_wordle_withclue_success.copy()
wordle_withclue_game_data_D40008 = wordle_withclue_game_data_D40006.copy()

In [24]:
wordle_withclue_game_data_D40002.groupby('experiment').episode.value_counts()

experiment                               episode  
0_high_frequency_words_clue_no_critic    episode_1    8
                                         episode_4    8
                                         episode_5    8
                                         episode_6    8
                                         episode_7    8
                                         episode_2    6
                                         episode_3    4
                                         episode_9    1
1_medium_frequency_words_clue_no_critic  episode_1    8
                                         episode_3    8
                                         episode_4    8
                                         episode_8    8
                                         episode_9    8
                                         episode_2    6
                                         episode_0    4
                                         episode_5    3
                                         episode_7   

# Image Game

In [25]:
imagegamegame_data_D40004.groupby('experiment').episode.value_counts()

NameError: name 'imagegamegame_data_D40004' is not defined

In [26]:
imagegame_game_data_D40001 = df_imagegame_success.copy()
imagegame_game_data_D40002 = prepare_reduced_data(df_imagegame_success, max_size=1)
imagegame_game_data_D40003 = df_imagegame_success.copy()
imagegame_game_data_D40004 = imagegame_game_data_D40002.copy()

imagegame_game_data_D40005 = df_imagegame_success.copy()
imagegame_game_data_D40006 = prepare_reduced_data_with_model_ranking(df_imagegame_success, max_size=1)
imagegame_game_data_D40007 = df_imagegame_success.copy()
imagegame_game_data_D40008 = imagegame_game_data_D40006.copy()

In [27]:
len(imagegame_game_data_D40003), len(imagegamegame_data_D40004)

NameError: name 'imagegamegame_data_D40004' is not defined

# Merge prepared data together

In [28]:
columns_to_keep = ['game', 'benchmark_version', 'game_id', 'model', 'experiment',
'episode', 'Aborted', 'Lose', 'Success', 'chat']

In [29]:
data_D40001 = [
    taboo_data_D40001,
    reference_game_data_D40001, 
    imagegame_game_data_D40001, 
    wordle_with_critic_game_data_D40001, 
    wordle_withclue_game_data_D40001, 
    wordle_game_data_D40001, 
    privateshared_game_data_D40001,]

data_D40002 = [
    taboo_data_D40002,
    reference_game_data_D40002,
    imagegame_game_data_D40002,
    wordle_with_critic_game_data_D40002,
    wordle_withclue_game_data_D40002,
    wordle_game_data_D40002,
    privateshared_game_data_D40002,]

data_D40003 = [
    taboo_data_D40003,
    reference_game_data_D40003,
    imagegame_game_data_D40003,
    wordle_with_critic_game_data_D40003,
    wordle_withclue_game_data_D40003,
    wordle_game_data_D40003,
    privateshared_game_data_D40003,]

data_D40004 = [
    taboo_data_D40004,
    reference_game_data_D40004,
    imagegame_game_data_D40004,
    wordle_with_critic_game_data_D40004,
    wordle_withclue_game_data_D40004,
    wordle_game_data_D40004,
    privateshared_game_data_D40004,]

data_D40005 = [
    taboo_data_D40005,
    reference_game_data_D40005,
    imagegame_game_data_D40005,
    wordle_with_critic_game_data_D40005,
    wordle_withclue_game_data_D40005,
    wordle_game_data_D40005,
    privateshared_game_data_D40005,]

data_D40006 = [
    taboo_data_D40006,
    reference_game_data_D40006,
    imagegame_game_data_D40006,
    wordle_with_critic_game_data_D40006,
    wordle_withclue_game_data_D40006,
    wordle_game_data_D40006,
    privateshared_game_data_D40006,]

data_D40007 = [
    taboo_data_D40007,
    reference_game_data_D40007,
    imagegame_game_data_D40007,
    wordle_with_critic_game_data_D40007,
    wordle_withclue_game_data_D40007,
    wordle_game_data_D40007,
    privateshared_game_data_D40007,]

data_D40008 = [
    taboo_data_D40008,
    reference_game_data_D40008,
    imagegame_game_data_D40008,
    wordle_with_critic_game_data_D40008,
    wordle_withclue_game_data_D40008,
    wordle_game_data_D40008,
    privateshared_game_data_D40008,]

In [30]:
data_D40001 = [tdf[columns_to_keep] for tdf in data_D40001]
data_D40002 = [tdf[columns_to_keep] for tdf in data_D40002]
data_D40003 = [tdf[columns_to_keep] for tdf in data_D40003]
data_D40004 = [tdf[columns_to_keep] for tdf in data_D40004]
data_D40005 = [tdf[columns_to_keep] for tdf in data_D40005]
data_D40006 = [tdf[columns_to_keep] for tdf in data_D40006]
data_D40007 = [tdf[columns_to_keep] for tdf in data_D40007]
data_D40008 = [tdf[columns_to_keep] for tdf in data_D40008]


In [31]:
merged_data_D40001 = pd.concat(data_D40001)
merged_data_D40002 = pd.concat(data_D40002)
merged_data_D40003 = pd.concat(data_D40003)
merged_data_D40004 = pd.concat(data_D40004)
merged_data_D40005 = pd.concat(data_D40005)
merged_data_D40006 = pd.concat(data_D40006)
merged_data_D40007 = pd.concat(data_D40007)
merged_data_D40008 = pd.concat(data_D40008)

In [33]:
print("D40001:", len(merged_data_D40001))
print("D40002:", len(merged_data_D40002))
print("D40003:", len(merged_data_D40003))
print("D40004:", len(merged_data_D40004))
print("D40005:", len(merged_data_D40005))
print("D40006:", len(merged_data_D40006))
print("D40007:", len(merged_data_D40007))
print("D40008:", len(merged_data_D40008))

D40001: 1325
D40002: 926
D40003: 1669
D40004: 965
D40005: 1325
D40006: 926
D40007: 1669
D40008: 965


In [34]:
import copy

def ensure_alternating_roles(messages, cull_system_message: bool = True):
    """
    The messages format assumes alternating roles of user and assistant. This method checks, if this constraint
    is satisfied. If this is not the case and there are consecutive user or assistant messages,
    then these are merged into a single one.

    :param cull_system_message:
    :param messages: to be checked
    :return: a new messages object with the alternating roles ensured
    """
    consec_msg = 0
    _messages = copy.deepcopy(messages)

    if cull_system_message:
        if _messages[0]['role'] == "system" and not _messages[0]["content"]:
            del _messages[0]

    def is_same_role(msg1, msg2):
        return msg1["role"] == msg2["role"]

    delimiter = "\n\n"

    def join_content(msg1, msg2):
        return f"{msg1['content']}{delimiter}{msg2['content']}"

    if len(_messages) <= 1:
        return _messages

    def is_valid(idx):
        return idx < len(_messages)

    msg_idx = 1
    while is_valid(msg_idx):
        prev_message = _messages[msg_idx - 1]
        message = _messages[msg_idx]
        if is_same_role(prev_message, message):
            warn_msg = (f"Found consecutive role assignments. These will be merged into one:\n"
                        f"{prev_message}\n"
                        f"{message}")
            #logger.warning(warn_msg)
            #print("X")
            consec_msg += 1
            prev_message['content'] = join_content(prev_message, message)
            del _messages[msg_idx]
        else:
            msg_idx += 1
    #print(f"{consec_msg} consecutive messages have been merged!")
    return _messages

In [35]:
def prepare_qa_data(data: DataFrame) -> DataFrame:
    result_data: pd.DataFrame = {key: [] for key in data.columns}

    for i, row in data.iterrows():
        chat = row['chat']
        game = row['game']

        chat_last_index = len(chat)

        if game == 'privateshared':
            chat_last_index = len(chat)-1
            messages = []
            probes = []
            for i, item in enumerate(chat):
                if "message" in item["type"]:
                    messages.append(item)
                elif "probe" in item["type"]:
                    if i != chat_last_index:
                        if "probe" in chat[i+1]["type"]: # if next turn is also part of probing
                            probes.append(item)
                        else:
                            probes.append(item)
                            for column in data.columns:
                                if column == 'chat':
                                    result_data[column].append(messages + probes) # append everything above "send message"
                                    probes = []
                                else:
                                    result_data[column].append(row[column])


                    else:
                        for column in data.columns:

                            if column == 'chat':
                                probes.append(item)
                                result_data[column].append(messages + probes) # append everything above "send message"
                                probes = []
                            else:
                                result_data[column].append(row[column])
        else:
            for i, item in enumerate(chat):
                if item['role'] == 'assistant':
                    for key in data.columns:
                        if key != 'chat':
                            result_data[key].append(row[key])
                        else:
                            if i + 1 < chat_last_index:
                                result_data[key].append(chat[:i + 1])
                            else:
                                result_data[key].append(row[key])
    return result_data

In [36]:
merged_data_D40001.chat = merged_data_D40001.chat.apply(ensure_alternating_roles)
merged_data_D40002.chat = merged_data_D40002.chat.apply(ensure_alternating_roles)
merged_data_D40003.chat = merged_data_D40003.chat.apply(ensure_alternating_roles)
merged_data_D40004.chat = merged_data_D40004.chat.apply(ensure_alternating_roles)
merged_data_D40005.chat = merged_data_D40005.chat.apply(ensure_alternating_roles)
merged_data_D40006.chat = merged_data_D40006.chat.apply(ensure_alternating_roles)
merged_data_D40007.chat = merged_data_D40007.chat.apply(ensure_alternating_roles)
merged_data_D40008.chat = merged_data_D40008.chat.apply(ensure_alternating_roles)


In [37]:
result_data_D40001 = pd.DataFrame.from_dict(prepare_qa_data(merged_data_D40001.sample(frac=1).reset_index(drop=True)))
result_data_D40002 = pd.DataFrame.from_dict(prepare_qa_data(merged_data_D40002.sample(frac=1).reset_index(drop=True)))
result_data_D40003 = pd.DataFrame.from_dict(prepare_qa_data(merged_data_D40003.sample(frac=1).reset_index(drop=True)))
result_data_D40004 = pd.DataFrame.from_dict(prepare_qa_data(merged_data_D40004.sample(frac=1).reset_index(drop=True)))
result_data_D40005 = pd.DataFrame.from_dict(prepare_qa_data(merged_data_D40005.sample(frac=1).reset_index(drop=True)))
result_data_D40006 = pd.DataFrame.from_dict(prepare_qa_data(merged_data_D40006.sample(frac=1).reset_index(drop=True)))
result_data_D40007 = pd.DataFrame.from_dict(prepare_qa_data(merged_data_D40007.sample(frac=1).reset_index(drop=True)))
result_data_D40008 = pd.DataFrame.from_dict(prepare_qa_data(merged_data_D40008.sample(frac=1).reset_index(drop=True)))


In [39]:
result_data_D40005.game.value_counts() / len(result_data_D40005)

game
imagegame            0.338106
wordle_withcritic    0.205728
wordle_withclue      0.147867
privateshared        0.098188
taboo                0.090006
referencegame        0.084161
wordle               0.035944
Name: count, dtype: float64

In [596]:
result_data_D40002.game.value_counts()


game
imagegame            371
privateshared        336
wordle_withclue      331
wordle_withcritic    309
taboo                294
referencegame        288
wordle               123
Name: count, dtype: int64

In [597]:
result_data_D40003.game.value_counts()

game
privateshared        1708
imagegame            1157
wordle               1036
wordle_withcritic     701
wordle_withclue       506
taboo                 294
referencegame         288
Name: count, dtype: int64

In [598]:
result_data_D40004.game.value_counts()

game
imagegame            371
privateshared        336
wordle_withclue      331
wordle               322
wordle_withcritic    309
taboo                294
referencegame        288
Name: count, dtype: int64

In [600]:
result_data_D40001.to_csv("../../data/training_data/D40001.csv", index=False)
result_data_D40002.to_csv("../../data/training_data/D40002.csv", index=False)
result_data_D40003.to_csv("../../data/training_data/D40003.csv", index=False)
result_data_D40004.to_csv("../../data/training_data/D40004.csv", index=False)

result_data_D40005.to_csv("../../data/training_data/D40005.csv", index=False)
result_data_D40006.to_csv("../../data/training_data/D40006.csv", index=False)
result_data_D40007.to_csv("../../data/training_data/D40007.csv", index=False)
result_data_D40008.to_csv("../../data/training_data/D40008.csv", index=False)