# Datasets
- D40001 Balanced before splitting
- D40002 Balanced after splitting
- D40003 Balanced before splitting (Oversampled, Private-shared)
- D40004 Balanced after splitting (Oversampled, Private-shared)

In [108]:
import pandas as pd

In [163]:
path_taboo: str = "../../data/processed/processed_extended_data/taboo_old_processed.jsonl"
path_imagegame: str = "../../data/processed/processed_extended_data/imagegame_old_processed.jsonl"
path_referencegame: str = "../../data/processed/processed_extended_data/referencegame_new_processed.jsonl"
path_privateshared: str = "../../data/processed/processed_extended_data/privateshared_old_processed.jsonl"
path_wordle: str = "../../data/processed/processed_extended_data/wordle_old_processed.jsonl"
path_wordle_withclue: str = "../../data/processed/processed_extended_data/wordle_withclue_old_processed.jsonl"
path_wordle_withcritic: str = "../../data/processed/processed_extended_data/wordle_withcritic_old_processed.jsonl"

In [164]:
df_taboo = pd.read_json(path_taboo, lines=True)
df_imagegame = pd.read_json(path_imagegame, lines=True)
df_referencegame = pd.read_json(path_referencegame, lines=True)
df_privateshared = pd.read_json(path_privateshared, lines=True)
df_wordle = pd.read_json(path_wordle, lines=True)
df_wordle_withclue = pd.read_json(path_wordle_withclue, lines=True)
df_wordle_withcritic = pd.read_json(path_wordle_withcritic, lines=True)

In [165]:
df_taboo_success = df_taboo[df_taboo.Success == 1]
df_imagegame_success = df_imagegame[df_imagegame.Success == 1]
df_referencegame_success = df_referencegame[df_referencegame.Success == 1]
df_privateshared_success = df_privateshared[df_privateshared.Success == 1]
df_wordle_success = df_wordle[df_wordle.Success == 1]
df_wordle_withclue_success = df_wordle_withclue[df_wordle_withclue.Success == 1]
df_wordle_withcritic_success = df_wordle_withcritic[df_wordle_withcritic.Success == 1]

In [166]:
print(f"Taboo Data: {len(df_taboo_success)}")
print(f"Image Data: {len(df_imagegame_success)}")
print(f"Reference Game Data: {len(df_referencegame_success)}")
print(f"Private Shared Data: {len(df_privateshared_success)}")
print(f"Wordle Data: {len(df_wordle_success)}")
print(f"Wordle With Clue Data: {len(df_wordle_withclue_success)}")
print(f"Wordle With Critic Data: {len(df_wordle_withcritic_success)}")

Taboo Data: 1668
Image Data: 246
Reference Game Data: 19278
Private Shared Data: 41
Wordle Data: 26
Wordle With Clue Data: 268
Wordle With Critic Data: 395


# Reduce Taboo
### Steps: 
- take only top score episodes
- only allow certain number of duplicates

In the data when only considering the top scores 56/60 episodes are represented, but some of them occur 66 times
Down size the dataset to about 200 by only allowing each episode to occur 4 times at max which is effectively 2 time since there is always player 1 and player 2

In [167]:
top_10_models: list = [
    'gpt-4-0613-t0.0--gpt-4-0613-t0.0',
    'claude-v1.3-t0.0--claude-v1.3-t0.0',
    'gpt-4-1106-preview-t0.0--gpt-4-1106-preview-t0.0',
    'gpt-4-t0.0--gpt-4-t0.0',
    'gpt-4-0314-t0.0--gpt-4-0314-t0.0',
    'claude-2.1-t0.0--claude-2.1-t0.0',
    'gpt-4-t0.0--gpt-3.5-turbo-t0.0',
    'claude-2-t0.0--claude-2-t0.0',
    'gpt-3.5-turbo-1106-t0.0--gpt-3.5-turbo-1106-t0.0',
    'gpt-3.5-turbo-0613-t0.0--gpt-3.5-turbo-0613-t0.0',
]

In [168]:
df_taboo_success_top_score = df_taboo_success[df_taboo_success.main_score == 100.00]

In [169]:
df_taboo_success_top_score.groupby(['experiment']).episode.value_counts()

experiment   episode   
0_high_en    episode_15    62
             episode_7     60
             episode_6     52
             episode_5     50
             episode_10    48
             episode_17    48
             episode_0     42
             episode_11    42
             episode_9     36
             episode_1     34
             episode_13    34
             episode_4     28
             episode_3     24
             episode_12    12
             episode_16    12
             episode_2     12
             episode_8      8
             episode_18     4
             episode_19     4
             episode_14     2
1_medium_en  episode_17    54
             episode_15    46
             episode_1     44
             episode_5     36
             episode_10    32
             episode_13    32
             episode_6     30
             episode_12    24
             episode_16    22
             episode_9     20
             episode_0     18
             episode_19    18
             epi

In [170]:
# order the dataframe by experiment episode model player
df_taboo_success_top_score.sort_values(by=['experiment', 'episode', 'model', 'player'])

Unnamed: 0,game,benchmark_version,game_id,model,experiment,episode,Aborted,Lose,Success,chat,target,main_score,player
996,taboo,v1.0,0,WizardLM-13b-v1.2-t0.0--WizardLM-13b-v1.2-t0.0,0_high_en,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",length,100.0,player 1
4057,taboo,v1.0,0,WizardLM-13b-v1.2-t0.0--WizardLM-13b-v1.2-t0.0,0_high_en,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",length,100.0,player 2
1055,taboo,v1.0,0,WizardLM-70b-v1.0-t0.0--WizardLM-70b-v1.0-t0.0,0_high_en,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",length,100.0,player 1
4116,taboo,v1.0,0,WizardLM-70b-v1.0-t0.0--WizardLM-70b-v1.0-t0.0,0_high_en,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",length,100.0,player 2
1173,taboo,v1.0,0,claude-2-t0.0--claude-2-t0.0,0_high_en,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",length,100.0,player 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5767,taboo,v1.0,9,tulu-2-dpo-70b-t0.0--tulu-2-dpo-70b-t0.0,2_low_en,episode_9,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",undo,100.0,player 2
2824,taboo,v1.0,9,vicuna-13b-v1.5-t0.0--vicuna-13b-v1.5-t0.0,2_low_en,episode_9,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",undo,100.0,player 1
5885,taboo,v1.0,9,vicuna-13b-v1.5-t0.0--vicuna-13b-v1.5-t0.0,2_low_en,episode_9,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",undo,100.0,player 2
2883,taboo,v1.0,9,vicuna-33b-v1.3-t0.0--vicuna-33b-v1.3-t0.0,2_low_en,episode_9,0,0,1,"[{'role': 'user', 'content': 'You are playing ...",undo,100.0,player 1


In [171]:
data_lookup: dict = {}
for i, row in df_taboo_success_top_score.iterrows():
    key = row['experiment'] + row['episode']
    if key not in data_lookup:
        data_lookup[key] = {
            'player 1' : [],
            'player 2' : []
        }
    data_lookup[key][row.player].append({"model": row['model'], 'index': i})
        

In [172]:
# D40001 Downsample and pick random
# D40002 Downsample and consider top 10 and rest is random

In [173]:
import random
max_size = 2
for key in data_lookup:
    p1, p2 = data_lookup[key]['player 1'], data_lookup[key]['player 2']
    assert len(p1) == len(p2)
    
    if len(p1) > max_size:
        # pick 2 random numbers from
        indexes = random.sample(range(0, len(p1) - 1), max_size)
        data_lookup[key]['player 1'] = [p1[i]['index'] for i in indexes]
        data_lookup[key]['player 2'] = [p2[i]['index'] for i in indexes]
    else:
        data_lookup[key]['player 1'] = [t['index'] for t in p1]
        data_lookup[key]['player 2'] = [t['index'] for t in p2]

In [174]:
all_indices = []
for key in data_lookup:
    p1, p2 = data_lookup[key]['player 1'], data_lookup[key]['player 2']
    assert len(p1) == len(p2)
    assert len(p1) <= 2
    
    all_indices.extend(p1)
    all_indices.extend(p2)

In [175]:
reduced_taboo_data = df_taboo_success_top_score.loc[all_indices]

In [176]:
len(reduced_taboo_data)

220

# Reduced Referencegame Data

In [179]:
df_referencegame_success.groupby(['experiment']).episode.value_counts()

experiment         episode   
0_line_grids_rows  episode_0     162
                   episode_24    160
                   episode_26    148
                   episode_3     132
                   episode_21    126
                                ... 
5_random_grids     episode_26     78
                   episode_25     76
                   episode_10     74
                   episode_22     72
                   episode_11     64
Name: count, Length: 180, dtype: int64

In [181]:
# order the dataframe by experiment episode model player
df_referencegame_success.sort_values(by=['experiment', 'episode', 'model', 'player'])

Unnamed: 0,game,benchmark_version,game_id,model,experiment,episode,Aborted,Lose,Success,chat,target,main_score,request_count,request_ratio,average_expression_tokens,player
7200,referencegame,v1.5_quantized,0,CapybaraHermes-2.5-Mistral-7B-GGUF-q4-t0.0--Ca...,0_line_grids_rows,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are given th...",first,100.0,2,1.0,6.0,player 1
30240,referencegame,v1.5_quantized,0,CapybaraHermes-2.5-Mistral-7B-GGUF-q4-t0.0--Ca...,0_line_grids_rows,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are given th...",first,100.0,2,1.0,6.0,player 2
7380,referencegame,v1.5_quantized,0,CapybaraHermes-2.5-Mistral-7B-GGUF-q5-k-s-t0.0...,0_line_grids_rows,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are given th...",first,100.0,2,1.0,10.0,player 1
30420,referencegame,v1.5_quantized,0,CapybaraHermes-2.5-Mistral-7B-GGUF-q5-k-s-t0.0...,0_line_grids_rows,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are given th...",first,100.0,2,1.0,10.0,player 2
7560,referencegame,v1.5_quantized,0,CapybaraHermes-2.5-Mistral-7B-GGUF-q5-t0.0--Ca...,0_line_grids_rows,episode_0,0,0,1,"[{'role': 'user', 'content': 'You are given th...",first,100.0,2,1.0,12.0,player 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43019,referencegame,v1.6,9,openchat_3.5-t0.0--openchat_3.5-t0.0,5_random_grids,episode_9,0,0,1,"[{'role': 'user', 'content': 'You are given th...","[first, 1st, 1]",100.0,2,1.0,14.0,player 2
6659,referencegame,v1.5,9,sheep-duck-llama-2-70b-v1.1-t0.0--sheep-duck-l...,5_random_grids,episode_9,0,0,1,"[{'role': 'user', 'content': 'You are given th...",first,100.0,2,1.0,29.0,player 1
20339,referencegame,v1.6,9,sheep-duck-llama-2-70b-v1.1-t0.0--sheep-duck-l...,5_random_grids,episode_9,0,0,1,"[{'role': 'user', 'content': 'You are given th...","[first, 1st, 1]",100.0,2,1.0,29.0,player 1
29699,referencegame,v1.5,9,sheep-duck-llama-2-70b-v1.1-t0.0--sheep-duck-l...,5_random_grids,episode_9,0,0,1,"[{'role': 'user', 'content': 'You are given th...",first,100.0,2,1.0,29.0,player 2


In [186]:
data_lookup: dict = {}
for i, row in df_referencegame_success.iterrows():
    key = row['experiment'] + row['episode']
    if key not in data_lookup:
        data_lookup[key] = {
            'player 1' : [],
            'player 2' : []
        }
    data_lookup[key][row.player].append({"model": row['model'], 'index': i})

In [187]:
import random
max_size = 2
for key in data_lookup:
    p1, p2 = data_lookup[key]['player 1'], data_lookup[key]['player 2']
    assert len(p1) == len(p2)

    if len(p1) > max_size:
        # pick 2 random numbers from
        indexes = random.sample(range(0, len(p1) - 1), max_size)
        data_lookup[key]['player 1'] = [p1[i]['index'] for i in indexes]
        data_lookup[key]['player 2'] = [p2[i]['index'] for i in indexes]
    else:
        data_lookup[key]['player 1'] = [t['index'] for t in p1]
        data_lookup[key]['player 2'] = [t['index'] for t in p2]

In [188]:
all_indices = []
for key in data_lookup:
    p1, p2 = data_lookup[key]['player 1'], data_lookup[key]['player 2']
    assert len(p1) == len(p2)
    assert len(p1) <= 2

    all_indices.extend(p1)
    all_indices.extend(p2)

In [189]:
reduced_reference_game_data = df_referencegame_success.loc[all_indices]

In [190]:
len(reduced_reference_game_data)

720