In [2]:
import pandas as pd
import json
path_data: str = '../../benchmark_results/raw.csv'
raw_results = pd.read_csv(path_data)
import os

# only consider these models for eval
models_to_consider_no_rev: list = ['Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0',
                            'llama3.1-sft-e1-_E1_D10001-t0.0--llama3.1-sft-e1-_E1_D10001-t0.0',
                            'llama3.1-sft-e1-_E1_D20001-t0.0--llama3.1-sft-e1-_E1_D20001-t0.0',
                            'llama3.1-sft-e1-_E1_D20002-t0.0--llama3.1-sft-e1-_E1_D20002-t0.0',
                            'llama3.1-sft-e1-_E1_D20003-t0.0--llama3.1-sft-e1-_E1_D20003-t0.0',
                            'llama3.1-sft-e1-_E1_D30001-t0.0--llama3.1-sft-e1-_E1_D30001-t0.0',
                            'llama3.1-sft-e1-_E1_D30002-t0.0--llama3.1-sft-e1-_E1_D30002-t0.0',
                            'llama3.1-sft-e1-_E1_D30003-t0.0--llama3.1-sft-e1-_E1_D30003-t0.0',
                            'llama3.1-sft-e1-_E1_D30004-t0.0--llama3.1-sft-e1-_E1_D30004-t0.0']

directories = os.listdir("../../benchmark_results")
directories = [r for r in directories if '_REV' in r or '4bit-plain' in r]
models_to_consider_rev = directories

# the models to consider
models_to_consider = models_to_consider_rev

In [3]:
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']

def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

clean_csv_data: pd.DataFrame = group_raw_csv(data=raw_results, columns_to_keep=columns_to_keep_raw_csv)

In [4]:
clean_csv_data.head()

metric,game,model,experiment,episode,Aborted,Lose,Success
0,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_0,1.0,0.0,0.0
1,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_1,1.0,0.0,0.0
2,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_10,1.0,0.0,0.0
3,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_11,1.0,0.0,0.0
4,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_12,1.0,0.0,0.0


In [5]:
taboo_successes = clean_csv_data[(clean_csv_data.game == 'taboo') & (clean_csv_data.Success == 1.0) & (clean_csv_data.model.isin(models_to_consider))]

In [6]:
def prepare_instance_data(path: str) -> dict:
    with open(path, 'r') as f:
        data: dict = json.load(f)
        return data

In [7]:
results = {}

for idx, row in taboo_successes.iterrows():
    path: str = f"../../benchmark_results/{row['model']}/{row['game']}/{row['experiment']}/{row['episode']}/scores.json"
    data = prepare_instance_data(path)
    if row['model'] in results.keys():
        results[row['model']].append(len(data['turn scores'].keys()))
    else:
        results[row['model']] = [len(data['turn scores'].keys())]

In [8]:
for key in results.keys():
    print(f'{key:<40}: {sum(results[key])/len(results[key])} / {len(results[key])}')

Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0: 1.5333333333333334 / 30
llama3.1-sft-e1-_E1_D10001_REV-t0.0--llama3.1-sft-e1-_E1_D10001_REV-t0.0: 1.4516129032258065 / 31
llama3.1-sft-e1-_E1_D20001_REV-t0.0--llama3.1-sft-e1-_E1_D20001_REV-t0.0: 1.2941176470588236 / 34
llama3.1-sft-e1-_E1_D20002_REV-t0.0--llama3.1-sft-e1-_E1_D20002_REV-t0.0: 1.3428571428571427 / 35
llama3.1-sft-e1-_E1_D20003_REV-t0.0--llama3.1-sft-e1-_E1_D20003_REV-t0.0: 1.5357142857142858 / 28
llama3.1-sft-e1-_E1_D30002_REV-t0.0--llama3.1-sft-e1-_E1_D30002_REV-t0.0: 1.575 / 40
llama3.1-sft-e1-_E1_D30003_REV-t0.0--llama3.1-sft-e1-_E1_D30003_REV-t0.0: 1.5116279069767442 / 43
llama3.1-sft-e1-_E1_D30004_REV-t0.0--llama3.1-sft-e1-_E1_D30004_REV-t0.0: 1.4193548387096775 / 31
llama3.1-sft-e1-_E1_D50001_REV-t0.0--llama3.1-sft-e1-_E1_D50001_REV-t0.0: 1.5862068965517242 / 29
llama3.1-sft-e1-_E1_D50002_REV-t0.0--llama3.1-sft-e1-_E1_D50002_REV-t0.0: 1.5185185185185186 / 27
llama3.1-sft-e1-_E1_D50003_

In [22]:
for key in results.keys():
    print(f'{key:<40}: {sum(results[key])/len(results[key])} / {len(results[key])}')

Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0: 1.5333333333333334 / 30
llama3.1-sft-e1-_E1_D10001-t0.0--llama3.1-sft-e1-_E1_D10001-t0.0: 1.5 / 40
llama3.1-sft-e1-_E1_D20001-t0.0--llama3.1-sft-e1-_E1_D20001-t0.0: 1.3902439024390243 / 41
llama3.1-sft-e1-_E1_D20002-t0.0--llama3.1-sft-e1-_E1_D20002-t0.0: 1.4634146341463414 / 41
llama3.1-sft-e1-_E1_D20003-t0.0--llama3.1-sft-e1-_E1_D20003-t0.0: 1.2972972972972974 / 37
llama3.1-sft-e1-_E1_D30001-t0.0--llama3.1-sft-e1-_E1_D30001-t0.0: 1.40625 / 32
llama3.1-sft-e1-_E1_D30002-t0.0--llama3.1-sft-e1-_E1_D30002-t0.0: 1.6216216216216217 / 37
llama3.1-sft-e1-_E1_D30003-t0.0--llama3.1-sft-e1-_E1_D30003-t0.0: 1.5945945945945945 / 37
llama3.1-sft-e1-_E1_D30004-t0.0--llama3.1-sft-e1-_E1_D30004-t0.0: 1.34375 / 32


## Finding:
the average number of turns until a game is successfully ended slightly decreased for most models while for tqo it slightly increased. 

The overall number of successful games increased for all models

In [23]:
print(sum([sum(results[key])/len(results[key]) for key in results.keys() if not key.startswith('U')])/(len(results)-1))


1.45214650626236
