In [1]:
import pandas as pd
import json
path_data: str = '../../benchmark_results/raw.csv'
raw_results = pd.read_csv(path_data)


# only consider these models for eval
models_to_consider: list = ['Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0',
                            'llama3.1-sft-e1-_E1_D10001-t0.0--llama3.1-sft-e1-_E1_D10001-t0.0',
                            'llama3.1-sft-e1-_E1_D20001-t0.0--llama3.1-sft-e1-_E1_D20001-t0.0',
                            'llama3.1-sft-e1-_E1_D20002-t0.0--llama3.1-sft-e1-_E1_D20002-t0.0',
                            'llama3.1-sft-e1-_E1_D20003-t0.0--llama3.1-sft-e1-_E1_D20003-t0.0',
                            'llama3.1-sft-e1-_E1_D30001-t0.0--llama3.1-sft-e1-_E1_D30001-t0.0',
                            'llama3.1-sft-e1-_E1_D30002-t0.0--llama3.1-sft-e1-_E1_D30002-t0.0',
                            'llama3.1-sft-e1-_E1_D30003-t0.0--llama3.1-sft-e1-_E1_D30003-t0.0',
                            'llama3.1-sft-e1-_E1_D30004-t0.0--llama3.1-sft-e1-_E1_D30004-t0.0']

In [2]:
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']

def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

clean_csv_data: pd.DataFrame = group_raw_csv(data=raw_results, columns_to_keep=columns_to_keep_raw_csv)

In [3]:
clean_csv_data.head()

metric,game,model,experiment,episode,Aborted,Lose,Success
0,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_0,1.0,0.0,0.0
1,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_1,1.0,0.0,0.0
2,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_10,1.0,0.0,0.0
3,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_11,1.0,0.0,0.0
4,imagegame,ClemBench-SFT-Nicola-Test-t0.0--ClemBench-SFT-...,0_compact_grids,episode_12,1.0,0.0,0.0


In [12]:
imagegame_successes = clean_csv_data[(clean_csv_data.game == 'imagegame') & (clean_csv_data.Aborted == 0.0) & (clean_csv_data.model.isin(models_to_consider))]
immage_game_no_random_grids = imagegame_successes[imagegame_successes.experiment != '1_random_grids']

In [13]:
def prepare_instance_data(path: str) -> dict:
    with open(path, 'r') as f:
        data: dict = json.load(f)
        return data

In [14]:
results = {}

for idx, row in immage_game_no_random_grids.iterrows():
    path: str = f"../../benchmark_results/{row['model']}/{row['game']}/{row['experiment']}/{row['episode']}/scores.json"
    data = prepare_instance_data(path)
    if row['model'] in results.keys():
        results[row['model']].append(len(data['turn scores'].keys()))
    else:
        results[row['model']] = [len(data['turn scores'].keys())]

In [15]:
for key in results.keys():
    print(f'{key:<40}: {sum(results[key])/len(results[key])} / {len(results[key])}')

Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0: 14.789473684210526 / 19
llama3.1-sft-e1-_E1_D10001-t0.0--llama3.1-sft-e1-_E1_D10001-t0.0: 3.5 / 4
llama3.1-sft-e1-_E1_D20002-t0.0--llama3.1-sft-e1-_E1_D20002-t0.0: 4.888888888888889 / 18
llama3.1-sft-e1-_E1_D20003-t0.0--llama3.1-sft-e1-_E1_D20003-t0.0: 7.625 / 16
llama3.1-sft-e1-_E1_D30001-t0.0--llama3.1-sft-e1-_E1_D30001-t0.0: 8.166666666666666 / 18
llama3.1-sft-e1-_E1_D30002-t0.0--llama3.1-sft-e1-_E1_D30002-t0.0: 6.666666666666667 / 9
llama3.1-sft-e1-_E1_D30003-t0.0--llama3.1-sft-e1-_E1_D30003-t0.0: 5.421052631578948 / 19
llama3.1-sft-e1-_E1_D30004-t0.0--llama3.1-sft-e1-_E1_D30004-t0.0: 7.631578947368421 / 19


In [18]:
print(sum([sum(results[key])/len(results[key]) for key in results.keys() if not key.startswith('U')])/(len(results)-1))

6.271407685881371


## Finding:
The average number of turns drastically decreased from 13.5 to 4-6.5