In [21]:
import pandas as pd
import json
import os
path_data: str = '../../benchmark_results/raw.csv'
raw_results = pd.read_csv(path_data)

# the game to observe
game = 'wordle_withclue'

# only consider these models for eval
models_to_consider_no_rev: list = ['Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0',
                            'llama3.1-sft-e1-_E1_D10001-t0.0--llama3.1-sft-e1-_E1_D10001-t0.0',
                            'llama3.1-sft-e1-_E1_D20001-t0.0--llama3.1-sft-e1-_E1_D20001-t0.0',
                            'llama3.1-sft-e1-_E1_D20002-t0.0--llama3.1-sft-e1-_E1_D20002-t0.0',
                            'llama3.1-sft-e1-_E1_D20003-t0.0--llama3.1-sft-e1-_E1_D20003-t0.0',
                            'llama3.1-sft-e1-_E1_D30001-t0.0--llama3.1-sft-e1-_E1_D30001-t0.0',
                            'llama3.1-sft-e1-_E1_D30002-t0.0--llama3.1-sft-e1-_E1_D30002-t0.0',
                            'llama3.1-sft-e1-_E1_D30003-t0.0--llama3.1-sft-e1-_E1_D30003-t0.0',
                            'llama3.1-sft-e1-_E1_D30004-t0.0--llama3.1-sft-e1-_E1_D30004-t0.0']

directories = os.listdir("../../benchmark_results")
directories = [r for r in directories if '_REV' in r or '4bit-plain' in r]
models_to_consider_rev = directories

# the models to consider
models_to_consider = models_to_consider_rev

In [22]:
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']

def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

clean_csv_data: pd.DataFrame = group_raw_csv(data=raw_results, columns_to_keep=columns_to_keep_raw_csv)

In [23]:
taboo_successes = clean_csv_data[(clean_csv_data.game == game) & (clean_csv_data.Success == 1.0) & (clean_csv_data.model.isin(models_to_consider))]

In [24]:
def prepare_instance_data(path: str) -> dict:
    with open(path, 'r') as f:
        data: dict = json.load(f)
        return data

In [25]:
results = {}

for idx, row in taboo_successes.iterrows():
    path: str = f"../../benchmark_results/{row['model']}/{row['game']}/{row['experiment']}/{row['episode']}/scores.json"
    data = prepare_instance_data(path)
    if row['model'] in results.keys():
        results[row['model']].append(len(data['turn scores'].keys()))
    else:
        results[row['model']] = [len(data['turn scores'].keys())]

In [26]:
for key in results.keys():
    print(f'{key:<40}: {sum(results[key])/len(results[key])} / {len(results[key])}')

Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0: 3.3333333333333335 / 6
llama3.1-sft-e1-_E1_D10001_REV-t0.0--llama3.1-sft-e1-_E1_D10001_REV-t0.0: 1.5 / 6
llama3.1-sft-e1-_E1_D20001_REV-t0.0--llama3.1-sft-e1-_E1_D20001_REV-t0.0: 1.7 / 10
llama3.1-sft-e1-_E1_D20002_REV-t0.0--llama3.1-sft-e1-_E1_D20002_REV-t0.0: 1.625 / 8
llama3.1-sft-e1-_E1_D20003_REV-t0.0--llama3.1-sft-e1-_E1_D20003_REV-t0.0: 1.25 / 4
llama3.1-sft-e1-_E1_D30002_REV-t0.0--llama3.1-sft-e1-_E1_D30002_REV-t0.0: 2.0 / 7
llama3.1-sft-e1-_E1_D30003_REV-t0.0--llama3.1-sft-e1-_E1_D30003_REV-t0.0: 2.111111111111111 / 9
llama3.1-sft-e1-_E1_D30004_REV-t0.0--llama3.1-sft-e1-_E1_D30004_REV-t0.0: 2.125 / 8
llama3.1-sft-e1-_E1_D50001_REV-t0.0--llama3.1-sft-e1-_E1_D50001_REV-t0.0: 2.0 / 4
llama3.1-sft-e1-_E1_D50002_REV-t0.0--llama3.1-sft-e1-_E1_D50002_REV-t0.0: 2.375 / 8
llama3.1-sft-e1-_E1_D50003_REV-t0.0--llama3.1-sft-e1-_E1_D50003_REV-t0.0: 1.5 / 6
llama3.1-sft-e1-_E1_D50004_REV-t0.0--llama3.1-sft-e1-_E

## Finding:
## Wordle
only two models show moinor decrease of average turn length

## Wordle withclue
all models show a decrease in average turn length. some of them more than one turn less

## wordle withcritic
some models show a minor decrease in the average turn length while soe show a slight increase

In [48]:
print(sum([sum(results[key])/len(results[key]) for key in results.keys() if not key.startswith('U')])/(len(results)))


2.1568783068783066
