In [42]:
import json
import pandas as pd

# path to the results directory
base_path_results: str = '/Users/nicolahorst/Documents/GitHub/clem-project_playpen/benchmark_results/llama70B'

# path to the raw_csv of the data models to analyze
path_raw_csv: str = base_path_results +'/raw.csv'

# model to analyze.
model_name: str = "llama-3.1-70B-DPO_dialogue-t0.0--llama-3.1-70B-DPO_dialogue-t0.0"

# All games that do allow automatic error retrieval
automatic_error_retrieval: list = ["guesswhat", "wordle_withclue", "wordle_withcritic", "wordle", "adventuregame", "codenames"]

# All games that do not allow automatic error retrieval
manual_error_retrieval: list = ['matchit_ascii', "textmapworld", "textmapworld_graphreasoning", "textmapworld_specificroom", "imagegame", "privateshared", "taboo", "referencegame"]

std_cols: list = ['game', 'model', 'experiment', 'episode']



In [43]:
def group_raw_csv(df: pd.DataFrame) -> pd.DataFrame:
    df.metric = df.metric.astype('string')
    df.value = df.value.astype('string')
    df_pivoted = df.pivot(index=['game', 'model', 'experiment', 'episode'],
                          columns='metric',
                          values='value')

    # Reset the index to make the index columns regular columns
    df_pivoted = df_pivoted.reset_index()
    return df_pivoted

def collect_errors(data: pd.DataFrame, base_dir: str) -> dict:
    errors = {}
    for i, row in data.iterrows():
        interactions_dir = base_dir + '/' + row["model"] + '/' + row["game"] + '/' + row["experiment"] + '/' + row["episode"] + '/interactions.json'
        interaction_data = json.load(open(interactions_dir))

        action_type  = None
        error_type   = None

        if row["game"] in ["guesswhat", "wordle_withclue", "wordle_withcritic", "wordle", "adventuregame"]:
            action_type = interaction_data["turns"][-1][-1]["action"]["type"]
            error_type = interaction_data["turns"][-1][-2]["action"]["content"]

        elif row["game"] == "codenames":
            action_type =  interaction_data["turns"][-1][-1]["action"]["content"]["player"]
            error_type =  interaction_data["turns"][-1][-1]["action"]["content"]["type"]

        # skip of nothing found
        if action_type is None or error_type is None:
            continue

        if action_type not in errors.keys():
            errors[action_type] = {}

        if error_type in  errors[action_type]:
            errors[action_type][error_type] += 1
        else:
            errors[action_type][error_type] = 1
    return errors



# load the raw csv data
raw_data: pd.DataFrame = pd.read_csv(path_raw_csv, index_col=0)

# transpose the dataframe to get evey episode in one row
df_pivoted = group_raw_csv(raw_data)

for col in df_pivoted.columns:
    if col not in std_cols:
        df_pivoted[col] = pd.to_numeric(df_pivoted[col], errors='coerce')

for game in automatic_error_retrieval:
    df_aborted = df_pivoted[(df_pivoted.Aborted == 1) & (df_pivoted.model == model_name) & (df_pivoted.game == game)]
    print(game)
    print(collect_errors(df_aborted, base_path_results))


guesswhat
{'invalid content': {'Invalid question. Asking about the part of speech (POS) of the target word is not allowed.': 3}}
wordle_withclue
{'invalid format': {"Guess does not conform to the format rules\nError: The response should contain only the 'guess:' and 'explanation:' keywords and associated information.": 19, "Guess does not conform to the format rules\nError: The response should contain the 'guess:' keyword only once.": 8, 'Guess does not conform to the format rules\nError: The length of the guessed word is not 5.': 1}}
wordle_withcritic
{'invalid format': {"Guess does not conform to the format rules\nError: The response should contain only the 'guess:' and 'explanation:' keywords and associated information.": 7, 'Guess does not conform to the format rules\nError: The length of the guessed word is not 5.': 4, "Guess does not conform to the format rules\nError: The response should contain the 'guess:' keyword only once.": 19}}
wordle
{'invalid format': {"Guess does not co

## For the Manual Error retrieval

In [50]:
import webbrowser

# All games that do not allow automatic error retrieval
todo: list = ['matchit_ascii', "textmapworld", "textmapworld_graphreasoning", "textmapworld_specificroom"]

game_to_analyze: str = "textmapworld_specificroom"

# load the raw csv data
raw_data: pd.DataFrame = pd.read_csv(path_raw_csv, index_col=0)

# transpose the dataframe to get evey episode in one row
df_pivoted = group_raw_csv(raw_data)

for col in df_pivoted.columns:
    if col not in std_cols:
        df_pivoted[col] = pd.to_numeric(df_pivoted[col], errors='coerce')

# get only aborted episodes for model
df_aborted = df_pivoted[(df_pivoted.Aborted == 1) & (df_pivoted.model == model_name) & (df_pivoted.game == game_to_analyze)]

# load all html transcript files
transcript_html_filenames: list = []
for i, row in df_aborted.iterrows():
    html_file_path = base_path_results + '/' + row["model"] + '/' + row["game"] + '/' + row["experiment"] + '/' + row["episode"] + '/transcript.html'
    transcript_html_filenames.append(html_file_path)


print(len(transcript_html_filenames), "Aborted Episode Files")
# open all transcripts

2 Aborted Episode Files


In [51]:
### Open all transcripts automatically
new = 2
for html in transcript_html_filenames:
    url = (f"file://{html}")
    print(url)
    webbrowser.open(url, new=new)

file:///Users/nicolahorst/Documents/GitHub/clem-project_playpen/benchmark_results/llama70B/llama-3.1-70B-DPO_dialogue-t0.0--llama-3.1-70B-DPO_dialogue-t0.0/textmapworld_specificroom/1_close/episode_8/transcript.html
file:///Users/nicolahorst/Documents/GitHub/clem-project_playpen/benchmark_results/llama70B/llama-3.1-70B-DPO_dialogue-t0.0--llama-3.1-70B-DPO_dialogue-t0.0/textmapworld_specificroom/2_far/episode_6/transcript.html


In [39]:
df = pd.read_csv('/Users/nicolahorst/Documents/GitHub/clem-project_playpen/benchmark_results/llama70B/results.csv', index_col=0)

In [41]:
df.head()

Unnamed: 0,"-, clemscore","adventuregame, % Played","adventuregame, Quality Score","adventuregame, Quality Score (std)","all, Average % Played","all, Average Quality Score","codenames, % Played","codenames, Quality Score","codenames, Quality Score (std)","guesswhat, % Played",...,"textmapworld_specificroom, Quality Score (std)","wordle, % Played","wordle, Quality Score","wordle, Quality Score (std)","wordle_withclue, % Played","wordle_withclue, Quality Score","wordle_withclue, Quality Score (std)","wordle_withcritic, % Played","wordle_withcritic, Quality Score","wordle_withcritic, Quality Score (std)"
llama-3.1-70B-DPO_dialogue-t0.0--llama-3.1-70B-DPO_dialogue-t0.0,41.48,85.16,59.38,40.76,63.76,65.06,37.69,38.78,49.23,95.0,...,0.0,0.0,,,6.67,50.0,70.71,0.0,,
llama-3.1-70B-DPO_turn-t0.0--llama-3.1-70B-DPO_turn-t0.0,45.09,86.72,55.21,41.92,77.56,58.13,77.69,29.7,45.92,91.67,...,0.0,86.67,1.54,5.43,0.0,,,0.0,,
