In [1]:
import json

import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.patches as mpatches
import numpy as np

In [5]:
path_raw_results: str = '../../benchmark_results/raw.csv'
df: pd.DataFrame = pd.read_csv(path_raw_results)

base_model_name: str = 'Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0'

## Group Data

In [6]:
columns_to_keep_raw_csv: list = ['game', 'model', 'experiment', 'episode', 'Aborted', 'Lose', 'Success']

def group_raw_csv(data: pd.DataFrame, columns_to_keep: list[str]) -> pd.DataFrame:
    df: pd.DataFrame = data.pivot_table(
        index=['game', 'model', 'experiment', 'episode'],
        columns=['metric'],
        values='value'
    ).reset_index()

    columns_to_drop: list = [column for column in list(df.keys()) if column not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=0)

clean_csv_data: pd.DataFrame = group_raw_csv(data=df, columns_to_keep=columns_to_keep_raw_csv)

# utils

In [7]:
# this works for taboo, and all wordle games
def get_term_from_episode(row):
    model_name = row['model']
    game = row['game']
    experiment = row['experiment']
    episode = row['episode']
    
    path: str = f'../../benchmark_results/{model_name}/{game}/{experiment}/{episode}/instance.json'
    instance: dict = json.load(open(path))
    return instance['target_word']

# this works for image game and referencegame and private shared
def get_generic_turn_name(row):
    experiment_name = row['experiment']
    episode = row['episode']
    return f'{experiment_name[2:]}-{episode.split("_")[1]}'

def prepare_model_names(raw_names: list) -> list:
    return [m.split("--")[0].split("-t0")[0] for m in raw_names]

def prepare_value(row):
    if row['Success'] == 1.0: return 'Success'
    if row['Lose'] == 1.0: return 'Loss'
    if row['Aborted'] == 1.0: return 'Aborted'
    return 'Undefined'

def prepare_model_data(model_name, data_frame: pd.DataFrame, get_turn_name) -> pd.DataFrame:
    df: pd.DataFrame = data_frame[data_frame.model == model_name]
    data_base_model: dict = {}
    
    for index, row in df.iterrows():
        data_base_model.update({get_turn_name(row): prepare_value(row)})
        
    return data_base_model


def reorder_columns(df: pd.DataFrame) -> pd.DataFrame:
    original_columns = df.columns.tolist()
    running_columns = df.columns.tolist()
    rearranged_columns = []

    ## collect all values that are aborted
    for column in running_columns:
        if len(df[column].unique()) == 1 and df[column].iloc[0] == 'Aborted':
            rearranged_columns.append(column)

    running_columns = [r for r in running_columns if r not in rearranged_columns]

    ## collect all values that are aborted
    for column in running_columns:
        if df[column].iloc[0] == 'Aborted':
            rearranged_columns.append(column)

    running_columns = [r for r in running_columns if r not in rearranged_columns]

    for column in running_columns:
        if len(df[column].unique()) == 1 and df[column].iloc[0] == 'Loss':
            rearranged_columns.append(column)

    running_columns = [r for r in running_columns if r not in rearranged_columns]

    for column in running_columns:
        if df[column].iloc[0] == 'Loss':
            rearranged_columns.append(column)

    running_columns = [r for r in running_columns if r not in rearranged_columns]

    for column in running_columns:
        if len(df[column].unique()) == 1 and df[column].iloc[0] == 'Success':
            rearranged_columns.append(column)

    running_columns = [r for r in running_columns if r not in rearranged_columns]

    for column in running_columns:
        if df[column].iloc[0] == 'Success':
            rearranged_columns.append(column)

    assert len(rearranged_columns) == len(original_columns)

    return rearranged_columns, original_columns

In [8]:
# list of all games
games = list(set(list(clean_csv_data.game)))

# only consider these models for eval
models_to_consider: list = ['Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0',]

turn_extraction_lookup = {
    'wordle': get_term_from_episode,
    'wordle_withclue': get_term_from_episode,
    'wordle_withcritic': get_term_from_episode,
    'taboo': get_term_from_episode,
    'imagegame': get_generic_turn_name,
    'referencegame': get_generic_turn_name,
    'privateshared': get_generic_turn_name,
}

custom_height_lookup = {
    'wordle': 3,
    'wordle_withclue': 3,
    'wordle_withcritic': 3,
    'taboo': 3.7,
    'imagegame': 3.8,
    'referencegame': 5,
    'privateshared': 3.7,
}

custom_width_lookup = {
    'wordle': 11,
    'wordle_withclue': 11,
    'wordle_withcritic': 11,
    'taboo': 15,
    'imagegame': 11,
    'referencegame': 30,
    'privateshared': 15,
}

In [30]:
game = 'referencegame'
model_name: str = 'Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0'
game_data = clean_csv_data[clean_csv_data.game == game]
prep_data = prepare_model_data(model_name, game_data, turn_extraction_lookup[game])
short_name = prepare_model_names([model_name])
plot_data = {short_name[0]: prep_data}
df = pd.DataFrame(plot_data)
df["experiment"] = df.index
df["raw_exp"] = df.experiment.map(lambda x: x.split("-")[0])
df.head()

Unnamed: 0,Unsloth-meta-llama-3.1-4bit-plain,experiment,raw_exp
line_grids_rows-0,Success,line_grids_rows-0,line_grids_rows
line_grids_rows-1,Loss,line_grids_rows-1,line_grids_rows
line_grids_rows-10,Loss,line_grids_rows-10,line_grids_rows
line_grids_rows-11,Loss,line_grids_rows-11,line_grids_rows
line_grids_rows-12,Success,line_grids_rows-12,line_grids_rows


In [41]:
positive_episodes: dict = {}
negative_episodes: dict = {}

unique_experiments = df.raw_exp.unique()

for experiment in unique_experiments:
    positive_episodes[experiment] = []
    negative_episodes[experiment] = []
    
    exp_df = df[df.raw_exp == experiment]
    positive_df = exp_df[exp_df['Unsloth-meta-llama-3.1-4bit-plain'] == 'Success']
    negative_df = exp_df[exp_df['Unsloth-meta-llama-3.1-4bit-plain'] == 'Loss']
    
    # sample two random episodes each
    p = positive_df.sample(2).reset_index(drop=True)
    n = negative_df.sample(4).reset_index(drop=True)
    
    positive_episodes[experiment].extend(['episode_' + t.split('-')[-1] for t in list(p.experiment)])
    negative_episodes[experiment].extend(['episode_' + t.split('-')[-1] for t in list(n.experiment)])



In [42]:
positive_episodes

{'line_grids_rows': ['episode_25', 'episode_18'],
 'line_grids_columns': ['episode_10', 'episode_0'],
 'diagonal_grids': ['episode_27', 'episode_24'],
 'letter_grids': ['episode_8', 'episode_11'],
 'shape_grids': ['episode_15', 'episode_24'],
 'random_grids': ['episode_28', 'episode_0']}

In [43]:
negative_episodes

{'line_grids_rows': ['episode_29', 'episode_7', 'episode_10', 'episode_15'],
 'line_grids_columns': ['episode_15', 'episode_5', 'episode_3', 'episode_22'],
 'diagonal_grids': ['episode_5', 'episode_29', 'episode_17', 'episode_8'],
 'letter_grids': ['episode_29', 'episode_22', 'episode_24', 'episode_13'],
 'shape_grids': ['episode_13', 'episode_9', 'episode_17', 'episode_1'],
 'random_grids': ['episode_23', 'episode_17', 'episode_1', 'episode_20']}