In [1]:
import pandas as pd
import re
import json
import os

In [2]:
save_dir = "../../results/results_tables/"
results_dir = "../../results/clembench_v1.6/"
results_dir2 = "../../results/clembench_v2.0/"
gen_results_dir = "../../results/general_benchmarks/"
results_csv = ["baseline", "D1","D2", "D3", "D5", "D6", "D7"]#, "DM"]
raw_csv = ["baseline", "D1","D2", "D3","D4", "D5", "D6", "D7", "D9_top10"] 
games = ["imagegame", "privateshared", "referencegame", "taboo", "wordle", "wordle_withclue", "wordle_withcritic"]
games_v2 = ["adventuregame", "codenames", "guesswhat", "matchit_ascii", "textmapworld", "textmapworld_graphreasoning", "textmapworld_specificroom"]
tasks = ["mmlu", "social_iqa", "piqa"]
columns = ["model"] + tasks

# epsiodes referencegame which were *not* seen in the training data
ref_training_episodes = {
    '0_line_grids_rows': ['episode_25',  'episode_18',  'episode_29',  'episode_7',  'episode_10',  'episode_15'],
    '1_line_grids_columns': ['episode_10',  'episode_0',  'episode_15',  'episode_5',  'episode_3',  'episode_22'],'2_diagonal_grids': ['episode_27',  'episode_24',  'episode_5',  'episode_29',  'episode_17',  'episode_8'],
    '3_letter_grids': ['episode_8',  'episode_11',  'episode_29',  'episode_22',  'episode_24',  'episode_13'],
    '4_shape_grids': ['episode_15',  'episode_24',  'episode_13',  'episode_9',  'episode_17',  'episode_1'],
    '5_random_grids': ['episode_28',  'episode_0',  'episode_23',  'episode_17',  'episode_1',  'episode_20']}

In [3]:
# adapted from bencheval.py
def save_clem_table(df: pd.DataFrame) -> None:
    """Create benchmark results as a table."""
    #########
    df_aux = df[df['metric'].isin(["Played", "Main Score"])]

    # compute mean benchscore and mean played (which is binary, so a proportion)
    df_a = (df_aux.groupby(['game', 'model', 'metric'])
                  .mean(numeric_only=True)
                  .reset_index())
    df_a.loc[df_a.metric == "Played", 'value'] *= 100
    df_a = df_a.round(2)
    df_a['metric'].replace(
        {"Played": '% Played'},
        inplace=True)

    # compute the std of benchscore
    df_aux_b = df_aux[df_aux.metric == "Main Score"]
    df_b = (df_aux_b.groupby(['game', 'model', 'metric'])
                    .std(numeric_only=True)
                    .reset_index()
                    .round(2))
    df_b['metric'].replace(
        {"Main Score": "Main Score"+' (std)'},
        inplace=True)

    # compute the macro-average main score over games, per model
    df_all = (df_a.groupby(['model', 'metric'])
                  .mean(numeric_only=True)
                  .reset_index()
                  .round(2))
    # add columns for standard format in concatenation below
    df_all['game'] = 'all'
    df_all['metric'] = 'Average ' + df_all['metric']

    # merge all data and make it one model per row
    df_full = pd.concat([df_a, df_b, df_all], axis=0, ignore_index=True)
    # sort just so all metrics are close to each other in a game column
    df_full.sort_values(by=['game', 'metric'], inplace=True)
    # rename according to paper
    df_full['metric'] = df_full['metric'].str.replace("Main Score", 'Quality Score')
    df_full = df_full.pivot(columns=['game', 'metric'], index=['model'])
    df_full = df_full.droplevel(0, axis=1)

    # compute clemscores and add to df
    clemscore = ((df_full[('all', 'Average % Played')] / 100)
                 * df_full[('all', 'Average Quality Score')])
    clemscore = clemscore.round(2).to_frame(name=('-', 'clemscore'))
    df_results = pd.concat([clemscore, df_full], axis=1)

    # flatten header
    df_results.index.name = None
    df_results.columns = df_results.columns.to_flat_index() 
    df_results.columns = [', '.join(x) for x in df_results.columns]

    # save table
    #df_results.to_csv(Path(path) / f'{TABLE_NAME}.csv')
    #df_results.to_html(Path(path) / f'{TABLE_NAME}.html')
    #print(f'\n Saved results into {path}/{TABLE_NAME}.csv and .html')
    return df_results

In [4]:
# remove referencegame instances that appeared in the training data (ref_training_episodes)
def remove_referencegame_instances(df: pd.DataFrame, ref_training_episodes: dict) -> pd.DataFrame:
    """
    Creates a results table like results.csv excluding the specified referencegame episodes (that have been part of the training data) from raw.csv.

    """
    ind_to_remove = []
    for experiment, episodes in ref_training_episodes.items():
        for episode in episodes:
            ind = df[((df.experiment == experiment) & (df.episode == episode))].index
            ind_to_remove += list(ind)

    ref_keep = df.loc[ind_to_remove]
    df1_no_reftraining = pd.concat([df[~(df.game == "referencegame")], ref_keep])
    #
    # print("Results Top10 without referencegame episodes from training (final clemscore?)")
    #full final results
    ffr = save_clem_table(df1_no_reftraining)
    ffr[["-, clemscore", "all, Average % Played", "all, Average Quality Score"]]
    return ffr

In [3]:
def prettify_df(df):
    #exclude cols
    df = df.loc[:,~df.columns.str.contains('std', case=False)]
    df = (df.round(decimals=2)
          .sort_values(by = "short_name")
          )
    # shorten model names
    if "--" in df.index:
        short_names = [x.split("--")[0][:-5] for x in list(df.index)]
        short_dict = dict(zip(list(df.index), short_names))
        df = df.rename(index=short_dict)
    if "--" in df.short_name:
        short_names = [x.split("--")[0][:-5] for x in list(df.index)]
        short_dict = dict(zip(list(df.index), short_names))
        df = df.rename(index=short_dict)
    return df

def save_as_csv_and_tex(df, experiment, save_dir):
    df.to_csv(save_dir + "results_" + experiment + ".csv")
    df.to_latex(save_dir + "results_" + experiment + ".tex", 
            float_format="%.2f",
            escape = True,
            caption = "Results on experiment " + experiment + ".", 
            label = "tab:results-" + experiment,
            position= "h!"
            )

### Short v2.0 intermezzo

In [4]:
df = pd.read_csv(results_dir2 + "results.csv")
df.columns.values[0] = "short_name"
df2 = pd.read_csv(results_dir2 + "results_leaderboard.csv")
df2.columns.values[0] = "short_name"
models_unquantized = ["Meta-Llama-3.1-70B-Instruct-t0.0--Meta-Llama-3.1-70B-Instruct-t0.0", "Meta-Llama-3.1-8B-Instruct-t0.0--Meta-Llama-3.1-8B-Instruct-t0.0"]
# models = ["Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0", "Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0--Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0", "llama3.1-8B-sft-e1-DFINAL-t0.0--llama3.1-8B-sft-e1-DFINAL-t0.0", ]

models = ["Meta-Llama-3.1-8B-Instruct-t0.0--Meta-Llama-3.1-8B-Instruct-t0.0", 
           "Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0", 
           "llama3.1-8B-sft-e1-DFINAL-t0.0--llama3.1-8B-sft-e1-DFINAL-t0.0", 
           "llama3.1-8B-sft-e1-DFINAL_1.1K-steps-t0.0--llama3.1-8B-sft-e1-DFINAL_1.1K-steps-t0.0",
           "llama3.1-8B-sft-e1-DFINAL_1.7K-steps-t0.0--llama3.1-8B-sft-e1-DFINAL_1.7K-steps-t0.0",
           "llama3.1-sft-e1-_E1_D40005-t0.0--llama3.1-sft-e1-_E1_D40005-t0.0", 
           "Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0--Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0", 
           "llama3.1-70B-sft-e1-DFINAL_1.7K-steps-t0.0--llama3.1-70B-sft-e1-DFINAL_1.7K-steps-t0.0"]
df = df[df.short_name.isin(models)]
df2 = df2[df2.short_name.isin(models_unquantized)]
dfs = pd.concat([df,df2])
dfs = prettify_df(dfs)
dfs
dfs = df

In [5]:
prettify_df(df)

Unnamed: 0,short_name,"-, clemscore","adventuregame, % Played","adventuregame, Quality Score","all, Average % Played","all, Average Quality Score","codenames, % Played","codenames, Quality Score","guesswhat, % Played","guesswhat, Quality Score",...,"textmapworld_graphreasoning, % Played","textmapworld_graphreasoning, Quality Score","textmapworld_specificroom, % Played","textmapworld_specificroom, Quality Score","wordle, % Played","wordle, Quality Score","wordle_withclue, % Played","wordle_withclue, Quality Score","wordle_withcritic, % Played","wordle_withcritic, Quality Score"
4,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,22.36,35.94,33.85,56.51,39.57,43.08,16.07,90.0,11.73,...,20.0,44.33,56.67,94.12,36.67,0.0,0.0,,6.67,50.0
5,Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0--Unsl...,42.52,88.28,56.77,73.58,57.79,84.62,36.36,91.67,30.3,...,23.33,50.25,90.0,100.0,76.67,1.3,6.67,50.0,0.0,
6,llama3.1-70B-sft-e1-DFINAL_1.7K-steps-t0.0--ll...,45.69,71.09,56.77,72.17,63.31,54.62,32.39,90.0,44.44,...,46.67,55.24,93.33,100.0,83.33,13.2,26.67,32.29,26.67,75.0
0,llama3.1-8B-sft-e1-DFINAL-t0.0--llama3.1-8B-sf...,25.9,15.62,11.46,57.49,45.05,0.77,0.0,96.67,18.39,...,0.0,,70.0,100.0,63.33,13.68,0.0,,0.0,
1,llama3.1-8B-sft-e1-DFINAL_1.1K-steps-t0.0--lla...,28.25,10.94,17.19,63.75,44.31,9.23,16.67,98.33,12.43,...,26.67,34.6,63.33,94.74,63.33,7.89,16.67,25.0,20.0,38.89
2,llama3.1-8B-sft-e1-DFINAL_1.7K-steps-t0.0--lla...,28.86,7.03,14.84,58.72,49.14,5.38,14.29,85.0,30.07,...,0.0,,50.0,100.0,70.0,1.43,6.67,25.0,10.0,75.0
3,llama3.1-sft-e1-_E1_D40005-t0.0--llama3.1-sft-...,26.6,21.26,14.96,54.6,48.72,9.23,41.67,73.33,36.36,...,0.0,,66.67,95.0,46.67,7.14,0.0,,0.0,


In [8]:
# dfs = pd.concat([df,df2])
# dfs = prettify_df(dfs)
prettify_df(dfs).to_csv(save_dir + "clembench_v2.0_results_steps.csv", index=False)



In [6]:
#games = ["imagegame", "privateshared", "referencegame", "taboo"] + games_v2
# games = ["imagegame", "privateshared", "referencegame", "taboo", "wordle", "wordle_withclue", "wordle_withcritic"]

played = [x + ", % Played" for x in games_v2]
quality = [x + ", Quality Score" for x in games_v2]
pd.concat([dfs.short_name, dfs[played].mean(axis=1)], axis = 1)

Unnamed: 0,short_name,0
0,llama3.1-8B-sft-e1-DFINAL-t0.0--llama3.1-8B-sf...,49.508571
1,llama3.1-8B-sft-e1-DFINAL_1.1K-steps-t0.0--lla...,56.071429
2,llama3.1-8B-sft-e1-DFINAL_1.7K-steps-t0.0--lla...,47.915714
3,llama3.1-sft-e1-_E1_D40005-t0.0--llama3.1-sft-...,45.641429
4,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,54.527143
5,Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0--Unsl...,82.2
6,llama3.1-70B-sft-e1-DFINAL_1.7K-steps-t0.0--ll...,78.815714


In [10]:
pd.concat([dfs.short_name, dfs[quality].fillna(value = 0).mean(axis=1)], axis = 1)
pd.concat([dfs.short_name, dfs[quality].mean(axis=1)], axis = 1)

Unnamed: 0,short_name,0
0,llama3.1-8B-sft-e1-DFINAL-t0.0--llama3.1-8B-sf...,35.176667
1,llama3.1-8B-sft-e1-DFINAL_1.1K-steps-t0.0--lla...,38.982857
2,llama3.1-8B-sft-e1-DFINAL_1.7K-steps-t0.0--lla...,45.693333
3,llama3.1-sft-e1-_E1_D40005-t0.0--llama3.1-sft-...,47.648333
4,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,45.08
5,Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0--Unsl...,57.487143
6,llama3.1-70B-sft-e1-DFINAL_1.7K-steps-t0.0--ll...,62.068571


In [11]:
dfs[quality]#.fillna(value = 0)

Unnamed: 0,"imagegame, Quality Score","privateshared, Quality Score","referencegame, Quality Score","taboo, Quality Score","wordle, Quality Score","wordle_withclue, Quality Score","wordle_withcritic, Quality Score"
0,95.36,96.27,40.0,39.17,13.68,,
1,94.41,96.32,40.0,45.0,7.89,25.0,38.89
2,94.46,97.35,33.33,38.06,1.43,25.0,75.0
3,87.43,94.37,33.33,27.78,7.14,,
4,54.62,23.48,38.89,31.92,0.0,,50.0


## Prepare result tables including general benchmarks (if existing)

In [None]:
pattern_short_name = r"D\d{5}_*\w*|llama-3.1"

# all_results = []

for experiment in raw_csv:
    # read raw
    df_raw = pd.read_csv(results_dir + experiment + "/raw.csv", index_col = 0)
    df_raw = df_raw[df_raw.game.isin(games)] # exclude matchit_ascii!
    
    df = remove_referencegame_instances(df_raw, ref_training_episodes)

    short_names = [re.search(pattern_short_name, name).group() for name in df.index]
    df["short_name"] = short_names

    results = []
    double_but_ok = 0
    double_bad = 0
    # create general results
    for directory in os.listdir(gen_results_dir):   # single model directories
        directory = gen_results_dir + directory
        if any(name in directory for name in short_names): # is there general benchmark data for this experiment?
            if os.path.isdir(directory):
                model_results = dict.fromkeys(columns)
                model_results["model"] = directory
                subfiles = os.listdir(directory)
                # prüfen, dass das eigentlich files sind und json!
                for file in subfiles:
                    file = directory + "/" + file
                    with open(file) as f:
                        jsonfile = json.load(f)
                    for task in tasks:
                        if task in jsonfile["results"]:
                            accuracy = jsonfile["results"][task]["acc,none"]
                            if model_results[task] is None:
                                model_results[task] = accuracy
                            else:
                                if accuracy == model_results[task]:
                                    double_but_ok += 1
                                else:
                                    #print("ATTENTION FOR", directory)
                                    #print(task)
                                    #print("Trying to set different")
                                    double_bad += 1
                results.append(model_results)


    try:    
        results_df = pd.concat([pd.DataFrame(x, index = [0]) for x in results]).reset_index()
        results_df["short_name"] = [re.search(pattern_short_name, name).group() for name in results_df.model]
        results_df[tasks]*=100
        final_df = pd.merge(results_df[["short_name"] + tasks], df, 
                        how = "outer",
                        on = "short_name",
                        ).set_index("short_name")
    except ValueError:
        print("No general benchmark files for " + experiment)
        final_df = df
    
    # all_results.append(final_df)
      
    save_as_csv_and_tex(prettify_df(final_df), experiment, save_dir= save_dir)

In [None]:
# uncomment only if new table will not be committed or changes for google sheets will be done befor (i.e. putting llama in first row)
# save_as_csv_and_tex(prettify_df(pd.concat(all_results)),"all", save_dir=save_dir)


### Prepare result tables without general benchmarks



In [None]:
# Prepare simple results tables
for experiment in results_csv:
    df = pd.read_csv(results_dir +  experiment + "/results.csv", index_col=0)

    #TODO: only keep games in game list (i.e. exclude matchit_ascii to not be part of the evaluation)
    #exclude cols
    df = df.loc[:,~df.columns.str.contains('std', case=False)]
    # shorten model names
    short_names = [x.split("--")[0][:-5] for x in list(df.index)]
    short_dict = dict(zip(list(df.index), short_names))
    df = df.rename(index=short_dict)

    
# prepare results tables for experiment which need referencegame episodes removed
for experiment in raw_csv:
    df = pd.read_csv(results_dir + experiment + "/raw.csv", index_col = 0)
    #df["full_inst_name"] = df["experiment"].str[2:] + "_" + df["episode"]
    df = remove_referencegame_instances(df, ref_training_episodes)

    # rename_dict = {}
    # for name1 in results_df.model: 
    #     for name2 in df.index:
    #         if name1 in name2:
    #             print(name1)
    #             rename_dict[name1] = name2

    # results_df = results_df.replace(rename_dict)

    # df = pd.merge(results_df, df, right_index=True, left_on = "model", how = 'inner')
    # df = df.drop(columns=["index"], axis=1)
    # print(df.head(n= 1))
    # df.to_csv(save_dir + "results_tables/results_" + experiment + ".csv",
    #           index = False)
    df = df.drop(columns=["matchit_ascii, % Played", "matchit_ascii, Quality Score"], axis=1, errors = "ignore")
    df.to_csv(save_dir + "results_tables/results_" + experiment + ".csv")

    

### Include general benchmarks playground

In [26]:
directory = "../../results/general_benchmarks/unsloth__meta-llama-3.1-8b-instruct-bnb-4bit"
subfiles = os.listdir(directory)
        # prüfen, dass das eigentlich files sind und json!
for file in subfiles:
    file = directory + "/" + file    
    with open(file) as f:
        jsonfile = json.load(f)
    for task in tasks:
        if task in jsonfile["results"]:
            accuracy = jsonfile["results"][task]["acc,none"]
            print(accuracy)
            
        if model_results[task] is None:
            model_results[task] = accuracy
        else:
            if accuracy == model_results[task]:
                double_but_ok += 1
            else:
                #print("ATTENTION FOR", directory)
                #print(task)
                #print("Trying to set different")
                double_bad += 1

0.48311156601842375
0.6589517162797323
0.7927094668117519


In [None]:
### HIER

pattern_short_name = r"D\d{5}_*\w*|llama-3.1"

for experiment in raw_csv:
    # read raw
    df_raw = pd.read_csv(results_dir + experiment + "/raw.csv", index_col = 0)
    df_raw = df_raw[df_raw.game.isin(games)] # exclude matchit_ascii!
    
    df = remove_referencegame_instances(df_raw, ref_training_episodes)

    short_names = [re.search(pattern_short_name, name).group() for name in df.index]
    df["short_name"] = short_names

    results = []
    double_but_ok = 0
    double_bad = 0
    # create general results
    for directory in os.listdir(gen_results_dir):   # single model directories
        directory = gen_results_dir + directory
        if any(name in directory for name in short_names): # is there general benchmark data for this experiment?
            if os.path.isdir(directory):
                model_results = dict.fromkeys(columns)
                model_results["model"] = directory
                subfiles = os.listdir(directory)
                # prüfen, dass das eigentlich files sind und json!
                for file in subfiles:
                    file = directory + "/" + file
                    with open(file) as f:
                        jsonfile = json.load(f)
                    for task in tasks:
                        if task in jsonfile["results"]:
                            accuracy = jsonfile["results"][task]["acc,none"]
                            if model_results[task] is None:
                                model_results[task] = accuracy
                            else:
                                if accuracy == model_results[task]:
                                    double_but_ok += 1
                                else:
                                    #print("ATTENTION FOR", directory)
                                    #print(task)
                                    #print("Trying to set different")
                                    double_bad += 1
                results.append(model_results)


    try:    
        results_df = pd.concat([pd.DataFrame(x, index = [0]) for x in results]).reset_index()
        results_df["short_name"] = [re.search(pattern_short_name, name).group() for name in results_df.model]
        results_df[tasks]*=100
        final_df = pd.merge(results_df[["short_name"] + tasks], df, 
                        on = "short_name",
                        ).set_index("short_name")
    except ValueError:
        print("No general benchmark files for " + experiment)
        final_df = df
            
    save_as_csv_and_tex(prettify_df(final_df), experiment, save_dir= save_dir)
    
    

In [20]:
results_df = pd.concat([pd.DataFrame(x, index = [0]) for x in results]).reset_index()
#results_df["model"] = results_df.model.str[94:]
results_df.sort_values("mmlu")

Unnamed: 0,index,model,mmlu,social_iqa,piqa
9,0,../../results/general_benchmarks/clembench-pla...,0.496226,0.493347,0.761153
18,0,../../results/general_benchmarks/clembench-pla...,0.517092,0.494371,0.762786
6,0,../../results/general_benchmarks/clembench-pla...,0.54921,0.48567,0.773667
31,0,../../results/general_benchmarks/clembench-pla...,0.589588,0.484647,0.781284
1,0,../../results/general_benchmarks/clembench-pla...,0.59849,0.490788,0.792165
12,0,../../results/general_benchmarks/clembench-pla...,0.603404,0.494882,0.779652
21,0,../../results/general_benchmarks/clembench-pla...,0.625196,0.490788,0.788357
0,0,../../results/general_benchmarks/clembench-pla...,0.627475,0.627475,0.627475
20,0,../../results/general_benchmarks/clembench-pla...,0.644068,0.644068,0.644068
15,0,../../results/general_benchmarks/clembench-pla...,0.644353,0.479529,0.789989


In [None]:
results_df = pd.concat([pd.DataFrame(x, index = [0]) for x in results]).reset_index()
results_df["model"] = results_df.model.str[94:]


df = pd.read_csv(results_dir + "D4" + "/raw.csv", index_col = 0)
df = remove_referencegame_instances(df, ref_training_episodes)
df = df.loc[:,~df.columns.str.contains('std', case=False)]
short_names = [x.split("--")[0][:-5] for x in list(df.index)]
short_dict = dict(zip(list(df.index), short_names))
df = df.rename(index=short_dict)
rename_dict = {}
for name1 in results_df.model: 
    for name2 in df.index:
        if name1 in name2:
            print(name1)
            rename_dict[name1] = name2

results_df = results_df.replace(rename_dict)

df = pd.merge(results_df, df, right_index=True, left_on = "model", how = 'inner')
df = df.drop(columns=["index"], axis=1)
df