In [1]:
# %pip install tabulate

In [2]:
import pandas as pd

In [3]:
from glob import glob

In [4]:
import json

In [5]:
# results/korean_origin_bench/20b/00_shot.json
def _get_metric_name(v):
    metrics = ['f1', 'macro_f1', 'acc_norm', 'acc']
    for m in metrics:
        if v.get(m):
            return {
                'metric': m,
                'value': v[m],
            }

def get_df_klue(path, model_name=''):
    data = []
    for i in ['0', '5', '10', '50']:
        shot = f'{path}/{i}_shot.json'
        try:
            data.append(
                {
                    f"{k} ({_get_metric_name(v)['metric']})": _get_metric_name(v)['value']
                    for k, v in json.load(open(shot))['results'].items()
                }
            )
        except FileNotFoundError:
            pass
    df = pd.DataFrame(data, index=[0, 5, 10, 50][:len(data)]).T
    print(df.to_markdown())
    return df

In [6]:
get_df_klue('results/klue_etc_bench/home/jovyan/beomi/llama2-koen-13b/60b', 'llama2-koen-13b')

|                                  |        0 |        5 |       10 |       50 |
|:---------------------------------|---------:|---------:|---------:|---------:|
| kohatespeech (macro_f1)          | 0.278224 | 0.378693 | 0.370702 | 0.509343 |
| kohatespeech_apeach (macro_f1)   | 0.337667 | 0.556898 | 0.581788 | 0.667511 |
| kohatespeech_gen_bias (macro_f1) | 0.248404 | 0.484745 | 0.473659 | 0.461714 |
| korunsmile (f1)                  | 0.327145 | 0.329163 | 0.347889 | 0.395522 |
| nsmc (acc)                       | 0.6442   | 0.87702  | 0.89982  | 0.90984  |
| pawsx_ko (acc)                   | 0.5355   | 0.5455   | 0.5435   | 0.5255   |


Unnamed: 0,0,5,10,50
kohatespeech (macro_f1),0.278224,0.378693,0.370702,0.509343
kohatespeech_apeach (macro_f1),0.337667,0.556898,0.581788,0.667511
kohatespeech_gen_bias (macro_f1),0.248404,0.484745,0.473659,0.461714
korunsmile (f1),0.327145,0.329163,0.347889,0.395522
nsmc (acc),0.6442,0.87702,0.89982,0.90984
pawsx_ko (acc),0.5355,0.5455,0.5435,0.5255


In [7]:
various_models = sorted(glob('results/all/*/*'))
various_models

['results/all/42MARU/GenAI-llama2-ko-en-platypus',
 'results/all/EleutherAI/polyglot-ko-12.8b',
 'results/all/EleutherAI/polyglot-ko-5.8b',
 'results/all/HumanF-MarkrAI/pub-llama-13B-v3',
 'results/all/KT-AI/midm-bitext-S-7B-inst-v1',
 'results/all/beomi/Yi-Ko-6B',
 'results/all/beomi/Yi-Ko-6B-20B',
 'results/all/beomi/llama-2-ko-7b',
 'results/all/beomi/llama-2-ko-7b-emb-dev',
 'results/all/beomi/llama-2-ko-7b-emb-dev-7B',
 'results/all/beomi/open-llama-2-ko-7b',
 'results/all/beomi/open-llama-2-ko-7b-dev',
 'results/all/beomi/open-llama-2-ko-7b-dev-v1',
 'results/all/hyunseoki/ko-en-llama2-13b',
 'results/all/jyoung105/KoR-Orca-Platypus-13B-neft',
 'results/all/kyujinpy/KoR-Orca-Platypus-13B']

In [8]:
for model in various_models:
    print(model)
    get_df_klue(model)
    print()

results/all/42MARU/GenAI-llama2-ko-en-platypus
|                                  |        0 |        5 |       10 |
|:---------------------------------|---------:|---------:|---------:|
| kobest_boolq (macro_f1)          | 0.75496  | 0.813542 | 0.826657 |
| kobest_copa (macro_f1)           | 0.789858 | 0.8109   | 0.812864 |
| kobest_hellaswag (macro_f1)      | 0.46552  | 0.481476 | 0.487711 |
| kobest_sentineg (macro_f1)       | 0.599451 | 0.942065 | 0.952136 |
| kohatespeech (macro_f1)          | 0.409181 | 0.368872 | 0.407597 |
| kohatespeech_apeach (macro_f1)   | 0.340951 | 0.599693 | 0.63309  |
| kohatespeech_gen_bias (macro_f1) | 0.137929 | 0.500808 | 0.47299  |
| korunsmile (f1)                  | 0.4154   | 0.343849 | 0.385809 |
| nsmc (acc)                       | 0.58484  | 0.8477   | 0.86622  |
| pawsx_ko (acc)                   | 0.5405   | 0.531    | 0.535    |

results/all/EleutherAI/polyglot-ko-12.8b
|                                  |        0 |        5 |       10 |  

In [9]:

# results/all/beomi/llama-2-ko-7b-emb-dev
# |                                  |        0 |        5 |       10 |       50 |
# |:---------------------------------|---------:|---------:|---------:|---------:|
# | kobest_boolq (macro_f1)          | 0.337452 | 0.570559 | 0.585608 | 0.567486 |
# | kobest_copa (macro_f1)           | 0.665517 | 0.69096  | 0.686659 | 0.698672 |
# | kobest_hellaswag (macro_f1)      | 0.383174 | 0.379488 | 0.380637 | 0.383733 |
# | kobest_sentineg (macro_f1)       | 0.367966 | 0.91937  | 0.96473  | 0.947103 |
# | kohatespeech (macro_f1)          | 0.300627 | 0.362646 | 0.364923 | 0.287482 |
# | kohatespeech_apeach (macro_f1)   | 0.337667 | 0.486543 | 0.452716 | 0.335808 |
# | kohatespeech_gen_bias (macro_f1) | 0.124535 | 0.489203 | 0.461098 | 0.461714 |
# | korunsmile (f1)                  | 0.35822  | 0.367208 | 0.381937 | 0.376735 |
# | nsmc (acc)                       | 0.50714  | 0.80386  | 0.83964  | 0.86138  |
# | pawsx_ko (acc)                   | 0.539    | 0.4915   | 0.5225   | 0.521    |
