## How to download the HELM dataset

Run in terminal:

```bash
export LEADERBOARD_VERSION=v0.3.0
curl -O https://storage.googleapis.com/crfm-helm-public/benchmark_output/archives/${LEADERBOARD_VERSION}/run_stats.zip
mkdir -p benchmark_output/runs/${LEADERBOARD_VERSION}
unzip run_stats.zip -d benchmark_output/runs/${LEADERBOARD_VERSION}
```


In [1]:
import os
import json
import pandas as pd

Download and compile all of the leaderboard files into a single csv

In [2]:
rows = []

# access all subdirectories
for root, dirs, files in os.walk("benchmark_output/runs/v0.3.0"):
    for file in files:
        if file.endswith(".json"):
            filepath = os.path.join(root, file)
            
            with open(filepath, "r") as f:
                data = json.load(f)
            
            flat = pd.json_normalize(data)
            rows.append(flat)

In [3]:
df = pd.concat(rows, ignore_index=True)

In [4]:
df.to_csv("helm_leaderboard_all_results.csv", index=False)

Run from here to load saved data and clean it

In [8]:
data = pd.read_csv("helm_leaderboard_all_results.csv")

  data = pd.read_csv("helm_leaderboard_all_results.csv")


In [9]:
data

Unnamed: 0,count,sum,sum_squared,min,max,mean,variance,stddev,name.name,name.split,...,scenario_spec.args.topic,scenario_spec.args.dataset,scenario_spec.args.datatag,scenario_spec.args.dataset_name,scenario_spec.args.sampling_min_length,scenario_spec.args.sampling_max_length,scenario_spec.args.doc_max_length,scenario_spec.args.difficulty,scenario_spec.args.gender,scenario_spec.args.num_parenthesis_pairs
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,num_references,test,...,,,,,,,,,,
1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,num_train_trials,test,...,,,,,,,,,,
2,1.0,1024.0,1048576.0,1024.0,1024.0,1024.0,0.0,0.0,num_prompt_tokens,test,...,,,,,,,,,,
3,1.0,32.0,1024.0,32.0,32.0,32.0,0.0,0.0,num_completion_tokens,test,...,,,,,,,,,,
4,1.0,32.0,1024.0,32.0,32.0,32.0,0.0,0.0,num_output_tokens,test,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2799511,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,estimated_num_tokens_cost,,...,,,,,,,,,,
2799512,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,num_completions,,...,,,,,,,,,,
2799513,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,max_num_output_tokens,,...,,,,,,,,,,
2799514,1.0,957.0,915849.0,957.0,957.0,957.0,0.0,0.0,num_requests,,...,,,,,,,,,,


In [34]:
drop_cols = [c for c in data.columns if c.startswith(("scenario","name","adapter","data_","metric","groups"))]
drop_cols.remove("name.name")
drop_cols.remove("adapter_spec.model")
data_clean = data.drop(columns=drop_cols)
data_clean

Unnamed: 0,count,sum,sum_squared,min,max,mean,variance,stddev,name.name,adapter_spec.model
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,num_references,
1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,num_train_trials,
2,1.0,1024.0,1048576.0,1024.0,1024.0,1024.0,0.0,0.0,num_prompt_tokens,
3,1.0,32.0,1024.0,32.0,32.0,32.0,0.0,0.0,num_completion_tokens,
4,1.0,32.0,1024.0,32.0,32.0,32.0,0.0,0.0,num_output_tokens,
...,...,...,...,...,...,...,...,...,...,...
2799511,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,estimated_num_tokens_cost,
2799512,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,num_completions,
2799513,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,max_num_output_tokens,
2799514,1.0,957.0,915849.0,957.0,957.0,957.0,0.0,0.0,num_requests,


In [35]:
data_clean["model"] = df["adapter_spec.model"].bfill()
data_clean

Unnamed: 0,count,sum,sum_squared,min,max,mean,variance,stddev,name.name,adapter_spec.model,model
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,num_references,,cohere/small-20220720
1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,num_train_trials,,cohere/small-20220720
2,1.0,1024.0,1048576.0,1024.0,1024.0,1024.0,0.0,0.0,num_prompt_tokens,,cohere/small-20220720
3,1.0,32.0,1024.0,32.0,32.0,32.0,0.0,0.0,num_completion_tokens,,cohere/small-20220720
4,1.0,32.0,1024.0,32.0,32.0,32.0,0.0,0.0,num_output_tokens,,cohere/small-20220720
...,...,...,...,...,...,...,...,...,...,...,...
2799511,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,estimated_num_tokens_cost,,ai21/j1-jumbo
2799512,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,num_completions,,ai21/j1-jumbo
2799513,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0,max_num_output_tokens,,ai21/j1-jumbo
2799514,1.0,957.0,915849.0,957.0,957.0,957.0,0.0,0.0,num_requests,,ai21/j1-jumbo


In [36]:
data_clean = data_clean.drop("adapter_spec.model", axis=1)
data_clean = data_clean.dropna()
data_clean.insert(0, "name.name", data_clean.pop("name.name"))
data_clean = data_clean.rename(columns={'name.name':'metric'})
data_clean.insert(0, "model", data_clean.pop("model"))
data_clean

Unnamed: 0,model,metric,count,sum,sum_squared,min,max,mean,variance,stddev
0,cohere/small-20220720,num_references,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,cohere/small-20220720,num_train_trials,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
2,cohere/small-20220720,num_prompt_tokens,1.0,1024.0,1048576.0,1024.0,1024.0,1024.0,0.0,0.0
3,cohere/small-20220720,num_completion_tokens,1.0,32.0,1024.0,32.0,32.0,32.0,0.0,0.0
4,cohere/small-20220720,num_output_tokens,1.0,32.0,1024.0,32.0,32.0,32.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2799510,ai21/j1-jumbo,num_bytes,3.0,6.0,12.0,2.0,2.0,2.0,0.0,0.0
2799511,ai21/j1-jumbo,estimated_num_tokens_cost,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0
2799512,ai21/j1-jumbo,num_completions,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0
2799513,ai21/j1-jumbo,max_num_output_tokens,957.0,957.0,957.0,1.0,1.0,1.0,0.0,0.0


In [37]:
data_clean.to_csv("helm_leaderboard_all_results_clean.csv", index=False)

In [38]:
models = data_clean['model'].unique()

In [39]:
print(models)

['cohere/small-20220720' 'cohere/xlarge-20220609' 'openai/ada'
 'together/t5-11b' 'together/redpajama-incite-instruct-3b-v1'
 'together/opt-175b' 'openai/text-davinci-002' 'lmsys/vicuna-7b-v1.3'
 'openai/text-ada-001' 'ai21/j1-large' 'ai21/j1-grande' 'mosaicml/mpt-30b'
 'openai/curie' 'together/ul2' 'anthropic/stanford-online-all-v4-s3'
 'meta/llama-65b' 'cohere/command-medium-beta' 'together/yalm'
 'meta/llama-2-7b' 'openai/text-babbage-001' 'openai/text-davinci-003'
 'meta/llama-30b' 'openai/davinci' 'together/gpt-neox-20b'
 'openai/babbage' 'cohere/medium-20220720' 'ai21/j1-jumbo'
 'together/gpt-j-6b' 'together/opt-66b'
 'together/redpajama-incite-instruct-7b' 'together/bloom' 'together/glm'
 'AlephAlpha/luminous-supreme' 'openai/gpt-3.5-turbo-0301' 'ai21/j2-large'
 'ai21/j2-jumbo' 'lmsys/vicuna-13b-v1.3' 'cohere/xlarge-20221108'
 'AlephAlpha/luminous-extended' 'cohere/command-xlarge-beta'
 'openai/code-cushman-001' 'AlephAlpha/luminous-base'
 'eleutherai/pythia-12b-v0' 'openai/text

In [41]:
metrics = data_clean['metric'].unique()
print(metrics)

['num_references' 'num_train_trials' 'num_prompt_tokens'
 'num_completion_tokens' 'num_output_tokens' 'inference_runtime'
 'batch_size' 'inference_denoised_runtime' 'finish_reason_length'
 'finish_reason_stop' 'finish_reason_endoftext' 'finish_reason_unknown'
 'num_train_instances' 'prompt_truncated' 'max_prob' 'exact_match'
 'logprob' 'num_perplexity_tokens' 'num_bytes' 'perplexity'
 'bits_per_byte' 'logprob_per_byte' 'ece_10_bin' 'ece_1_bin'
 'selective_cov_acc_area' 'selective_acc@10' 'platt_ece_10_bin'
 'platt_ece_1_bin' 'num_instances' 'expected_max_toxicity'
 'max_toxicity_probability' 'toxic_frac' 'estimated_num_tokens_cost'
 'num_completions' 'max_num_output_tokens' 'num_requests'
 'inference_idealized_runtime' 'exact_match@5' 'quasi_exact_match'
 'quasi_exact_match@5' 'prefix_exact_match' 'prefix_exact_match@5'
 'quasi_prefix_exact_match' 'quasi_prefix_exact_match@5' 'platt_coef'
 'platt_intercept'
 'bias_metric:mode=representation,demographic_category=race' 'Success@1'
 'Succ