In [None]:
# Let's install lm-eval
!git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
!cd lm-evaluation-harness
!pip install -e .

In [None]:
# Install wandb extra for lm-eval
!pip install lm_eval[wandb]

## Usage
The goal here is to run evals on the following datasets
```
TASKS = [
    "wikitext",         # Perplexity on WikiText
    "lambada_openai",   # Cloze/Prediction task
    "hellaswag",        # Commonsense NLI
    "piqa",             # Physical Interaction QA
    "winogrande",       # Commonsense Reasoning (Winograd Schema)
    "arc_easy",         # AI2 Reasoning Challenge (Easy)
    "arc_challenge",    # AI2 Reasoning Challenge (Challenge)
    "openbookqa",       # Open Book Question Answering
    "mmlu",             # Massive Multitask Language Understanding
    "gsm8k",            # Grade School Math
]
```

All datasets are to be evaluated using zero-shot except `mmlu` and `gsm8k`.

`mmlu` should use `5 few-shot`

`gsm8k` should use `8 few-shot`

In [None]:
# Use lm-eval to evaluate a model on HF on based on the instructions below
# ```

# All datasets are to be evaluated using zero-shot except `mmlu` and `gsm8k`.

# `mmlu` should use `5 few-shot`

# `gsm8k` should use `8 few-shot`

# You can run the evaluation using the command below
# TASKS = [
#     "wikitext",         # Perplexity on WikiText
#     "lambada_openai",   # Cloze/Prediction task
#     "hellaswag",        # Commonsense NLI
#     "piqa",             # Physical Interaction QA
#     "winogrande",       # Commonsense Reasoning (Winograd Schema)
#     "arc_easy",         # AI2 Reasoning Challenge (Easy)
#     "arc_challenge",    # AI2 Reasoning Challenge (Challenge)
#     "openbookqa",       # Open Book Question Answering
#     "mmlu",             # Massive Multitask Language Understanding
#     "gsm8k",            # Grade School Math
# ]
# ``

# New tasks to test on
# SciQ, RACE, ReCORD, SST, MRPC, RTE, MultiNLI, WSC273, WiC.
NEW_TASKS = [
    "sciq",
    "race",
    "super_glue",
    "swag",
    "anli",
    "xnli",
    "wsc273",
    "pubmedqa",
    "mathqa",
    "siqa"
]

# Let's test things out with one model first
model_checkpoints = [
    "kokolamba/SubspaceDecoder_mla192-96-0",
    "kokolamba/SubspaceDecoder_mla192-96-192",
    "kokolamba/SubspaceDecoder_mla0-128-0",
    "kokolamba/SubspaceDecoder_mla0-96-192",
    "kokolamba/SubspaceDecoder_mla0-0-192",
    "kokolamba/SubspaceDecoder_mla0-0-0",
    "kokolamba/SubspaceDecoder_mla192-0-0",
    "kokolamba/SubspaceDecoder_mha"
]

# !lm_eval --model hf \
#     --model_args pretrained=kokolamba/SubspaceDecoder_mha,trust_remote_code=True \
#     --tasks wikitext,lambada_openai,hellaswag,piqa,winograde,arc_easy,arc_challenge,openbookqa,mmlu,gsm8k_cot\
#     --device cuda:0 \
#     --batch_size auto:4 \
#     --output_path results \
#     --wandb_args project=subspace-decoder-lm-harness-results \
#     --log_samples \
#     --limit 10
import subprocess

# Pick one model checkpoint
ckpt = "kokolamba/SubspaceDecoder_mha"

print(f"Running evaluation for {ckpt}...")

cmd = [
    "lm_eval",
    "--model", "hf",
    "--model_args", f"pretrained={ckpt},trust_remote_code=True",
    "--tasks", "wikitext,lambada_openai,hellaswag,piqa,winograde,arc_easy,arc_challenge,openbookqa,mmlu,gsm8k_cot",
    "--device", "cuda:0",
    "--batch_size", "auto:4",
    "--output_path", "results",
    "--wandb_args", "project=subspace-decoder-lm-harness-results",
    "--log_samples",
    "--limit", "10"
]

# Run the command
subprocess.run(cmd, check=True)

In [None]:
# Use subprocess to run the command for each model in the list
import subprocess

for ckpt in model_checkpoints:
    print(f"Running evaluation for {ckpt}...")
    cmd = [
        "lm_eval",
        "--model", "hf",
        "--model_args", f"pretrained={ckpt},trust_remote_code=True",
        "--tasks", "wikitext,lambada_openai,hellaswag,piqa,winograde,arc_easy,arc_challenge,openbookqa,mmlu,gsm8k_cot",
        "--device", "cuda:0",
        "--batch_size", "auto:4",
        "--output_path", "results",
        "--wandb_args", "project=subspace-decoder-lm-harness-results",
        "--log_samples",
        "--limit", "10"
    ]
    subprocess.run(cmd, check=True)