In [None]:
# @title # 🧐 LLM AutoEval
# @markdown > 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)
# @markdown ❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).
# @markdown * This notebook allows you to **automatically evaluate your LLMs** using RunPod (please consider using my [referral link](https://runpod.io?ref=9nvk2srl)).
# @markdown * The results are automatically uploaded to [GitHub Gist](https://gist.github.com/) and the pod is destroyed (you can safely close this tab).
# @markdown * For further details, see the project on 💻 [GitHub](https://github.com/mlabonne/llm-autoeval).
# @markdown ---
# @markdown ## 🔑 Tokens
# @markdown Enter the name of your tokens in the Secrets tab.

RUNPOD_TOKEN = "runpod" # @param {type:"string"}
GITHUB_TOKEN = "github" # @param {type:"string"}
HF_TOKEN = "HF_TOKEN" # @param {type:"string"}
!pip install runpod requests

import runpod
import requests
from google.colab import userdata
from huggingface_hub import HfApi

# Fetch Tokens from Secrets
runpod.api_key = userdata.get(RUNPOD_TOKEN)
GITHUB_API_TOKEN = userdata.get(GITHUB_TOKEN)
HUGGINGFACE_TOKEN = userdata.get(HF_TOKEN)

# Step 1: Verify RUNPOD_TOKEN
def verify_runpod_token(api_key):
    runpod.api_key = api_key
    try:
        user_info = runpod.get_user()
        print("RunPod token is valid.")
        #print(f"User: {user_info}")
    except Exception as e:
        print(f"RunPod token is invalid. Error: {e}")

# Step 2: Verify GITHUB_TOKEN
def verify_github_token(token):
    headers = {"Authorization": f"Bearer {token}"}
    # Test creating a gist to verify permissions
    gist_data = {
        "description": "Test gist for verifying token",
        "public": False,
        "files": {
            "test.txt": {
                "content": "This is a test gist."
            }
        }
    }
    response = requests.post("https://api.github.com/gists", headers=headers, json=gist_data)
    if response.status_code == 201:
        print("GitHub token is valid and can create gists.")
        # Delete the test gist
        gist_url = response.json()['url']
        delete_response = requests.delete(gist_url, headers=headers)
        if delete_response.status_code == 204:
            print("Test gist deleted successfully.")
        else:
            print(f"Failed to delete test gist. Status code: {delete_response.status_code}, Response: {delete_response.text}")
    else:
        print(f"GitHub token is invalid or lacks gist creation permissions. Status code: {response.status_code}, Response: {response.text}")

# Step 3: Verify HF_TOKEN
def verify_hf_token(token):
    api = HfApi()
    try:
        user_info = api.whoami(token)
        print("Hugging Face token is valid.")
        print(f"User: {user_info['name']}")
    except Exception as e:
        print(f"Hugging Face token is invalid. Error: {e}")

# Run the verification
print ("verifying tokens...")
verify_runpod_token(runpod.api_key)
verify_github_token(GITHUB_API_TOKEN)
verify_hf_token(HUGGINGFACE_TOKEN)

# --- Original Script ---

In [2]:

!pip install -qqq runpod --progress-bar off

import runpod
from google.colab import userdata

# @markdown ---

# @markdown ## 🔍 Evaluation
MODEL_ID = "cstr/Spaetzle-v85-7b" # @param {type:"string"}
BENCHMARK = "nous" # @param ["nous", "eq-bench", "openllm", "lighteval"]

# @markdown For lighteval, select tasks as specified in the [readme](https://github.com/huggingface/lighteval?tab=readme-ov-file#usage) or in the list of [recommended tasks](https://github.com/huggingface/lighteval/blob/main/tasks_examples/recommended_set.txt).

LIGHTEVAL_TASK = "leaderboard|truthfulqa:mc|0|0,leaderboard|gsm8k|0|0" # @param {type:"string"}

# @markdown ---

# @markdown ## ☁️ Cloud GPU

GPU = "NVIDIA RTX A6000" # @param ["NVIDIA A100 80GB PCIe", "NVIDIA A100-SXM4-80GB", "NVIDIA A30", "NVIDIA A40", "NVIDIA GeForce RTX 3070", "NVIDIA GeForce RTX 3080", "NVIDIA GeForce RTX 3080 Ti", "NVIDIA GeForce RTX 3090", "NVIDIA GeForce RTX 3090 Ti", "NVIDIA GeForce RTX 4070 Ti", "NVIDIA GeForce RTX 4080", "NVIDIA GeForce RTX 4090", "NVIDIA H100 80GB HBM3", "NVIDIA H100 PCIe", "NVIDIA L4", "NVIDIA L40", "NVIDIA RTX 4000 Ada Generation", "NVIDIA RTX 4000 SFF Ada Generation", "NVIDIA RTX 5000 Ada Generation", "NVIDIA RTX 6000 Ada Generation", "NVIDIA RTX A2000", "NVIDIA RTX A4000", "NVIDIA RTX A4500", "NVIDIA RTX A5000", "NVIDIA RTX A6000", "Tesla V100-FHHL-16GB", "Tesla V100-PCIE-16GB", "Tesla V100-SXM2-16GB", "Tesla V100-SXM2-32GB"]
NUMBER_OF_GPUS = 1 # @param {type:"slider", min:1, max:8, step:1}
CONTAINER_DISK = 75 # @param {type:"slider", min:50, max:500, step:25}
CLOUD_TYPE = "COMMUNITY" # @param ["COMMUNITY", "SECURE"]
REPO = "https://github.com/mlabonne/llm-autoeval.git" # @param {type:"string"}
TRUST_REMOTE_CODE = False # @param {type:"boolean"}
PRIVATE_GIST = True # @param {type:"boolean"}
DEBUG = False # @param {type:"boolean"}

# @markdown ---

# Create a pod
pod = runpod.create_pod(
    name=f"Eval {MODEL_ID.split('/')[-1]} on {BENCHMARK.capitalize()}",
    image_name="runpod/pytorch:2.0.1-py3.10-cuda11.8.0-devel-ubuntu22.04",
    gpu_type_id=GPU,
    cloud_type=CLOUD_TYPE,
    gpu_count=NUMBER_OF_GPUS,
    volume_in_gb=0,
    container_disk_in_gb=CONTAINER_DISK,
    template_id="au6nz6emhk",
    env={
        "BENCHMARK": BENCHMARK,
        "MODEL_ID": MODEL_ID,
        "REPO": REPO,
        "TRUST_REMOTE_CODE": TRUST_REMOTE_CODE,
        "PRIVATE_GIST": PRIVATE_GIST,
        "DEBUG": DEBUG,
        "GITHUB_API_TOKEN": GITHUB_API_TOKEN,
        "HUGGINGFACE_TOKEN": HUGGINGFACE_TOKEN,
        "LIGHT_EVAL_TASK": LIGHTEVAL_TASK,
        "NUMBER_OF_GPUS": NUMBER_OF_GPUS
    }
)

print("Pod started: https://www.runpod.io/console/pods")

Pod started: https://www.runpod.io/console/pods
