In [3]:
import re
import torch
import json
from tqdm import tqdm
from datasets import load_dataset, get_dataset_config_names
from transformers import AutoModelForCausalLM, AutoTokenizer

def evaluate_model_on_dataset(model_name: str, dataset_name: str, subsets: [], num_samples: int):
    """
    model_name: Hugging Face에서 사용할 모델 이름 (예: "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct")
    dataset_name: 데이터셋 이름 (예: "HAERAE-HUB/KMMLU")
    num_samples: 각 subset에서 평가할 샘플 수
    """
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)    

    result = {}
    option_map = {1: "A", 2: "B", 3: "C", 4: "D"}
    
    for subset in tqdm(subsets, desc="Evaluating subsets"):
        subset_result = {"predict": []}
        try:
            ds = load_dataset(dataset_name, subset)
        except Exception as e:
            subset_result["predict"].append(f"데이터셋 로드 중 오류 발생: {e}")
            result[subset] = subset_result
            continue
        
        test_set = ds["test"]
        if test_set.num_rows < num_samples:
            samples = test_set
        else:
            samples = test_set.select(range(num_samples))
        
        correct = 0
        total = len(samples)
        
        for idx, sample in enumerate(samples):
            question = sample["question"]
            option_A = sample["A"]
            option_B = sample["B"]
            option_C = sample["C"]
            option_D = sample["D"]
            
            if isinstance(sample["answer"], int):
                true_answer = option_map.get(sample["answer"], "")
            else:
                true_answer = sample["answer"].strip().upper()
            
            prompt = (
                f"Question: {question}\n"
                f"A. {option_A}\n"
                f"B. {option_B}\n"
                f"C. {option_C}\n"
                f"D. {option_D}\n"
                f"Answer:"
            )
            
            input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
            output = model.generate(
                input_ids,
                max_new_tokens=20,
                do_sample=False,
                eos_token_id=tokenizer.eos_token_id,
            )
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
            
            answer_text = generated_text.split("Answer:")[-1].strip()
            predicted = ""
            for char in answer_text:
                if char.upper() in ["A", "B", "C", "D"]:
                    predicted = char.upper()
                    break
            
            if predicted == true_answer:
                correct += 1
            
            predict_str = f"[{idx+1:03d}] 정답: {true_answer} / 예측: {predicted}"
            subset_result["predict"].append(predict_str)
        
        accuracy = correct / total if total > 0 else 0
        subset_result["accuracy"] = accuracy
        result[subset] = subset_result

    all_accuracies = [v["accuracy"] for v in result.values() if "accuracy" in v]
    mean_accuracy = sum(all_accuracies) / len(all_accuracies) if all_accuracies else 0
    print(f"Mean Accuracy across all subsets: {mean_accuracy:.2%}")
    
    # 결과를 JSON 파일로 저장
    with open("results.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
        
    return result

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_name = "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct"
dataset_name = "HAERAE-HUB/KMMLU"
subsets = get_dataset_config_names(dataset_name)
num_samples = 3000

print(f"*** Evaluation of {dataset_name} ***")
results = evaluate_model_on_dataset(model_name, dataset_name, subsets, num_samples)
for subset in subsets:
    print(f"Accuracy on {subset} : {results[subset]['accuracy']:.2%}")

*** Evaluation of HAERAE-HUB/KMMLU ***


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.82s/it]
Evaluating subsets: 100%|██████████████████████████████████████████████████████████████| 45/45 [29:33<00:00, 39.40s/it]

Mean Accuracy across all subsets: 41.89%
Accuracy on Accounting : 37.00%
Accuracy on Agricultural-Sciences : 33.20%
Accuracy on Aviation-Engineering-and-Maintenance : 44.10%
Accuracy on Biology : 35.60%
Accuracy on Chemical-Engineering : 41.10%
Accuracy on Chemistry : 40.67%
Accuracy on Civil-Engineering : 41.30%
Accuracy on Computer-Science : 66.30%
Accuracy on Construction : 36.40%
Accuracy on Criminal-Law : 30.00%
Accuracy on Ecology : 49.80%
Accuracy on Economics : 40.77%
Accuracy on Education : 46.00%
Accuracy on Electrical-Engineering : 36.40%
Accuracy on Electronics-Engineering : 53.60%
Accuracy on Energy-Management : 30.40%
Accuracy on Environmental-Science : 34.70%
Accuracy on Fashion : 44.30%
Accuracy on Food-Processing : 40.40%
Accuracy on Gas-Technology-and-Engineering : 39.80%
Accuracy on Geomatics : 42.20%
Accuracy on Health : 50.00%
Accuracy on Industrial-Engineer : 44.90%
Accuracy on Information-Technology : 64.20%
Accuracy on Interior-Architecture-and-Design : 54.00%
A




In [7]:
# results.json 파일 불러오기
with open("results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

total_correct = 0
total_samples = 0

# 각 subset에 대해, 맞춘 샘플 수(accuracy * 샘플 수)와 전체 샘플 수를 누적합산
for subset, data in results.items():
    subset_samples = len(data["predict"])  # 해당 subset의 총 샘플 수
    subset_accuracy = data["accuracy"]       # 해당 subset의 정확도
    total_correct += subset_samples * subset_accuracy
    total_samples += subset_samples

# 전체 정확도 계산 (benchmark score)
kmmlu_score = total_correct / total_samples

print("KMMLU Benchmark Score: {:.2%}".format(kmmlu_score))


KMMLU Benchmark Score: 43.16%


### Etc.

```
conda env remove -n exaone_env
conda create -n exaone_env python=3.9 -y
conda activate exaone_env
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
pip install transformers ipykernel accelerate
python -m ipykernel install --user --name exaone_env --display-name EXAONE
```

In [None]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

In [None]:
import torch
print("PyTorch에서 사용중인 CUDA 버전:", torch.version.cuda)

In [None]:
import transformers
print(transformers.__version__)

In [13]:
from datasets import load_dataset, get_dataset_config_names

# 1. 사용 가능한 config (subset) 목록 불러오기
dataset_name = "HAERAE-HUB/HAE_RAE_BENCH_2.0"
configs = get_dataset_config_names(dataset_name)
print("사용 가능한 config (subset) 목록:\n", configs)
print("Subset의 수", len(configs))

# 2. 각 config별로 데이터셋 로드 및 test split 샘플 수 확인
for config in configs:
    try:
        dataset = load_dataset(dataset_name, config)
        if "test" in dataset:
            test_dataset = dataset["test"]
            print(config, test_dataset.num_rows)
        else:
            print("해당 config에 test split이 존재하지 않습니다.")
    except Exception as e:
        print(f"데이터셋 로드 중 오류 발생: {e}")

사용 가능한 config (subset) 목록:
 ['date_understanding', 'context_definition_alignment', 'proverb_unscrambling', '2_digit_multiply', '3_digit_subtract', 'gsm8k_ko']
Subset의 수 6
date_understanding 475
context_definition_alignment 439
proverb_unscrambling 672
2_digit_multiply 1000
3_digit_subtract 1000
gsm8k_ko 250


In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name = "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct"

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     trust_remote_code=True,
#     device_map="auto"
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Choose your prompt
# prompt = "스스로를 자랑해 봐"       # Korean example

# messages = [
#     {"role": "system", 
#      "content": "You are EXAONE model from LG AI Research, a helpful assistant."},
#     {"role": "user", "content": prompt}
# ]
# input_ids = tokenizer.apply_chat_template(
#     messages,
#     tokenize=True,
#     add_generation_prompt=True,
#     return_tensors="pt"
# )

# output = model.generate(
#     input_ids.to("cuda"),
#     eos_token_id=tokenizer.eos_token_id,
#     max_new_tokens=128,
#     do_sample=False,
# )

# print(tokenizer.decode(output[0]))