# Evaluate base model on MMLU and MMLU_RU

## Setup and Imports

In [17]:
# Imports
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.notebook import tqdm
import torch_xla.core.xla_model as xm # This should now work

ImportError: cannot import name 'XLA_LIB' from partially initialized module 'torch_xla.core.xla_model' (most likely due to a circular import) (/usr/local/lib/python3.11/dist-packages/torch_xla/core/xla_model.py)

## Model Loading

In [None]:
def load_model_and_tokenizer(model_name):
    """Load model and tokenizer with TPU support"""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map='auto'
    )
    model.eval()
    # Move model to TPU
    device = xm.xla_device()
    model = model.to(device)
    return model, tokenizer, device

## Data Loading

In [None]:
def load_mmlu_data(subjects=None):
    """Load MMLU-RU test data for specified subjects"""
    if subjects is None:
        # You can modify this list to include only subjects you want to evaluate
        subjects = [
            'abstract_algebra',
            'college_mathematics',
            'machine_learning',
            'college_physics'
        ]

    dfs = []
    for subject in subjects:
        try:
            dataset = load_dataset("NLPCoreTeam/mmlu_ru", subject, split="test")
            df = dataset.to_pandas()
            df['subject'] = subject
            dfs.append(df)
        except Exception as e:
            print(f"Error loading {subject}: {e}")

    return pd.concat(dfs, ignore_index=True)

## Evaluation Functions

In [None]:
def format_prompt(row):
    """Format a single question into a prompt"""
    prompt = f"Question: {row['question_ru']}\nChoices:\n"
    for idx, choice in enumerate(row['choices_ru']):
        prompt += f"{chr(65 + idx)}. {choice}\n"
    prompt += "Answer:"
    return prompt


def evaluate_model(model, tokenizer, df, device):
    """Evaluate model on the dataset"""
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prompt = format_prompt(row)
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5,
                temperature=0.0,
                do_sample=False
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated = response[len(prompt):].strip()

        # Extract first capital letter as prediction
        pred = next((c for c in generated if c.upper() in 'ABCD'), 'X')

        results.append({
            'subject': row['subject'],
            'question': row['question_ru'],
            'correct_answer': row['answer'],
            'predicted_answer': pred,
            'correct': pred == row['answer']
        })

    return pd.DataFrame(results)


In [None]:
# 1. Load model
model_name = "Qwen/Qwen2.5-3B-Instruct"  # e.g., "Qwen/Qwen2.5-7B-Instruct"
model, tokenizer, device = load_model_and_tokenizer(model_name)

In [None]:
# 2. Load data
eval_df = load_mmlu_data()

In [None]:
# 3. Run evaluation
results_df = evaluate_model(model, tokenizer, eval_df, device)

In [None]:
# 4. Calculate and display results
accuracy = results_df['correct'].mean()
subject_accuracy = results_df.groupby('subject')['correct'].mean()

print(f"Overall accuracy: {accuracy:.2%}")
print("\nAccuracy by subject:")
print(subject_accuracy)

In [None]:
# 5. Save results
results_df.to_csv(f"mmlu_results_{model_name.replace('/', '_')}.csv", index=False)