In [None]:
!pip install transformers datasets torch

In [None]:
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import re

In [None]:
dataset = load_dataset("cais/mmlu", "college_mathematics")
test_data = dataset['test']

In [None]:
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token="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")
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token="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")
    return model, tokenizer

In [None]:
def evaluate_model(model, tokenizer, test_data, prompt_template):
    total_time = 0
    correct = 0
    total = len(test_data)

    for i in range(total):
        question = test_data[i]['question']
        options = [test_data[i]['choices'][j] for j in range(4)]
        correct_answer = test_data[i]['choices'][test_data[i]['answer'] - 1]

        prompt = prompt_template.format(question=question, options="\n".join(options))

        inputs = tokenizer(prompt, return_tensors="pt")

        start_time = time.time()
        outputs = model.generate(**inputs, max_new_tokens=100)
        inference_time = time.time() - start_time
        total_time += inference_time

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        match = re.search(r"(The correct answer is|Answer is|Answer\s*:|answer is)\s*(.+)", generated_text, re.IGNORECASE)
        if match:
            generated_answer = match.group(2).strip()
        else:
            generated_answer = None
        
        if generated_answer and (correct_answer in generated_answer or generated_answer in correct_answer):
            correct += 1
    
    accuracy = correct / total
    avg_inference_time = total_time / total

    return accuracy, avg_inference_time

In [None]:
def run_evaluation(models, test_data):
    results = {}
    prompt_template = "Choose answer of given question from below options.\nQuestion: {question}\n{options}\nThink step by step."
    for model_name, model_path in models.items():
        print(f"Loading model: {model_name}")
        model, tokenizer = load_model_and_tokenizer(model_path)

        print(f"Evaluating {model_name} with Chain of thought prompt...")
        accuracy, avg_inference_time = evaluate_model(model, tokenizer, test_data, prompt_template)
        results[(model_name, 'chain_of_thought')] = (accuracy, avg_inference_time)

        print(f"Accuracy: {accuracy}, Avg Inference Time: {avg_inference_time}")
        print()

    return results

In [None]:
models = {
    "Gemma-2B": "google/gemma-2b-it",
    "Phi-3.5-mini": "microsoft/Phi-3.5-mini-instruct",
    "Meta-Llama-3.1-8B": "meta-llama/Meta-Llama-3.1-8B-Instruct"
}

In [None]:
results = run_evaluation(models, test_data)