In [11]:
!pip install transformers datasets torch requests together

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import re
import requests
import json
import os
from together import Together

In [13]:
together_api_url = "https://api.together.ai/v1/inference"
api_key = "7aef0d2616f79d18ea5200186b3182367fa9bbf330bfcc889c1f1e8f1f1d26e1"

In [14]:
dataset = load_dataset("cais/mmlu", "college_mathematics")
test_data = dataset['test']

In [15]:
def generate_response(prompt, api_key):
    client = Together(api_key=api_key)
    
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
    )
    
    return response.choices[0].message.content


In [16]:
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token="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")
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token="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")
    return model, tokenizer

In [17]:
def evaluate_model(model, tokenizer, test_data, prompt_template):
    total_time = 0
    correct = 0
    total = len(test_data)

    for i in range(total):
        question = test_data[i]['question']
        options = [test_data[i]['choices'][j] for j in range(4)]
        correct_answer = test_data[i]['choices'][test_data[i]['answer'] - 1]

        prompt = prompt_template.format(question=question, options="\n".join(options))

        inputs = tokenizer(prompt, return_tensors="pt")

        start_time = time.time()
        outputs = model.generate(**inputs, max_new_tokens=100)
        inference_time = time.time() - start_time
        total_time += inference_time

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        match = re.search(r"(The correct answer is|Answer is|Answer\s*:|answer is)\s*(.+)", generated_text, re.IGNORECASE)
        if match:
            generated_answer = match.group(2).strip()
        else:
            generated_answer = None
        
        if generated_answer and (correct_answer in generated_answer or generated_answer in correct_answer):
            correct += 1
    
    accuracy = correct / total
    avg_inference_time = total_time / total

    return accuracy, avg_inference_time

In [18]:
def run_evaluation(models, test_data):
    results = {}
    prompt_template = "Choose the answer to the given question from below options.\nQuestion: {question}\n{options}"

    for model_name, model_path in models.items():
        print(f"Loading model: {model_name}")
        model, tokenizer = load_model_and_tokenizer(model_path)

        print(f"Evaluating {model_name} with Zero Shot prompt...")
        accuracy, avg_inference_time = evaluate_model(model, tokenizer, test_data, prompt_template)
        results[(model_name, 'chain_of_thought')] = (accuracy, avg_inference_time)

        print(f"Accuracy: {accuracy}, Avg Inference Time: {avg_inference_time}")
        print()

    return results

In [19]:
models = {
#     "Gemma-2B": "google/gemma-2b-it",
#     "Phi-3.5-mini": "microsoft/Phi-3.5-mini-instruct",
    "Meta-Llama-3.1-8B": "meta-llama/Meta-Llama-3.1-8B-Instruct"
}

In [20]:
results = run_evaluation(models, test_data)

Loading model: Meta-Llama-3.1-8B


Downloading shards: 100%|██████████| 4/4 [00:43<00:00, 10.75s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Evaluating Meta-Llama-3.1-8B with Zero Shot prompt...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Accuracy: 0.1, Avg Inference Time: 29.002120382785797

