Test Gemini vs Ensemble for MMLU

In [1]:
from pprint import pprint
from datetime import datetime
from datasets import load_dataset

from utils import Annotate

In [2]:
seed =42
now = datetime.now().strftime("%Y%m%d")

In [3]:
dataset = load_dataset("cais/mmlu", "all")

# # take a small sample for dev purposes
dataset = dataset['test'].shuffle(seed=seed).select(range(20))

# user provided data description
DESCRIPTION = """
This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge.
The test spans subjects in the humanities, social sciences, hard sciences, and other areas that are important for some people to learn.
To attain high accuracy on this test, models must possess extensive world knowledge and problem solving ability.
This covers 57 subjects  across STEM, the humanities, the social sciences, and more. 
It ranges in difficulty from an elementary level to an advanced professional level, and it tests both world knowledge and problem solving ability. 
Subjects range from traditional areas, such as mathematics and history, to more specialized areas like law and ethics.
"""


In [4]:
gemini_prompt_template = """
<QUESTION>
{datapoint}
</QUESTION>
------------

<CHOICES>
{labels}
</choices>
------------

INSTRUCTION:
- read the above question carefully.
- you are given 4 choices seperated by comma in <CHOICES>.
- take your time and pick the precise correct answer from <CHOICES> for the given <QUESTION>.
- remember that there is always only one correct answer.
- return the exact correct answer from <CHOICES>. Don't provide explanations.
"""

In [5]:
prompt = [gemini_prompt_template.format(description= DESCRIPTION,
                                        datapoint=x['question'],
                                        labels=x['choices']) for x in dataset]
print(len(prompt))

20


In [10]:
models = [
    # "palm",
    # "gemini",
    "claude"
    ]

ann = Annotate()


In [11]:
r = await ann.classification(prompt, models=models)

Creating tasks: 40it [00:00, 153076.79it/s]           
Gathering claude results:   0%|          | 0/20 [00:00<?, ?it/s]2024-05-28 22:23:54,813/Annotate[ERROR]: claude Task 0 failed: 'Message' object has no attribute 'text'
Gathering claude results:   5%|▌         | 1/20 [00:26<08:22, 26.45s/it]2024-05-28 22:23:54,815/Annotate[ERROR]: claude Task 1 failed: 'Message' object has no attribute 'text'
2024-05-28 22:23:54,816/Annotate[ERROR]: claude Task 2 failed: 'Message' object has no attribute 'text'
2024-05-28 22:23:54,817/Annotate[ERROR]: claude Task 3 failed: 'Message' object has no attribute 'text'
2024-05-28 22:23:54,818/Annotate[ERROR]: claude Task 4 failed: 'Message' object has no attribute 'text'
2024-05-28 22:23:54,818/Annotate[ERROR]: claude Task 5 failed: 'Message' object has no attribute 'text'
2024-05-28 22:23:54,819/Annotate[ERROR]: claude Task 6 failed: 'Message' object has no attribute 'text'
2024-05-28 22:23:54,820/Annotate[ERROR]: claude Task 7 failed: 'Message' object h

In [12]:
r['claude']

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [None]:
result = [dataset['choices'][idx][v] for idx, v in enumerate(dataset['answer'])]
pprint(result)