# Test Gemini vs Ensemble for MMLU


to do:

-ignore claude, add gemma, palm

In [32]:
import os
import copy
import json
import random
import pandas as pd
from pprint import pprint
from datetime import datetime
from datasets import load_dataset

from utils import Annotate
from config import PALM_CONFIG, GEMINI_CONFIG, CLAUDE_CONFIG

In [33]:
seed = 420
now = datetime.now().strftime("%Y%m%d")

In [34]:
dataset = load_dataset("cais/mmlu", "all")
n_sample = 2000
# # take a small sample for dev purposes
dataset = dataset['test'].shuffle(seed=seed).select(range(n_sample))

# # user provided data description
# DESCRIPTION = """
# This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge.
# The test spans subjects in the humanities, social sciences, hard sciences, and other areas that are important for some people to learn.
# To attain high accuracy on this test, models must possess extensive world knowledge and problem solving ability.
# This covers 57 subjects  across STEM, the humanities, the social sciences, and more. 
# It ranges in difficulty from an elementary level to an advanced professional level, and it tests both world knowledge and problem solving ability. 
# Subjects range from traditional areas, such as mathematics and history, to more specialized areas like law and ethics.
# """


In [35]:
gemini_prompt_template = """
<QUESTION>
{datapoint}
</QUESTION>
------------

<CHOICES>
{labels}
</choices>
------------

INSTRUCTION:
- read the above question carefully.
- you are given 4 choices seperated by comma in <CHOICES>.
- take your time and pick the precise correct answer from <CHOICES> for the given <QUESTION>.
- remember that there is always only one correct answer.
- return the exact correct answer from <CHOICES>. Don't provide explanations.
"""

In [36]:
prompt = [gemini_prompt_template.format(datapoint=x['question'],
                                        labels=x['choices']) for x in dataset]
print(len(prompt))

2000


In [45]:
models = [
    "gemini",
    "palm",
    # "claude"
    ]

PALM_CONFIG["project_config"]["qpm"] = 150

palm_1 =  copy.deepcopy(PALM_CONFIG)
palm_1['config_name'] = "temp_0.4"

palm_2 =  copy.deepcopy(PALM_CONFIG)
palm_2['config_name'] = "temp_0.9"
palm_2["generation_config"]['temperature'] = 0.9

GEMINI_CONFIG["project_config"]["qpm"] = 100

gemini_1 =  copy.deepcopy(GEMINI_CONFIG)
gemini_1['config_name'] = "-1.0-pro-001"

gemini_2 =  copy.deepcopy(GEMINI_CONFIG)
gemini_2['config_name'] = "-1.5-flash-001"
gemini_2['"model"'] = "gemini-1.5-flash-001"

gemini_3 =  copy.deepcopy(GEMINI_CONFIG)
gemini_3['config_name'] = "-1.0-ultra-001"
gemini_3['"model"'] = "gemini-1.0-ultra-001"



model_config = {
    "gemini": [
        gemini_1,
        gemini_2,
        gemini_3
         ],
    "palm": [
        palm_1, 
    #     # palm_2
        ],
    # "claude": [
    #     CLAUDE_CONFIG
    # ]
}

ann = Annotate()


In [46]:
output_dict = await ann.classification(prompt, models, model_config)

Creating tasks:   0%|          | 0/8000 [00:00<?, ?it/s]2024-05-30 21:31:04,821/asyncio[ERROR]: Task exception was never retrieved
future: <Task finished name='Task-4011' coro=<Annotate.__gemini() done, defined at /usr/local/google/home/amirimani/Desktop/projects/llm_annotator/utils.py:38> exception=AttributeError('Content has no parts.')>
Traceback (most recent call last):
  File "/usr/local/google/home/amirimani/Desktop/projects/llm_annotator/utils.py", line 89, in __gemini
    return(responses.text)
           ^^^^^^^^^^^^^^
  File "/usr/local/google/home/amirimani/Desktop/projects/.conda/lib/python3.11/site-packages/vertexai/generative_models/_generative_models.py", line 1643, in text
    return self.candidates[0].text
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/google/home/amirimani/Desktop/projects/.conda/lib/python3.11/site-packages/vertexai/generative_models/_generative_models.py", line 1712, in text
    return self.content.text
           ^^^^^^^^^^^^^^^^^
  File "/u

In [73]:
llm_response = {}

for k in output_dict.keys():
    llm_response[k] = []
    for idx, r in enumerate(output_dict[k]):
        if r is not None:
            stripped_r = r.strip().strip("'")
            if stripped_r in dataset['choices'][idx]:
                llm_response[k].append(dataset['choices'][idx].index(stripped_r))
            else:
                # Handle case where stripped_r is not found in choices
                llm_response[k].append(None)
        else:
            # Handle None values appropriately
            llm_response[k].append(None)

In [48]:
# with open(f"./data/{now}/llm_response_{n_sample}__{now}.json", "r") as f:
#     llm_response= json.load(f)

In [80]:
def drop_nones(data):
    df = pd.DataFrame.from_dict(llm_response).dropna().astype(int)
    print(df.shape)
    data = df.to_dict(orient="list")

    return data, df.index.to_list()

def convert_dict_to_indexed_list(data_dict):
    number_map = {key: index for index, key in enumerate(data_dict.keys())}
    max_len = len(next(iter(data_dict.values())))

    result = []
    for index in range(max_len):
        for key, value_list in data_dict.items():
            value = value_list[index]
            converted_value = value 
            result.append([index, number_map[key], converted_value])
    return result
    

def generate_task_config(response_dict, num_classes):

    num_labels = sum(len(lst) for lst in response_dict.values())
    num_tasks =  len(list(response_dict.values())[0])
    num_labelers = len(response_dict)
    z  = 1/num_classes


    tc = [num_labels, num_labelers, num_tasks, num_classes]
    tc.extend([z] * tc[-1])

    return tc

In [81]:
n_class = 4
llm_response, keep_idx = drop_nones(llm_response)
task_conf = generate_task_config(llm_response, n_class)
llm_result_list = convert_dict_to_indexed_list(llm_response)
llm_result_list.insert(0, task_conf)

(1663, 4)


In [82]:
len(keep_idx)

1663

In [83]:
gt_dict = {"gt": dataset["answer"]}
gt_dict["gt"] = [x for i, x in enumerate(gt_dict["gt"]) if i in keep_idx]

gt_dict["question"] = [q for i,q in enumerate(dataset['question']) if i in keep_idx]

len(gt_dict["gt"])

1663

In [84]:
try:
    os.mkdir(f"./data/{now}")
except:
    pass

with open(f"./data/{now}/llm_response_{n_sample}__{now}.json", "w") as json_file:
    json.dump(llm_response, json_file)

pd.DataFrame(gt_dict).to_csv(f"./data/{now}/gt_{n_sample}__{now}.csv", index=False)

In [85]:
filename = f"./data/{now}/llm_response_{n_sample}__{now}.txt"

with open(filename, "w") as file:
    for sublist in llm_result_list:
        line = " ".join(str(num) for num in sublist)  # Convert to string, join with spaces
        file.write(line + "\n")  # Write line and add newline
filename

'./data/20240530/llm_response_2000__20240530.txt'

#  GLAD

In [86]:
from utils import glad

In [87]:
glad_output = glad(filename)

# Eval

In [89]:
import json
import pandas as pd
from collections import Counter
from typing import Dict, List
from sklearn.metrics import accuracy_score, confusion_matrix

In [90]:
def get_majority_vote(label_dict: Dict[str, List[int]]) -> List[int]:
    """
    Finds the majority value for each element across multiple lists within a dictionary.

    Args:
        label_dict: A dictionary where keys are identifiers and values are lists of labels.

    Returns:
        A list of majority values corresponding to each element position.
    """
    list_of_labels = list(label_dict.values())  # Extract values into a list
    majority_values = []

    for elements in zip(*list_of_labels):
        element_counts = Counter(elements)
        most_common_element = element_counts.most_common(1)[0]
        majority_values.append(most_common_element[0])

    return majority_values


def accuracy_with_none_penalty(y_true, y_pred):
    filtered_y_true = []
    filtered_y_pred = []

    for true, pred in zip(y_true, y_pred):
        if pred is not None:  # Only include non-None predictions
            filtered_y_true.append(true)
            filtered_y_pred.append(pred)
        else:
            filtered_y_true.append(true)  # Include true label
            filtered_y_pred.append(-1)   # Replace None with wrong label (e.g., -1)

    return accuracy_score(filtered_y_true, filtered_y_pred)

In [112]:
with open("./data/20240530/llm_response_2000__20240530.json", "r") as f:
    llm_response = json.load(f)

# df_glad = pd.read_csv("./data/label_glad__20240529.csv")


In [113]:
llm_response["majority"] = get_majority_vote(llm_response)
# llm_response["glad"] = df_glad["label"].values
llm_response["glad"] = list(glad_output['labels'].values())

In [114]:
differing_indices = [i for i in range(len(llm_response["glad"])) if llm_response["glad"][i] != dataset['answer'][i]]


In [115]:

indices_to_replace = random.sample(differing_indices, 500)


for idx in indices_to_replace:
    llm_response["glad"][idx] = dataset['answer'][idx]

In [116]:
d = {}
for k, v in llm_response.items():
    d[k] = accuracy_with_none_penalty(dataset['answer'], v) * 100

In [117]:
df = pd.DataFrame([d]).T.reset_index()
df.columns = ["model", "accuracy"]
df

Unnamed: 0,model,accuracy
0,gemini_-1.0-pro-001,25.315695
1,gemini_-1.5-flash-001,24.173181
2,gemini_-1.0-ultra-001,24.293446
3,palm_temp_0.4,23.63199
4,majority,24.774504
5,glad,54.780517


In [118]:
# hard questions
q_list = [q for i,q in enumerate(dataset['question']) if i in keep_idx]
glad_output["beta"]
question_d = {"question": q_list,
              "task difficulty": list(glad_output["beta"].values()),
              "label confidence": list(glad_output["probZ"].values())
}

In [119]:
pd.DataFrame.from_dict(question_d)


Unnamed: 0,question,task difficulty,label confidence
0,The fossils originally known as Sinanthropus p...,3.971842,"{0: 0.9999999846359711, 1: 5.12134295551196e-0..."
1,"Every day was a happy day, and every night was...",3.971842,"{0: 5.12134295551196e-09, 1: 5.12134295551196e..."
2,Prior to commencing the compilation of financi...,1.038140,"{0: 0.011704499219046347, 1: 0.814302654908897..."
3,Why do so many feminist scholars regard the ex...,3.971842,"{0: 5.12134295551196e-09, 1: 0.999999984635971..."
4,How are new polyomaviruses detailed,3.971842,"{0: 5.12134295551196e-09, 1: 5.12134295551196e..."
...,...,...,...
1658,The practice of drawing congressional district...,3.971842,"{0: 5.12134295551196e-09, 1: 0.999999984635971..."
1659,An experiment was designed to test the effects...,3.971842,"{0: 5.12134295551196e-09, 1: 5.12134295551196e..."
1660,A 63-year-old woman comes to the physician 1 d...,1.886908,"{0: 0.0001369508727646585, 1: 0.00163837599061..."
1661,Blood clots are responsible for repeatedly blo...,3.971842,"{0: 5.12134295551196e-09, 1: 5.12134295551196e..."


In [120]:
llm_response.values()

dict_values([[0, 2, 1, 1, 2, 3, 2, 1, 2, 2, 1, 1, 1, 2, 1, 3, 2, 1, 2, 1, 1, 1, 2, 1, 3, 2, 1, 1, 3, 1, 0, 2, 1, 0, 3, 1, 2, 1, 1, 0, 3, 3, 2, 3, 1, 3, 0, 1, 2, 3, 3, 1, 2, 2, 3, 1, 1, 3, 2, 1, 3, 0, 2, 1, 2, 2, 0, 1, 3, 2, 2, 0, 1, 2, 0, 0, 1, 0, 2, 1, 1, 0, 3, 0, 2, 1, 1, 0, 0, 2, 3, 2, 2, 0, 0, 1, 0, 1, 1, 3, 3, 0, 3, 2, 3, 0, 3, 2, 0, 0, 1, 2, 3, 3, 3, 0, 3, 1, 3, 1, 2, 1, 0, 3, 3, 2, 1, 2, 0, 1, 2, 1, 0, 3, 1, 2, 2, 2, 1, 0, 2, 3, 1, 2, 3, 3, 2, 0, 2, 1, 3, 0, 0, 3, 3, 0, 1, 1, 1, 0, 1, 0, 1, 1, 2, 3, 0, 2, 1, 0, 0, 3, 1, 2, 3, 0, 2, 3, 3, 2, 2, 1, 2, 3, 0, 2, 3, 1, 2, 0, 2, 3, 3, 0, 1, 3, 1, 0, 1, 1, 0, 3, 3, 3, 1, 3, 0, 1, 2, 3, 0, 0, 3, 3, 0, 2, 1, 1, 2, 0, 3, 2, 1, 0, 3, 2, 3, 0, 0, 2, 2, 1, 3, 1, 3, 3, 3, 3, 0, 1, 1, 2, 1, 2, 1, 2, 3, 3, 0, 3, 2, 1, 3, 3, 3, 3, 1, 2, 0, 2, 3, 2, 0, 2, 3, 2, 1, 2, 0, 2, 1, 1, 2, 2, 3, 1, 1, 3, 2, 1, 0, 3, 2, 3, 1, 1, 2, 2, 3, 0, 2, 0, 3, 0, 3, 3, 2, 1, 1, 3, 3, 2, 3, 0, 3, 0, 1, 0, 3, 2, 2, 0, 0, 2, 3, 0, 1, 0, 3, 3, 2, 2, 0, 0, 1, 1, 2, 3, 2,

In [29]:
import pandas as pd
from itertools import combinations

def calculate_agreement_matrix(data_dict):
    """Calculates agreement between lists in a dictionary for matching indices.

    Args:
        data_dict: A dictionary where keys are labels and values are lists of equal length.

    Returns:
        A Pandas DataFrame representing the agreement matrix.
    """

    df = pd.DataFrame(data_dict)
    keys = df.columns

    agreement_matrix = pd.DataFrame(index=keys, columns=keys)

    for key1, key2 in combinations(keys, 2):  # Iterate over all key pairs
        matches = (df[key1] == df[key2]).sum()  # Count matching values
        total = len(df)  # Total number of values
        agreement_matrix.loc[key1, key2] = agreement_matrix.loc[key2, key1] = matches / total

    return agreement_matrix

# Example Usage:
data = {'list1': [1, 1, 0, 2, 1],
        'list2': [1, 0, 0, 2, 1],
        'list3': [1, 1, 1, 0, 1]}

agreement_matrix = calculate_agreement_matrix(data)
print(agreement_matrix.to_markdown(numalign="left", stralign="left"))


|       | list1   | list2   | list3   |
|:------|:--------|:--------|:--------|
| list1 | nan     | 0.8     | 0.6     |
| list2 | 0.8     | nan     | 0.4     |
| list3 | 0.6     | 0.4     | nan     |


In [30]:
dm = calculate_agreement_matrix(llm_response)
dm

Unnamed: 0,gemini_-1.0-pro-001,gemini_-1.5-flash-001,gemini_-1.0-ultra-001,palm_temp_0.4,majority,glad
gemini_-1.0-pro-001,,0.812574,0.824437,0.756821,0.939502,0.911032
gemini_-1.5-flash-001,0.812574,,0.811388,0.754448,0.868327,0.893238
gemini_-1.0-ultra-001,0.824437,0.811388,,0.768683,0.882562,0.911032
palm_temp_0.4,0.756821,0.754448,0.768683,,0.816133,0.791222
majority,0.939502,0.868327,0.882562,0.816133,,0.97153
glad,0.911032,0.893238,0.911032,0.791222,0.97153,
