# TODO:
### NOW:
- ~~enforce output format for gemini~~
- llama, gpt, ~~claude~~
   - send concurrent calls to all models at once
- ~~add evaluation if there is a golden set for individual model~~
- aggregation strategy and tie breaking
   - multiclass classification: ~~majority vote~~, baysian approach with GT
      - provide X labeles per class
- repeat the same thing for multi-label/ner

### LATER:
- secret management
- update readme
- add images



### nice things to do:
- add tqdm to asyncio calls
- proper logging

# Annotate

In [1]:
from utils import Annotate
from datasets import load_dataset

seed =42

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gemini_prompt_template = """
<data_description>
{description}
</data_description>
-----------

<context>
{datapoint}
</context>
------------

<labels>
{labels}
</labels>
------------

INSTRUCTION:
- familirize yourself with the data using data_description
- read the context carefully. this is the data point you need to label.
- take your time and label the dadatapoint with the most appropriate option using the provided labels.
- return the result as a single label from the <labels>. Don't provide explanations
"""

In [4]:
dataset = load_dataset("yelp_polarity", split="train") # https://huggingface.co/datasets/yelp_polarity

# take a small sample for dev purposes
dataset_sample = dataset.shuffle(seed=seed).select(range(100))

# user provided data description
DESCRIPTION = """
This is a dataset for binary sentiment classification.
It contains highly polar yelp reviews.
Negative polarity is class 0, and positive class 1.
"""

LABEL_SET = [0, 1] 

In [20]:
prompt = [gemini_prompt_template.format(description= DESCRIPTION,
                                        datapoint=x,
                                        labels=LABEL_SET) for x in dataset_sample["text"][:20]]
print(len(prompt))

20


In [21]:
ann = Annotate()

VALID_MODELS = ["gemini", "claude"]

In [22]:
d = {}
for m in VALID_MODELS:
    d[m] = await ann.classification(prompt, model=m)

[1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0]
[1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0]


In [23]:
import json
with open("./data/output/20_sample.json", "w") as json_file:
    json.dump(d, json_file, indent=4)

In [24]:
all_results = [d["gemini"], d["claude"]]

## evaluate

In [None]:
from utils import Evaluate

eval = Evaluate()

In [None]:
y_labels = ["a", "b", "c"]
all_results = [[1, 1, 0, 0, 1], [0, 0, 0, 1, 1], [0, 1, 0, 0, 1]]

In [None]:
eval.classification(all_results, strategy="majority", visualize=True, y_labels=y_labels)

# Dev

In [28]:
import numpy as np


max_iter=100
tol=1e-4

In [27]:
n_data = len(d['gemini'])
n_labelers = len(VALID_MODELS) # ASSUMPTION: all llms annotate every datapoint

In [30]:
# Initialize parameters (randomly for demonstration)
Z = np.random.rand(n_data) > 0.5  


alpha = np.random.rand(2)
alpha /= alpha.sum()  # Ensure probabilities sum to 1
alpha


array([0.85167198, 0.14832802])

In [31]:
beta = np.random.rand(2, 2)
beta /= beta.sum(axis=1, keepdims=True)  # Normalize rows
beta

array([[0.53970528, 0.46029472],
       [0.23374726, 0.76625274]])