In [63]:
import yaml


def load_config(filename: str) -> object:
    file = open(filename).read()
    return yaml.safe_load(file)


experts = load_config('experts.yml')
experts

{'experts': [{'model': 'gpt-3.5-turbo'},
  {'model': 'anthropic/claude-3-haiku'},
  {'model': 'perplexity/llama-3-sonar-small-32k-online'},
  {'model': 'google/palm-2-chat-bison-32k'},
  {'model': 'google/gemma-2-9b-it'}]}

In [64]:
def load_template(filename: str) -> str:
    file = open(filename, 'r').read()
    return file


template = load_template('prompt_template.txt')
template

'Can you evaluate how well this example conveys its meaning, how well it is organized and structured, whether it fits the theme of the conversation, and whether its responses are accurate?\nPlease rate text below from 1 (poor) to 5 (excellent), RESPONSE ONLY ONE NUMBER:\n\n{{ context }}\n'

In [65]:
from jinja2 import Template
from openai import OpenAI
from settings import OPENAI_API_KEY, OPENAI_BASE_URL


class Expert:
    client = None

    def __init__(self, model, base_url=None, api_key=None, prompt_template=None):
        self.model = model
        self.base_url = base_url or OPENAI_BASE_URL
        self.api_key = api_key or OPENAI_API_KEY
        self.prompt_template = prompt_template

        if self.client is None:
            self.init_client()

    def init_client(self):
        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)

    def rate_element(self, data: str) -> int | None:
        prompt = Template(self.prompt_template).render(context=data)
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=0,
            n=1,
            max_tokens=5  # Amount of OUTPUT tokens
        )

        # Extract the response text and parse it as an integer rating from 1 to 5
        response_text = response.choices[0].message.content
        try:
            rating = int(''.join(filter(str.isdigit, response_text)))
        except ValueError:
            print("Invalid rating value in response: {}".format(response_text))
            return None

        # Ensure the rating is within the valid range [1, 5]
        if rating < 1 or rating > 5:
            print("Rating out of range: {}".format(rating))
            return None

        return rating

In [66]:
import json
import concurrent.futures
from datasets import load_dataset
import tiktoken

config = load_config('experts.yml')
template = load_template('prompt_template.txt')
experts = [Expert(**expert, prompt_template=template) for expert in config['experts']]
encoding = tiktoken.get_encoding("gpt2")  # Load the Tiktoken encoding model for counting tokens


# Processing function, due to limitation of concurrent requests in python
def process_model(expert: Expert, data: str) -> int:
    return expert.rate_element(data)


# Load the dataset
dataset = load_dataset("IlyaGusev/saiga_scored", split="train[:10]")  # Replace with your desired dataset name

# Get the data to rate from the dataset
data_to_rate = [json.dumps(sample["messages"], ensure_ascii=False) for sample in dataset]

In [67]:
ratings_by_experts = []
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
    # Fill object with default structure
    ratings_by_expert = {expert: [] for expert in experts}
    for i, data in enumerate(data_to_rate):

        # Count the number of tokens in the sample using Tiktoken
        num_tokens = len(encoding.encode(data))
        # Skip processing for large samples with more than 3000 tokens
        if num_tokens > 3000:
            print("Skipping sample {}: Size exceeds limit ({} tokens)".format(i + 1, num_tokens))
            ratings_by_experts.append(None)
            continue

        print("Processing sample {}: {}".format(i + 1, data))
        ratings = list(executor.map(process_model, experts, [data] * len(experts)))
        ratings_by_expert = {expert.model: ratings[idx] for idx, expert in enumerate(experts)}
        print("Ratings:", ratings)

        # Save results
        ratings_by_experts.append(ratings_by_expert)

Processing sample 1: [{"content": "Ideas generales de la Seguridad Social (qué es, para qué sirve, qué has oído\ndecir de ella...)", "role": "user"}, {"content": "La Seguridad Social es un sistema integral diseñado para proteger a los ciudadanos frente a diversas situaciones de vulnerabilidad o necesidad, tales como la vejez, el desempleo, la enfermedad, la discapacidad, la muerte (protección a los dependientes del fallecido) y la maternidad. Este sistema se basa en principios de solidaridad y redistribución de recursos, donde los trabajadores, empleadores y, en algunos casos, el Estado, contribuyen financieramente para sostenerlo.\n\nLa Seguridad Social tiene varios propósitos fundamentales:\n\n1. **Protección contra la pérdida de ingresos**: Asegura un ingreso básico a las personas que no pueden trabajar debido a vejez, enfermedad, discapacidad, maternidad o desempleo.\n\n2. **Acceso a servicios de salud**: Ofrece cobertura de salud para el tratamiento de enfermedades y condiciones m

In [68]:
# Process each element concurrently using a pool of experts and take a quorum vote
quorum = len(experts) // 2 + 1  # Calculate the minimum number of experts to get a majority vote

# Calculate the average rating and classify it as "bad" or "good" for each sample
for i, data in enumerate(data_to_rate):
    
    # Count the number of tokens in the sample using Tiktoken
    num_tokens = len(encoding.encode(data))
    # Skip processing for large samples with more than 3000 tokens
    if num_tokens > 3000:
        print("Sample {}: Skipping due to size exceeds limit ({} tokens)".format(i + 1, num_tokens))
        continue
    
    # average_rating = statistics.mean(ratings_by_experts[expert.model] for expert in experts
    average_rating = sum(
        ratings_by_experts[i][expert.model]
        for expert in experts if ratings_by_experts[i][expert.model] is not None
    ) / len([expert for expert in experts if ratings_by_experts[i][expert.model] is not None])
    print("Sample {}: Average rating is {:.2f}".format(i + 1, average_rating))

    # Determine if the majority vote has been achieved
    majority_vote = sum(1 for expert in experts if ratings_by_experts[i][expert.model] == int(average_rating)) >= quorum


    if not majority_vote:
        print("Warning: Majority vote not achieved.")

    # Classify the average rating as "bad" or "good" based on a threshold of 3.5
    if average_rating < 3.5:
        print("Classification: Bad")
    else:
        print("Classification: Good")

Sample 1: Average rating is 4.75
Classification: Good
Sample 2: Average rating is 4.25
Classification: Good
Sample 3: Average rating is 4.00
Classification: Good
Sample 4: Average rating is 4.50
Classification: Good
Sample 5: Skipping due to size exceeds limit (3728 tokens)
Sample 6: Skipping due to size exceeds limit (10593 tokens)
Sample 7: Average rating is 4.67
Classification: Good
Sample 8: Average rating is 4.67
Classification: Good
Sample 9: Skipping due to size exceeds limit (7171 tokens)
Sample 10: Average rating is 4.50
Classification: Good
