NOTE: Some data has been redacted (sensitive information, etc.)

In [None]:
from dotenv import load_dotenv
import os
import openai
import tag_machine_api
import json
import psycopg
from pathlib import Path
import datasets
from collections import defaultdict
from tqdm import tqdm
import yaml
import random
import time
import csv
import gzip
import re
import base64
import magic
from PIL import Image
import io
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
from traceback import format_exc
from typing import TypeVar, Union, Any, Callable
from psycopg.types.json import Jsonb
from collections import Counter
import itertools
from contextlib import contextmanager
from pydantic import BaseModel, ConfigDict, model_validator, Field, computed_field
from psycopg.rows import dict_row, class_row
import requests
import pandas as pd
import matplotlib.pyplot as plt
import math
import dataclasses

In [None]:
#CURRENT_RESPONSE_MODEL = "8gjfxjdm"
CURRENT_RESPONSE_MODEL = "5i5xmxdx"
#CURRENT_RESPONSE_MODEL = "kiqxrbng"

In [None]:
load_dotenv()
api = tag_machine_api.TagMachineAPI(os.environ["TAG_MACHINE_TOKEN"], "XXXXX")

@contextmanager
def db_conn(**kwargs):
	conn = psycopg.connect(dbname='postgres', user='postgres', host=str(Path.cwd().parent / 'pg-socket'), **kwargs)
	try:
		with conn.cursor() as cur:
			yield conn, cur
	except Exception as e:
		raise e
	finally:
		conn.close()

In [None]:
class EvalSample(BaseModel):
	model_config = ConfigDict(frozen=True, revalidate_instances="always", extra='forbid', strict=True, validate_by_alias=True, validate_by_name=True)
	filehash: bytes = Field(validation_alias="image_hash")
	system: str
	question: str
	question_type: str
	task_type: str
	id: int | None = None
	ground_truth_knowledge: str | None = None
	response_a: str | None = None
	response_b: str | None = None
	response_a_model: str | None = None
	response_b_model: str | None = None
	judge_system: str | None = None
	judge_user: str | None = None
	judge_model: str | None = None
	judge_reasoning: str | None = None
	judge_winner: str | None = None
	judge_response_a_score: float | None = None
	judge_response_b_score: float | None = None
	image_dataurl: str | None = None
	created_at: int | None = None
	update_sequence: int | None = None
	in_progress: int | None = None

	@computed_field
	@property
	def is_done(self) -> bool:
		return self.judge_winner is not None

	def get_image(self) -> "EvalSample":
		image_data = api.read_image(self.filehash)
		image_mime = magic.from_buffer(image_data, mime=True)
		image_dataurl = f"data:{image_mime};base64,{base64.b64encode(image_data).decode('utf-8')}"
		return self.model_copy(update={"image_dataurl": image_dataurl})
	
	@model_validator(mode="before")
	@classmethod
	def unpack_messages(cls, data: Any) -> Any:
		if isinstance(data, cls):
			return data
		
		d: dict[str, Any] = dict(data)

		if 'messages' in d and d['messages'] is not None:
			msgs: list[dict[str, str]] = d['messages']

			assert len(msgs) == 2,  f"Expected 2 messages, got {len(msgs)}"
			assert msgs[0]['role'] == 'system', f"Expected system role, got {msgs[0]['role']}"
			assert msgs[1]['role'] == 'user',   f"Expected user role, got {msgs[1]['role']}"

			d['system']   = msgs[0]['content']
			d['question'] = msgs[1]['content']
			d.pop('messages', None)

		return d
	
	def add_to_database(self, cur: psycopg.Cursor) -> "EvalSample":
		assert self.id is None, "Sample already has an ID"

		created_at = int(time.time()) if self.created_at is None else self.created_at

		cur.execute("""
			INSERT INTO alignment_preferences_ai (
				image_hash,
				messages,
				question_type,
				task_type,
				ground_truth_knowledge,
				response_a,
				response_b,
				response_a_model,
				response_b_model,
				judge_system,
				judge_user,
				judge_model,
				judge_reasoning,
				judge_winner,
			  	judge_response_a_score,
			  	judge_response_b_score,
				created_at,
				in_progress
			) VALUES (
				%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
			) RETURNING id
		""", (
			self.filehash,
			Jsonb([
				{"role": "system", "content": self.system},
				{"role": "user",   "content": self.question}
			]),
			self.question_type,
			self.task_type,
			self.ground_truth_knowledge,
			self.response_a,
			self.response_b,
			self.response_a_model,
			self.response_b_model,
			self.judge_system,
			self.judge_user,
			self.judge_model,
			self.judge_reasoning,
			self.judge_winner,
		  	self.judge_response_a_score,
		  	self.judge_response_b_score,
			created_at,
			self.in_progress
		))

		row = cur.fetchone()
		assert row is not None, "Failed to insert sample into database"

		return self.model_copy(update={"id": row[0], "created_at": created_at})
	
	def update_in_database(self, cur: psycopg.Cursor):
		assert self.id is not None, "Sample does not have an ID"

		cur.execute("SELECT image_hash, messages, question_type, task_type FROM alignment_preferences_ai WHERE id = %s", (self.id,))
		row = cur.fetchone()
		assert row is not None, "Failed to find sample in database"
		assert row[0] == self.filehash, "Filehash does not match"
		assert row[1] == [{"role": "system", "content": self.system}, {"role": "user", "content": self.question}], "Messages do not match"
		assert row[2] == self.question_type, "Question type does not match"
		assert row[3] == self.task_type, "Task type does not match"

		cur.execute("""
			UPDATE alignment_preferences_ai
			SET
				ground_truth_knowledge = %s,
				response_a = %s,
				response_b = %s,
				response_a_model = %s,
				response_b_model = %s,
				judge_system = %s,
				judge_user = %s,
				judge_model = %s,
				judge_reasoning = %s,
				judge_winner = %s,
			  	judge_response_a_score = %s,
			  	judge_response_b_score = %s,
				in_progress = %s
			WHERE id = %s
		""", (
			self.ground_truth_knowledge,
			self.response_a,
			self.response_b,
			self.response_a_model,
			self.response_b_model,
			self.judge_system,
			self.judge_user,
			self.judge_model,
			self.judge_reasoning,
			self.judge_winner,
		  	self.judge_response_a_score,
		  	self.judge_response_b_score,
			self.in_progress,
		  	self.id
		))
	
	@staticmethod
	def get_from_database(conn: psycopg.Connection, id: int) -> "EvalSample | None":
		with conn.cursor(row_factory=class_row(EvalSample)) as cur:
			cur.execute("SELECT * FROM alignment_preferences_ai WHERE id = %s", (id,))
			row = cur.fetchone()
			if row is None:
				return None
			
			return row


class EvalSampleMulti(BaseModel):
	model_config = ConfigDict(frozen=True, revalidate_instances="always", extra='forbid', strict=True, validate_by_alias=True, validate_by_name=True)
	filehash: bytes = Field(validation_alias="image_hash")
	system: str
	question: str
	question_type: str
	ground_truth_knowledge: str | None = None
	responses: list[str] | None = None
	responses_model: str | None = None
	judge_system: str | None = None
	judge_user: str | None = None
	judge_model: str | None = None
	judge_reasoning: str | None = None
	judge_rankings: list[int] | None = None
	image_dataurl: str | None = None
	id: int | None = None
	created_at: int | None = None
	update_sequence: int | None = None
	in_progress: int | None = None

	@computed_field
	@property
	def is_done(self) -> bool:
		return self.judge_rankings is not None

	def get_image(self) -> "EvalSampleMulti":
		image_data = api.read_image(self.filehash)
		image_mime = magic.from_buffer(image_data, mime=True)
		image_dataurl = f"data:{image_mime};base64,{base64.b64encode(image_data).decode('utf-8')}"
		return self.model_copy(update={"image_dataurl": image_dataurl})
	
	@model_validator(mode="before")
	@classmethod
	def unpack_messages(cls, data: Any) -> Any:
		if isinstance(data, cls):
			return data
		
		d: dict[str, Any] = dict(data)

		if 'messages' in d and d['messages'] is not None:
			msgs: list[dict[str, str]] = d['messages']

			assert len(msgs) == 2,  f"Expected 2 messages, got {len(msgs)}"
			assert msgs[0]['role'] == 'system', f"Expected system role, got {msgs[0]['role']}"
			assert msgs[1]['role'] == 'user',   f"Expected user role, got {msgs[1]['role']}"

			d['system']   = msgs[0]['content']
			d['question'] = msgs[1]['content']
			d.pop('messages', None)

		return d
	
	def add_to_database(self, cur: psycopg.Cursor) -> "EvalSampleMulti":
		assert self.id is None, "Sample already has an ID"

		created_at = int(time.time()) if self.created_at is None else self.created_at

		cur.execute("""
			INSERT INTO alignment_rankings_ai (
				image_hash,
				messages,
				question_type,
				ground_truth_knowledge,
				responses,
				responses_model,
				judge_system,
				judge_user,
				judge_model,
				judge_reasoning,
				judge_rankings,
				created_at,
				in_progress
			) VALUES (
				%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
			) RETURNING id
		""", (
			self.filehash,
			Jsonb([
				{"role": "system", "content": self.system},
				{"role": "user",   "content": self.question}
			]),
			self.question_type,
			self.ground_truth_knowledge,
			Jsonb(self.responses),
			self.responses_model,
			self.judge_system,
			self.judge_user,
			self.judge_model,
			self.judge_reasoning,
			Jsonb(self.judge_rankings),
			created_at,
			self.in_progress
		))

		row = cur.fetchone()
		assert row is not None, "Failed to insert sample into database"

		return self.model_copy(update={"id": row[0], "created_at": created_at})

	def update_in_database(self, cur: psycopg.Cursor):
		assert self.id is not None, "Sample does not have an ID"

		cur.execute("SELECT image_hash, messages, question_type FROM alignment_rankings_ai WHERE id = %s", (self.id,))
		row = cur.fetchone()
		assert row is not None, "Failed to find sample in database"
		assert row[0] == self.filehash, "Filehash does not match"
		assert row[1] == [{"role": "system", "content": self.system}, {"role": "user", "content": self.question}], "Messages do not match"
		assert row[2] == self.question_type, "Question type does not match"

		cur.execute("""
			UPDATE alignment_rankings_ai
			SET
				ground_truth_knowledge = %s,
				responses = %s,
				responses_model = %s,
				judge_system = %s,
				judge_user = %s,
				judge_model = %s,
				judge_reasoning = %s,
				judge_rankings = %s,
				in_progress = %s
			WHERE id = %s
		""", (
			self.ground_truth_knowledge,
			Jsonb(self.responses) if self.responses is not None else None,
			self.responses_model,
			self.judge_system,
			self.judge_user,
			self.judge_model,
			self.judge_reasoning,
			Jsonb(self.judge_rankings) if self.judge_rankings is not None else None,
			self.in_progress,
		  	self.id
		))
	
	@staticmethod
	def get_from_database(conn: psycopg.Connection, id: int) -> "EvalSampleMulti | None":
		with conn.cursor(row_factory=class_row(EvalSampleMulti)) as cur:
			cur.execute("SELECT * FROM alignment_rankings_ai WHERE id = %s", (id,))
			row = cur.fetchone()
			if row is None:
				return None
			
			return row


def list_db_preferences() -> list[EvalSample]:
	with db_conn(row_factory=class_row(EvalSample)) as (conn, cur):
		cur.execute("SELECT * FROM alignment_preferences_ai")
		return cur.fetchall() # type: ignore


def list_db_rankings() -> list[EvalSampleMulti]:
	with db_conn(row_factory=class_row(EvalSampleMulti)) as (conn, cur):
		cur.execute("SELECT * FROM alignment_rankings_ai")
		return cur.fetchall() # type: ignore

In [None]:
# Danbooru
API_TAG_ID_TO_NAME = {tag.id: tag.name for tag in api.fetch_tags()}
DANBOORU_TAG_METADATA = {}

for line in Path("XXXXX/tags000000000000.json").read_text().splitlines():
	data = json.loads(line)
	DANBOORU_TAG_METADATA[data["name"]] = data

DANBOORU_TAG_CATEGORY_TO_STR = {
	0: "general",
	1: "artist",
	3: "copyright",
	4: "character",
	5: "meta",
}

# e621
p = Path("XXXXX/tags-2023-07-16.csv.gz")
E621_TAG_METADATA = {}

with gzip.open(p, "rt") as f:
	reader = csv.DictReader(f)
	for row in reader:
		E621_TAG_METADATA[row["name"]] = row

E621_TAG_CATEGORY_TO_STR = {
	0: "general",
	1: "artist",
	3: "copyright",
	4: "character",
	5: "species",
	6: "invalid",
	7: "meta",
	8: "lore",
}

# Rule34
RULE34_TAG_TO_CATEGORY = json.loads(Path("XXXXX/rule34_tag_to_category.json").read_text())
RULE34_TAG_TO_CATEGORY = {k: v.replace(", ambiguous", "").replace("metadata", "meta") for k, v in RULE34_TAG_TO_CATEGORY.items()}


def image_to_tags(filehash: bytes) -> dict[str, str]:
	image_metadata = api.get_image_metadata(filehash)
	tag_strings = {}

	# Danbooru
	all_tags = [API_TAG_ID_TO_NAME[int(i)] for i in image_metadata.tags.keys()]

	# Sort into categories
	by_category = defaultdict(list)
	for tag in all_tags:
		if tag not in DANBOORU_TAG_METADATA:
			category = "general"
		else:
			category = DANBOORU_TAG_METADATA[tag]['category']
			category = DANBOORU_TAG_CATEGORY_TO_STR[int(category)]
		by_category[category].append(tag)
	
	# Build the final list by first listing by category, and sorted alphabetically within each category
	# General tags are listed last without a category prefix
	all_tags = []

	for category in ["artist", "copyright", "character", "meta"]:
		tags = sorted(by_category[category])
		for tag in tags:
			all_tags.append(f"{category}:{tag}")
	
	all_tags.extend(sorted(by_category["general"]))
	
	# Format the tags into a string
	if len(all_tags) > 0:
		tag_strings['danbooru'] = ", ".join(all_tags)

	# E621
	if 'e621_tags' in image_metadata.attributes:
		e621_tags = next(iter(image_metadata.attributes.get('e621_tags', {}).keys()), "")
		e621_tags = e621_tags.split(" ")

		# Sort into categories
		by_category = defaultdict(list)
		for tag in e621_tags:
			if tag not in E621_TAG_METADATA:
				category = "general"
			else:
				category = int(E621_TAG_METADATA[tag]['category'])
				category = E621_TAG_CATEGORY_TO_STR[category]
		
			by_category[category].append(tag)
		
		# Build the final list by first listing by category, and sorted alphabetically within each category
		# General tags are listed last without a category prefix
		all_tags = []

		for category in ["artist", "copyright", "character", "species", "meta", "lore"]:
			tags = sorted(by_category[category])
			for tag in tags:
				all_tags.append(f"{category}:{tag}")
		
		all_tags.extend(sorted(by_category["general"]))
		
		# Format the tags into a string
		if len(all_tags) > 0:
			tag_strings['e621'] = ", ".join(all_tags)

	# Rule34
	if 'rule34_tags' in image_metadata.attributes:
		r34_tags = next(iter(image_metadata.attributes.get('rule34_tags', {}).keys()), "")
		r34_tags = [tag.strip() for tag in r34_tags.split(" ") if tag.strip() != ""]

		# Sort into categories
		by_category = defaultdict(list)
		for tag in r34_tags:
			if tag not in RULE34_TAG_TO_CATEGORY:
				category = "invalid"
			else:
				category = RULE34_TAG_TO_CATEGORY[tag]
		
			by_category[category].append(tag)
		
		# Build the final list by first listing by category, and sorted alphabetically within each category
		# General tags are listed last without a category prefix
		all_tags = []

		for category in ["artist", "copyright", "character", "species", "meta", "lore"]:
			tags = sorted(by_category[category])
			for tag in tags:
				all_tags.append(f"{category}:{tag}")
		
		all_tags.extend(sorted(by_category["general"]))
		
		# Format the tags into a string
		if len(all_tags) > 0:
			tag_strings['rule34'] = ", ".join(all_tags)
	
	return tag_strings

In [None]:
DESIRED_SOURCES = {
	"XXXXX": 2048*4,
	"XXXXX": 128*4,
	"XXXXX": 768*4,
	"XXXXX": 768*4,
	"XXXXX": 512*4,
	"XXXXX": 512*4,
	"XXXXX": 128*4,
	"XXXXX": 128*4,
	"XXXXX": 2048*4,
	"XXXXX": 768*4,
	"XXXXX": 128*4,
	"XXXXX": 512*4,
	"XXXXX": 512*4,
}


API_IMAGES_BY_SOURCE = {}

for source in tqdm(DESIRED_SOURCES.keys()):
	results = api.search(f"source='{source}'", ["hash"])
	assert all(len(filehash) == 32 for filehash in results)
	API_IMAGES_BY_SOURCE[source] = set(filehash.tobytes() for filehash in results)

In [None]:
# Load our training dataset
source_ds = datasets.load_dataset("fancyfeast/joy-captioning-20250408a")
assert isinstance(source_ds, datasets.DatasetDict)
TRAINED_FILEHASHES = set()

for split in source_ds.keys():
	TRAINED_FILEHASHES.update(source_ds[split]["filehash"])

TRAINED_BY_TYPE = defaultdict(list)
for example in tqdm(source_ds['train']):
	question_type = example["question_type"]
	TRAINED_BY_TYPE[question_type].append(example)

In [None]:
# Load prompt mappings
PROMPT_MAPPINGS = yaml.safe_load(Path("XXXXX/prompt-mappings.yaml").read_text())
PROMPT_EXTENSIONS = [line.strip() for line in Path("XXXXX/EXTENSIONS.txt").read_text().splitlines() if line.strip()]

# Gather a list of API images we can use
API_IMAGES_TO_USE = []

for source, desired_count in DESIRED_SOURCES.items():
	# Exclude any images that the model was either trained, tested, or validated on
	api_images = API_IMAGES_BY_SOURCE[source] - TRAINED_FILEHASHES

	# Sort by hash for determinism
	api_images = sorted(list(api_images))

	# Truncate
	api_images = api_images[:desired_count*2]

	API_IMAGES_TO_USE.extend(api_images)

print(f"# API images to use: {len(API_IMAGES_TO_USE)}")

In [None]:
NAMES = json.loads(Path("XXXXX/names.json").read_text())

In [None]:
# Build our ground truth answers based on our VQA database
# VQA with a vqa_category is human verified
results = api.search("EXISTS(vqa_category) AND EXISTS(questionAnswer)", ["hash", "attributes"])
assert isinstance(results, list)
VQA_GROUND_TRUTH_ANSWERS = {}

for result in tqdm(results):
	assert isinstance(result, tag_machine_api.SearchResultImage) and result.attributes is not None and result.hash is not None
	questionAnswer, = result.attributes['questionAnswer'].keys()
	questionAnswer = json.loads(questionAnswer)
	question = questionAnswer['question'].strip()
	answer = questionAnswer['answer'].strip()
	vqa_category, = result.attributes['vqa_category'].keys()

	# Handle system message being in the question
	if "<system>" in question:
		i = question.index("<system>")
		j = question.index("</system>")
		question = question[:i] + question[j + len("</system>"):]
	
	question = question.strip()

	if vqa_category.strip() == "" or question == "" or answer == "":
		continue

	VQA_GROUND_TRUTH_ANSWERS[(result.hash, question)] = answer

results = api.search("EXISTS(allenai_pixmo_ask_model_anything_question) AND EXISTS(allenai_pixmo_ask_model_anything_answer)", ["hash", "attributes"])
assert isinstance(results, list)

for result in tqdm(results):
	assert isinstance(result, tag_machine_api.SearchResultImage) and result.attributes is not None and result.hash is not None

	question, = result.attributes['allenai_pixmo_ask_model_anything_question'].keys()
	answer, = result.attributes['allenai_pixmo_ask_model_anything_answer'].keys()

	question = question.strip()
	answer = answer.strip()

	if question == "" or answer == "":
		continue

	if (result.hash, question) in VQA_GROUND_TRUTH_ANSWERS:
		continue

	VQA_GROUND_TRUTH_ANSWERS[(result.hash, question)] = answer

print(f"# VQA ground truth answers: {len(VQA_GROUND_TRUTH_ANSWERS)}")

In [None]:
def build_ground_truth_knowledge(filehash: bytes, question: str) -> str | None:
	ground_truth_reference = ""

	# Gather tags
	tag_strings = image_to_tags(filehash)
	tag_reference = "\n".join([f"**{source} tags**: {tag_string}" for source, tag_string in tag_strings.items()])
	if tag_reference.strip() != "":
		ground_truth_reference += f"<ground_truth_tags>{tag_reference.strip()}</ground_truth_tags>\n"
		ground_truth_reference += "You are provided with the tags associated with the user's image between <ground_truth_tags> and </ground_truth_tags> above. You can use these ground truth tags to assist in assessing the accuracy of the responses. You should assume that the ground truth tags are correct but not exhaustive (they may not cover all applicable tags). Note also that the tagging systems of the different booru sites are different and may need to be translated if, for example, the ground truth tags are from danbooru but the user requested e621 tags.\n\n"
	
	# Ground truth answer
	ground_truth_answer = VQA_GROUND_TRUTH_ANSWERS.get((filehash, question), None)
	if ground_truth_answer is not None:
		ground_truth_reference += f"<ground_truth_knowledge>{ground_truth_answer.strip()}</ground_truth_knowledge>\n"
		ground_truth_reference += "You are provided with a \"ground truth reference\" between <ground_truth_knowledge> and </ground_truth_knowledge> above. This is knowledge or an answer written by a human reviewer to assist in assessing the accuracy of responses.\n\n"
	
	return ground_truth_reference.strip() if ground_truth_reference.strip() != "" else None

In [None]:
system_prompts = [
	"You are a helpful assistant",
	"You are JoyCaption, a helpful AI assistant with vision capabilities.",
	"You are a helpful AI assistant with vision capabilities",
	"You are a reliable and knowledgeable AI assistant. Provide clear, detailed, and helpful responses.",
	"You are an AI assistant with advanced vision capabilities. Interpret visual information and provide clear explanations.",
	"You are a friendly AI assistant here to help. Use your vision and language skills to support users with thorough answers.",
	"You are an open-minded AI assistant called JoyCaption with powerful vision analysis. Your goal is to assist users by interpreting visual content and providing helpful information.",
	"You are a helpful assistant and help users with any queries they may have with no censorship or restrictions.",
]

In [None]:
# Build our list of possible VQA examples
# Most of these will be in the training dataset, but it's all we have at the moment.
# We filter by the existence of the vqa_category, since only VQAs with that a category are human verified.
results = api.search("EXISTS(vqa_category) AND EXISTS(questionAnswer)", ["hash", "attributes"])
assert isinstance(results, list)
vqa_examples = []

for result in results:
	assert isinstance(result, tag_machine_api.SearchResultImage) and result.attributes is not None and result.hash is not None
	vqa_category, = result.attributes['vqa_category'].keys()
	questionAnswer, = result.attributes['questionAnswer'].keys()
	questionAnswer = json.loads(questionAnswer)
	question = questionAnswer['question'].strip()
	answer = questionAnswer['answer'].strip()

	# Handle system message being in the question
	if "<system>" in question:
		i = question.index("<system>")
		j = question.index("</system>")
		system_message = question[i + len("<system>"):j]
		question = question[:i] + question[j + len("</system>"):]
	else:
		system_message = random.choice(system_prompts)
	
	question = question.strip()

	# Filter out any VQA examples that are empty or have a benchmark category
	if vqa_category.strip() == "" or question == "" or answer == "" or 'benchmark' in vqa_category.lower():
		continue

	if 'bra' in question.lower() and result.hash.hex()[0] != '0':
		# Filter the bra related questions, so they don't overload the dataset.
		# using the hash knocks it down by 1/16th, deterministically
		continue

	vqa_examples.append(EvalSample(
		filehash=result.hash,
		system=system_message,
		question=question,
		question_type='questionAnswer',
		task_type='overall',
	))

print(f"# VQA examples: {len(vqa_examples)}")

# Inject some AllenAI vqa data
results = api.search("source='allenai_pixmo_ask_model_anything' AND exists(allenai_pixmo_ask_model_anything_question)", ["hash", "attributes"])
assert isinstance(results, list)
allenai_examples = []

for result in tqdm(results):
	assert isinstance(result, tag_machine_api.SearchResultImage) and result.attributes is not None and result.hash is not None
	if 'vqa_category' in result.attributes:
		# Should already be in our dataset, so skip
		continue

	question, = result.attributes['allenai_pixmo_ask_model_anything_question'].keys()
	answer, = result.attributes['allenai_pixmo_ask_model_anything_answer'].keys()

	question = question.strip()
	answer = answer.strip()
	if question == "" or answer == "":
		continue

	allenai_examples.append(EvalSample(
		filehash=result.hash,
		system=random.choice(system_prompts),
		question=question,
		question_type='questionAnswer',
		task_type='overall',
	))

allenai_examples.sort(key=lambda e: e.filehash)
vqa_examples.extend(allenai_examples[:3500])

print(f"Extended VQA to {len(vqa_examples)} examples")

In [None]:
def _longest_internal_repeat(s: str) -> int:
	"""
	Crude check for “aaa… / ab_ab_ab…” patterns inside ONE very-long token.
	Returns length of the largest substring that appears ≥ 4 contiguous times.
	"""
	max_len = len(s) // 4                       # need 4× to be interesting
	for size in range(1, max_len + 1):
		chunk = s[:size]
		if chunk * (len(s) // size) in s:
			return size * (len(s) // size)
	return 0


def is_repper(text: str) -> bool:
	"""
	Conservative detector for useless repetition / tag-spam.

	Returns True only when repetition is very likely harmful.
	Five lightweight heuristics are combined; thresholds are tuned
	to avoid false-positives on ordinary prose or short enumerations.
	"""

	tokens = [t.lower() for t in re.split(r"\W+", text) if t]
	n = len(tokens)
	if n < 30:                                     # tiny blocks → never flag
		return False

	# --------------------------------------------------- 1
	# ≥10 identical tokens BACK-TO-BACK
	if sum(tokens[i] == tokens[i - 1] for i in range(1, n)) >= 10:
		return True

	# --------------------------------------------------- 2
	# Heavy duplicate ratio on “content” words (len ≥4)
	content = [t for t in tokens if len(t) >= 4]
	if len(content) >= 50:
		dup_count  = len(content) - len(set(content))
		dup_ratio  = dup_count / len(content)
		if dup_ratio > 0.55 and dup_count >= 100:      # both conditions
			return True

	# --------------------------------------------------- 3
	# Repeated 3-gram loops (“water ocean waves …”)
	tris = [' '.join(tokens[i:i + 3]) for i in range(n - 2)]
	if tris:
		most_common_tri, freq = Counter(tris).most_common(1)[0]
		if freq >= 10 and freq / len(tris) > 0.20:
			return True

	# --------------------------------------------------- 4
	# Colon-style tag duplication  (e.g.   meta:icon_set … )
	colon_tags = [t for t in tokens if ':' in t]
	if len(colon_tags) >= 30:
		tag_dup = len(colon_tags) - len(set(colon_tags))
		if tag_dup >= 20 and tag_dup / len(colon_tags) > 0.30:
			return True

	# --------------------------------------------------- 5A
	#   Common-prefix flood  (tokens differ only by added suffix words)
	if n >= 50:
		for k in (15, 20, 25):                       # test several slice sizes
			counts = Counter(t[:k] for t in tokens)
			top_slice, freq = counts.most_common(1)[0]
			uniq_with_slice = {t for t in tokens if t.startswith(top_slice)}
			if freq / n > 0.18 and len(uniq_with_slice) >= 15:
				return True

	# --------------------------------------------------- 5B
	#   Internal repetition inside ONE very long token
	for tok in tokens:
		if len(tok) > 80 and _longest_internal_repeat(tok) / len(tok) > 0.65:
			return True

	# --------------------------------------------------- 5C
	#   Numeric-suffix families (“foo_1 … foo_40”)
	num_suffix = re.compile(r'^(.+?)_(\d{1,3})$')
	buckets: dict[str, set[int]] = {}
	for t in tokens:
		m = num_suffix.match(t)
		if m:
			buckets.setdefault(m.group(1), set()).add(int(m.group(2)))
	if any(len(nums) >= 15 for nums in buckets.values()):
		return True

	return False


def generate_responses(example: EvalSample) -> EvalSample:
	for _ in range(10):
		response_a = generate_response(example)
		response_b = generate_response(example)

		if response_a != response_b:
			return example.model_copy(update={"response_a": response_a, "response_b": response_b, "response_a_model": CURRENT_RESPONSE_MODEL, "response_b_model": CURRENT_RESPONSE_MODEL})
	
	raise RuntimeError("Failed to generate two different responses after 10 attempts")


def generate_responses_multi(example: EvalSampleMulti, n: int) -> EvalSampleMulti:
	responses = set()
	for i in range(n*3):
		response = generate_response(example)

		# Try to ensure at least one response isn't a repper
		if is_repper(response) and all(is_repper(r) for r in responses) and i < ((n*3) - 1) and len(responses) > (n//2):
			continue

		responses.add(response)

		if len(responses) == n:
			return example.model_copy(update={"responses": list(responses), "responses_model": CURRENT_RESPONSE_MODEL})
	
	raise RuntimeError("Failed to generate enough different responses")


def generate_response(example: EvalSample | EvalSampleMulti, logprobs: bool = False) -> str:
	assert example.image_dataurl is not None, f"Missing image data URL for {example.filehash}"
	client = openai.Client(base_url="http://localhost:5052/v1", api_key="token-abc123")
	temperature = random.uniform(0.4, 1.0)

	response = client.chat.completions.create(
		model=CURRENT_RESPONSE_MODEL,
		messages=[
			{
				"role": "system",
				"content": example.system,
			},
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": example.question.strip(),
					},
					{
						"type": "image_url",
						"image_url": {
							"url": example.image_dataurl,
						},
					},
				]
			},
		],
		temperature=temperature,
		top_p=0.95,
		max_tokens=512,
		logprobs=logprobs,
	)
	client.close()

	assert len(response.choices) == 1, f"Expected 1 responses, got {len(response.choices)}"
	assert response.choices[0].message.content is not None, "Response content is None"
	response_a = response.choices[0].message.content.strip()
	if logprobs:
		return response_a, response.choices[0].logprobs.content

	return response_a

In [None]:
question_types = {
	"questionAnswer": 2000,
	"default-descriptive": 2000,
	"training_prompt": 2000,
	"all_tags": 2000,
	"midjourney": 1000,
	"straight-forward": 2000,
	"social-media": 200,
	"booru-tags": 200,
	"product-listing": 200,
	"art-critic": 200,
	"consise-descriptive": 200,
}

# Normalize
n = sum(question_types.values())
question_types = {k: v / n for k, v in question_types.items()}


def get_random_prompt(question_type: str, filehash: bytes) -> str:
	"""
	Based on the question type, return a random user question/prompt
	"""
	if question_type == 'all_tags':
		sources = set(source for source in API_IMAGES_BY_SOURCE.keys() if filehash in API_IMAGES_BY_SOURCE[source])

		source_to_use = random.choice(["danbooru", "e621", "rule34"])
		for source in ["danbooru", "e621", "rule34"]:
			if source in sources:
				source_to_use = source
				break
		
		prompt = random.choice(PROMPT_MAPPINGS['prompts']['all_tags'][source_to_use])
		return prompt.strip()

	n_words = random.randint(20, 300)
	n_words = (n_words // 10) * 10
	length = random.choice(['very short', 'short', 'medium-length', 'long', 'very long'])

	x = random.random()
	y = random.random()

	if x < 0.33 and y < 0.1 and question_type == 'default-descriptive':
		prompt = random.choice(PROMPT_MAPPINGS['prompts'][question_type]['informal'])
	elif x < 0.33:
		prompt = random.choice(PROMPT_MAPPINGS['prompts'][question_type]['formal'])
	elif x < 0.66 and y < 0.1 and question_type == 'default-descriptive':
		prompt = random.choice(PROMPT_MAPPINGS['prompts'][question_type]['informal,word_count'])
	elif x < 0.66:
		prompt = random.choice(PROMPT_MAPPINGS['prompts'][question_type]['formal,word_count'])
	elif y < 0.1 and question_type == 'default-descriptive':
		prompt = random.choice(PROMPT_MAPPINGS['prompts'][question_type]['informal,length'])
	else:
		prompt = random.choice(PROMPT_MAPPINGS['prompts'][question_type]['formal,length'])
	
	return prompt.format(
		word_count=n_words,
		length=length,
	).strip()


def _build_hammered_examples_helper(ranking_to_response: Callable[[list[str]], tuple[str, str] | None]):
	# List all existing preferences and rankings
	existing_preferences = defaultdict(list)
	for example in list_db_preferences():
		existing_preferences[(example.filehash, example.system, example.question, example.response_a, example.response_b)].append(example)

	# Go through the rankings and find ones that haven't been head-to-head compared
	new_examples: list[EvalSample] = []
	for ranking_example in tqdm(list_db_rankings(), desc="Building hammered examples"):
		rankings = ranking_example.judge_rankings
		if rankings is None:
			continue

		if ranking_example.responses_model != CURRENT_RESPONSE_MODEL:
			continue
		
		assert len(rankings) >= 2, f"Rankings are not valid for {ranking_example.id}: {rankings}"
		assert ranking_example.responses is not None and len(ranking_example.responses) == len(rankings), f"Responses are not valid for {ranking_example.id}"

		ranked_responses = [ranking_example.responses[i] for i in rankings]
		responses = ranking_to_response(ranked_responses)
		if responses is None:
			continue

		assert len(responses) == 2, f"Expected 2 responses, got {len(responses)}: {responses}"

		# Check if we've already compared these two responses
		existing = existing_preferences.get((ranking_example.filehash, ranking_example.system, ranking_example.question, responses[0], responses[1]), [])
		existing = existing + existing_preferences.get((ranking_example.filehash, ranking_example.system, ranking_example.question, responses[1], responses[0]), [])
		existing = [e for e in existing if e.response_a_model == ranking_example.responses_model and e.response_b_model == ranking_example.responses_model]
		if len(existing) > 0:
			# We already have a preference for these two responses, so skip
			continue

		# Randomize the order of the responses
		responses = list(responses)
		random.shuffle(responses)

		# Create a new example
		new_examples.append(EvalSample(
			filehash=ranking_example.filehash,
			system=ranking_example.system,
			question=ranking_example.question,
			question_type=ranking_example.question_type,
			task_type="overall",
			response_a=responses[0],
			response_b=responses[1],
			response_a_model=ranking_example.responses_model,
			response_b_model=ranking_example.responses_model,
			ground_truth_knowledge=build_ground_truth_knowledge(ranking_example.filehash, ranking_example.question),
		))
		existing_preferences[(ranking_example.filehash, ranking_example.system, ranking_example.question, responses[0], responses[1])].append(new_examples[-1])
	
	# Now insert all the new examples into the database
	with db_conn() as (conn, cur):
		new_examples = [example.add_to_database(cur) for example in tqdm(new_examples)]
		conn.commit()
	
	print(f"Added {len(new_examples)} new examples to the database")


def build_hammered_examples():
	"""
	Build preference pairs from the MultiRanked dataset
	Selects the best and worst responses from the rankings
	"""
	_build_hammered_examples_helper(lambda ranked_responses: (ranked_responses[0], ranked_responses[-1]))


def build_hammered_examples2():
	"""
	Build preference pairs from the MultiRanked dataset
	Selects the best and worst responses from the rankings, but ensures the best is non-abnormal and the worst is abnormal
	"""
	def _to_responses(ranked_responses: list[str]) -> tuple[str, str] | None:
		if len(ranked_responses) < 6:
			return None
		
		best_responses = [r for r in ranked_responses[:3] if not is_repper(r)]
		worst_responses = [r for r in ranked_responses[-3:] if is_repper(r)]

		return (best_responses[0], worst_responses[-1]) if len(best_responses) > 0 and len(worst_responses) > 0 else None

	_build_hammered_examples_helper(_to_responses)


def build_hammered_examples3():
	"""
	Build preference pairs from the MultiRanked dataset
	Selects the best non-abnormal and worst abnormal responses from the rankings
	"""
	def _to_responses(ranked_responses: list[str]) -> tuple[str, str] | None:
		if len(ranked_responses) < 6:
			return None

		responses = []
		for response in ranked_responses:
			if len(responses) == 0 and not is_repper(response):
				responses.append(response)  # Best, non-abnormal
			elif len(responses) == 1 and is_repper(response):
				responses.append(response)  # First abnormal after best
				break
		
		return tuple(responses) if len(responses) == 2 else None

	_build_hammered_examples_helper(_to_responses)


def build_examples(n: int):
	# List all existing preferences
	existing_types = defaultdict(int)
	existing_filehashes = set()
	for example in list_db_preferences():
		existing_types[example.question_type] += 1
		existing_filehashes.add(example.filehash)

	# Figure out what VQA examples we have left and what API images we can use
	remaining_vqa = [e for e in vqa_examples if e.filehash not in existing_filehashes]
	remaining_api_images = list(set(API_IMAGES_TO_USE) - existing_filehashes)

	print(f"# remaining VQA examples: {len(remaining_vqa)}")
	print(f"# remaining API images: {len(remaining_api_images)}")

	random.shuffle(remaining_vqa)
	random.shuffle(remaining_api_images)

	print({k: (int(v * n), existing_types[k]) for k, v in question_types.items()})

	target_counts = {k: max(0, int(v * n) - existing_types[k]) for k, v in question_types.items()}
	print(target_counts)
	examples = []

	for question_type, target_count in target_counts.items():
		for _ in range(target_count):
			if question_type == 'questionAnswer':
				if len(remaining_vqa) == 0:
					continue

				examples.append(remaining_vqa.pop())
				continue

			if len(remaining_api_images) == 0:
				continue

			filehash = remaining_api_images.pop()
			system = random.choice(system_prompts).strip()
			question = get_random_prompt(question_type, filehash)
			examples.append(EvalSample(
				filehash=filehash,
				system=system,
				question=question,
				question_type=question_type,
				task_type="overall",
				ground_truth_knowledge=build_ground_truth_knowledge(filehash, question),
			))
	
	print(f"Generated {len(examples)} examples to add to the database")

	# Now insert all the new examples into the database
	with db_conn() as (conn, cur):
		examples = [example.add_to_database(cur) for example in tqdm(examples)]
		conn.commit()


def build_ranked_examples(max_n: int | None):
	"""
	Build MultiRanked examples based on the existing examples
	"""
	existing_preferences = list_db_preferences()
	existing_rankings = set()
	for example in list_db_rankings():
		existing_rankings.add((example.filehash, example.system, example.question))
	
	random.shuffle(existing_preferences)
	
	new_examples = []
	for example in tqdm(existing_preferences, desc="Building ranked examples"):
		if (example.filehash, example.system, example.question) in existing_rankings:
			continue
		
		# Generate a new example
		new_examples.append(EvalSampleMulti(
			filehash=example.filehash,
			system=example.system,
			question=example.question,
			question_type=question_type,
			ground_truth_knowledge=build_ground_truth_knowledge(example.filehash, example.question),
		))
		existing_rankings.add((example.filehash, example.system, example.question))

		if max_n is not None and len(new_examples) >= max_n:
			break
	
	print(f"Generated {len(new_examples)} rankings to add to the database")

	# Now insert all the new examples into the database
	with db_conn() as (conn, cur):
		for example in tqdm(new_examples):
			example.add_to_database(cur)
		conn.commit()

In [None]:
# System prompt given to the Judge model
PROMPT = """
You are an expert AI Response Evaluator. Your task is to meticulously analyze and compare two distinct AI-generated responses (`Response A` and `Response B`) provided in response to a specific `User Query` and a `User Image`. These responses were generated based on a specific `Original System Prompt` that defined the AI's persona, constraints, and goals.

**Your Goal:** Determine which response (`Response A` or `Response B`) is objectively better according to the evaluation criteria outlined below. Provide a clear judgment and a detailed, reasoned justification for your choice.

**Input You Will Receive:**

1.  `Original System Prompt`: The instructions the AI models were given to generate their responses. Pay close attention to persona, tone, constraints, required format, and specific tasks mentioned here.
2.  `User Query`: The specific question or instruction from the user.
3.  `User Image`: An image provided by the user, which may be relevant to the query.
4.  `Response A`: One of the AI-generated responses.
5.  `Response B`: One of the AI-generated responses.

**Evaluation Criteria:**

1.  **Adherence to Original System Prompt:**
    *   Did the response follow ALL instructions, constraints, formatting requirements, persona, and tone specified in the `Original System Prompt`?
    *   How well did each response embody the defined persona or role?
2.  **Addressing the User Query:**
    *   Did the response directly, accurately, and completely answer the `User Query`?
    *   Is the response relevant to the user's explicit and implicit needs?
3.  **Image Integration:**
    *   If an image was provided and relevant, did the response appropriately acknowledge, analyze, or utilize the image content as necessitated by the `User Query` and `Original System Prompt`?
4.  **Helpfulness and Usefulness:**
    *   How helpful and practical is the response for the user? Does it provide value?
5.  **Accuracy and Factual Correctness:**
    *   Is the information presented accurate and free from errors? (Acknowledge if you cannot verify).
6.  **Clarity, Conciseness, and Structure:**
    *   Is the response well-organized, easy to understand, and appropriately concise? Is it free from unnecessary jargon or rambling?
7.  **Overall Quality:**
    *   Considering all the above, which response provides a superior user experience?

**Your Task Steps:**

1.  **Understand the Context:** Thoroughly review the `Original System Prompt`, `User Query`, and `User Image`. Understand the *expected* output.
2.  **Analyze Response A:** Evaluate `Response A` against all relevant criteria listed above. Note its strengths and weaknesses.
3.  **Analyze Response B:** Evaluate `Response B` against all relevant criteria listed above. Note its strengths and weaknesses.
4.  **Compare Responses:** Directly compare `Response A` and `Response B` on each relevant criterion. Highlight key differences. Which response performed better on each point?
5.  **Make a Judgment:** State clearly which response is better (`Response A` or `Response B`). If they are very close, state that, but still choose the slightly better one if possible. If one is significantly better, note that. In rare cases where both fail significantly or are equally good/bad in offsetting ways, you may note that, but *still attempt* to identify if one has a slight edge based on primary criteria (like prompt adherence and query addressing).
6.  **Provide Detailed Justification:** Explain *why* you chose one response over the other. Your justification should:
    *   Reference specific aspects of the responses, the `Original System Prompt`, the `User Query`, and the `User Image`.
    *   Clearly explain how the chosen response better meets the criteria.
    *   Point out the specific failings of the less preferred response.
    *   Be structured, clear, and objective.

**Note:**

*   If the user query sets a maximum word count for the response, the response should be within that limit.
*   If the user query asks for the response to be "very short", "short", "medium-length", "long", or "very long" then treat that as an approximate word count limit where very short is ~20 words, short is ~40 words, medium-length is ~60 words, long is ~100 words, and very long is ~200 words.
*   If the user asks for a list of danbooru, e621, or rule34 tags, the response is expected to follow these guidelines unless otherwise specified by the user or system prompt:
	*   Tags should be separated by commas (e.g. "tag1, tag2, tag3").
	*   Tags should be in lowercase.
	*   Tags should use underscores instead of spaces (e.g., "tag_me").
	*   Tags should be relevant to the content of the image and the user query.
	*   Tags should be based on the tagging system used by the site specified in the user query.
	*   Tags belong to categories, such as artist, character, copyright, etc.  The response should prepend the category to all tags except those in the general category (e.g. "artist:tag1, character:tag2, copyright:tag3").
	*   For rule34 or e621, the response should start by listing artist tags, then copyright tags, then character tags, then species tags, then meta tags, then lore tags, and finally general tags.
	*   For danbooru, the response should start with artist tags, then copyright tags, then chracter tags, then meta tags.
	*   Within each category, the tags should be sorted alphabetically.
	*   If the image does not have any tags within a certain category, for example there are no artist tags, then the response should not include that category at all. In other words, it is valid for a response to _not_ have any artist tags listed (or copyright tags, etc).  But if it does list a category the categories must be in the order specified above.
	*   The response should not include any tags that are not in the image or are not relevant to the user query.
	*   Unless specified otherwise by the user or system, the response should include all relavant tags in the image.
	*   Some tags have implicated tags.  For example, if "long_hair" is applicable to an image, the tag "hair" is also applicable, and it is correct to include both tags.
*   Do not bias your evaluations on the order of the responses.  "A" and "B" have been assigned arbitrarily.  You should evaluate them independently and then compare them.

**Output Format:**

Structure your evaluation clearly. You might use headings like:

*   **Context Summary:** (Briefly summarize the task set by the prompt/query)
*   **Analysis of Response A:** (Strengths/Weaknesses against criteria)
*   **Analysis of Response B:** (Strengths/Weaknesses against criteria)
*   **Comparison:** (Direct point-by-point comparison)
*   **Judgment:** (e.g., "Response B is better.")
*   **Justification:** (Detailed reasoning for the judgment)

After your evaluation, at the end of your response, always write a machine parsable output that includes:

* Based on your evaluation, the name of the best response between <best> and </best> tags. Either `<best>Response A</best>` or `<best>Response B</best>`.
* An overall quality score for Response A between 1 and 10, written between <response_a_score> and </response_a_score> tags. A score of 1 means the response is completely useless, and a score of 10 means the response is perfect.
* An overall quality score for Response B between 1 and 10, written between <response_b_score> and </response_b_score> tags. A score of 1 means the response is completely useless, and a score of 10 means the response is perfect.
"""

PROMPT_SCORE_MIN = 1
PROMPT_SCORE_MAX = 10

MULTI_PROMPT = """
You are an expert AI Response Evaluator. Your task is to meticulously analyze and compare several distinct AI-generated responses (`Response A`, `Response B`, `Response C`, …) that were produced in response to a specific `User Query` and a `User Image`. These responses were generated based on a specific `Original System Prompt` that defined the AI's persona, constraints, and goals.

**Your Goal:** Rank **all** responses from **best to worst** according to the evaluation criteria outlined below. Provide a clear judgment and a detailed, reasoned justification for your ordering.

**Input You Will Receive:**

1.  `Original System Prompt`: The instructions the AI models were given to generate their responses. Pay close attention to persona, tone, constraints, required format, and specific tasks mentioned here.
2.  `User Query`: The specific question or instruction from the user.
3.  `User Image`: An image provided by the user, which may be relevant to the query.
4.  `Response A`, `Response B`, `Response C`, …: The AI-generated responses to be evaluated.

**Evaluation Criteria (apply to each response):**

1.  **Adherence to Original System Prompt:**
    *   Did the response follow ALL instructions, constraints, formatting requirements, persona, and tone specified in the `Original System Prompt`?
    *   How well did each response embody the defined persona or role?
2.  **Addressing the User Query:**
    *   Did the response directly, accurately, and completely answer the `User Query`?
    *   Is the response relevant to the user's explicit and implicit needs?
3.  **Image Integration:**
    *   If an image was provided and relevant, did the response appropriately acknowledge, analyze, or utilize the image content as necessitated by the `User Query` and `Original System Prompt`?
4.  **Helpfulness and Usefulness:**
    *   How helpful and practical is the response for the user? Does it provide value?
5.  **Accuracy and Factual Correctness:**
    *   Is the information presented accurate and free from errors? (Acknowledge if you cannot verify).
6.  **Clarity, Conciseness, and Structure:**
    *   Is the response well-organized, easy to understand, and appropriately concise? Is it free from unnecessary jargon or rambling?
7.  **Overall Quality:**
    *   Considering all the above, how strong is the user experience provided?

**Your Task Steps:**

1.  **Understand the Context:** Thoroughly review the `Original System Prompt`, `User Query`, and `User Image`. Understand the *expected* output.
2.  **Analyze Each Response Individually:** For every response, evaluate it against all relevant criteria listed above. Note its strengths and weaknesses.
3.  **Compare Responses:** Directly contrast the responses on each criterion.
4.  **Rank the Responses:** Order them from **best** (highest overall quality) to **worst** (lowest overall quality).
5.  **Provide Detailed Justification:** Explain *why* each response occupies its position in the ranking. Reference specific aspects of the responses, the `Original System Prompt`, the `User Query`, and the `User Image`. Be structured, clear, and objective.

**Output Format (strict):**

Use the following headings *exactly*:

* **Context Summary:**  
  Briefly summarize what the user asked for and what the system prompt requires.

* **Per-Response Analysis:**  
  For each response in **ranked order**, supply a short bullet-point list of its key strengths and weaknesses.

* **Ranking Explanation:**  
  A concise narrative comparing the responses, highlighting decisive factors for the ordering.

* **Ranking:**  
  A single line listing the response labels from best to worst, comma-separated, enclosed in `<ranking>` tags.  
  Example: `<ranking>Response C, Response A, Response B</ranking>`

**Notes & Special Rules:**

*   If the user query sets a maximum word count for the response, the response should be within that limit.
*   If the user query asks for the response to be "very short", "short", "medium-length", "long", or "very long" then treat that as an approximate word count limit where very short is ~20 words, short is ~40 words, medium-length is ~60 words, long is ~100 words, and very long is ~200 words.
*   If the user query asks for the response to be in a casual tone, the response should be written as if it were written by a human with a casual tone, using contractions, slang, and informal language as appropriate. It should avoid sounding like a robot pretending to be casual.
*   If the user asks for a list of danbooru, e621, or rule34 tags, the response is expected to follow these guidelines unless otherwise specified by the user or system prompt:
	*   Tags should be separated by commas (e.g. "tag1, tag2, tag3").
	*   Tags should be in lowercase.
	*   Tags should use underscores instead of spaces (e.g., "tag_me").
	*   Tags should be relevant to the content of the image and the user query.
	*   Tags should be based on the tagging system used by the site specified in the user query.
	*   Tags belong to categories, such as artist, character, copyright, etc.  The response should prepend the category to all tags except those in the general category (e.g. "artist:tag1, character:tag2, copyright:tag3").
	*   For rule34 or e621, the response should start by listing artist tags, then copyright tags, then character tags, then species tags, then meta tags, then lore tags, and finally general tags.
	*   For danbooru, the response should start with artist tags, then copyright tags, then chracter tags, then meta tags.
	*   Within each category, the tags should be sorted alphabetically.
	*   If the image does not have any tags within a certain category, for example there are no artist tags, then the response should not include that category at all. In other words, it is valid for a response to _not_ have any artist tags listed (or copyright tags, etc).  But if it does list a category the categories must be in the order specified above.
	*   The response should not include any tags that are not in the image or are not relevant to the user query.
	*   Unless specified otherwise by the user or system, the response should include all relavant tags in the image.
	*   Some tags have implicated tags.  For example, if "long_hair" is applicable to an image, the tag "hair" is also applicable, and it is correct to include both tags.
*   Do **not** use the order in which the candidates are presented as a bias when ranking them; judge solely on the criteria above.
"""




MULTI_USER_MESSAGE = """
<original_system_prompt>{system_prompt}</original_system_prompt>
<user_query>{user_query}</user_query>
{responses}

---

{ground_truth_reference}

**Now, please perform the evaluation based on the instructions provided in your system prompt. Don't forget to clearly mark the ranking in <ranking></ranking> tags at the end of your response, with a single line, comma separated list of the response labels in ranked order from best to worst.**
"""

# MULTI_USER_MESSAGE_ALL_TAGS = """
# <original_system_prompt>{system_prompt}</original_system_prompt>
# <user_query>{user_query}</user_query>
# {responses}

# ---

# {ground_truth_reference}
# <ground_truth_tags>{ground_truth_tags}</ground_truth_tags>

# **Now, please perform the evaluation based on the instructions provided in your system prompt.**
# Additionally, the ground truth tags associated with the user's image from {source} have been provided to you. You should assume that these are correct, but not exhaustive. You can use these ground truth tags to assist in assessing the accuracy of the responses.
# **Don't forget to clearly mark the ranking in <ranking></ranking> tags at the end of your response, with a single line, comma separated list of the response labels in ranked order from best to worst.**
# """



USER_MESSAGE = """
<original_system_prompt>{system_prompt}</original_system_prompt>
<user_query>{user_query}</user_query>
{responses}

---

{ground_truth_reference}

**Now, please perform the evaluation based on the instructions provided in your system prompt. Don't forget to clearly mark the best response in <best></best> tags, and provide a score for each response between 1 and 10, inside <response_a_score></response_a_score> and <response_b_score></response_b_score> tags.**
"""

# USER_MESSAGE_ALL_TAGS = """
# <original_system_prompt>{system_prompt}</original_system_prompt>
# <user_query>{user_query}</user_query>
# {responses}

# ---

# {ground_truth_reference}
# <ground_truth_tags>{ground_truth_tags}</ground_truth_tags>

# **Now, please perform the evaluation based on the instructions provided in your system prompt.**
# Additionally, the ground truth tags associated with the user's image from {source} have been provided to you. You should assume that these are correct, but not exhaustive. You can use these ground truth tags to assist in assessing the accuracy of the responses.
# **Don't forget to clearly mark the best response in <best></best> tags at the end of your response, and provide a score for each response between 1 and 10, inside <response_a_score></response_a_score> and <response_b_score></response_b_score> tags.**
# """


CAPTION_SYSTEM_PROMPT = """
XXXXX

Unless requested otherwise, when you write an image description or caption, you follow these general guidelines:

* The description should be detailed.
* The description must be accurate.
* Include information about both the subject and background, colors, objects, people, clothes, textures, styles, locations of objects in the image, etc.
* Include in your caption whether the image is a photograph, drawing, CGI, etc, and what specific style (for example if it is art, what style of art; if it's a drawing is it done in pencil, digital medium, etc).
"""


TRAINING_PROMPT_SYSTEM_PROMPT = """
XXXXX

Unless requested otherwise, when you write a stable diffusion prompt for an image, you follow these general guidelines:
* The prompt must be accurate.
* The prompt should match how real human users write stable diffusion prompts.
* Stable diffusion prompts often (but not always) use short phrases, descriptive tags, keywords, etc separated by commas, to indicate the desired content of the image succinctly.  Full sentences are less common (though not unheard of).
* Stable diffusion prompts often (but not always) use tags from websites like danbooru, e621, rule34, etc.
* Stable diffusion prompts are prompts for text-to-image models. Clarity and effectivenss for the model are prioritized over grammatical perfection or narrative flow. They lead with subject or medium, not narration. Prompts usually begin with the main subject (“portrait of…”, “anime girl…”, “oil painting of…”, "in the style of…", etc) rather than meta phrases like “This image shows…”, "You are looking at...", etc.
* Stable diffusion prompts sometimes (but not always) use slang, abbreviations, and shorthand. For example, "pussy" instead of "vagina", "anthro" instead of "anthropomorphic", "nsfw" instead of "not safe for work", "photo" instead of "photograph", etc.
* Order matters: The order of keywords can influence their impact; terms appearing earlier sometimes carry more weight or influence the foundational composition.
* Art style descriptors are common in prompts, as well as artist references, to emulate the specific styles.
* For photos, prompts often include camera specifications.
* Lighting descriptions are common.
* Since the goal is for the prompt to recreate the reference image, things like watermarks, compression artifacts, and other image flaws should be included in the prompt if they are present in the reference image.
"""


MIDJOURNEY_SYSTEM_PROMPT = """
XXXXX

Unless requested otherwise, you write a MidJourney prompt that would best recreate the user's image.
"""


VQA_SYSTEM_PROMPT = """
XXXXX
"""


ALL_TAGS_PROMPT = """
"""


STRAIGHTFORWARD_SYSTEM_PROMPT = """
When the user asks for an image description, you should craft a confident, direct set of instructions for describing the image in a way that both visually impaired readers and text-to-image models can understand and recreate. Begin by stating the main subject and medium. Immediately highlight pivotal elements—people, objects, scenery—using definite language without any hedging words.

Focus on clear, concrete details: colors, shapes, textures, and spatial relationships (foreground, background, left, right). Describe how elements interact (“A golden retriever bounding through tall grass” rather than “A dog seems to be in some field”). Keep the tone casual yet precise, avoiding overly technical or flowery language.

Never start with “This image is…”, "The image is of...", etc as writing variations like that such as "This anime illustration..." harms the text-to-image model due to the repetition of starting with "This" or "The" or variations and wastes that token as well.

The image's medium MUST be included somewhere, as it is the most fundamental detail.

Avoid words like "featuring", "depicts", "showcasing", "revealing", "captures", etc unless necessary, as they are common in more formal, overly flowery writing styles that do not assist a text-to-image model.

Do not talk about the mood or feel of the image, as it is subjective and not something that can be recreated by a text-to-image model.

Do not use "likely,", "possibly," "appears," "seems," "suggests," "indicates," "indicating", "suggesting," or variations thereof and similar ambiguous terms. State each observation plainly and confidently.

Note: These restrictions do not apply to quotes, dialogue, or text in the image, which should be transcribed verbatim.

If there are visible compression artifacts, mention them, but do not speculate on the cause or resolution. If the image is blurry, mention that, but do not guess why it is blurry.

If there are watermarks or artist signatures or other parts that identify the source or creator of the image, mention them.

Do not use phrasing like "... is visible" or "... can be seen" as it is redundant (if it's in the description, it's visible), and does not add any information to the description.

Vary the order in which you describe elements in the image to keep the description engaging and informative. Do not repeat the same structure for each description. For example some descriptions may start with the subject, others with the background, others with the overall scene, etc.

Do not mention things that are not in the image, such as "No people are visible" or "There are no watermarks" as it is not helpful to a text-to-image model.

Never mention resolution. Do not guess at unobservable details. Vary your vocabulary to keep the description lively. Above all, make each sentence purposeful—include only essential information that helps someone visualize or reproduce the scene exactly.
"""


SOCIAL_MEDIA_SYSTEM_PROMPT = """
XXXXX

"""


PRODUCT_LISTING_SYSTEM_PROMPT = """
XXXXX

"""


PRIMARY_SYSTEM_MESSAGE = {
	"default-descriptive": CAPTION_SYSTEM_PROMPT,
	"training_prompt": TRAINING_PROMPT_SYSTEM_PROMPT,
	"all_tags": "",
	"midjourney": MIDJOURNEY_SYSTEM_PROMPT,
	"straight-forward": STRAIGHTFORWARD_SYSTEM_PROMPT,
	"social-media": SOCIAL_MEDIA_SYSTEM_PROMPT,
	"booru-tags": CAPTION_SYSTEM_PROMPT,
	"product-listing": PRODUCT_LISTING_SYSTEM_PROMPT,
	"art-critic": CAPTION_SYSTEM_PROMPT,
	"consise-descriptive": CAPTION_SYSTEM_PROMPT,
	"questionAnswer": VQA_SYSTEM_PROMPT,
}


ALPHA_SYSTEM_MESSAGE = """
XXXXX

## Core Vision Principles (Apply to all tasks unless overridden below by user query or system prompt)

These principles apply when you are asked to caption, describe, or analyze an image, or when you are asked to generate a prompt for a text-to-image model based on an image.

*   **Clarity & Confidence:** State observations plainly and confidently. Avoid hedging ("likely," "possibly," "appears," "seems," "suggests") and overly formal/flowery language ("featuring," "depicts," "showcasing," "revealing," "captures") unless necessary for quotes or specific styles.
*   **Accuracy:** Descriptions and prompts must be accurate to the image.
*   **Medium & Style:** Always identify the image medium (photograph, drawing, CGI, etc.) and specific style (e.g., oil painting, pencil sketch, anime art, photorealistic).
*   **Text Transcription:** Transcribe quotes, dialogue, or text verbatim.
*   **Contextual Elements:** Include details about background, colors, objects, clothes, textures, lighting, and spatial relationships.
*   **Flaws:** Mention visible flaws like watermarks, compression artifacts, or blurriness if present.
*   **Exclusions:** Do not mention image resolution. Do not describe things *not* in the image (e.g., "No people visible").

## Task-Specific Guidelines

These guidelines only apply in specific contexts unless overridden by user query or system prompt.

### Image Captioning (Default)
*   Follow Core Principles to create a detailed, descriptive caption.
*   Aim for clear prose that paints a picture for the reader.

### Stable Diffusion Prompting
*   The prompt does not need to be complete. The prompt should vary from very simple to very detailed—or even a terse list of keywords or tags. All levels and formats that real users employ are acceptable unless the user asks otherwise.
*   Adapt Core Principles into a prompt suitable for Stable Diffusion.
   *   Medium & Style is optional
   *   Contextual Elements is optional
   *   Details are optional
*   The prompt should match how real human users write stable diffusion prompts.
*   Humans write stable diffusion prompts in very different ways, all of which are valid:
   *   Short tag list: comma separated keywords, descriptive tags, etc.
   *   Comma-separated phrase string: short phrases, descriptive tags, keywords, incomplete sentences, etc separated by commas or other punctuation.
   *   Full sentence caption: full but concise and terse sentences, separated by periods, commas, or other punctuation.
*   Bare tag or keyword dumps are equally valid 'human-style' prompts, especially for niche communities.
*   Any of these styles are fine, including combinations of them, unless the user demands a specific format.
*   Feel free to use slang, shorthand, or explicit language.
*   Do **not** add leading meta phrases like "This image shows..." or "You are looking at..." as they are common in more formal, overly flowery writing styles that do not assist a text-to-image model.
*   Order matters: The order of keywords can influence their impact; terms appearing earlier sometimes carry more weight or influence the foundational composition.
*   The image medium may be included in the prompt, but if the prompt is at a low level of detail, it is not necessary.
*   Art style descriptors are common in prompts, as well as artist references, to emulate the specific styles.
*   For photos, prompts may include camera specifications.
*   Lighting descriptions are common.
*   Since the goal is for the prompt to recreate the reference image, things like watermarks, compression artifacts, and other image flaws should be included in the prompt if they are present in the reference image.

### MidJourney Prompting
*   Adapt Core Principles into a prompt suitable for MidJourney.
*   The prompt should match the style of real MidJourney prompts.

### Straightforward Captioning
*   Follow Core Principles, emphasizing clarity for text-to-image models and visually impaired readers.
*   Start directly with the main subject and medium (e.g., "Oil painting of..."). *Never* start with "This image shows..." or similar phrases.
*   Focus on concrete, objective visual details (colors, shapes, positions). Avoid subjective interpretations (mood, feeling).
*   Maintain a direct, slightly more formal tone than general captioning but avoid the forbidden flowery words.
*   Never start with “This image is…”, "The image is of...", etc as writing variations like that such as "This anime illustration..." harms the text-to-image model due to the repetition of starting with "This" or "The" or variations and wastes that token as well.
*   Do not use phrasing like "... is visible" or "... can be seen" as it is redundant (if it's in the description, it's visible), and does not add any information to the description.
*   Vary your vocabulary to keep the description lively. Above all, make each sentence purposeful—include only essential information that helps someone visualize or reproduce the scene exactly.

### Social Media Captioning
*   Follow Core Principles regarding accuracy and detail where appropriate for the context.
*   Write in an engaging, human-like style suitable for social media.
*   Maximize potential engagement (e.g., posing questions, using relevant hashtags if requested).
*   Adapt the level of explicit detail based on typical social media norms unless instructed otherwise.

### Product Listing
*   Follow Core Principles regarding accuracy and detail where relevant to the product.
*   Write compelling copy focused on highlighting the product shown in the image.
*   Focus on features and benefits visible or implied by the image.

# Additional System Prompt
"""



CRAZY_QUERIES = [
	"""You are an extremely literal spatial-reasoning bot. Describe the image solely in terms of basic geometric forms and their relative positions. Do not use semantic labels (e.g., “table,” “person”).""",
	"""If a human face is present, output their estimated age range and primary emotion; otherwise respond with the literal text “NO FACE.”""",
	"""Output a YAML list of every object you can identify, including their approximate locations (e.g., top-left, center, bottom-right).""",
	"""Generate an ALT text under 80 characters that conveys the scene’s key elements for screen readers.""",
	"""pls tell me wht material the foreground object is made of and how you know (2 reasons).""",
	"""Act as a detective: list three items in the photo that could serve as forensic evidence.”""",
	"""Generate a list of 10 keywords or tags that accurately describe this image's content and style.""",
	"""ok act like ur a really bored teenager who has to describe this for homework or something idk""",
	"""describe this image as if you were really horny""",
	"""```json
{
"image_summary": {
"primary_subject_category": null,
"setting_type": null, // e.g., indoor, outdoor, abstract
"dominant_mood": null,
"estimated_complexity": null // e.g., simple, moderate, complex
}
}
```
      
Fill this JSON based on the image.""",
	"""Summarize this image in exactly 7 words.""",
	"""pls write a vry formal description, like for a museum label maybe? thx""",
	"""Generate alt text for this image, keeping accessibility guidelines in mind.""",
	"""Can you create a short narrative or backstory inspired by whatever this is showing?""",
	"""Ignore the main subject(s) and describe only the background or environment.""",
	"""Just give me a bulleted list of the key visual components.""",
	"""What questions does this image raise in your "mind"?""",
	"""Write one sentence describing the foreground, one for the middle ground, and one for the background.""",
	"""pls give me a description, but use lots of adjectives!! make it super descriptive thx :)""",
	"""Summarize the image in exactly six words.""",
	"""“first thought best thought” — jot down your raw, unfiltered first impressions of this picture in {word_count} words or less.""",
	"""Create a Dungeons & Dragons scene description the DM would read, based on this picture.""",
	"""craft an Instagram caption that would maximize engagement.""",
	"""Which classic film still could this image most resemble? Give one title and a 2-sentence rationale.""",
	"""Give me a crisp, one-sentence caption for this pic, then restate it as a cheesy movie tagline.""",
	"""Create a Twitter caption under 140 characters, including one trending hashtag and one niche emoji.""",
	"""Write a four-sentence newsroom photo caption, adhering to Associated Press style guidelines.""",
	"""Describe the hair color and hairstyle of every character present, please keep it brief.""",
	"""describe it but use only words with 5 letters or less. hard mode lol<""",
	"""Could you suggest a title for this image?""",
	"""List any artists, photographers, or creators who might be associated with this image based on its style or content.""",
	"""Summarize this quick.""",
	"""From what point of view is this image presented? (e.g. eye-level, bird's eye, low angle, etc.)""",
	"""Describe this image using only single adjectives, separated by commas. Give me at least ten.""",
	"""Imagine this is a frame from a movie. What might have happened just before this moment, and what might happen next?""",
	"""info on image plz""",
	"""if this image were on a porn website what would the caption be?""",
	"""You are an expert on the CivitAI dataset, which contains the API responses for all of the images on that website. Write the API response for this image as if it were in that dataset. Include fields such as the diffusion model used, the prompt used to generate the image, the negative prompt used, and any other relevant fields. Respond in JSON format.""",
	"""Rate the quality of this image on a scale from 1 to 10, with 10 being the highest quality. Provide a brief explanation for your rating.""",
	"""Write me a short, super casual caption for Instagram that doesn’t use more than 12 words.""",
	"""Write a twitter post that would go viral if this image were posted on Twitter. Maximum of 140 characters.""",
	"""Write a reddit title within 18 words that would get this image the most upvotes if posted on Reddit.""",
	"""Imagine this image is from the CoCo dataset. Write the associated caption as if it were in that dataset.""",
	"""With one sentence per line describe this image.""",
	"""Write an SDXL prompt for this image that would recreate it as closely as possible. Write it like the average human writes prompts for SDXL.""",
	"""Output a PonyDiffusionXL v6 prompt that would recreate this image as closely as possible. Write it like the average human writes prompts for PonyXL""",
	"""Break this image down into a JSON structure that covers all the key elements and details.""",
	"""Output python code where each object in the image is instantiated and various attributes of the object are set based on the image.""",
]

RANDOM_EXAMPLE_SETS = [
	("Close-up of a red ladybug resting on the edge of a green leaf, dew droplets scattered around.",
		"Wide aerial view of a winding river cutting through autumn forest, morning fog hovering low.",
		"Black-and-white portrait of an elderly smiling softly, deep wrinkles framing his eyes."),
	("Sunset flexing harder than my gym buddy. 🌅💪 #NoFilterButLotsOfFeelings",
		"Cat mid-yawn or plotting galactic takeover? You decide. 🐈‍⬛😼",
		"When your coffee spills but the latte art still slaps. ☕🎨 #SkillOrSorcery"),
	("Cranes skim mirror ponds / rice fields breathe a silver hush / dawn bows to their wings",
		"Desert moonrise—quiet / dunes trade secrets with cold stars / solitude glows blue",
		"Old pier creaks once more / gulls stitch salt into the wind / tide erases threads"),
	("Rain painted the alley chrome, and her neon umbrella was the only confession worth hearing.",
		"The city’s skyline smoked like a bad habit; somewhere in that haze, trouble lit its next cigarette.",
		"He wore his grin the way a cardsharp palms an ace—too smooth to be clean."),
	("ISO 100 │ f/8 │ 1/250 s │ 35 mm—high-contrast shot of basalt columns under midday sun; strong leading lines create geometric tension.",
		"ISO 1600 │ f/2 │ 1/60 s │ 50 mm—low-light café interior, warm tungsten cast; shallow depth isolates barista’s latte art.",
		"ISO 400 │ f/11 │ 8 s │ 14 mm—nighttime long exposure at seafront pier; smooth water surface, starburst streetlamps."),
	("The squirrel in the blue waistcoat balanced a hazelnut crown upon his head and declared it Adventure Day.",
		"Deep inside the tulip forest, Luna the caterpillar practiced her very first butterfly curtsy.",
		"When the moon forgot to rise, the fireflies stitched a silver blanket so the night wouldn’t feel naked."),
	("Behold yon orchard bathed in amber light; the plums, like rubies, tempt the hand of dusk.",
		"Across the storm-toss’d sea a lantern sways, a lone star guiding hearts that seek the shore.",
		"Soft zephyrs court the rose in fragrant bloom, whilst petals blush beneath the wooing breeze."),
	("Neon-spill ∆ city-skin; drones hum lullabies while rain pixelates the street.",
		"Heartbeat sync—signal lost—umbrella holograms stutter over chrome koi ponds.",
		"Headlights carve data-scars into midnight asphalt; graffiti AIs rewrite themselves on loop."),
    [
      "🌄🦌🌲✨",
      "🚴‍♀️💨🌧️🏁",
      "🍜🔥😋🍥"
    ],
    [
      "Today at 0900 UTC, HorizonTech unveiled its flagship solar drone, capturing a sunrise test flight above the Mojave Desert to demonstrate zero-emission aviation.",
      "The attached image showcases our new micro-data-center rack, engineered for 40 % lower power draw while exceeding Tier-IV redundancy standards.",
      "In response to coastal resiliency goals, AquaGrid’s prototype tidal turbine—pictured mid-deployment—successfully reached 97 % of projected output during phase-two trials."
    ],
    [
      "Figure 1: Confocal microscopy reveals chloroplast migration toward the adaxial leaf surface under 500 µmol m⁻² s⁻¹ blue-light stimulus (n = 30, p < 0.01).",
      "Plate B illustrates crystalline domains of the polymer after annealing at 180 °C for 15 min, supporting a 12 % increase in tensile strength versus control.",
      "Spectrogram indicates humpback whale vocalization shift to higher frequencies in shipping lanes; peak energy observed at 580 Hz (CI 95 %)."
    ],
    [
      "Silver train, lonely track.",
      "Tea steam meets winter glass.",
      "First snow on forgotten swings."
    ],
    [
      "Pan-roast tomatoes until skins blister like this—deep scarlet, lightly charred, juices just starting to seep.",
      "Dough should look glossy yet pull clean from the bowl; see strands stretching in the photo—window-pane test passed.",
      "Finish with a butter baste: watch the steak edge foam and brown to hazelnut, exactly as pictured."
    ],
    [
      "The city folds into origami swans while traffic lights bleed marmalade over the asphalt.",
      "Clock-tower shadows drip upward, stitching daylight into the moon’s empty pockets.",
      "Umbrellas bloom from puddles, petals of rain whispering forgotten passwords to the wind."
    ],
]

In [None]:
T = TypeVar('T', bound=Union[EvalSample, EvalSampleMulti])
def prep_evaluation(example: T) -> T:
	"""
	Fills out any missing fields needed for evaluation.
	"""
	if example.image_dataurl is None:
		example = example.get_image()
	
	if isinstance(example, EvalSample) and (example.response_a is None or example.response_b is None):
		example = generate_responses(example)
	elif isinstance(example, EvalSampleMulti) and example.responses is None:
		example = generate_responses_multi(example, n=10)
	
	if example.judge_system is None:
		example = example.model_copy(update={"judge_system": (PROMPT if isinstance(example, EvalSample) else MULTI_PROMPT).strip()})

	if example.judge_user is None:
		system_message = ALPHA_SYSTEM_MESSAGE.strip()
		#system_message = system_message + "\n\n" + PRIMARY_SYSTEM_MESSAGE[example.question_type].strip()
		system_message = system_message + "\n\n" + example.system.strip()

		user_message = USER_MESSAGE if isinstance(example, EvalSample) else MULTI_USER_MESSAGE
		
		if isinstance(example, EvalSample):
			assert example.response_a is not None and example.response_b is not None, "Missing response_a or response_b"
			responses = [example.response_a, example.response_b]
		else:
			assert example.responses is not None, "Missing responses"
			responses = example.responses
		
		formatted_responses = []
		for i, response in enumerate(responses):
			word_count = len(response.split())
			label = chr(97 + i)  # 'a' for 0, 'b' for 1, etc.
			formatted_responses.append(f"<response_{label}>{response}</response_{label}>\n<response_{label}_word_count>{word_count}</response_{label}_word_count>")
		
		user_message = user_message.format(
			system_prompt=system_message,
			user_query=example.question.strip(),
			responses="\n".join(formatted_responses),
			source=source,
			ground_truth_reference=(example.ground_truth_knowledge if example.ground_truth_knowledge is not None else "")
		).strip()

		example = example.model_copy(update={"judge_user": user_message})
	
	return example


RE_BEST_TAG = re.compile(r"(?i)<best>\s*response\s+([ab])\s*</best")

def parse_judge_response(evaluation: str, example: EvalSample) -> EvalSample:
	# Parse best tag
	#m = re.search(r"<best>\s*response\s+([ab])\s*</best", evaluation, re.IGNORECASE)
	m = RE_BEST_TAG.findall(evaluation)
	if len(m) == 0:
		raise RuntimeError(f"No <best> tag found in response: {evaluation}")
	elif len(m) > 1:
		raise RuntimeError(f"Multiple <best> tags found in response: {evaluation}")
	best = m[0].strip().lower()
	#best = m.group(1).strip().lower()
	assert best in {"a", "b"}, f"Invalid best tag in response: {best}"

	# Parse scores
	m = re.search(r"<response_a_score>\s*([0-9]+(?:\.[0-9]+)?)\s*</response_a_score>", evaluation, re.IGNORECASE)
	if m:
		response_a_score = float(m.group(1).strip())
		if response_a_score < PROMPT_SCORE_MIN or response_a_score > PROMPT_SCORE_MAX:
			print(f"Warning: <response_a_score> tag out of range: {response_a_score}")
			response_a_score = None
		else:
			response_a_score = (response_a_score - PROMPT_SCORE_MIN) / (PROMPT_SCORE_MAX - PROMPT_SCORE_MIN)
	else:
		#print("Warning: No <response_a_score> tag found in response")
		response_a_score = None
	
	m = re.search(r"<response_b_score>\s*([0-9]+(?:\.[0-9]+)?)\s*</response_b_score", evaluation, re.IGNORECASE)
	if m:
		response_b_score = float(m.group(1).strip())
		if response_b_score < PROMPT_SCORE_MIN or response_b_score > PROMPT_SCORE_MAX:
			print(f"Warning: <response_b_score> tag out of range: {response_b_score}")
			response_b_score = None
		else:
			response_b_score = (response_b_score - PROMPT_SCORE_MIN) / (PROMPT_SCORE_MAX - PROMPT_SCORE_MIN)
	else:
		#print("Warning: No <response_b_score> tag found in response")
		response_b_score = None
	
	return example.model_copy(update={"judge_winner": best, "judge_response_a_score": response_a_score, "judge_response_b_score": response_b_score})


def parse_judge_response_multi(evaluation: str, example: EvalSampleMulti) -> EvalSampleMulti:
	assert example.responses is not None, f"Missing responses for {example.filehash}"
	# Parse ranking tag
	m = re.search(r"<ranking>\s*(.*)\s*</ranking", evaluation, re.IGNORECASE)
	assert m is not None, f"No <ranking> tag found in response: {evaluation}"
	ranking_str = m.group(1).strip().lower()
	without_prefix = 'response' not in ranking_str
	rankings = []
	for letter in ranking_str.split(","):
		letter = letter.strip()
		if not without_prefix and not letter.startswith("response"):
			raise RuntimeError(f"Invalid response letter in ranking: {ranking_str}")
		if not without_prefix:
			letter = letter[len("response"):].strip()
		if len(letter) != 1:
			raise RuntimeError(f"Invalid response letter in ranking: {ranking_str}")
		idx = ord(letter) - ord("a")
		if idx < 0 or idx >= len(example.responses):
			raise RuntimeError(f"Invalid response letter in ranking: {ranking_str}")
		if letter in rankings:
			raise RuntimeError(f"Duplicate response letter in ranking: {ranking_str}")
		rankings.append(idx)
	
	if len(rankings) != len(example.responses):
		raise RuntimeError(f"Invalid number of responses in ranking: {ranking_str}")
	
	return example.model_copy(update={"judge_rankings": rankings})


SAFETY_SETTINGS_OFF = [
	{"category": "HARM_CATEGORY_HARASSMENT",        "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_HATE_SPEECH",       "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
	{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
]


def openrouter_completion(**kwargs) -> str:
	response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers={
		"Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}",
	}, data=json.dumps(kwargs))
	response.raise_for_status()
	result = response.json()

	if 'choices' not in result or len(result['choices']) != 1 or result['choices'][0] is None:
		raise RuntimeError(f"Did not get response from OpenRouter: {result}")
	
	if 'message' not in result['choices'][0] or result['choices'][0]['message'] is None:
		raise RuntimeError(f"Did not get message from OpenRouter: {result['choices'][0]}")
	
	if 'content' not in result['choices'][0]['message'] or result['choices'][0]['message']['content'] is None:
		raise RuntimeError(f"Did not get content from OpenRouter: {result['choices'][0]['message']}")
	
	return result['choices'][0]['message']['content']


T = TypeVar('T', bound=Union[EvalSample, EvalSampleMulti])
def run_openai_model(example: T, model: str, reasoning_effort: str | None, temperature: float | None, use_openrouter: bool, flex: bool = False, gemini_caching: bool = False) -> T:
	assert example.judge_system is not None and example.judge_user is not None and example.image_dataurl is not None, "Missing judge system, user message, or image data URL"

	extra_args = {}

	if reasoning_effort is not None:
		extra_args["reasoning_effort"] = reasoning_effort
	
	if temperature is not None:
		extra_args["temperature"] = temperature
	
	if model.startswith("google/gemini"):
		extra_args["safety_settings"] = SAFETY_SETTINGS_OFF
	
	if flex:
		extra_args["service_tier"] = "flex"
	
	if use_openrouter:
		system_content = example.judge_system.strip() if not gemini_caching else [{ "type": "text", "text": example.judge_system.strip(), "cache_control": {"type": "ephemeral"}}]
		evaluation = openrouter_completion(
			model=model,
			messages=[
				{
					"role": "system",
					"content": system_content,
				},
				{
					"role": "user",
					"content": [
						{
							"type": "text",
							"text": example.judge_user.strip(),
						},
						{
							"type": "image_url",
							"image_url": {
								"url": example.image_dataurl,
							},
						},
					]
				},
			],
			**extra_args,
		)
	else:
		client = openai.Client()
	
		response = client.chat.completions.create(
			model=model,
			messages=[
				{
					"role": "system",
					"content": example.judge_system.strip(),
				},
				{
					"role": "user",
					"content": [
						{
							"type": "text",
							"text": example.judge_user.strip(),
						},
						{
							"type": "image_url",
							"image_url": {
								"url": example.image_dataurl,
							},
						},
					]
				},
			],
			**extra_args,
		)

		if len(response.choices) != 1 or response.choices[0].message.content is None:
			raise RuntimeError(f"Did not get response from Judge AI: {response}")

		evaluation = response.choices[0].message.content.strip()

	if isinstance(example, EvalSampleMulti):
		example = parse_judge_response_multi(evaluation, example)
	else:
		example = parse_judge_response(evaluation, example)
	
	return example.model_copy(update={"judge_model": model, "judge_reasoning": evaluation})


T = TypeVar('T', bound=Union[EvalSample, EvalSampleMulti])
def get_run_model_request(example: T, model: str, reasoning_effort: str | None, temperature: float | None) -> tuple[dict, T]:
	assert example.judge_system is not None and example.judge_user is not None and example.image_dataurl is not None, "Missing judge system, user message, or image data URL"
	assert example.id is not None, "Missing example ID"

	if isinstance(example, EvalSample):
		custom_id = f"XXXXX-{example.id}"
	else:
		custom_id = f"XXXXX-{example.id}"

	task = {
		"custom_id": custom_id,
		"method": "POST",
		"url": "/v1/chat/completions",
		"body": {
			"model": model,
			"messages": [
				{
					"role": "system",
					"content": example.judge_system.strip(),
				},
				{
					"role": "user",
					"content": [
						{
							"type": "text",
							"text": example.judge_user.strip(),
						},
						{
							"type": "image_url",
							"image_url": {
								"url": example.image_dataurl,
							},
						},
					]
				},
			],
		}
	}

	if reasoning_effort is not None:
		task['body']["reasoning_effort"] = reasoning_effort
	
	if temperature is not None:
		task['body']["temperature"] = temperature
	
	return task, example.model_copy(update={"judge_model": model, "in_progress": int(time.time())})

## Start Here
Run all cells above to initialize the notebook

In [None]:
# Add ranked examples for the new model
unique_examples = set()
existing_examples = set()

for example in tqdm(list_db_preferences()):
	unique_examples.add((example.filehash, example.system, example.question, example.question_type))
	#if example.response_a_model is None or (example.response_a_model == CURRENT_RESPONSE_MODEL and example.response_b_model == CURRENT_RESPONSE_MODEL):
	#	existing_examples.add((example.filehash, example.system, example.question, example.question_type))

with db_conn() as (conn, cur):
	n_added = 0
	for filehash, system, question, question_type in tqdm(list(unique_examples - existing_examples)):

		new_example = EvalSampleMulti(
			filehash=filehash,
			system=system,
			question=question,
			question_type=question_type,
			ground_truth_knowledge=build_ground_truth_knowledge(filehash, question)
		)
		new_example.add_to_database(cur)
		n_added += 1
	
	conn.commit()

print(f"Added {n_added} new examples to database.")

## Hammered Examples
These take different approaches to extracting head-to-head preferences from the ranked examples.

In [None]:
_ = build_hammered_examples()

In [None]:
_ = build_hammered_examples2()

In [None]:
_ = build_hammered_examples3()

In [None]:
build_ranked_examples(1000)

In [None]:
build_examples(28000)

## Manual Run
This code lets me run through the process of evaluating a single example manually.

In [None]:
examples = list_db_preferences() + list_db_rankings()
examples = [e for e in examples if not e.is_done]
examples = [e for e in examples if e.in_progress is None]
print(f"Found {len(examples)} examples to run")

In [None]:
example = random.choice(examples)
example = prep_evaluation(example)
image = Image.open(io.BytesIO(base64.b64decode(example.image_dataurl.split(",")[1])))
scale = 512 / max(image.size)
image = image.resize((int(image.size[0] * scale), int(image.size[1] * scale)), Image.LANCZOS)
print(f"Filehash: {example.filehash.hex()}")
display(image)
print(f"User: {example.judge_user}")

In [None]:
print(example.id)
with db_conn() as (conn, cur):
	example.update_in_database(cur)
	conn.commit()

In [None]:
#evaluated_example = run_openai_model(example, model="gpt-4.1", reasoning_effort=None, temperature=0.5, use_openrouter=False)
#winner, reasoning = run_openrouter_model(system, user, image_dataurl, model="anthropic/claude-3.7-sonnet:beta", reasoning_effort=None)
#evaluated_example = run_openai_model(example, model="google/gemini-2.5-pro-preview-03-25", reasoning_effort=None, temperature=0.4, use_openrouter=True)
#winner, reasoning = run_openai_model(system, user, image_dataurl, temperature=None, model='o3', reasoning_effort='medium')
evaluated_example = run_openai_model(example, model="o4-mini", reasoning_effort="medium", temperature=None, use_openrouter=False, flex=False)
#evaluated_example = run_openai_model(example, model="o3", reasoning_effort="medium", temperature=None, use_openrouter=False, flex=True)
#evaluated_example = run_openai_model(example, model="qwen/qwen2.5-vl-32b-instruct:free", reasoning_effort=None, temperature=0.6, use_openrouter=True)

print(f"Reasoning: {evaluated_example.judge_reasoning}")
if isinstance(evaluated_example, EvalSample):
	print(f"Response A Score: {evaluated_example.judge_response_a_score}")
	print(f"Response B Score: {evaluated_example.judge_response_b_score}")
	print(f"Winner: {evaluated_example.judge_winner}")
else:
	print(f"Rankings: {evaluated_example.judge_rankings}")

In [None]:
# Save the preference
with db_conn() as (conn, cur):
	evaluated_example.update_in_database(cur)
	conn.commit()

In [None]:
### ⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️ ###
###### If an unacceptable image is found, run this to delete it from the database
with db_conn() as (conn, cur):
	cur.execute("DELETE FROM alignment_preferences_ai WHERE image_hash = %s", (example.filehash,))
	deleted = cur.rowcount
	cur.execute("DELETE FROM alignment_rankings_ai WHERE image_hash = %s", (example.filehash,))
	deleted += cur.rowcount
	conn.commit()
	print(f"Deleted {deleted} examples from database.")
####### DO NOT RUN OTHERWISE

## Parallel Preferences

In [None]:
examples = list_db_preferences() + list_db_rankings()
examples = [e for e in examples if not e.is_done]
examples = [e for e in examples if e.in_progress is None]
random.shuffle(examples)
print(f"Loaded {len(examples)} examples to process")

work = random.sample(examples, min(256, len(examples)))
#work = [prep_evaluation(example) for example in tqdm(work, desc="Preparing examples")]

def handle_job(example: EvalSample | EvalSampleMulti) -> EvalSample | EvalSampleMulti | None:
	example = prep_evaluation(example)

	try:
		evaluated_example = run_openai_model(example, model="google/gemini-2.5-pro-preview-03-25", reasoning_effort=None, temperature=0.4, use_openrouter=True, gemini_caching=True)
		#evaluated_example = run_openai_model(example, model="o4-mini", reasoning_effort='medium', temperature=None, use_openrouter=False, flex=False)
		#evaluated_example = run_openai_model(example, model="o3", reasoning_effort='medium', temperature=None, use_openrouter=False, flex=False)
		#evaluated_example = run_openai_model(example, model="gpt-4.1", reasoning_effort=None, temperature=0.5, use_openrouter=False)
		assert isinstance(evaluated_example, EvalSample) or isinstance(evaluated_example, EvalSampleMulti), f"Expected EvalSample or EvalSampleMulti, got {type(evaluated_example)}"
	except Exception as e:
		print(f"Error from run_openai_model: {e}")
		print(format_exc())
		return None
	
	return evaluated_example


with ThreadPoolExecutor(max_workers=16) as executor, tqdm(total=len(work)) as pbar, db_conn() as (conn, cur):
	futures = [executor.submit(handle_job, example) for example in work]
	for future in as_completed(futures):
		pbar.update(1)
		try:
			result = future.result()
		except Exception as e:
			print(f"Error: {e}")
			continue

		if result is None:
			continue

		assert isinstance(result, EvalSample) or isinstance(result, EvalSampleMulti), f"Expected EvalSample or EvalSampleMulti, got {type(result)}"
		assert result.judge_model is not None, f"Missing judge model in result: {result.filehash.hex()}"
		assert result.judge_reasoning is not None, f"Missing judge reasoning in result: {result.filehash.hex()}"

		result.update_in_database(cur)
		conn.commit()

## Batched Preferences

In [None]:
all_examples = list_db_preferences() + list_db_rankings()
#all_examples = list_db_rankings()
#all_examples = list_db_preferences()
all_examples = [e for e in all_examples if not e.is_done]
all_examples = [e for e in all_examples if e.in_progress is None]
print(f"Loaded {len(all_examples)} examples to process")

In [None]:
# Prep all examples in parallel
def handle_prep_job(example: EvalSample | EvalSampleMulti) -> EvalSample | EvalSampleMulti:
	return prep_evaluation(example)


with ThreadPoolExecutor(max_workers=8) as executor, tqdm(total=len(all_examples)) as pbar, db_conn() as (conn, cur):
	futures = [executor.submit(handle_prep_job, example) for example in all_examples]
	for future in as_completed(futures):
		pbar.update(1)
		try:
			result = future.result()
		except Exception as e:
			print(f"Error: {e}")
			continue

		assert isinstance(result, EvalSample) or isinstance(result, EvalSampleMulti), f"Expected EvalSample or EvalSampleMulti, got {type(result)}"
 
		result.update_in_database(cur)
		conn.commit()

In [None]:
def send_batch(batch_data: bytes):
	while True:
		try:
			client = openai.Client()
			batch_data_io = io.BytesIO(batch_data)
			batch_file = client.files.create(file=batch_data_io, purpose="batch")
			batch_job = client.batches.create(
				input_file_id=batch_file.id,
				endpoint="/v1/chat/completions",
				completion_window="24h",
			)
			print(f"Batch job created: {batch_job.id}")
			break
		except Exception as e:
			print(f"Error sending batch: {e}")
			time.sleep(5)
			continue


print(f"Number of examples: {len(all_examples)}")
random.shuffle(all_examples)
work = all_examples#[:1024]
current_batch = {
	"o4-mini": b"",
	"gpt-4.1": b"",
	#"o3": b"",
}


with db_conn() as (conn, cur):
	for example in tqdm(work):
		try:
			example = prep_evaluation(example)
		except Exception as e:
			print(f"Error preparing example: {e}")
			continue

		# Turn into a task
		model = random.choice(list(current_batch.keys()))
		if model == 'o4-mini':
			task, example = get_run_model_request(example, model='o4-mini', reasoning_effort='medium', temperature=None)
		elif model == 'gpt-4.1':
			task, example = get_run_model_request(example, model='gpt-4.1', reasoning_effort=None, temperature=0.5)
		elif model == 'o3':
			task, example = get_run_model_request(example, model='o3', reasoning_effort='medium', temperature=None)
		else:
			raise ValueError(f"Unknown model: {model}")
		
		example.update_in_database(cur)
		conn.commit()
		
		# Write to the batch
		line = json.dumps(task).encode('utf-8') + b"\n"
		if len(current_batch[model]) + len(line) > 90 * 1000 * 1000:
			# Batch is full, send it
			send_batch(current_batch[model])
			current_batch[model] = b""
		current_batch[model] += line

# Send the vestiges
for batch_data in current_batch.values():
	if len(batch_data) != 0:
		send_batch(batch_data)

## Download Batches

In [None]:
client = openai.Client()
with db_conn() as (conn, cur):
	cur.execute("SELECT batch_id FROM alignment_batch_results")
	completed_batch_ids = set(batch_id for batch_id, in cur)

	for i, batch in tqdm(enumerate(client.batches.list())):
		if i >= 400:
			break

		if batch.status != "completed" and batch.status != "expired" and batch.status != "cancelled":
			print(f"Batch {batch.id} not completed: {batch.status}")
			continue

		if batch.output_file_id is None or batch.id in completed_batch_ids:
			continue

		try:
			result = client.files.content(batch.output_file_id).content
		except Exception as e:
			print(f"Error fetching batch {batch.id}: {e}")
			continue
		cur.execute("INSERT INTO alignment_batch_results (batch_id, batch_data) VALUES (%s, %s) ON CONFLICT (batch_id) DO NOTHING", (batch.id, result))
		conn.commit()
		print(f"Fetched batch {batch.id} with {len(result)} bytes")

In [None]:
client.batches.cancel("XXXXX")

## Process Batches

In [None]:
with db_conn() as (conn, cur):
	cur.execute("SELECT batch_id, batch_data FROM alignment_batch_results WHERE processed = FALSE")
	unprocessed = cur.fetchall()

	for batch_id, batch_data in tqdm(unprocessed):
		batch_data = batch_data.decode('utf-8').splitlines()
		batch_data = [json.loads(line) for line in batch_data if line.strip()]

		for line in batch_data:
			if 'body' not in line['response'] or 'choices' not in line['response']['body'] or len(line['response']['body']['choices']) != 1:
				print(f"Warning: invalid response in batch {batch_id}: {line}")
				continue

			response = line['response']['body']['choices'][0]['message']['content'].strip()

			custom_id = line['custom_id']
			if custom_id.startswith('XXXXX-'):
				custom_id = int(custom_id[len('XXXXX-'):])

				# Find the example in the database
				example = EvalSample.get_from_database(conn, custom_id)
				if example is None:
					print(f"Warning: no example found for batch {batch_id}, custom_id {custom_id}")
					continue

				# Add the response to the example
				example = example.model_copy(update={"judge_reasoning": response})
				example.update_in_database(cur)
			elif custom_id.startswith('XXXXX-'):
				custom_id = int(custom_id[len('XXXXX-'):])

				# Find the example in the database
				example = EvalSampleMulti.get_from_database(conn, custom_id)
				if example is None:
					print(f"Warning: no example found for batch {batch_id}, custom_id {custom_id}")
					continue

				# Add the response to the example
				example = example.model_copy(update={"judge_reasoning": response})
				example.update_in_database(cur)
			else:
				continue
		
		cur.execute("UPDATE alignment_batch_results SET processed = TRUE WHERE batch_id = %s", (batch_id,))
		print(f"Processed batch {batch_id} with {len(batch_data)} entries")
	
	conn.commit()


## Parse Judge Responses

In [None]:
examples = list_db_preferences() + list_db_rankings()
n_count = 0
errors = []

with db_conn() as (conn, cur), open("judge-parsing-errors.txt", "w") as error_file:
	for example in tqdm(examples):
		if example.judge_reasoning is None:
			continue

		if isinstance(example, EvalSample) and example.judge_winner is not None:
			continue

		if isinstance(example, EvalSampleMulti) and example.judge_rankings is not None:
			continue

		try:
			if isinstance(example, EvalSample):
				example = parse_judge_response(example.judge_reasoning, example)
			elif isinstance(example, EvalSampleMulti):
				example = parse_judge_response_multi(example.judge_reasoning, example)
			else:
				raise RuntimeError(f"Unknown example type: {type(example)}")
		except Exception as e:
			errors.append((example.id, e))
			error_file.write("=" * 80 + "\n")
			error_file.write(f"Error parsing example {example.id}: {e}\n")
			error_file.write("=" * 80 + "\n\n")
			continue

		# No longer in progress
		example = example.model_copy(update={"in_progress": None})

		# Update the database
		example.update_in_database(cur)

		n_count += 1
	
	conn.commit()

print(f"Processed {n_count} examples")
print(f"Errors: {len(errors)}")
print("Error messages written to judge-parsing-errors.txt")

## Clean Batches
Once all batches are no longer in progress on the OpenAI side, and we've downloaded and processed them all, some preferences/rankings might still be left in progress, due to errors. This clears their state so they can be tried again.

In [None]:
with db_conn() as (conn, cur):
	cur.execute("UPDATE alignment_preferences_ai SET in_progress = NULL WHERE in_progress IS NOT NULL")
	print(f"Reset in_progress for {cur.rowcount} preferences")
	cur.execute("UPDATE alignment_rankings_ai SET in_progress = NULL WHERE in_progress IS NOT NULL")
	print(f"Reset in_progress for {cur.rowcount} rankings")

	cur.execute("UPDATE alignment_preferences_ai SET judge_reasoning = NULL WHERE judge_reasoning IS NOT NULL AND judge_winner IS NULL")
	print(f"Reset judge_reasoning for {cur.rowcount} preferences without a winner")
	cur.execute("UPDATE alignment_rankings_ai SET judge_reasoning = NULL WHERE judge_reasoning IS NOT NULL AND judge_rankings IS NULL")
	print(f"Reset judge_reasoning for {cur.rowcount} rankings without rankings")
	conn.commit()

## Measure Progress

In [None]:
all_examples = list_db_preferences()
all_examples = [e for e in all_examples if e.judge_winner is not None and e.judge_response_a_score is not None and e.judge_response_b_score is not None]
all_examples = [e for e in all_examples if e.response_a_model == CURRENT_RESPONSE_MODEL and e.response_b_model == CURRENT_RESPONSE_MODEL]
#all_examples = [e for e in all_examples if e.response_a_model == "8gjfxjdm" and e.response_b_model == "8gjfxjdm"]
print(f"{len(all_examples)} examples with scores")

In [None]:
by_question_type = defaultdict(list)
valids = defaultdict(list)
rejects = defaultdict(list)
invalids = defaultdict(list)
valids_by_key = set()

for example in tqdm(all_examples):
	assert example.response_a is not None and example.response_b is not None, "Missing response_a or response_b"
	assert example.judge_response_a_score is not None and example.judge_response_b_score is not None, "Missing judge response scores"
	by_question_type[example.question_type].append((example.judge_response_a_score, example.judge_response_b_score))

	abnormal_response_a = is_repper(example.response_a)
	abnormal_response_b = is_repper(example.response_b)

	if example.judge_winner == "a" and abnormal_response_a:
		rejects[example.question_type].append("abnormal_winner")
		invalids[example.question_type].append(example)
		continue
	elif example.judge_winner == "b" and abnormal_response_b:
		rejects[example.question_type].append("abnormal_winner")
		invalids[example.question_type].append(example)
		continue

	if abs(example.judge_response_a_score - example.judge_response_b_score) < 0.2:
		rejects[example.question_type].append("close_scores")
		invalids[example.question_type].append(example)
		continue

	if example.judge_winner == "a" and example.judge_response_a_score < 0.5:
		rejects[example.question_type].append("winner_low_score")
		invalids[example.question_type].append(example)
		continue
	elif example.judge_winner == "b" and example.judge_response_b_score < 0.5:
		rejects[example.question_type].append("winner_low_score")
		invalids[example.question_type].append(example)
		continue

	valids[example.question_type].append(example.judge_response_a_score if example.judge_winner == "a" else example.judge_response_b_score)
	k = (example.filehash, example.question)
	valids_by_key.add(k)

total_valid = sum(len(v) for v in valids.values())

print("Question type statistics:")
for question_type in by_question_type.keys():
	print(f"{question_type}")
	scores = [score for scores in by_question_type[question_type] for score in scores]
	n_scores = len(scores)
	n_examples = n_scores // 2
	mean_score = sum(scores) / n_scores
	min_score = min(scores)
	max_score = max(scores)
	n_valid = len(valids[question_type])
	min_valid = min(valids[question_type])
	max_valid = max(valids[question_type])
	mean_valid = sum(valids[question_type]) / n_valid

	print(f"  {n_examples} examples ({n_valid / total_valid:.2%} vs {question_types[question_type]:.2%} target)")
	print(f"  {n_valid} valid examples ({n_valid / n_examples:.2%})")
	print(f"  Mean score: {mean_score:.4f}")
	print(f"  Min score: {min_score:.4f}")
	print(f"  Max score: {max_score:.4f}")
	print(f"  Valid scores: {mean_valid:.4f} ({min_valid:.4f} - {max_valid:.4f})")

	for reject_reason, count in Counter(rejects[question_type]).items():
		print(f"  {reject_reason}: {count} ({count / n_examples:.2%})")

print(f"Total Valid: {sum(len(v) for v in valids.values())}")

In [None]:
# ────────────────────────────────────────────────────────────────────────────────
# Pass 1 ─ gather raw data
# ────────────────────────────────────────────────────────────────────────────────
by_question_type   = defaultdict(list)   # (a_score, b_score)
valids             = defaultdict(list)   # winner score only
rejects            = defaultdict(list)   # list of textual reasons
invalid_examples   = defaultdict(list)   # the examples themselves (if needed)

for ex in tqdm(all_examples, desc="Scanning examples"):
    assert ex.response_a and ex.response_b,          "Missing response text"
    assert ex.judge_response_a_score is not None and ex.judge_response_b_score is not None, "Missing scores"

    by_question_type[ex.question_type].append(
        (ex.judge_response_a_score, ex.judge_response_b_score)
    )

    abnormal_a = is_repper(ex.response_a)
    abnormal_b = is_repper(ex.response_b)

    # ---- rejection checks ----------------------------------------------------
    reason = None
    if ex.judge_winner == "a" and abnormal_a:
        reason = "abnormal_winner"
    elif ex.judge_winner == "b" and abnormal_b:
        reason = "abnormal_winner"
    elif abs(ex.judge_response_a_score - ex.judge_response_b_score) < 0.20:
        reason = "close_scores"
    elif ex.judge_winner == "a" and ex.judge_response_a_score < 0.50:
        reason = "winner_low_score"
    elif ex.judge_winner == "b" and ex.judge_response_b_score < 0.50:
        reason = "winner_low_score"

    if reason:
        rejects[ex.question_type].append(reason)
        invalid_examples[ex.question_type].append(ex)
        continue

    # ---- valid example -------------------------------------------------------
    winner_score = (
        ex.judge_response_a_score if ex.judge_winner == "a" else ex.judge_response_b_score
    )
    valids[ex.question_type].append(winner_score)

# ────────────────────────────────────────────────────────────────────────────────
# Pass 2 ─ build tidy table
# ────────────────────────────────────────────────────────────────────────────────
rows = []
for qtype, pairs in by_question_type.items():
    flat_scores = list(itertools.chain.from_iterable(pairs))
    n_examples  = len(flat_scores) // 2
    n_valid     = len(valids[qtype])

    data = dict(
        question_type = qtype,
        n_examples    = n_examples,
        n_valid       = n_valid,
        pct_valid     = n_valid / n_examples if n_examples else math.nan,
        mean_all      = sum(flat_scores) / len(flat_scores),
        min_all       = min(flat_scores),
        max_all       = max(flat_scores),
        mean_valid    = (sum(valids[qtype]) / n_valid) if n_valid else math.nan,
        min_valid     = min(valids[qtype]) if n_valid else math.nan,
        max_valid     = max(valids[qtype]) if n_valid else math.nan,
    )

    # add reject-reason counts as extra columns (helps later if you want a bar chart)
    for reason, cnt in Counter(rejects[qtype]).items():
        data[f"rej_{reason}"] = cnt
    rows.append(data)

summary_df = (
    pd.DataFrame(rows)
      .set_index("question_type")
      .sort_values("question_type")
      .fillna(0)
)

# nicer formatting for notebook display
pd.options.display.float_format = "{:,.4f}".format
display(summary_df[
    ["n_examples", "n_valid", "pct_valid", "mean_all",
     "min_all", "max_all", "mean_valid", "min_valid", "max_valid"]
])

print(f"\nTOTAL VALID: {summary_df.n_valid.sum():,}")

# ────────────────────────────────────────────────────────────────────────────────
# Pass 3 ─ visualisations
# ────────────────────────────────────────────────────────────────────────────────
num_types   = len(by_question_type)
ncols       = 3
nrows       = math.ceil(num_types / ncols)
fig, axes   = plt.subplots(nrows, ncols, figsize=(ncols * 5, nrows * 4), sharex=True)

all_axes = axes.flatten()
for ax, (qtype, pairs) in zip(all_axes, by_question_type.items()):
    flat_scores  = list(itertools.chain.from_iterable(pairs))
    valid_scores = valids[qtype]

    # full distribution
    ax.hist(flat_scores, bins=20, alpha=0.4, label="all")
    # valid only
    if valid_scores:
        ax.hist(valid_scores, bins=20, alpha=0.7, label="valid")
        ax.axvline(summary_df.loc[qtype, "mean_valid"], color="red", ls="--", lw=1)

    ax.set_title(qtype, fontsize=10)
    ax.set_xlim(0, 1)
    ax.set_ylim(0)            # autorescale high end
    ax.set_xlabel("score")
    ax.set_ylabel("count")

# tidy up empty subplots
for k in range(len(by_question_type), len(all_axes)):
    fig.delaxes(all_axes[k])

fig.suptitle("Score distributions by question type", y=1.02, fontsize=14)
fig.tight_layout()
plt.show()

# ────────────────────────────────────────────────────────────────────────────────
# OPTIONAL: stacked-bar of reject reasons
# ────────────────────────────────────────────────────────────────────────────────
"""
rej_cols = [c for c in summary_df.columns if c.startswith("rej_")]
if rej_cols:
    summary_df[rej_cols].plot(kind="bar", stacked=True, figsize=(12, 4))
    plt.title("Rejection reasons by question type")
    plt.ylabel("count")
    plt.tight_layout()
    plt.show()
"""


In [None]:
to_renew = {}

for example in tqdm(random.sample(invalids["questionAnswer"], len(invalids["questionAnswer"]))):
	if example.filehash in to_renew:
		continue

	if (example.filehash, example.question) in valids_by_key:
		continue

	new_example = EvalSampleMulti(
		filehash=example.filehash,
		system=example.system,
		question=example.question,
		question_type=example.question_type,
		ground_truth_knowledge=build_ground_truth_knowledge(example.filehash, example.question),
	)

	to_renew[example.filehash] = new_example

print(f"{len(to_renew)} examples to renew")

In [None]:
def pool_build_responses(example: EvalSampleMulti) -> EvalSampleMulti | None:
	example = example.get_image()

	reppers = set()
	non_reppers = set()

	for _ in range(100):
		if (len(reppers) + len(non_reppers)) >= 10 and len(reppers) >= 3 and len(non_reppers) >= 3:
			break

		response = generate_response(example)
		if is_repper(response):
			reppers.add(response)
		else:
			non_reppers.add(response)
	
	if (len(reppers) + len(non_reppers)) < 10:
		print(f"Warning: not enough responses for {example.filehash.hex()}")
		#print(f"Question: {example.question}")
		#print("Reppers:")
		#for r in reppers:
		#	print(f"  {r[:50]}...")
		#print("Non-reppers:")
		#for r in non_reppers:
		#	print(f"  {r[:50]}...")
		return None
	
	reppers = list(reppers)
	non_reppers = list(non_reppers)
	
	responses = reppers[:3]
	if len(non_reppers) > 0:
		responses = responses + random.sample(non_reppers, min(len(non_reppers), 10 - len(responses)))
	
	if len(responses) < 10:
		responses = responses + random.sample(reppers[3:], 10 - len(responses))

	assert len(responses) == 10
	random.shuffle(responses)

	return example.model_copy(update={"responses": responses, "responses_model": CURRENT_RESPONSE_MODEL})


work = list(to_renew.values()) #[:256]

# for example in tqdm(work):
# 	try:
# 		example = pool_build_responses(example)
# 	except Exception as e:
# 		print(f"Error generating responses for example ({type(example)}) {example.filehash.hex()}: {e}")
# 		continue

# 	if example is None:
# 		continue

# 	with db_conn() as (conn, cur):
# 		example.add_to_database(cur)
# 		conn.commit()

with ThreadPoolExecutor(max_workers=8) as executor, tqdm(total=len(work)) as pbar, db_conn() as (conn, cur):
	futures = [executor.submit(pool_build_responses, example) for example in work]
	for future in as_completed(futures):
		pbar.update(1)
		try:
			result = future.result()
		except Exception as e:
			print(f"Error: {e}")
			continue

		if result is None:
			continue

		assert isinstance(result, EvalSampleMulti), f"Expected EvalSampleMulti, got {type(result)}"
		assert result.responses is not None, f"Missing responses in result: {result.filehash.hex()}"
		assert result.responses_model is not None, f"Missing responses model in result: {result.filehash.hex()}"

		result.add_to_database(cur)
		conn.commit()

In [None]:
# Retry invalid preferences
by_pref_key = defaultdict(list)
by_pref_key_is_valid = set()
existing_pairs = set()
existing_responses = defaultdict(set)

for example in tqdm(all_examples):
	assert example.response_a is not None and example.response_b is not None, "Missing response_a or response_b"
	assert example.judge_response_a_score is not None and example.judge_response_b_score is not None, "Missing judge response scores"

	existing_pairs.add((example.filehash, example.system, example.question, example.response_a, example.response_b))

	pref_key = (example.filehash, example.system, example.question, example.question_type)

	by_pref_key[pref_key].append((example.response_a, example.judge_response_a_score))
	by_pref_key[pref_key].append((example.response_b, example.judge_response_b_score))
	existing_responses[pref_key].add(example.response_a)
	existing_responses[pref_key].add(example.response_b)

	abnormal_response_a = is_repper(example.response_a)
	abnormal_response_b = is_repper(example.response_b)

	if example.judge_winner == "a" and abnormal_response_a:
		continue
	elif example.judge_winner == "b" and abnormal_response_b:
		continue

	if abs(example.judge_response_a_score - example.judge_response_b_score) < 0.2:
		continue

	if example.judge_winner == "a" and example.judge_response_a_score < 0.5:
		continue
	elif example.judge_winner == "b" and example.judge_response_b_score < 0.5:
		continue

	by_pref_key_is_valid.add(pref_key)


needs_reroll = []
new_examples = []

for pref_key, responses in tqdm(by_pref_key.items()):
	if pref_key in by_pref_key_is_valid:
		continue

	sorted_responses = sorted(responses, key=lambda x: x[1], reverse=True)
	#assert sorted_responses[0][1] > sorted_responses[-1][1]

	best_responses = [(r, s) for r, s in sorted_responses if not is_repper(r) and s > 0.5]
	if len(best_responses) == 0:
		needs_reroll.append(pref_key)
		continue

	best_response, best_score = best_responses[0]
	worst_response, worst_score = sorted_responses[-1]

	if abs(best_score - worst_score) < 0.2 or best_score < worst_score:
		needs_reroll.append(pref_key)
		continue

	# Looks like we have a valid pair to try
	if (pref_key[0], pref_key[1], pref_key[2], best_response, worst_response) in existing_pairs or (pref_key[0], pref_key[1], pref_key[2], worst_response, best_response) in existing_pairs:
		# Already tried it, so spin again
		needs_reroll.append(pref_key)
		continue

	response_pair = [best_response, worst_response]
	random.shuffle(response_pair)

	new_examples.append(EvalSample(
		filehash=pref_key[0],
		system=pref_key[1],
		question=pref_key[2],
		question_type=pref_key[3],
		task_type='overall',
		response_a=response_pair[0],
		response_b=response_pair[1],
		ground_truth_knowledge=build_ground_truth_knowledge(pref_key[0], pref_key[2]),
	))

print(f"Found {len(new_examples)} new examples to try")
print(f"Found {len(needs_reroll)} examples to reroll")
print(f"Found {len(by_pref_key_is_valid)} valid examples")
print(f"Out of {len(by_pref_key)} keys")

In [None]:
cnts = defaultdict(int)
cnts_v = defaultdict(int)

for filehash, system, question, question_type in by_pref_key.keys():
	cnts[question_type] += 1
	if (filehash, system, question, question_type) in by_pref_key_is_valid:
		cnts_v[question_type] += 1

for question_type, count in cnts.items():
	print(f"{question_type}: {count} total, {cnts_v[question_type]} valid ({cnts_v[question_type] / count:.2%})")

In [None]:
with db_conn() as (conn, cur):
	for example in new_examples:
		example.add_to_database(cur)
		conn.commit()

In [None]:
def handle_new_example_responses(example: EvalSample) -> EvalSample | None:
	example = example.get_image()

	responses = []

	while True:
		response = generate_response(example)
		if response in existing_responses[(example.filehash, example.system, example.question, example.question_type)]:
			print("Already tried response, spinning again")
			continue

		responses.append(response)
		if len(responses) == 2:
			break
	
	return example.model_copy(update={"response_a": responses[0], "response_b": responses[1], "response_a_model": CURRENT_RESPONSE_MODEL, "response_b_model": CURRENT_RESPONSE_MODEL})

work = []
for pref_key in tqdm(needs_reroll):
	new_example = EvalSample(
		filehash=pref_key[0],
		system=pref_key[1],
		question=pref_key[2],
		question_type=pref_key[3],
		task_type='overall',
		ground_truth_knowledge=build_ground_truth_knowledge(pref_key[0], pref_key[2]),
	)
	work.append(new_example)

with ThreadPoolExecutor(max_workers=8) as executor, tqdm(total=len(work)) as pbar, db_conn() as (conn, cur):
	futures = [executor.submit(handle_new_example_responses, example) for example in work]
	for future in as_completed(futures):
		pbar.update(1)
		try:
			result = future.result()
		except Exception as e:
			print(f"Error: {e}")
			continue

		if result is None:
			continue

		assert isinstance(result, EvalSample), f"Expected EvalSample, got {type(result)}"
		assert result.response_a is not None and result.response_b is not None, f"Missing responses in result: {result.filehash.hex()}"
		assert result.response_a_model is not None and result.response_b_model is not None, f"Missing response models in result: {result.filehash.hex()}"

		result.add_to_database(cur)
		conn.commit()

## Evaluate DPO Progress

In [None]:
remaining_api_images = set(API_IMAGES_TO_USE)
for example in tqdm(list_db_preferences() + list_db_rankings()):
	remaining_api_images.discard(example.filehash)
eval_filehashes = random.sample(list(remaining_api_images), 1000)

In [None]:
eval_dataurls = []

for filehash in tqdm(eval_filehashes):
	image_data = api.read_image(filehash)
	image_mime = magic.from_buffer(image_data, mime=True)
	image_dataurl = f"data:{image_mime};base64,{base64.b64encode(image_data).decode('utf-8')}"
	eval_dataurls.append(image_dataurl)

In [None]:
def eval_generate_response(idx: int, system: str, question: str, image_dataurl: str, temperature: float) -> tuple[int, str]:
	#client = openai.Client(base_url="http://localhost:5053/v1", api_key="token-abc123")
	client = openai.Client(base_url="http://localhost:5052/v1", api_key="token-abc123")
	response = client.chat.completions.create(
		#model="mb3500zp",
		model="5i5xmxdx",
		messages=[
			{
				"role": "system",
				"content": system,
			},
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": question,
					},
					{
						"type": "image_url",
						"image_url": {
							"url": image_dataurl,
						},
					},
				]
			},
		],
		temperature=temperature,
		top_p=0.9,
		max_tokens=512,
	)

	assert len(response.choices) == 1, f"Expected 1 responses, got {len(response.choices)}"
	assert response.choices[0].message.content is not None, "Response content is None"
	response_a = response.choices[0].message.content.strip()

	return idx, response_a


system = "You are JoyCaption, a helpful AI assistant with vision capabilities."
#question = "Respond as a real, human stable-diffusion user would."
question = "Output a description which could generate this image using Stable Diffusion. Write like the average human."


work = []
responses = []
for i, image_dataurl in enumerate(eval_dataurls):
	work.append((i, system, question, image_dataurl, 0.6))


with ThreadPoolExecutor(max_workers=4) as executor, tqdm(total=len(work)) as pbar:
	futures = [executor.submit(eval_generate_response, *args) for args in work]
	for future in as_completed(futures):
		pbar.update(1)
		try:
			result = future.result()
		except Exception as e:
			print(f"Error: {e}")
			continue

		idx, response = result
		response = response.strip()

		responses.append({
			"filehash": eval_filehashes[idx],
			"response": response,
		})

In [None]:
failures = [response['response'] for response in responses if is_repper(response['response'])]
print(f"Failed responses: {len(failures)}")
with open("repper_responses.txt", "w") as f:
	for response in failures:
		f.write(response + "\n")
		f.write("=" * 80 + "\n")

In [None]:
failures = [response['response'] for response in responses if is_repper(response['response'])]
print(f"Failed responses: {len(failures)}")
with open("repper_responses.txt", "w") as f:
	for response in failures:
		f.write(response + "\n")
		f.write("=" * 80 + "\n")

In [None]:
n_failures = 0

for response in dpo_model_responses:
	pieces = response["response"].lower().split(",")
	n_reps = len(pieces) - len(set(pieces))
	if n_reps > 10:
		n_failures += 1

print(f"Number of failures: {n_failures}")

In [None]:
with open('tmp-eval-hashes.txt', 'w') as f:
	for filehash in eval_filehashes:
		f.write(f"{filehash.hex()}\n")

## Evaluate Accuracy of Various Judges

In [None]:
with conn.cursor() as cur:
	cur.execute("SELECT image_hash, messages, response_a, response_b, winner, task_type FROM alignment_preferences_ai")
	existing_results = []
	for filehash, messages, response_a, response_b, winner, task_type in cur:
		assert winner in {"a", "b"}, f"Invalid winner: {winner}"
		messages = json.loads(messages)
		assert len(messages) == 2 and messages[0]["role"] == 'system' and messages[1]["role"] == 'user'
		system = messages[0]["content"]
		question = messages[1]["content"]
		existing_results.append({
			"filehash": bytes(filehash),
			"system": system,
			"question": question,
			"response_a": response_a,
			"response_b": response_b,
			"winner": winner,
			"task_type": task_type,
		})

	print(set(r['task_type'] for r in existing_results))

	existing_results.sort(key=lambda x: (x["filehash"].hex(), x["response_a"], x["response_b"], x["winner"]))
	random.seed(69)
	random.shuffle(existing_results)
	existing_subset = existing_results[:512]

In [None]:
# Prep evaluations
for result in tqdm(existing_subset):
	image_data = api.read_image(result["filehash"])
	image_mime = magic.from_buffer(image_data, mime=True)
	image_dataurl = f"data:{image_mime};base64,{base64.b64encode(image_data).decode('utf-8')}"
	result["image_dataurl"] = image_dataurl

	system_message = ALPHA_SYSTEM_MESSAGE.strip()
	system_message = system_message + "\n\n" + PRIMARY_SYSTEM_MESSAGE[result['task_type']].strip()
	system_message = system_message + "\n\n" + result['system'].strip()

	ground_truth_tags = None
	source = None

	if result['task_type'] == 'all_tags':
		source = 'danbooru' if 'danbooru' in result['question'].lower() else None
		source = 'e621' if 'e621' in result['question'].lower() else source
		source = 'rule34' if 'rule34' in result['question'].lower() else source

		assert source is not None, f"Unknown source in question: {result['question']}"
		ground_truth_tags = image_to_tags(result['filehash'], source)

		user_message = USER_MESSAGE_ALL_TAGS
	else:
		user_message = USER_MESSAGE
	
	# Swap response_a and response_b so we can see if the result is indeterminate
	response_a = result['response_b']
	response_b = result['response_a']
	result['expected_winner'] = 'a' if result['winner'] == 'b' else 'b'
	
	response_a_word_count = len(response_a.split())
	response_b_word_count = len(response_b.split())

	user_message = user_message.format(
		system_prompt=system_message,
		user_query=result['question'].strip(),
		response_a=response_a,
		response_a_word_count=response_a_word_count,
		response_b=response_b,
		response_b_word_count=response_b_word_count,
		ground_truth_tags=ground_truth_tags,
		source=source,
	).strip()

	result['judge_system'] = PROMPT.strip()
	result['judge_user'] = user_message

In [None]:
def eval_pool_run_model(result, model):
	try:
		if model == 'gpt-4.1':
			winner, reasoning = run_openai_model(result['judge_system'], result['judge_user'], result['image_dataurl'], model='gpt-4.1', reasoning_effort=None, temperature=0.5)
		elif model == 'gemini-2.5-pro':
			winner, reasoning = run_openrouter_model(result['judge_system'], result['judge_user'], result['image_dataurl'], model="google/gemini-2.5-pro-preview-03-25", reasoning_effort=None, temperature=0.5)
		elif model == 'o3':
			winner, reasoning = run_openai_model(result['judge_system'], result['judge_user'], result['image_dataurl'], model='o3', reasoning_effort='medium', temperature=None)
		elif model == 'o4-mini':
			winner, reasoning = run_openai_model(result['judge_system'], result['judge_user'], result['image_dataurl'], model='o4-mini', reasoning_effort='medium', temperature=None)
		elif model == 'qwen2.5-vl-32b':
			winner, reasoning = run_openrouter_model(result['judge_system'], result['judge_user'], result['image_dataurl'], model="qwen/qwen2.5-vl-32b-instruct:free", reasoning_effort=None, temperature=0.4)
		else:
			print(f"Unknown model: {model}")
			return result
	except Exception as e:
		print(f"Error from run_openai_model: {e}")
		print(format_exc())
		return result

	result[f'{model}:judge_winner'] = winner
	result[f'{model}:judge_reasoning'] = reasoning

	return result


#model = 'gpt-4.1'
#model = 'gemini-2.5-pro'
#model = 'o4-mini'
model = 'o3'
#model = 'qwen2.5-vl-32b'
work = [result for result in existing_subset if f'{model}:judge_winner' not in result]
#random.shuffle(work)
#work = work[:32]

with ThreadPoolExecutor(max_workers=12) as executor, tqdm(total=len(work)) as pbar:
	futures = [executor.submit(eval_pool_run_model, result, model) for result in work]
	for future in as_completed(futures):
		pbar.update(1)
		try:
			result = future.result()
		except Exception as e:
			print(f"Error: {e}")
			continue

		if isinstance(result, str):
			print(f"Error: {result}")
			continue

		if f'{model}:judge_winner' not in result:
			print(f"Failure on: {result['filehash'].hex()}")
			continue


# for result in tqdm(work):
# 	image = Image.open(io.BytesIO(base64.b64decode(result['image_dataurl'].split(",")[1])))
# 	scale = 512 / max(image.size)
# 	image = image.resize((int(image.size[0] * scale), int(image.size[1] * scale)), Image.LANCZOS)
# 	display(image)

# 	result = eval_pool_run_model(result, model)

# 	print(result['judge_system'])
# 	print(result['judge_user'])
# 	print(result[f'{model}:judge_reasoning'])
# 	print("WINNER:", result[f'{model}:judge_winner'])
# 	print("EXPECTED WINNER:", result['expected_winner'])

# 	break

In [None]:
serializable_subset = []
for result in existing_subset:
	result = {k: (v if k != 'filehash' else v.hex()) for k, v in result.items()}
	del result['image_dataurl']
	serializable_subset.append(result)

Path("tmp-eval-accuracy.json").write_text(json.dumps(serializable_subset, indent=2))

In [None]:
deserialized_subset = json.loads(Path("tmp-eval-accuracy.json").read_text())
for result in deserialized_subset:
	result['filehash'] = bytes.fromhex(result['filehash'])
	result['image_dataurl'] = f"data:image/jpeg;base64,{base64.b64encode(api.read_image(result['filehash'])).decode('utf-8')}"
existing_subset = deserialized_subset

In [None]:
# Measure accuracy
n_correct = defaultdict(int)
n_total = defaultdict(int)

for result in existing_subset:
	models = set()
	for k in result.keys():
		if k.endswith(':judge_winner'):
			model = k.split(':')[0]
			models.add(model)
	
	expected_winner = result['expected_winner']

	for model in models:
		winner = result[f'{model}:judge_winner']
		if winner == expected_winner:
			n_correct[model] += 1
		n_total[model] += 1

print("Accuracy:")
for model in n_total.keys():
	accuracy = n_correct[model] / n_total[model]
	print(f"{model}: {accuracy:.2%} ({n_correct[model]}/{n_total[model]})")

In [None]:
# Filter down to only entries that o3 agrees on and then measure accuracy
filtered_subset = [result for result in existing_subset if 'o3:judge_winner' in result and result['o3:judge_winner'] == result['expected_winner']]
print(f"Filtered subset size: {len(filtered_subset)}")

n_correct = defaultdict(int)
n_total = defaultdict(int)

for result in filtered_subset:
	models = set()
	for k in result.keys():
		if k.endswith(':judge_winner'):
			model = k.split(':')[0]
			models.add(model)
	
	expected_winner = result['expected_winner']

	for model in models:
		winner = result[f'{model}:judge_winner']
		if winner == expected_winner:
			n_correct[model] += 1
		n_total[model] += 1

print("Accuracy:")
for model in n_total.keys():
	accuracy = n_correct[model] / n_total[model]
	print(f"{model}: {accuracy:.2%} ({n_correct[model]}/{n_total[model]})")

## Build Training Dataset

In [None]:
DATASET_PATH = "fancyfeast/joy-captioning-alignment-20250507a"
TEST_SIZE = 512
MIN_WINNER_SCORE = 0.5
MIN_SCORE_DIFF = 0.2


dataset_features = datasets.Features({
	'filehash': datasets.Value('binary'),
	'system': datasets.Value('string'),
	'question': datasets.Value('string'),
	'question_type': datasets.Value('string'),
	'response_w': datasets.Value('string'),
	'response_l': datasets.Value('string'),
	'response_w_score': datasets.Value('float32'),
	'response_l_score': datasets.Value('float32'),
})


@dataclasses.dataclass(frozen=True)
class DatasetEntry:
	filehash: bytes
	system: str
	question: str
	question_type: str
	response_w: str
	response_l: str
	response_w_score: float
	response_l_score: float

	def to_json_dict(self) -> dict:
		data = dataclasses.asdict(self)
		data['filehash'] = data['filehash'].hex()
		return data
	
	def to_dataset(self) -> dict:
		return {
			'filehash': self.filehash,
			'system': self.system,
			'question': self.question,
			'question_type': self.question_type,
			'response_w': self.response_w,
			'response_l': self.response_l,
			'response_w_score': self.response_w_score,
			'response_l_score': self.response_l_score,
		}


examples_by_filehash: dict[bytes, list[DatasetEntry]] = defaultdict(list)

# Find all valid examples in the database
for example in tqdm(list_db_preferences(), desc="Loading examples from database"):
	# Must be from the current response model
	if example.response_a_model != CURRENT_RESPONSE_MODEL or example.response_b_model != CURRENT_RESPONSE_MODEL:
		continue

	# Must have responses
	if example.response_a is None or example.response_b is None or example.judge_winner is None:
		continue

	# Must have scores
	if example.judge_response_a_score is None or example.judge_response_b_score is None:
		continue

	# Gap between scores must be significant
	if abs(example.judge_response_a_score - example.judge_response_b_score) < MIN_SCORE_DIFF:
		continue

	# Winner must have a reasonable score and cannot be a repper
	if example.judge_winner == 'a':
		if example.judge_response_a_score < MIN_WINNER_SCORE or is_repper(example.response_a):
			continue
	elif example.judge_winner == 'b':
		if example.judge_response_b_score < MIN_WINNER_SCORE or is_repper(example.response_b):
			continue
	
	examples_by_filehash[example.filehash].append(DatasetEntry(
		filehash=example.filehash,
		system=example.system,
		question=example.question,
		question_type=example.question_type,
		response_w=example.response_a if example.judge_winner == 'a' else example.response_b,
		response_l=example.response_b if example.judge_winner == 'a' else example.response_a,
		response_w_score=example.judge_response_a_score if example.judge_winner == 'a' else example.judge_response_b_score,
		response_l_score=example.judge_response_b_score if example.judge_winner == 'a' else example.judge_response_a_score,
	))

# Now filter down to the best example per filehash
all_examples: list[DatasetEntry] = []

for examples in tqdm(examples_by_filehash.values(), desc="Filtering examples"):
	# Sort by winner score, descending
	examples.sort(key=lambda ex: ex.response_w_score, reverse=True)

	# Keep only the examples with the highest score
	best_score = examples[0].response_w_score
	examples = [ex for ex in examples if ex.response_w_score >= best_score]

	# If we have repper losers, keep only those examples
	if any(is_repper(ex.response_l) for ex in examples):
		examples = [ex for ex in examples if is_repper(ex.response_l)]
	
	# Now pick the example with the largest margin
	examples.sort(key=lambda ex: abs(ex.response_w_score - ex.response_l_score), reverse=True)

	all_examples.append(examples[0])

# Collect stats
winner_scores = defaultdict(list)
score_diffs = defaultdict(list)
loser_reppers = defaultdict(int)

for example in all_examples:
	score_diffs[example.question_type].append(abs(example.response_w_score - example.response_l_score))
	winner_scores[example.question_type].append(example.response_w_score)
	loser_reppers[example.question_type] += 1 if is_repper(example.response_l) else 0

print(f"Total examples: {len(all_examples)}")
print("Stats by question type:")
for question_type in winner_scores.keys():
	avg_winner_score = sum(winner_scores[question_type]) / len(winner_scores[question_type])
	max_winner_score = max(winner_scores[question_type])
	min_winner_score = min(winner_scores[question_type])
	avg_score_diff = sum(score_diffs[question_type]) / len(score_diffs[question_type])
	max_score_diff = max(score_diffs[question_type])
	min_score_diff = min(score_diffs[question_type])
	n_loser_reppers = loser_reppers[question_type]
	print(f"{question_type}: count={len(winner_scores[question_type])}, avg_winner_score={avg_winner_score:.4f}, max_winner_score={max_winner_score:.4f}, min_winner_score={min_winner_score:.4f}, avg_score_diff={avg_score_diff:.4f}, max_score_diff={max_score_diff:.4f}, min_score_diff={min_score_diff:.4f}, n_loser_reppers={n_loser_reppers} ({n_loser_reppers / len(winner_scores[question_type]):.4%})")

# Build dataset
dataset = datasets.Dataset.from_list([example.to_dataset() for example in all_examples], features=dataset_features)

# Split
dataset = dataset.train_test_split(test_size=TEST_SIZE, shuffle=True, seed=69)

print("Pushing to hub")
dataset.push_to_hub(DATASET_PATH, private=True)

## Glitch Filter Development

In [None]:
import re
from collections import Counter

# ----------------------------------------------------------------------
#  Utility --------------------------------------------------------------
# ----------------------------------------------------------------------
def _longest_internal_repeat(s: str) -> int:
    """
    Crude check for “aaa… / ab_ab_ab…” patterns inside ONE very-long token.
    Returns length of the largest substring that appears ≥ 4 contiguous times.
    """
    max_len = len(s) // 4                       # need 4× to be interesting
    for size in range(1, max_len + 1):
        chunk = s[:size]
        if chunk * (len(s) // size) in s:
            return size * (len(s) // size)
    return 0


# ----------------------------------------------------------------------
#  Main detector --------------------------------------------------------
# ----------------------------------------------------------------------
def is_abnormally_repetitive(text: str) -> bool:
    """
    Conservative detector for useless repetition / tag-spam.

    Returns True only when repetition is very likely harmful.
    Five lightweight heuristics are combined; thresholds are tuned
    to avoid false-positives on ordinary prose or short enumerations.
    """

    tokens = [t.lower() for t in re.split(r"\W+", text) if t]
    n = len(tokens)
    if n < 30:                                     # tiny blocks → never flag
        return False

    # --------------------------------------------------- 1
    # ≥10 identical tokens BACK-TO-BACK
    if sum(tokens[i] == tokens[i - 1] for i in range(1, n)) >= 10:
        return True

    # --------------------------------------------------- 2
    # Heavy duplicate ratio on “content” words (len ≥4)
    content = [t for t in tokens if len(t) >= 4]
    if len(content) >= 50:
        dup_count  = len(content) - len(set(content))
        dup_ratio  = dup_count / len(content)
        if dup_ratio > 0.55 and dup_count >= 100:      # both conditions
            return True

    # --------------------------------------------------- 3
    # Repeated 3-gram loops (“water ocean waves …”)
    tris = [' '.join(tokens[i:i + 3]) for i in range(n - 2)]
    if tris:
        most_common_tri, freq = Counter(tris).most_common(1)[0]
        if freq >= 10 and freq / len(tris) > 0.20:
            return True

    # --------------------------------------------------- 4
    # Colon-style tag duplication  (e.g.   meta:icon_set … )
    colon_tags = [t for t in tokens if ':' in t]
    if len(colon_tags) >= 30:
        tag_dup = len(colon_tags) - len(set(colon_tags))
        if tag_dup >= 20 and tag_dup / len(colon_tags) > 0.30:
            return True

    # --------------------------------------------------- 5A
    #   Common-prefix flood  (tokens differ only by added suffix words)
    if n >= 50:
        for k in (15, 20, 25):                       # test several slice sizes
            counts = Counter(t[:k] for t in tokens)
            top_slice, freq = counts.most_common(1)[0]
            uniq_with_slice = {t for t in tokens if t.startswith(top_slice)}
            if freq / n > 0.18 and len(uniq_with_slice) >= 15:
                return True

    # --------------------------------------------------- 5B
    #   Internal repetition inside ONE very long token
    for tok in tokens:
        if len(tok) > 80 and _longest_internal_repeat(tok) / len(tok) > 0.65:
            return True

    # --------------------------------------------------- 5C
    #   Numeric-suffix families (“foo_1 … foo_40”)
    num_suffix = re.compile(r'^(.+?)_(\d{1,3})$')
    buckets: dict[str, set[int]] = {}
    for t in tokens:
        m = num_suffix.match(t)
        if m:
            buckets.setdefault(m.group(1), set()).add(int(m.group(2)))
    if any(len(nums) >= 15 for nums in buckets.values()):
        return True

    return False






with conn.cursor() as cur, open('tmp-is-repper.txt', 'w') as f, open('tmp-is-not-repper.txt', 'w') as f2:
	cur.execute("SELECT responses FROM alignment_rankings_ai")
	all_responses = list(itertools.chain.from_iterable(row[0] for row in cur))

	for response in random.sample(all_responses, 760):
		if is_abnormally_repetitive(response):
			f.write(f"{response}\n------------------\n\n")
		else:
			f2.write(f"{response}\n------------------\n\n")


In [None]:
len(all_responses)

In [None]:
is_abnormally_repetitive("""This photograph captures an elderly man engaged in wood carving in a rustic workshop. The man, who has short gray hair and a beard, is focused on his work, holding a small wooden carving in his hands. He is dressed in a short-sleeved, white and gray striped shirt and blue jeans. The workshop is filled with various woodworking tools and materials, including a large wooden block on the table in front of him, a chisel in his right hand, and a small wooden piece in his left. 

The background reveals a wooden wall with a window on the right side, allowing natural light to illuminate the workspace. Hanging tools and wooden pieces are visible on the left side of the image. The floor is covered in sawdust, indicating active use. The man is standing near the center of the image, slightly to the right, with his back to a wooden door that is partially open, revealing a green, sunlit garden outside. 

The overall atmosphere of the photograph is warm and industrious, highlighting the man's dedication to his craft. The textures of the wood, the tools, and the man's clothing are clearly defined, adding to the authenticity of the scene. The image conveys a sense of tradition and craftsmanship in a serene, rural setting.""")

## Evaluate Model Performance
Compare the performance of two different models against each other. This is done by taking the latest test set (which should not have been included in the training set for any previous model) and generating a response for each model. The responses are then compared using the usual method to determine the win-rate of each model.

In [None]:
#ds = datasets.load_dataset("fancyfeast/joy-captioning-alignment-20250428a", split="test")
#ds = datasets.load_dataset("fancyfeast/joy-captioning-alignment-20250501a", split="test")
ds = datasets.load_dataset("fancyfeast/joy-captioning-alignment-20250507a", split="test")
client_a = openai.Client(base_url="http://localhost:5052/v1", api_key="token-abc123")
client_a_model = "5i5xmxdx"
client_b = openai.Client(base_url="http://localhost:5053/v1", api_key="token-abc123")
#client_b_model = "8gjfxjdm"
#client_b_model = "kiqxrbng"
client_b_model = "mb3500zp"

all_examples = []

for example in tqdm(ds):
	image_data = api.read_image(example['filehash'])
	image_mime = magic.from_buffer(image_data, mime=True)
	image_dataurl = f"data:{image_mime};base64,{base64.b64encode(image_data).decode('utf-8')}"
	response_a = client_a.chat.completions.create(
		model=client_a_model,
		messages=[
			{
				"role": "system",
				"content": example['system'],
			},
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": example['question'],
					},
					{
						"type": "image_url",
						"image_url": {
							"url": image_dataurl,
						},
					},
				]
			},
		],
		temperature=0.6,
		top_p=0.9,
		max_tokens=512,
	)
	response_a = response_a.choices[0].message.content.strip()

	response_b = client_b.chat.completions.create(
		model=client_b_model,
		messages=[
			{
				"role": "system",
				"content": example['system'],
			},
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": example['question'],
					},
					{
						"type": "image_url",
						"image_url": {
							"url": image_dataurl,
						},
					},
				]
			},
		],
		temperature=0.6,
		top_p=0.9,
		max_tokens=512,
	)
	response_b = response_b.choices[0].message.content.strip()

	# Randomize the order of the responses to prevent bias
	if random.random() < 0.5:
		responses = [response_a, response_b]
		response_models = [client_a_model, client_b_model]
	else:
		responses = [response_b, response_a]
		response_models = [client_b_model, client_a_model]
	
	all_examples.append(EvalSample(
		filehash=example['filehash'],
		system=example['system'],
		question=example['question'],
		question_type=example['question_type'],
		task_type='overall',
		response_a=responses[0],
		response_b=responses[1],
		response_a_model=response_models[0],
		response_b_model=response_models[1],
		ground_truth_knowledge=build_ground_truth_knowledge(example['filehash'], example['question']),
	))

In [None]:
with db_conn() as (conn, cur):
	for example in tqdm(all_examples):
		example.add_to_database(cur)
	conn.commit()


In [None]:
# At this point you need to go have all the preferences judged using Parallel Preferences or Batched Preferences
# This assumes that client_b_model is a new model that we haven't processed before, so all preferences in the database that include it must be from our evaluation.
all_examples = list_db_preferences()
all_examples = [e for e in all_examples if e.response_a_model == client_b_model or e.response_b_model == client_b_model]
print(f"Total examples: {len(all_examples)}")

winner_counts = defaultdict(int)
scores = defaultdict(list)
repper_cnt = defaultdict(int)

with open('tmp-is-repper.txt', 'w') as f:
	for example in all_examples:
		assert example.judge_winner is not None and example.judge_response_a_score is not None and example.judge_response_b_score is not None, f"Missing winner or scores in example: {example.id}"
		assert example.response_a is not None and example.response_b is not None, f"Missing responses in example: {example.id}"
		if example.judge_winner == 'a':
			winner_counts[example.response_a_model] += 1
		else:
			winner_counts[example.response_b_model] += 1
		
		scores[example.response_a_model].append(example.judge_response_a_score)
		scores[example.response_b_model].append(example.judge_response_b_score)

		if is_repper(example.response_a):
			repper_cnt[example.response_a_model] += 1
			f.write(f"Repper response A ({example.response_a_model}): {example.response_a}\n------------------\n\n")
		if is_repper(example.response_b):
			repper_cnt[example.response_b_model] += 1
			f.write(f"Repper response B ({example.response_b_model}): {example.response_b}\n------------------\n\n")

print("Winner counts:")
for model, count in winner_counts.items():
	print(f"{model}: {count}")

print("Scores:")
for model, score_list in scores.items():
	avg_score = sum(score_list) / len(score_list)
	print(f"{model}: avg_score={avg_score:.4f}, count={len(score_list)}")

print("Repper counts:")
for model, count in repper_cnt.items():
	print(f"{model}: {count} ({count / len(all_examples):.4%})")

## Seed Next Model

In [None]:
ds = datasets.load_dataset("fancyfeast/joy-captioning-alignment-20250428a", split="test")
test_filehashes = set(ds['filehash'])

In [None]:
# Clear out old examples with no judgements
n_deleted = 0
with db_conn() as (conn, cur):
	for example in tqdm(list_db_rankings()):
		if example.judge_reasoning is not None:
			continue

		if example.responses is None:
			cur.execute("DELETE FROM alignment_rankings_ai WHERE id = %s", (example.id,))
			n_deleted += 1
			continue
		
		if example.question_type != 'questionAnswer':
			cur.execute("DELETE FROM alignment_rankings_ai WHERE id = %s", (example.id,))
			n_deleted += 1
			continue

		if example.filehash in test_filehashes:
			cur.execute("DELETE FROM alignment_rankings_ai WHERE id = %s", (example.id,))
			n_deleted += 1
			continue
	
	conn.commit()
	print(f"Deleted {n_deleted} examples")

In [None]:
# Add all of our human VQA examples
existing_filehashes = set()
existing_filehashes.update(test_filehashes)

for example in tqdm(list_db_rankings()):
	if example.responses_model == CURRENT_RESPONSE_MODEL:
		existing_filehashes.add(example.filehash)

results = api.search("EXISTS(vqa_category) AND EXISTS(questionAnswer)", ["hash", "attributes"])
assert isinstance(results, list)

n_added = 0
with db_conn() as (conn, cur):
	for result in tqdm(results):
		assert isinstance(result, tag_machine_api.SearchResultImage) and result.hash is not None and result.attributes is not None
		if result.hash in existing_filehashes:
			continue

		sources = result.attributes.get('source', [])
		vqa_category, = result.attributes['vqa_category'].keys()
		questionAnswer, = result.attributes['questionAnswer'].keys()
		questionAnswer = json.loads(questionAnswer)
		vqa_category = vqa_category.strip()
		question = questionAnswer['question'].strip()
		answer = questionAnswer['answer'].strip()

		if vqa_category == '' or question == '' or answer == '' or 'benchmark' in vqa_category.lower():
			continue

		if 'bra_sizing' in sources and result.hash.hex()[0] != '0':
			# Filter the bra related questions, so they don't overload the dataset.
			# using the hash knocks it down by 1/16th, deterministically
			continue

		if "<system>" in question:
			i = question.index("<system>")
			j = question.index("</system>")
			system_message = question[i + len("<system>"):j]
			question = question[:i] + question[j + len("</system>"):]
		else:
			system_message = random.choice(system_prompts)
		
		question = question.strip()

		example = EvalSampleMulti(
			filehash=result.hash,
			system=system_message,
			question=question,
			question_type='questionAnswer',
			ground_truth_knowledge=build_ground_truth_knowledge(result.hash, question),
			responses_model=CURRENT_RESPONSE_MODEL,  # So we can track it
		)
		example.add_to_database(cur)
		existing_filehashes.add(result.hash)
		n_added += 1
	
	conn.commit()

print(f"Added {n_added} examples")

In [None]:
# List all existing rankings
n = 32000
existing_types = defaultdict(int)
existing_filehashes = set()
existing_filehashes.update(test_filehashes)
for example in list_db_rankings():
	if example.responses_model == CURRENT_RESPONSE_MODEL:
		existing_types[example.question_type] += 1
	existing_filehashes.add(example.filehash)
for example in list_db_preferences():
	existing_filehashes.add(example.filehash)

# Figure out what VQA examples we have left and what API images we can use
remaining_vqa = [e for e in vqa_examples if e.filehash not in existing_filehashes]
remaining_api_images = list(set(API_IMAGES_TO_USE) - existing_filehashes)

print(f"# remaining VQA examples: {len(remaining_vqa)}")
print(f"# remaining API images: {len(remaining_api_images)}")

random.shuffle(remaining_vqa)
random.shuffle(remaining_api_images)

print({k: (int(v * n), existing_types[k]) for k, v in question_types.items()})

target_counts = {k: max(0, int(v * n) - existing_types[k]) for k, v in question_types.items()}
print(target_counts)
examples = []

for question_type, target_count in tqdm(target_counts.items()):
	for _ in range(target_count):
		if question_type == 'questionAnswer':
			if len(remaining_vqa) == 0:
				continue

			examples.append(remaining_vqa.pop())
			continue

		if len(remaining_api_images) == 0:
			continue

		filehash = remaining_api_images.pop()
		system = random.choice(system_prompts).strip()
		question = get_random_prompt(question_type, filehash)
		# HACK BECAUSE WE FORGOT TO ADD EXTENSIONS
		n_extensions = random.randint(1, 3)
		extentions = [random.choice([" ", "\n", "\n\n"]) + x for x in random.sample(PROMPT_EXTENSIONS, n_extensions)]
		question = question + ''.join(extentions)
		###
		examples.append(EvalSampleMulti(
			filehash=filehash,
			system=system,
			question=question,
			question_type=question_type,
			ground_truth_knowledge=build_ground_truth_knowledge(filehash, question),
			responses_model=CURRENT_RESPONSE_MODEL,  # So we can track it
		))

print(f"Generated {len(examples)} examples to add to the database")

In [None]:
# Now insert all the new examples into the database
with db_conn() as (conn, cur):
	examples = [example.add_to_database(cur) for example in tqdm(examples)]
	conn.commit()

In [None]:
questions = [
	"{tag_string}\n\nPlease write an SDXL prompt for this image. The above are the booru tags that were associated with the image. The prompt include all of those tags. The prompt will be used by SDXL to try and recreate the image as exactly as possible. So make sure the prompt is complete, accurate, and detailed.",
	"{tag_string}\n\nI need an SDXL-like prompt.",
	"{tag_string}\n\nIncorporate as many of these tags into your c aption as possible while staying under {word_count} words and writing without describing mood and just being straightrfoward.",
	"Concise straightforward caption using some of the tags below. {word_count} words MAX.\n\n{tag_string}",
	"""{tag_string}\n\nYou have up to {word_count} words to write a caption for this image in a straightforward way without bullshit descriptors like "This image is..." or ambiguity.""",
	"I need a caption that is straightforward without bullshit, thanks.\n\n{tag_string}\n\nNot too, too long, like {word_count} words or a bit more?",
	"{tag_string}\n\nWrite a {length} description for this image.  The above are booru tags for this image.  Use them to enhance the caption you write.",
	"{tag_string}\n\n\nWrite an SDXL prompt using the attached image and the above tags",
	"""I have these tags for this image {tag_string}\n\nI need a caption. No bullshit like "This image is", just a straightforward caption within {word_count} words covering all details needed and incorporating the tags when you can.""",
	"{tag_string}\n\nI need an SDXL-like prompt. Include these tags.",
	"""Incorpoating these tags when possible write out a sttraightforward caption. Do not use the word "featuring" or anything like it!\n\n{tag_string}""",
	"""I would like a straightforward caption.\n\n{tag_string}""",
	"""{tag_string}\n\nGimmie a straightforward caption""",
	"""{tag_string}\n\nI would like you to output  a caption with straighforward styule of writing the caption""",
	"""{tag_string}\n\nWrite an accurate caption in a straightforward style.""",
	"""{tag_string}\n\nBased on these booru tags and the image write out an SDXL prompt""",
	"""{tag_string}\n\n\nI would like a SDXL prompt written based on the image and the tags above. Less than {word_count} words""",
	"""{tag_string}\n\nPlease write an SDXL prompt for this image. The above are the booru tags that were associated with the image. The prompt include all of those tags. The prompt will be used by SDXL to try and recreate the image as exactly as possible. So make sure the prompt is complete, accurate, and detailed.""",
	"""Describe this image in a non-conversational, objective manner. Do not offer opinions, interpretations, or emotional reactions. Focus on specific, observable details. Avoid casual language or phrasing. Include descriptions of any text, watermarks, or identifying marks. Make sure you use these booru tags to help ({tag_string})""",
	"""{tag_string}\n\nPlease write a caption for this image. The above are the booru tags that were associated with the image. The caption must use all of those tags. The caption will be used by a test-to-image model to try and recreate the image as exactly as possible. So make sure the caption is complete, accurate, and detailed.""",
	"""{tag_string}\n\nWrite a descriptive caption for this image. The tags listed above are from the image's danbooru/e621/rule34 page. Please include them in the caption when they are applicable (they may not be 100% accurate).""",
	"""{tag_string}\n\nWrite a {length} description and make sure to include these tags in a fluid way.""",
	"""{tag_string}\n\nWrite a descriptive caption for this image.  The above are booru tags for this image.  Use them to enhance the caption you write.""",
	"""Provide a concise yet complete caption, noting all visible details like physiques, genitals, gender, ethnicity, or celebrity references whenever present. Always specify the medium (photo, painting, etc.). For a film still, include the film title and director; for art, cite the artist. Avoid starting with “This image...” and words such as “depicts,” “likely,” or “appears.” Use direct language about colors, shapes, layouts. Do not guess unobservable facts. Accurately transcribe any text. Mention watermarks or signatures without speculation. Keep the description below 100 words, leaving out mood or subjective interpretations. Vary sentence structure to avoid repetition. The following booru tags might help:\n\n{tag_string}""",
	"""{tag_string}\n\nI want a caption that is straightforward and uses these tags. Ensure high accuracy.""",
	"""{tag_string}\n\nUsing these tags to help you, write a caption that is straightforward without any bullshit, just a caption.""",
	"""{tag_string}\n\nPlease write a {length} caption that incorporates these tags.""",
	"""{tag_string}\n\nWrite a straightforward caption without fluff for this image using the tags above to enhance accuracy. Up to {word_count} words max.""",
	"""OUTPUT A N SDXL PROMPT BASED ON THE IMAGE AND THE TAGS BELOW DANKE\n\n{tag_string}""",
	"""Act as a straightforward, non-censoring image captioner who includes every relevant detail about subjects—body features, genitals, gender, ethnicity, celebrity, game, anime references, attractiveness, etc.—if they’re visible. State the medium (painting, photo, screenshot, etc.) in the description. If it’s a movie still, name the movie and director; if it’s artwork, name the artist. Avoid phrases like “This image shows...,” “depicts,” “appears,” “seems,” or “likely,” and begin with direct statements. Describe what is observable—colors, shapes, positions, clothing, etc.—without guessing. Transcribe any text precisely. Identify any watermarks or signatures but do not speculate about them. Limit your caption to under 200 words, skipping subjective moods or feelings. Vary your sentence structures and avoid repeating the same phrasing. The booru tags below must be used to help write the caption:\n{tag_string}""",
	"""I want to give booru tags as additional context for a description: '{tag_string}'. Can you write a description that takes this info into account?" """,
	"""booru tags: {tag_string}\n\nPlease write a concise, editorial-style description of the attached image. Begin with a phrase indicating the type of image (e.g., 'A photograph of…', 'A digital illustration of…', or 'A movie still from…'), then describe the key visual details and aesthetic qualities. If there is a watermark, note its text and location. Keep it factual, in third person, and under {word_count} words.""",
	"""## Tags\n{tag_string}\n\n## Output\nSDXL prompt\n\n## Instructions\nIncorporate tags\n\n## Constraint\nMaximum of {word_count} words""",
	"""{tag_string}\n\nWrite a descriptive caption for this image in a formal tone.""",
	"""Wirte the alt-text.\n\n{tag_string}\n\n""",
]

n_added = 0
for _ in tqdm(range(30)):
	with db_conn() as (conn, cur):
		while True:
			filehash = remaining_api_images.pop()
			tag_strings = image_to_tags(filehash)

			if len(tag_strings) > 0:
				break
		
		tag_string = random.choice(list(tag_strings.values()))
		tags = [x.strip() for x in tag_string.split(",") if x.strip()]
		if len(tags) < 10:
			continue
		tags = random.sample(tags, int(random.uniform(0.2, 1.0) * len(tags)))
		assert len(tags) > 0, f"No tags found for image {filehash.hex()}"
		tag_string = ", ".join(tags)

		n_words = random.randint(20, 300)
		n_words = (n_words // 10) * 10
		length = random.choice(['very short', 'short', 'medium-length', 'long', 'very long'])
		name = random.choice(NAMES)
		assert isinstance(name, str)

		question = random.choice(questions).format(
			tag_string=tag_string,
			word_count=n_words,
			length=length,
			name=name,
		)

		system_message = random.choice(system_prompts)

		example = EvalSampleMulti(
			filehash=filehash,
			system=system_message,
			question=question,
			question_type='questionAnswer',
			ground_truth_knowledge=build_ground_truth_knowledge(filehash, question),
			responses_model=CURRENT_RESPONSE_MODEL,  # So we can track it
		)

		example.add_to_database(cur)
		conn.commit()
		n_added += 1

		# image_data = api.read_image(example.filehash)
		# image = Image.open(io.BytesIO(image_data))
		# scale = 512 / max(image.size)
		# image = image.resize((int(image.size[0] * scale), int(image.size[1] * scale)), Image.LANCZOS)
		# display(image)
		# print(example.system)
		# print(example.question)

print(f"Added {n_added} examples")

In [None]:
results = api.search("EXISTS(vqa_category) AND EXISTS(questionAnswer)", ["attributes"])
assert isinstance(results, list)
for result in results:
	assert isinstance(result, tag_machine_api.SearchResultImage) and result.attributes is not None
	vqa_category, = result.attributes['vqa_category'].keys()
	questionAnswer, = result.attributes['questionAnswer'].keys()
	questionAnswer = json.loads(questionAnswer)
	vqa_category = vqa_category.strip()
	question = questionAnswer['question'].strip()
	answer = questionAnswer['answer'].strip()

	if vqa_category == '' or question == '' or answer == '' or 'tag_augmentation' not in vqa_category.lower():
		continue

	print(question)
	print("###")

## Evaluate Tag Augmentation Performance

In [None]:
existing_filehashes = set()
for example in list_db_rankings() + list_db_preferences():
	existing_filehashes.add(example.filehash)

remaining_api_images = list(set(API_IMAGES_TO_USE) - existing_filehashes)
random.shuffle(remaining_api_images)

In [None]:
questions = [
	"{tag_string}\n\nPlease write an SDXL prompt for this image. The above are the booru tags that were associated with the image. The prompt include all of those tags. The prompt will be used by SDXL to try and recreate the image as exactly as possible. So make sure the prompt is complete, accurate, and detailed.",
	"{tag_string}\n\nI need an SDXL-like prompt.",
	"{tag_string}\n\nIncorporate as many of these tags into your c aption as possible while staying under {word_count} words and writing without describing mood and just being straightrfoward.",
	"Concise straightforward caption using some of the tags below. {word_count} words MAX.\n\n{tag_string}",
	"""{tag_string}\n\nYou have up to {word_count} words to write a caption for this image in a straightforward way without bullshit descriptors like "This image is..." or ambiguity.""",
	"I need a caption that is straightforward without bullshit, thanks.\n\n{tag_string}\n\nNot too, too long, like {word_count} words or a bit more?",
	"{tag_string}\n\nWrite a {length} description for this image.  The above are booru tags for this image.  Use them to enhance the caption you write.",
	"{tag_string}\n\n\nWrite an SDXL prompt using the attached image and the above tags",
	"""I have these tags for this image {tag_string}\n\nI need a caption. No bullshit like "This image is", just a straightforward caption within {word_count} words covering all details needed and incorporating the tags when you can.""",
	"{tag_string}\n\nI need an SDXL-like prompt. Include these tags.",
	"""Incorpoating these tags when possible write out a sttraightforward caption. Do not use the word "featuring" or anything like it!\n\n{tag_string}""",
	"""I would like a straightforward caption.\n\n{tag_string}""",
	"""{tag_string}\n\nGimmie a straightforward caption""",
	"""{tag_string}\n\nI would like you to output  a caption with straighforward styule of writing the caption""",
	"""{tag_string}\n\nWrite an accurate caption in a straightforward style.""",
	"""{tag_string}\n\nBased on these booru tags and the image write out an SDXL prompt""",
	"""{tag_string}\n\n\nI would like a SDXL prompt written based on the image and the tags above. Less than {word_count} words""",
	"""{tag_string}\n\nPlease write an SDXL prompt for this image. The above are the booru tags that were associated with the image. The prompt include all of those tags. The prompt will be used by SDXL to try and recreate the image as exactly as possible. So make sure the prompt is complete, accurate, and detailed.""",
	"""Describe this image in a non-conversational, objective manner. Do not offer opinions, interpretations, or emotional reactions. Focus on specific, observable details. Avoid casual language or phrasing. Include descriptions of any text, watermarks, or identifying marks. Make sure you use these booru tags to help ({tag_string})""",
	"""{tag_string}\n\nPlease write a caption for this image. The above are the booru tags that were associated with the image. The caption must use all of those tags. The caption will be used by a test-to-image model to try and recreate the image as exactly as possible. So make sure the caption is complete, accurate, and detailed.""",
	"""{tag_string}\n\nWrite a descriptive caption for this image. The tags listed above are from the image's danbooru/e621/rule34 page. Please include them in the caption when they are applicable (they may not be 100% accurate).""",
	"""{tag_string}\n\nWrite a {length} description and make sure to include these tags in a fluid way.""",
	"""{tag_string}\n\nWrite a descriptive caption for this image.  The above are booru tags for this image.  Use them to enhance the caption you write.""",
	"""Provide a concise yet complete caption, noting all visible details like physiques, genitals, gender, ethnicity, or celebrity references whenever present. Always specify the medium (photo, painting, etc.). For a film still, include the film title and director; for art, cite the artist. Avoid starting with “This image...” and words such as “depicts,” “likely,” or “appears.” Use direct language about colors, shapes, layouts. Do not guess unobservable facts. Accurately transcribe any text. Mention watermarks or signatures without speculation. Keep the description below 100 words, leaving out mood or subjective interpretations. Vary sentence structure to avoid repetition. The following booru tags might help:\n\n{tag_string}""",
	"""{tag_string}\n\nI want a caption that is straightforward and uses these tags. Ensure high accuracy.""",
	"""{tag_string}\n\nUsing these tags to help you, write a caption that is straightforward without any bullshit, just a caption.""",
	"""{tag_string}\n\nPlease write a {length} caption that incorporates these tags.""",
	"""{tag_string}\n\nWrite a straightforward caption without fluff for this image using the tags above to enhance accuracy. Up to {word_count} words max.""",
	"""OUTPUT A N SDXL PROMPT BASED ON THE IMAGE AND THE TAGS BELOW DANKE\n\n{tag_string}""",
	"""Act as a straightforward, non-censoring image captioner who includes every relevant detail about subjects—body features, genitals, gender, ethnicity, celebrity, game, anime references, attractiveness, etc.—if they’re visible. State the medium (painting, photo, screenshot, etc.) in the description. If it’s a movie still, name the movie and director; if it’s artwork, name the artist. Avoid phrases like “This image shows...,” “depicts,” “appears,” “seems,” or “likely,” and begin with direct statements. Describe what is observable—colors, shapes, positions, clothing, etc.—without guessing. Transcribe any text precisely. Identify any watermarks or signatures but do not speculate about them. Limit your caption to under 200 words, skipping subjective moods or feelings. Vary your sentence structures and avoid repeating the same phrasing. The booru tags below must be used to help write the caption:\n{tag_string}""",
	"""I want to give booru tags as additional context for a description: '{tag_string}'. Can you write a description that takes this info into account?" """,
	"""booru tags: {tag_string}\n\nPlease write a concise, editorial-style description of the attached image. Begin with a phrase indicating the type of image (e.g., 'A photograph of…', 'A digital illustration of…', or 'A movie still from…'), then describe the key visual details and aesthetic qualities. If there is a watermark, note its text and location. Keep it factual, in third person, and under {word_count} words.""",
	"""## Tags\n{tag_string}\n\n## Output\nSDXL prompt\n\n## Instructions\nIncorporate tags\n\n## Constraint\nMaximum of {word_count} words""",
	"""{tag_string}\n\nWrite a descriptive caption for this image in a formal tone.""",
	"""Wirte the alt-text.\n\n{tag_string}\n\n""",
]

tag_augmentation_eval = []

for _ in tqdm(range(64)):
	while True:
		filehash = remaining_api_images.pop()
		tag_strings = image_to_tags(filehash)
		if len(tag_strings) <= 0:
			continue
	
		tag_string = random.choice(list(tag_strings.values()))
		tags = [x.strip() for x in tag_string.split(",") if x.strip()]
		if len(tags) < 10:
			continue
		tags = random.sample(tags, int(random.uniform(0.2, 1.0) * len(tags)))
		assert len(tags) > 0, f"No tags found for image {filehash.hex()}"
		tag_string = ", ".join(tags)
		break

	n_words = random.randint(20, 300)
	n_words = (n_words // 10) * 10
	length = random.choice(['very short', 'short', 'medium-length', 'long', 'very long'])
	name = random.choice(NAMES)
	assert isinstance(name, str)

	question = random.choice(questions).format(
		tag_string=tag_string,
		word_count=n_words,
		length=length,
		name=name,
	)

	system_message = random.choice(system_prompts)

	tag_augmentation_eval.append({
		"filehash": filehash,
		"system": system_message,
		"question": question,
	})

In [None]:
client_b = openai.Client(base_url="http://localhost:5053/v1", api_key="token-abc123")
client_b_model = "mb3500zp"

for example in tqdm(tag_augmentation_eval):
	image_data = api.read_image(example['filehash'])
	image_mime = magic.from_buffer(image_data, mime=True)
	image_dataurl = f"data:{image_mime};base64,{base64.b64encode(image_data).decode('utf-8')}"
	response_b = client_b.chat.completions.create(
		model=client_b_model,
		messages=[
			{
				"role": "system",
				"content": example['system'],
			},
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": example['question'],
					},
					{
						"type": "image_url",
						"image_url": {
							"url": image_dataurl,
						},
					},
				]
			},
		],
		temperature=0.6,
		top_p=0.9,
		max_tokens=512,
	)
	response_b = response_b.choices[0].message.content.strip()
	example['response'] = response_b

In [None]:
judge_system_prompt = """
You are an expert AI Response Evaluator. Your task is to meticulously analyze an AI-generated response (`Response`) provided in response to a specific `User Query` and a `User Image`. These responses were generated based on a specific `Original System Prompt` that defined the AI's persona, constraints, and goals.

**Your Goal:** Determine objectively how well `Response` performs according to the evaluation criteria outlined below. Provide a clear judgment and a detailed, reasoned justification for your choices.

**Input You Will Receive:**

1.  `Original System Prompt`: The instructions the AI models were given to generate their responses. Pay close attention to persona, tone, constraints, required format, and specific tasks mentioned here.
2.  `User Query`: The specific question or instruction from the user.
3.  `User Image`: An image provided by the user, which may be relevant to the query.
4.  `Response`: One of the AI-generated responses.

**Evaluation Criteria:**

1.  **Adherence to Original System Prompt:**
    *   Did the response follow ALL instructions, constraints, formatting requirements, persona, and tone specified in the `Original System Prompt`?
    *   How well did the response embody the defined persona or role?
2.  **Addressing the User Query:**
    *   Did the response directly, accurately, and completely answer the `User Query`?
    *   Is the response relevant to the user's explicit and implicit needs?
3.  **Image Integration:**
    *   If an image was provided and relevant, did the response appropriately acknowledge, analyze, or utilize the image content as necessitated by the `User Query` and `Original System Prompt`?
4.  **Helpfulness and Usefulness:**
    *   How helpful and practical is the response for the user? Does it provide value?
5.  **Accuracy and Factual Correctness:**
    *   Is the information presented accurate and free from errors? (Acknowledge if you cannot verify).
6.  **Clarity, Conciseness, and Structure:**
    *   Is the response well-organized, easy to understand, and appropriately concise? Is it free from unnecessary jargon or rambling?

**Your Task Steps:**

1.  **Understand the Context:** Thoroughly review the `Original System Prompt`, `User Query`, and `User Image`. Understand the *expected* output.
2.  **Analyze Response:** Evaluate `Response` against all relevant criteria listed above. Note its strengths and weaknesses.
3.  **Score Based on Prompt Adherence:** Assign a score from 1 to 10 based on how well the response adheres to the `Original System Prompt` and addresses the `User Query`. A score of 1 means the response is completely useless, and a score of 10 means the response is perfect.
4.  **Score Based on Tag Integration:** If the `User Query` included a list of tags to incorporate in the response, evaluate how well the response integrated those tags. Assign a score from 1 to 10 based on the quality of tag integration. A score of 1 means no tags were integrated, and a score of 10 means all relevant tags were integrated perfectly.
5.  **Score Based on Accuracy:** Assign a score from 1 to 10 based on the accuracy of the response. A score of 1 means the response is completely inaccurate, and a score of 10 means the response is completely accurate.

**Note:**

*   If the user query sets a maximum word count for the response, the response should be within that limit.
*   If the user query asks for the response to be "very short", "short", "medium-length", "long", or "very long" then treat that as an approximate word count limit where very short is ~20 words, short is ~40 words, medium-length is ~60 words, long is ~100 words, and very long is ~200 words.
*   If the user includes a list of tags to incorporate into the response, then the AI is expected to skillfully weave those tags into the response in a natural way. The AI should not just list the tags or use them in a way that feels forced or unnatural. The tags do not need to be used in the same order. The tags do not need to be integrated verbatim; synonyms or related terms are acceptable as long as the meaning is preserved. The AI should also not use the word "tags" in the response.

**Output Format:**

Structure your evaluation clearly. You might use headings like:

*   **Context Summary:** (Briefly summarize the task set by the prompt/query)
*   **Analysis of Response:** (Strengths/Weaknesses against criteria)
*   **Scoring:** (Direct point-by-point evaluation scores)
*   **Justification:** (Detailed reasoning for the judgment)

After your evaluation, at the end of your response, always write a machine parsable output that includes:

* Based on your evaluation, the prompt adherence score (1-10) between <prompt_adherence_score> and </prompt_adherence_score>.
* Based on your evaluation, the tag integration score (1-10) between <tag_integration_score> and </tag_integration_score>.
* Based on your evaluation, the accuracy score (1-10) between <accuracy_score> and </accuracy_score>.
"""




judge_user_message = """
<original_system_prompt>{system_prompt}</original_system_prompt>
<user_query>{user_query}</user_query>
{response}

---

**Now, please perform the evaluation based on the instructions provided in your system prompt. Don't forget to clearly mark the scores for the response in <prompt_adherence_score></prompt_adherence_score>, <tag_integration_score></tag_integration_score>, and <accuracy_score></accuracy_score> tags.**
"""

client = openai.Client()

for example in tqdm(tag_augmentation_eval):
	if 'judge_response' in example:
		continue

	system_prompt = ALPHA_SYSTEM_MESSAGE + "\n" + example['system']

	image_data = api.read_image(example['filehash'])
	image_mime = magic.from_buffer(image_data, mime=True)
	image_dataurl = f"data:{image_mime};base64,{base64.b64encode(image_data).decode('utf-8')}"

	judge_response = client.chat.completions.create(
		model="o4-mini",
		reasoning_effort="medium",
		messages=[
			{
				"role": "system",
				"content": judge_system_prompt.strip(),
			},
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": judge_user_message.format(system_prompt=system_prompt.strip(), user_query=example['question'].strip(), response=example['response']).strip(),
					},
					{
						"type": "image_url",
						"image_url": {
							"url": image_dataurl,
						},
					}
				]
			},
		],
		#temperature=0.5,
	)
	judge_response = judge_response.choices[0].message.content.strip()
	example['judge_response'] = judge_response
	#break

In [None]:
import markdown

result = "<html><body>\n"

for example in tag_augmentation_eval:
	if 'judge_response' not in example:
		continue
	judge_response = example['judge_response']
	judge_response = re.sub(r'(:\n)([-+*])', r'\1\n\2', judge_response)
	judge_response = markdown.markdown(judge_response, extensions=["extra", "nl2br"])

	image_data = api.read_image(example['filehash'])
	image_mime = magic.from_buffer(image_data, mime=True)
	image_dataurl = f"data:{image_mime};base64,{base64.b64encode(image_data).decode('utf-8')}"
	result += f'<img src="{image_dataurl}" width="512"><br>\n'
	result += f"<p><strong>System:</strong> {example['system']}</p>\n"
	result += f"<p><strong>Question:</strong> {example['question']}</p>\n"
	result += f"<p><strong>Response:</strong> {example['response']}</p>\n"
	result += f"<p><strong>Judge Response:</strong> {judge_response}</p>\n"
	result += "<hr>\n"
result += "</body></html>\n"
with open("tag_augmentation_eval.html", "w") as f:
	f.write(result)

In [None]:
scores = defaultdict(list)
for i, example in enumerate(tag_augmentation_eval):
	if 'judge_response' not in example:
		print(f"Missing judge response for example {i}")
		continue
	judge_response = example['judge_response']
	try:
		prompt_adherence_score = int(re.search(r'<prompt_adherence_score>(\d+)</prompt_adherence_score>', judge_response).group(1))
		tag_integration_score = int(re.search(r'<tag_integration_score>(\d+)</tag_integration_score>', judge_response).group(1))
		accuracy_score = int(re.search(r'<accuracy_score>(\d+)</accuracy_score>', judge_response).group(1))
	except Exception as e:
		print(f"Error parsing judge response for example {i}: {e}")
		continue
	scores['prompt_adherence'].append(prompt_adherence_score)
	scores['tag_integration'].append(tag_integration_score)
	scores['accuracy'].append(accuracy_score)

for key, values in scores.items():
	mean = sum(values) / len(values)
	median = sorted(values)[len(values) // 2]
	min_value = min(values)
	max_value = max(values)
	print(f"{key}: mean={mean:.2f}, median={median}, min={min_value}, max={max_value}")