In [34]:
from martian_apart_hack_sdk import exceptions, judge_specs, martian_client, utils
from martian_apart_hack_sdk.models import judge_evaluation, llm_models, router_constraints
import pickle
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple
import uuid
import time

from openai.types.chat import (
    chat_completion,
    chat_completion_message,
)

In [35]:
config = utils.load_config()
client = martian_client.MartianClient(
    api_url=config.api_url,
    api_key=config.api_key,
)

In [36]:
JUDGE_IDS = [
    'harmlessness-judge',
    'privacy-judge',
    'factual-accuracy-judge',
    'prompt-faithfulness-relevance-judge',
    'calibration-uncertainty-judge',
    'bias-fairness-judge',
    'reasoning-consistency-judge',
    'discourse-coherence-judge',
    'conciseness-redundancy-judge',
    'style-formatting-judge',
]

In [37]:
JUDGES = { id: client.judges.get(judge_id=id) for id in JUDGE_IDS }

In [109]:
len(JUDGES)

10

In [39]:
def get_score(query, answer, judge):
    # Run the judge spec.
    chat_request_text = query
    chat_response_text = answer

    completion_request = {
        "model": llm_models.GPT_4O_MINI,
        "messages": [{"role": "user", "content": chat_request_text}],
    }

    chat_completion_response = chat_completion.ChatCompletion(
        id="123",
        choices=[
            chat_completion.Choice(
                finish_reason="stop",
                index=0,
                message=chat_completion_message.ChatCompletionMessage(
                    role="assistant",
                    content=chat_response_text,
                ),
            )
        ],
        created=0,
        model="gpt-4o",
        object="chat.completion",
        service_tier=None,
    )

    evaluation_result = client.judges.evaluate(
        judge,
        completion_request=completion_request,
        completion_response=chat_completion_response,
    )

    return evaluation_result.score

## Load data

In [107]:
with open('training_data.pkl', 'rb') as f:
    df = pickle.load(f)

In [104]:
del df['scores']

In [108]:
with open('post_annotation.pkl', 'wb') as f:
    pickle.dump(df, f)


In [40]:
df_path = '9000.pkl'

with open(df_path, 'rb') as f:
    df = pickle.load(f)


In [41]:
def evaluate_single_judge(args: Tuple[str, str, str, dict]) -> Tuple[int, float]:
    """
    Evaluate a single judge - designed for parallel execution
    
    Args:
        args: Tuple of (question, answer, judge_id, judges_dict)
    Returns:
        Tuple of (judge_index, score)
    """
    question, answer, judge_id, judges = args
    judge_index = JUDGE_IDS.index(judge_id)
    score = get_score(question, answer, judges[judge_id])
    return (judge_index, score)

In [42]:
def evaluate_with_backoff(args, max_retries: int = 5, initial_delay: float = 1.0):
    """
    Wrap evaluate_single_judge with exponential back-off on exception.
    
    Args:
        args: Tuple of (question, answer, judge_id, JUDGES)
        max_retries: Number of retry attempts
        initial_delay: Seconds to sleep before first retry
    """
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            return evaluate_single_judge(args)
        except Exception:
            if attempt == max_retries - 1:
                # Last attempt: re-raise or return a default
                raise
            time.sleep(delay)
            delay *= 2
    # Fallback (should not reach here)
    return evaluate_single_judge(args)

In [43]:
def evaluate_pair_parallel(question: str, answer: str, max_workers: int = None) -> List[float]:
    """
    Parallelize judge evaluations using ThreadPoolExecutor,
    with exponential back-off on each judge call.
    
    Args:
        question: The query text
        answer: The response text
        max_workers: Number of parallel workers (None = auto-select based on CPU)
    
    Returns:
        List of scores in judge order
    """
    scores = [0.0] * len(JUDGE_IDS)
    eval_args = [(question, answer, judge_id, JUDGES) for judge_id in JUDGE_IDS]
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for judge_index, score in executor.map(evaluate_with_backoff, eval_args):
            scores[judge_index] = score
    
    return scores

In [44]:
def eval_row(args):
    idx, question, answer = args
    return idx, evaluate_pair_parallel(question, answer)

In [47]:
# Prepare argument list
def label_rows(df):
    completed = 0
    batch = 0

    results = [None] * len(df)
    tasks = [
        (i, row["instruction"], row["answer"])
        for i, (_, row) in enumerate(df.iterrows())
    ]

    # Parallelize across rows
    with ThreadPoolExecutor() as executor:
        for i, scores in executor.map(eval_row, tasks):
            results[i] = scores
            completed += 1

            # Kinky, really dirty stuff
            if completed % 100 == 0:
                print(f"Completed {completed} rows! Saving progress...")
                
                with open(f'progress_2_{batch}.pkl', 'wb') as f:
                    pickle.dump(results, f)

                batch += 1

In [57]:
label_rows(other)

Completed 100 rows! Saving progress...
Completed 200 rows! Saving progress...
Completed 300 rows! Saving progress...
Completed 400 rows! Saving progress...
Completed 500 rows! Saving progress...
Completed 600 rows! Saving progress...
Completed 700 rows! Saving progress...
Completed 800 rows! Saving progress...
Completed 900 rows! Saving progress...
Completed 1000 rows! Saving progress...
Completed 1100 rows! Saving progress...
Completed 1200 rows! Saving progress...
Completed 1300 rows! Saving progress...
Completed 1400 rows! Saving progress...
Completed 1500 rows! Saving progress...
Completed 1600 rows! Saving progress...
Completed 1700 rows! Saving progress...
Completed 1800 rows! Saving progress...
Completed 1900 rows! Saving progress...
Completed 2000 rows! Saving progress...
Completed 2100 rows! Saving progress...
Completed 2200 rows! Saving progress...
Completed 2300 rows! Saving progress...
Completed 2400 rows! Saving progress...
Completed 2500 rows! Saving progress...
Completed