In [1]:
from martian_apart_hack_sdk import exceptions, judge_specs, martian_client, utils
from martian_apart_hack_sdk.models import judge_evaluation, llm_models, router_constraints
import pickle
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple
import uuid
import time

from openai.types.chat import (
    chat_completion,
    chat_completion_message,
)

In [2]:
config = utils.load_config()
client = martian_client.MartianClient(
    api_url=config.api_url,
    api_key=config.api_key,
)

In [3]:
JUDGE_IDS = [
    'harmlessness-judge',
    'privacy-judge',
    'factual-accuracy-judge',
    'prompt-faithfulness-relevance-judge',
    'calibration-uncertainty-judge',
    'bias-fairness-judge',
    'reasoning-consistency-judge',
    'discourse-coherence-judge',
    'conciseness-redundancy-judge',
    'style-formatting-judge',
]

In [4]:
JUDGES = { id: client.judges.get(judge_id=id) for id in JUDGE_IDS }

In [5]:
def get_score(query, answer, judge):
    # Run the judge spec.
    chat_request_text = query
    chat_response_text = answer

    completion_request = {
        "model": llm_models.GPT_4O_MINI,
        "messages": [{"role": "user", "content": chat_request_text}],
    }

    chat_completion_response = chat_completion.ChatCompletion(
        id="123",
        choices=[
            chat_completion.Choice(
                finish_reason="stop",
                index=0,
                message=chat_completion_message.ChatCompletionMessage(
                    role="assistant",
                    content=chat_response_text,
                ),
            )
        ],
        created=0,
        model="gpt-4o",
        object="chat.completion",
        service_tier=None,
    )

    evaluation_result = client.judges.evaluate(
        judge,
        completion_request=completion_request,
        completion_response=chat_completion_response,
    )

    return evaluation_result.score

## Load data

In [6]:
df_path = '9000.pkl'

with open(df_path, 'rb') as f:
    df = pickle.load(f)


In [7]:
def evaluate_single_judge(args: Tuple[str, str, str, dict]) -> Tuple[int, float]:
    """
    Evaluate a single judge - designed for parallel execution
    
    Args:
        args: Tuple of (question, answer, judge_id, judges_dict)
    Returns:
        Tuple of (judge_index, score)
    """
    question, answer, judge_id, judges = args
    judge_index = JUDGE_IDS.index(judge_id)
    score = get_score(question, answer, judges[judge_id])
    return (judge_index, score)

In [8]:
def evaluate_with_backoff(args, max_retries: int = 5, initial_delay: float = 1.0):
    """
    Wrap evaluate_single_judge with exponential back-off on exception.
    
    Args:
        args: Tuple of (question, answer, judge_id, JUDGES)
        max_retries: Number of retry attempts
        initial_delay: Seconds to sleep before first retry
    """
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            return evaluate_single_judge(args)
        except Exception:
            if attempt == max_retries - 1:
                # Last attempt: re-raise or return a default
                raise
            time.sleep(delay)
            delay *= 2
    # Fallback (should not reach here)
    return evaluate_single_judge(args)

In [9]:
def evaluate_pair_parallel(question: str, answer: str, max_workers: int = None) -> List[float]:
    """
    Parallelize judge evaluations using ThreadPoolExecutor,
    with exponential back-off on each judge call.
    
    Args:
        question: The query text
        answer: The response text
        max_workers: Number of parallel workers (None = auto-select based on CPU)
    
    Returns:
        List of scores in judge order
    """
    scores = [0.0] * len(JUDGE_IDS)
    eval_args = [(question, answer, judge_id, JUDGES) for judge_id in JUDGE_IDS]
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for judge_index, score in executor.map(evaluate_with_backoff, eval_args):
            scores[judge_index] = score
    
    return scores

In [10]:
def eval_row(args):
    idx, question, answer = args
    return idx, evaluate_pair_parallel(question, answer)

In [None]:
# Prepare argument list
completed = 0
batch = 0

results = [None] * len(df)
tasks = [
    (i, row["instruction"], row["answer"])
    for i, (_, row) in enumerate(df.iterrows())
]

# Parallelize across rows
with ThreadPoolExecutor() as executor:
    for i, scores in executor.map(eval_row, tasks):
        results[i] = scores
        completed += 1

        # Kinky, really dirty stuff
        if completed % 100 == 0:
            print(f"Completed {completed} rows! Saving progress...")
            
            with open(f'progress_{batch}.pkl', 'wb') as f:
                pickle.dump(results, f)

            batch += 1

Completed 100 rows! Saving progress...
Completed 200 rows! Saving progress...
Completed 300 rows! Saving progress...
Completed 400 rows! Saving progress...
Completed 500 rows! Saving progress...
Completed 600 rows! Saving progress...
Completed 700 rows! Saving progress...
Completed 800 rows! Saving progress...
Completed 900 rows! Saving progress...
Completed 1000 rows! Saving progress...
Completed 1100 rows! Saving progress...
Completed 1200 rows! Saving progress...
Completed 1300 rows! Saving progress...
Completed 1400 rows! Saving progress...
Completed 1500 rows! Saving progress...
Completed 1600 rows! Saving progress...
Completed 1700 rows! Saving progress...
Completed 1800 rows! Saving progress...
Completed 1900 rows! Saving progress...
Completed 2000 rows! Saving progress...
Completed 2100 rows! Saving progress...
Completed 2200 rows! Saving progress...
Completed 2300 rows! Saving progress...
Completed 2400 rows! Saving progress...
Completed 2500 rows! Saving progress...
Completed

In [12]:
with open('progress_40.pkl', 'rb') as f:
    results = pickle.load(f)


In [14]:
len(results)

9000

In [15]:
len(df)

9000

In [16]:
df['scores'] = results

In [17]:
df['scores']

uuid
8d76e6c7-c0d4-4a85-8b04-a0487951b9c4           [4, 3.8, 1.5, 2, 1.5, 3.5, 3, 3.2, 2.5, 3]
45845924-5386-428d-bf13-10b66ea048b4       [4, 4, 2.7, 2.8, 1.5, 3.5, 2.5, 3.5, 2.5, 2.5]
2b1db340-dd68-495d-bcf5-43f5479dfe10       [3.5, 4, 3.5, 2, 2.5, 3.8, 3.5, 3.5, 2.5, 2.5]
ea344eb0-64bc-4266-b970-0ebdacd2d661             [2, 3, 1, 0.9, 0.5, 2, 0.8, 1, 0.9, 0.9]
32758846-9750-4eac-8702-8c4dcf215409    [3.5, 3.8, 3.4, 3.8, 3.5, 3.5, 3.5, 3.5, 2.8, ...
                                                              ...                        
24eb39a7-ca81-4ce5-a54c-fe52e79ad0af                                                 None
92897240-2314-4ba4-8b5e-547599284272                                                 None
546a94e9-f1f0-4864-86d7-bb4605a3d28e                                                 None
3efcb49d-5840-467f-b991-a30894d59628                                                 None
e48486fe-b8ca-4066-b4ce-c856f747a17c                                                 None
Name:

In [21]:
subset = df[df['scores'].notnull()]

In [26]:
with open('4100.pkl', 'wb') as f:
    pickle.dump(df, f)

In [24]:
len(subset)

4100

In [None]:
df