In [None]:
from tqdm import tqdm
import json
from llmclient import LLMClient, get_llm_response
#import pandas as pd

def retrieve(prompt: list, llm: LLMClient, is_reasoning=False, has_think=False):
    """ Retrieve a response from an LLM. Useful in APIs where there is rate limitations.
    ---
    Params:
    prompt: list: a list of inputs (i.e., [{"role": "user", "content": ...}])
    llm: the LLMClient object
    is_reasoning: is it a reasoning model? -- note, GPT-5 does _not_ qualify for this because it does not return its CoT
    has_think: does it have a </think> token?
    """
    response = None
    tries = 0
    while True:
        if response is not None: break
        if tries > 5: break
        try:
            response = get_llm_response(llm, prompt)
            response = response.replace("```json", "").replace("```", "")
            if response == "":
                response = None
                tries += 1
            else:
                if is_reasoning:
                    if has_think:
                        response = response.split("</think>")[-1].strip()
                    else:
                        response = "{" + response.split("{")[-1].strip()
                response = json.loads(response.strip())
                response["Label"] = int(response["Label"])
        except:
            response = None
            tries += 1
    failed = False
    if response is None:
        failed = True
        response = {"Label": 0}
    return response, failed


In [None]:
def get_eval_prompt(question: str, exemplars: list):
    """ Retrieve the evaluation prompt.
    ---
    Params: 
    question: str: the full query (a [IMPLY] b [AND] c ... including the query)
    exemplars: list: a list of exemplars. May be an empty list.
    """

    system_prompt = """You are evaluating a subset of first-order logic. 
In this subset, conjunctions are given by [AND], implications by [IMPLY], and separations between clauses as [PERIOD]
You will be given Facts, and Rules. Based on these, determine the truth value of the Query.
Your final answer should be 0 (if the Query is false) or 1 (if true).

Give your answer in JSON with the following schema:
{{
"Label" (int): The label from the criterion. Only use the numbers 0 or 1.
}}
Only use the key "Label".
"""
    exemplars = []
    for e in exemplars:
        t = e["Question"]
        label = e["Response"]
        exemplars.append({"role": "user", "content": f"Question: {t}"})
        exemplars.append({"role": "assistant", "content": '{"Label": '  + label + '}'})
    user_prompt = f"Question: {question}"
    prompt = [{"role": "system", "content": system_prompt}]
    prompt += exemplars
    prompt += [{"role": "user", "content": user_prompt}]
    return prompt

In [None]:
# NL
dataset = json.load(open("model_responses_final.json", "r", encoding="utf-8"))
exemplars = json.load(open("exemplars.json", "r", encoding="utf-8"))[:5]

# Non-NL
dataset = json.load(open("test_gibberish.json", "r", encoding="utf-8"))
exemplars = json.load(open("exemplars_gibberish.json", "r", encoding="utf-8"))[:5]

In [None]:
MODEL = "gpt-5-reasoning"
MODEL = "anthropic-claude-opus-4-1"
params = {"max_tokens": 6000,}

MODEL = "qwen-3-17b"
params = {"max_completion_tokens": 4500,}

MODEL = "qwen-25-vl7b"
MODEL = "gpt-41-shortco-2025-04-14"
params = {"max_tokens": 128}

llm = LLMClient(params, MODEL)
model_name = MODEL

is_zero_shot = True
outputs = dataset # For checkpointing
mname = model_name + "_five_shot"
if is_zero_shot:
    exemplars = []
    mname = model_name + "_zero_shot"

fails = 0
optimise_tokens = "gpt-5" in MODEL.lower() or "claude" in MODEL.lower() # Some LLMs are too slow
is_reasoning = False # Claude, Qwen-3 (not GPT-5)
has_think = False # Does it have a </think> output?

for i in tqdm(range(len(outputs))):
    entry = outputs[i]
    query = entry["Question"]
    prompt = get_eval_prompt(query, exemplars)

    if optimise_tokens:
        if entry['Depth'] > 25:
            params = {"max_completion_tokens": 9000}
        if entry['Depth'] <= 25:
            params =  {"max_completion_tokens": 7000}
        if entry['Depth'] <= 18:
            params = {"max_completion_tokens": 5000}
    response, failed = retrieve(prompt, llm, is_reasoning=is_reasoning, has_think=has_think)
    if failed: fails += 1
    if "Scores" not in entry: entry["Scores"] = {}
    entry["Scores"][mname] = response["Label"]

    # We checkpoint in case of issues
    with open("tmp_scores.json", "a", encoding="utf-8") as f:
        json.dump(entry, f, ensure_ascii=False)

print(fails, round(fails*100/len(outputs), 2))

In [None]:
outputs =  [json.loads(l) for l in open("tmp_scores.json", "r", encoding="utf-8").readlines()]

with open("model_responses.json", "w", encoding="utf-8") as f:
    json.dump(outputs, f, ensure_ascii=False)