# Prompt improved via OpenRouter

In [1]:
import os
import requests
from datasets import load_dataset
from evaluate import load as load_metric
from tqdm import tqdm
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
LETTERS = ["A", "B", "C", "D", "E"]
letter_map = {l: i + 1 for i, l in enumerate(LETTERS)}

def encode_labels(labels):
    return [letter_map.get(l, -1) for l in labels]

def build_prompt(example: dict) -> str:
    question = example["question"].strip().rstrip("?.!")
    choices = example["choices"]["text"]
    lines = [f"{question}?", "Please select the best answer from the options below:"]
    lines += [f"{l}. {t}" for l, t in zip(LETTERS, choices)]
    return "\n".join(lines)

def extract_letter(raw_answer: str, example: dict) -> str:
    raw_answer = raw_answer.strip().upper()
    for ch in raw_answer:
        if ch in LETTERS:
            return ch
    for idx, choice in enumerate(example["choices"]["text"]):
        if choice.lower() in raw_answer.lower():
            return LETTERS[idx]
    return "?"

In [3]:
with open('/home/ananasclassic/.secret/orak', 'r', encoding='utf-8') as f:
    API_KEY = f.read().strip()
client = OpenAI(base_url='https://openrouter.ai/api/v1', api_key=API_KEY)

def openrouter_response(prompt: str) -> str:
    completion = client.chat.completions.create(extra_body={}, model='meta-llama/llama-3.3-70b-instruct', messages=[{'role': 'user', 'content': prompt}])
    return completion.choices[0].message.content

def ask_llm(prompt: str) -> str:
    return openrouter_response(prompt)

In [4]:

EXPAND_MODEL   = "deepseek/deepseek-r1-0528-qwen3-8b"
COMPRESS_MODEL = "deepseek/deepseek-r1-0528-qwen3-8b"

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=API_KEY,
)

def expand_prompt(original_prompt: str) -> str:
    system = {
        "role": "system",
        "content": (
            "You are an expert prompt architect. Your job is to take a USER prompt and "
            "rewrite it to make it as detailed, unambiguous, and "
            "structured as possible. Add the clarifying questions you would ask an LLM, "
            "break the task down into subtasks, include style/format cues, evaluation criteria, and any useful context that can be gleaned (but never "
            "change the original intent)."
            "In response to a user prompt, write only an improved version of their prompt without any aditional text."
        )
    }
    user = {
        "role": "user",
        "content": original_prompt
    }
    resp = client.chat.completions.create(
        model=EXPAND_MODEL,
        messages=[system, user],
        temperature=0.3,
        max_tokens=2500,
        extra_body={
            "provider": {
                "order": ["parasail/bf16", "novita"],
            }
        }
    )
    return resp.choices[0].message.content.strip()

def compress_prompt(detailed_prompt: str) -> str:
    system = {
        "role": "system",
        "content": (
            "You are a prompt condenser. Your job is to distill the key information from a prompt and create a new prompt that is concise while retaining all "
            "critical instructions, variables, constraints, and evaluation criteria. "
            "Remove redundancy, make lists compact, and prefer numbered steps instead of long prose."
            "When a user prompts you, write only a condensed version of their prompt without any aditional text."
        )
    }
    user = {
        "role": "user",
        "content": detailed_prompt
    }
    resp = client.chat.completions.create(
        model=COMPRESS_MODEL,
        messages=[system, user],
        temperature=0.1,
        max_tokens=2500,
        extra_body={
            "provider": {
                "order": ["parasail/bf16", "novita"],
            }
        }
    )
    return resp.choices[0].message.content.strip()

def improve(original_prompt: str) -> str:
    detailed  = expand_prompt(original_prompt)
    improved  = compress_prompt(detailed)
    return improved + "\nYour answer must begin with the letter (A–E) corresponding to the correct option. This is important for the testing system."


In [5]:
N = -1
ds = load_dataset('commonsense_qa', 'default', split='validation')
metric = load_metric('accuracy')

subset = ds if N == -1 else ds.select(range(N))

def handle(example):
    prompt = build_prompt(example)
    if 'improve' in globals() and callable(improve):
        prompt = improve(prompt)
    raw = ask_llm(prompt)
    letter = extract_letter(raw, example)
    return letter, example['answerKey']

preds, refs = [], []
with ThreadPoolExecutor() as ex:
    futures = [ex.submit(handle, exm) for exm in subset]
    for f in tqdm(as_completed(futures), total=len(subset), desc='Evaluating'):
        letter, ref = f.result()
        preds.append(letter)
        refs.append(ref)

preds = encode_labels(preds)
refs = encode_labels(refs)
result = metric.compute(predictions=preds, references=refs)
print(f'Accuracy: {result["accuracy"]:.3%}')

Evaluating: 100%|██████████| 1221/1221 [15:24<00:00,  1.32it/s]

Accuracy: 68.141%



