In [1]:
from datasets import load_dataset
from coolprompt import PromptTuner
from langchain_ollama import ChatOllama

target_model = ChatOllama(model="qwen2.5:1.5b-instruct")
system_model = ChatOllama(model="qwen2.5:3b-instruct")

tuner = PromptTuner(target_model=target_model, system_model=system_model)


samsum = load_dataset("knkarthick/samsum")
dataset = samsum["train"]["dialogue"][:40]
targets = samsum["train"]["summary"][:40]

# There are 4 available templates for llm as judge metric:

# Accuracy: The answer is factually correct and contains no mistakes. 
# It uses the right terms and numbers and does not introduce wrong information.

# Coherence: The answer is well structured and easy to follow. 
# Ideas are in a logical order and there are no contradictions.

# Fluency: The answer is written in natural, correct language. 
# Grammar, vocabulary and phrasing are clear and do not make the text hard to read.

# Relevance: The answer directly responds to the request and stays on topic. 
# It does not add unnecessary or unrelated details.

result = tuner.run(
    start_prompt="Summarize the text",
    task="generation",
    dataset=dataset,
    target=targets,
    method="hype",
    metric="llm_as_judge",
    llm_as_judge_criteria="relevance",
    generate_num_samples=20,
    verbose=2,
)

# You can use several criteria, and the final score will be the mean of them.

result2 = tuner.run(
    start_prompt="Summarize the text",
    task="generation",
    dataset=dataset,
    target=targets,
    method="hype",
    metric="llm_as_judge",
    llm_as_judge_criteria=["relevance", "coherence"],
    generate_num_samples=20,
    verbose=2,
)



# You can use custom criteria in llm as judge

custom = {
    "creativity": """You will be given a response to a question.
    Rate the creativity on a scale from 1 to {metric_ceil}.
    Return ONLY a single number.

    Source: {request}
    Response: {response}

    Creativity score (number only):"""
}

result3 = tuner.run(
    start_prompt="",
    task="generation",
    dataset=dataset,
    target=targets,
    method="hype",
    metric="llm_as_judge",
    llm_as_judge_criteria=["creativity"],
    llm_as_judge_custom_templates=custom,
    generate_num_samples=20,
    verbose=2,
)

[2025-11-15 15:30:30,345] [INFO] [assistant.__init__] - Validating the target model
[2025-11-15 15:30:30,347] [INFO] [assistant.__init__] - Validating the system model
[2025-11-15 15:30:30,348] [INFO] [assistant.__init__] - PromptTuner successfully initialized
[2025-11-15 15:30:30,347] [INFO] [assistant.__init__] - Validating the system model
[2025-11-15 15:30:30,348] [INFO] [assistant.__init__] - PromptTuner successfully initialized
[2025-11-15 15:30:33,142] [INFO] [assistant.run] - Validating args for PromptTuner running
[2025-11-15 15:30:33,143] [INFO] [evaluator.__init__] - Evaluator successfully initialized with llm_as_judge metric
[2025-11-15 15:30:33,142] [INFO] [assistant.run] - Validating args for PromptTuner running
[2025-11-15 15:30:33,143] [INFO] [evaluator.__init__] - Evaluator successfully initialized with llm_as_judge metric
[2025-11-15 15:30:43,336] [INFO] [assistant.run] - === Starting Prompt Optimization ===
[2025-11-15 15:30:43,337] [INFO] [assistant.run] - Method: h