In [1]:
# Setup
import os
from dotenv import load_dotenv
import together

load_dotenv()
together.api_key = os.getenv("TOGETHER_API_KEY")
MODEL_1 = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MODEL_2 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
# MODEL_2 = "Qwen-QwQ-32B"

In [2]:
# Generate model response
def generate_response(prompt, model):
    try:
        response = together.Complete.create(
            prompt=prompt,
            model=model,
            max_tokens=512,
            temperature=0.7
        )
        return response['choices'][0]['text'].strip()
    except Exception as e:
        return f"[Error in generate_response]: {e}"

In [3]:
# Evaluation function using LLM as judge
def prompteffectiveness(user_prompt, question):
    prompteff = f"""
You are an expert evaluator tasked with analyzing a USER PROMPT and a corresponding QUESTION using the following 7 dimensions. Your goal is to critically assess the prompt's quality and effectiveness based on these dimensions, scoring each dimension qualitatively and quantitatively, and then provide a detailed summary.

You will evaluate the prompt and question across **7 dimensions**, rating each as:
- "Good": Strong performance in this aspect
- "Average": Moderate or partial success
- "Bad": Weakness or failure
- "N/A": Not applicable to this prompt/response

**Scoring Criteria**: A number between 0 and 10, where:
  - **Good**: 8-10 points (8=solid good, 9=very good, 10=excellent)
  - **Average**: 4-7 points (4-5=below average, 6-7=above average)
  - **Bad**: 0-3 points (0=complete failure, 1-2=poor, 3=weak)
  - **N/A**: No score assigned (excluded from overall average calculation)
  
Then, provide a **Summary** including:
**OverallScore: A number between 0 and 10 (rounded to two decimal places), calculated as the average of all dimension scores
  - Sum all numerical scores from applicable dimensions
  - Divide by number of applicable dimensions (excluding N/A ratings)
  - Round to 2 decimal place
**ApplicableDimensions: The number of evaluation dimensions applied (typically 7)
**PromptEffectiveness: 
  - Assign based on OverallScore:  
    - "Effective" if OverallScore ≥ 7.5  
    - "Partially Effective" if 4 ≤ OverallScore < 7.5  
    - "Ineffective" if OverallScore < 4
**Explanation: 
  - Concisely justify your ratings, referencing specific dimension scores,  
  - Include a brief analysis of prompt strengths and weaknesses,  
  - Note any relevant failure or success tags.

--- 

Use the following dimensions to guide your evaluation:
### 1. Purpose & Persona
**Definition**: 
- “Purpose” clarifies why the prompt exists (e.g., to summarize, to translate, to analyze).
- “Persona” defines for whom or from whose point of view the LLM should answer (e.g., “as a financial analyst,” “as a children’s book author”).
**Key Indicators**:
  - Does the User Prompt explicitly state its goal?
  - Does it assign a clear persona/role to the model?
**Ratings**:
  - Good(8-10): Both goal and persona are unambiguous.
  (Example: “Summarize the following research paper as if you were a science journalist.”)
  - Average(4-7): Either goal or persona is stated but one is vague.
  (Example: “Write a summary. You’re a journalist.” No domain specified.)
  - Bad(0-3): Neither purpose nor persona appears.
  (Example: “Tell me about X.”)
  - N/A: Explaining purpose/persona is unnecessary (e.g., a prompt that simply asks for a dictionary definition).

---

### 2. Requirements & Restrictions
**Definition**: Explicit instructions about what must and must not be included in the answer (e.g., “Limit to 200 words,” “Do not mention sensitive data,” “Use bullet points only”).
**Applicability**: If there are no constraints needed for a given task (e.g., “What is 2 + 2?”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Are there clear “must‐include” or “must‐avoid” guidelines?
**Ratings**:
  - Good(8-10): Explicit, unambiguous constraints (e.g., “No pronouns; only third‐person narrative,” “Include three illustrative examples”).
  - Average(4-7): Some constraints are present, but others are implied or incomplete (e.g., “Be concise” without a length target).
  - Bad(0-3): No requirements or restrictions at all, leaving the model free to wander.
  - N/A: The prompt’s nature makes constraints unnecessary (e.g., a simple “List the days of the week”).

---

### 3. Examples (Few‐Shot / Zero‐Shot)
**Definition**: Whether the prompt provides explicit example inputs and outputs to guide the model (e.g., zero‐shot, one‐shot, or few‐shot formatting).
**Applicability**: If the task does not benefit from example inputs/outputs (for instance, simple factual queries), mark as N/A with a brief rationale.
**Key Indicators**:
  - Are there sample question/answer pairs included (e.g., “Example: Q: … A: …”) that align with the intended task format?
**Ratings**:
  - Good(8-10): Supplies clear, directly relevant examples that demonstrate exactly how to structure inputs and expected outputs.
  - Average(4-7): Includes examples that are only partially aligned with the task or are too generic to serve as effective guidance.
  - Bad(0-3): No examples are provided or requested, even though examples would significantly clarify format or expectations.
  - N/A: Examples aren’t needed (e.g., a prompt asking, “What is the capital of France?”).

---

### 4. Context & Background
**Definition**: Additional information about domain, audience, or relevant facts that the LLM must know to answer properly.
**Applicability**: If the task requires no extra context (e.g., “Define photosynthesis”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Does the prompt supply enough domain context?
  (Example: “The following text is from a 19th‐century medical journal.”)
  - Are audience considerations given?
  (Example: “Explain this to a high‐school student.”)
**Ratings**:
  - Good(8-10): Detailed context and audience description aligned with the task.
  - Average(4-7): Some context, but missing critical details, forcing assumptions.
  - Bad(0-3): No context; model is left guessing domain or audience.
  - N/A: Task is self‐contained and needs no additional context.

---

### 5. Instruction Structure
**Definition**: The explicit format of instructions: a single directive, multi‐part numbered steps, or a choice (“answer in bullet points vs. essay”).
**Applicability**: If structure is inherently trivial (e.g., “What is 5 × 7?”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Is the format clear (e.g., “Step 1: … Step 2: …”)?
  - Does it specify whether the answer should be direct, stepwise, multi‐sectioned, etc.?
**Ratings**:
  - Good(8-10): Well‐organized structure that matches the complexity (e.g., multi‐part instructions for multi‐stage tasks).
  - Average(4-7):  Some structure but potentially inconsistent or too generic (“Answer in two parts”).
  - Bad(0-3): No structural guidance—just a vague “Respond about X.”
  - N/A: No structure needed because the task is extremely simple.

---

### 6. Style & Sensitivity
**Definition**: Tone and register instructions (formal, friendly, technical), disclaimers (“I am not a lawyer”), and bias‐avoidance guidance.
**Applicability**: If style or sensitivity is irrelevant (e.g., “Calculate the area of a circle with radius 3cm.”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Does the prompt ask for a specific tone (“use clinical tone,” “avoid gender bias”)?
  - Are disclaimers or sensitivity notices included when necessary?
**Ratings**:
  - Good(8-10): Explicit style/tone and sensitivity cues.
  - Average(4-7):  Partial style guidance (e.g., only “be professional” without elaboration).
  - Bad(0-3):  No style or sensitivity guidance, even when sensitive content is expected.
  - N/A: Style considerations are irrelevant for the given prompt.
  
---
  
### 7. Prompt Conciseness
**Definition**: “Prompt Conciseness” refers to how efficiently the prompt communicates its intent, constraints, and expectations-using the fewest necessary words without sacrificing clarity or precision. A concise prompt avoids redundancy, filler words, and overly complex sentence structures while still being fully interpretable by the model. 
**Applicability**: If style or sensitivity is irrelevant (e.g., “Calculate the area of a circle with radius 3cm.”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Does the prompt clearly convey all required instructions in a compact form?
  - Are there unnecessary qualifiers, repetitions, or verbose phrasings?
**Ratings**:
  - Good(8-10): The prompt is compact and clearly communicates intent, requirements, and role without unnecessary elaboration.
  - Average(4-7):  The prompt conveys the main idea but includes minor redundancies or could be made more direct without losing clarity. 
  - Bad(0-3):  The prompt is overly wordy, vague, or includes irrelevant information that obscures the main task. 
  - N/A: Prompt Conciseness are irrelevant for the given prompt.

---

### Core Evaluation Questions:

1. Did the **USER PROMPT** satisfy all critical expectations and requirements posed by the **QUESTION**?
2. If the prompt failed to meet expectations, was this due to the question itself being poorly constructed, ambiguous, or lacking proper scaffolding?
3. Alternatively, if the prompt was shallow, incomplete, or ineffective despite the question being well-constructed, then the prompt is ineffective because its limitations were exposed.
  
---

### Explanation Instructions:

Provide a detailed explanation justifying the overall evaluation score (0 to 10):

- Describe how the prompt performed against each applicable dimension.
- Clearly highlight strengths and weaknesses.
- Identify specific failure tags if the score is below 9, explaining why those failures occurred.
- Reference all relevant dimensions and failure reasons explicitly.
- For strong responses (score 9–10), emphasize the well-handled dimensions and why they were effective.

This explanation should provide a clear rationale that helps prompt engineers and developers understand what worked, what didn’t, and how to improve.

---

Return your evaluation as JSON using this exact format:
{{
  "Ratings": {{
    "Purpose & Persona": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Requirements & Restrictions": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Examples (Few‐Shot / Zero‐Shot)": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Context & Background": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Instruction Structure": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Style & Sensitivity": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Prompt Conciseness": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}}
  }},
  "Summary": {{
    "OverallScore": <calculated average of all non-null or non-N/A scores>,
    "ApplicableDimensions": <count of dimensions with non-null scores>,
    "PromptEffectiveness": "Effective" | "Partially Effective" | "Ineffective",
    "Explanation": "concise yet detailed explanation referencing dimension ratings, strengths, weaknesses, and failure tags"
  }}
}}


USER PROMPT:
{user_prompt}

QUESTION:
{question}
"""

    try:
        evaluation = together.Complete.create(
            prompt=prompteff,
            model=MODEL_2,
            max_tokens=512,
            temperature=0.3
        )
        return evaluation['choices'][0]['text'].strip()
    except Exception as e:
        return f"[Error in judge_response]: {e}"

In [4]:
# Test with 3-4 real-world examples
examples = [
    {
        "prompt": "Explain how climate change affects marine ecosystems."
    }
    # {
    #     "prompt": """You are a licensed neurologist. Explain the difference between a transient ischemic attack (TIA) and an ischemic stroke. 
    #                Use technical language suitable for a graduate medical textbook. Then, present your explanation as a markdown-formatted table comparing symptoms, duration, causes, and treatments. Keep the entire response under 200 words.""",
    # },
    # {
    #     "prompt": """You are a senior economist. First, summarize the current global inflation trends using only publicly available 2023 data.
    #                 Then, identify 2 major causes of inflation by region (e.g., Europe vs. Asia). 
    #                 If there is not enough public data on a region, state 'Insufficient data.' Avoid speculating or guessing.
    #                 Conclude with a 3-bullet policy recommendation tailored for central banks.
    #                 """,
    # },
    # {
    #     "prompt": """Below are 2 examples of how to write an engaging and humorous tweet about programming. Follow the same style and write a third one.

    #             Example 1: Debugging is like being the detective in a crime movie where you are also the murderer.

    #             Example 2: My code doesn’t work and I have no idea why. My code works and I have no idea why. The circle of life.

    #             Your turn:""",
    # },
    # {
    #     "prompt": """
    #                 You are an AI safety researcher. A new model shows signs of deceptive behavior. 

    #                 First, generate three hypotheses that might explain this behavior (e.g., misaligned reward function, training leakage, adversarial prompt attack).
    #                 Then evaluate the pros and cons of each.
    #                 Finally, recommend the most plausible cause and justify your choice with a step-by-step explanation.
    #                 """,
    # },
    # {
    #     "prompt":  """
    #                 Act as a contract lawyer. Review the following clause and highlight three potential legal risks in bullet points. 
    #                 Then, rewrite the clause in clearer, legally robust language. Keep your rewrite under 80 words.

    #                 Clause: “The service provider is not liable for any consequences, intended or unintended, regardless of jurisdiction, unless otherwise mentioned.”
    #                 """,
    # }
   
]

for i, ex in enumerate(examples):
    print(f"\n=== EXAMPLE {i+1} ===")
    print("Prompt:", ex["prompt"])
    model_output = generate_response(ex["prompt"], MODEL_1)
    print("\nModel Response:", model_output)
    evaluation = judge_response(ex["prompt"], model_output)
    print("\nJudgment:", evaluation)


=== EXAMPLE 1 ===
Prompt: Explain how climate change affects marine ecosystems.


  response = together.Complete.create(



Model Response: Climate change affects marine ecosystems in several ways. First, ocean temperatures are rising, which can lead to coral bleaching, where corals expel the algae living in their tissues, causing them to turn completely white. This makes it difficult for corals to survive, as they rely on the algae for food and protection.

Second, ocean acidification is occurring as the ocean absorbs more carbon dioxide from the atmosphere. This causes the pH of the ocean to decrease, making it more acidic. This can harm shell-building animals, such as oysters, clams, sea urchins, and corals, as it becomes more difficult for them to build and maintain their shells and skeletons.

Third, sea level rise is causing coastal habitats, such as marshes and mangroves, to be inundated with saltwater. This can lead to the death of these important habitats, which provide nurseries for many marine species and help to protect coastlines from storms and erosion.

Fourth, changes in ocean currents and 

NameError: name 'judge_response' is not defined

In [None]:
# Evaluation function using LLM as judge
def judge_response(user_prompt, model_response):
    judge_prompt = f"""
You are an expert LLM evaluator. Your task is to critically analyze a model's response to a user-provided prompt and determine whether the prompt was effective at eliciting a high-quality, aligned, and robust response from the model.

You will evaluate the prompt and model response across **8 dimensions**, rating each as:
- "Good": Strong performance in this aspect
- "Average": Moderate or partial success
- "Bad": Weakness or failure
- "N/A": Not applicable to this prompt/response

Then, provide a **Summary** including:
**OverallScore: A number between 0 and 10 (rounded to two decimal places), calculated as the average of all dimension scores
  - Sum all numerical scores from applicable dimensions
  - Divide by number of applicable dimensions (excluding N/A ratings)
  - Round to 2 decimal place
**ApplicableDimensions: The number of evaluation dimensions applied (typically 14)
**PromptEffectiveness: "Effective", "Partially Effective", or "Ineffective"
**Score**: A number between 0 and 10, where:
  - **Good**: 8-10 points (8=solid good, 9=very good, 10=excellent)
  - **Average**: 4-7 points (4-5=below average, 6-7=above average)
  - **Bad**: 0-3 points (0=complete failure, 1-2=poor, 3=weak)
  - **N/A**: No score assigned (excluded from overall average calculation)
**Explanation: Concise justification referencing specific dimension scores, including a brief analysis of strengths and weaknesses. For scores below 7.0, identify primary failure tags and explain their impact on overall effectiveness.
**FailureTags (if applicable): A list of failure modes such as "Omission", "Ambiguity", "Conflict", "Overconstraint", "Shallow Reasoning", "Domain Leak", "Fabrication Risk", etc., based on dimensions scoring below 6.
**StrengthAreas: List of dimension names that received scores ≥ 8.0
**ImprovementAreas: List of dimension names that received scores < 6.0

--- 

Use the following dimensions to guide your evaluation:

---

### 1. Nested / Multi‐Step Instructions
**Definition**: The prompt requires the model to perform multiple subtasks in a specified order (e.g., “Step 1: …, Step 2: …, Step 3: …”).
**Key Indicators**:
  - Are there explicit numbered or indented steps?
  - Does one instruction logically precede another?
**Ratings**:
  - Good(8-10): Clearly numbered or indented steps with unambiguous ordering.
  - Average(4-7): Multiple instructions exist but ordering is implied rather than explicit (e.g., “First do X then do Y” without numbering).
  - Bad(0-3): No multi‐step requirement; all tasks are lumped into a single instruction.
  - N/A: The prompt does not require multiple subtasks (e.g., a straightforward factual query).

---

### 2. Conflicting Instructions
**Definition**: The prompt imposes two or more constraints that cannot all be satisfied simultaneously (e.g., “Write a summary in under 50 words but include five examples”).
**Key Indicators**:
  - Do two (or more) requirements directly contradict each other?
  - Is the prompt forcing the model to choose or partially satisfy conflicting directives?
**Ratings**:
  - Good(8-10): Clear, intentional conflict that tests the model’s ability to recognize and address the contradiction.
  - Average(4-7): Minor or ambiguous conflict that may or may not contradict in practice.
  - Bad(0-3): No conflicting constraints or conflicts that are accidental and trivial.
  - N/A: The prompt has no conflicting requirements (e.g., simply “List all prime numbers under 100”).

---

### 3. Inter‐Dependent Constraints
**Definition**: One instruction becomes active only if another condition is met (e.g., “If the text contains more than ten technical terms, define each; otherwise, just list them”).
**Key Indicators**:
  - Is there an explicit “if‐then” or “only when” dependency?
  - Does satisfying the second instruction depend on the first instruction’s outcome?
**Ratings**:
  - Good(8-10): Unambiguous, correctly nested conditions with clear triggers and actions.
  - Average(4-7): Conditional instruction exists but is loosely defined or lacks full context.
  - Bad(0-3): No inter‐dependent constraints or conditions are too vague for the model to follow.
  - N/A: No conditional instructions present (e.g., “Translate this sentence into Spanish”).

---

### 4. Edge‐Case Handling
**Definition**: The prompt instructs the model to explicitly admit when required information is missing or when an edge case arises (e.g., “If the input lacks dates, respond ‘Insufficient data’”).
**Key Indicators**:
  - Does the prompt say “If X is absent or unclear, do not guess”?
  - Are instructions provided for how to behave if required data is missing?
**Ratings**:
  - Good(8-10): Explicitly states how to handle missing or ambiguous input.
  - Average(4-7): Encourages accuracy but does not specify what to do if something’s missing.
  - Bad(0-3): No edge‐case guidance; model is expected to fill gaps arbitrarily.
  - N/A: The prompt’s context guarantees all information is present (e.g., “Compute the sum of the following five numbers”).

---

### 5. Ambiguity Resolution
**Definition**: The prompt introduces ambiguous terms or references and instructs the model to clarify or handle them (e.g., “The word ‘bank’ could mean financial institution or riverbank—state both interpretations”).
**Key Indicators**:
  - Are ambiguous words or phrases flagged?
  - Does the prompt ask “If ambiguous, explain interpretations”?
**Ratings**:
  - Good(8-10): Clearly marks ambiguous elements and instructs the model how to resolve them.
  - Average(4-7): Ambiguity exists but no explicit instruction on how to handle it.
  - Bad(0-3): Either no ambiguity or ambiguity that is not addressed by the prompt.
  - N/A: No ambiguity in the prompt (e.g., “List the first ten Fibonacci numbers”).

---

### 6. Domain Fusion
**Definition**: The prompt fuses two or more specialized domains into one task (e.g., “Analyze the legal contract’s economic impact using statistical models”).
**Key Indicators**:
  - Are at least two distinct fields explicitly mentioned?
  - Does the prompt require coherent integration of knowledge from both domains?
**Ratings**:
  - Good(8-10): Explicitly names and requires integration of both domains.
  - Average(4-7): Mentions two fields, but one is peripheral or not fully integrated.
  - Bad(0-3): Only one domain addressed or domains listed without requiring fusion.
  - N/A: Task focuses on a single domain (e.g., “Explain basic thermodynamics”).

---

### 7. Multi‐Source/Modal Analysis
**Definition**: The model must reason over multiple distinct inputs (e.g., two text passages, text plus described visuals) and synthesize across them.
**Key Indicators**:
  - Are there at least two distinct “sources” described?
  - Does the prompt instruct the model to integrate information across those sources?
**Ratings**:
  - Good(8-10): Clearly delineates sources and instructs how to synthesize them.
  - Average(4-7): Multiple inputs exist but integration instructions are implied rather than explicit.
  - Bad(0-3): Only one source used or no synthesis requirement.
  - N/A: Task provides a single input (e.g., “Summarize this paragraph”).

---

### 8. Hypothetical / Counterfactual Reasoning
**Definition**: The prompt poses a scenario contrary to known facts or purely hypothetical (e.g., “If gravity were inverted, describe consequences for river flow”).
**Key Indicators**:
  - Does it present an “as if” scenario explicitly defying reality?
  - Are you instructed to treat that scenario as true and reason within it?
**Ratings**:
  - Good(8-10): Clear statement of the counterfactual and instructions on how to explore implications.
  - Average(4-7): Hypothetical is given but lacks guidance on depth or scope.
  - Bad(0-3): Either no hypothetical component or trivial (“Imagine a purple cat”).
  - N/A: No hypothetical scenario—task uses real‐world facts only.

---

Evaluate the prompt and the model response using these questions:

1. Did the model satisfy all critical expectations of the prompt?
2. If the model failed, was it because the prompt was poorly constructed, overly ambiguous, or lacked proper scaffolding?
3. Alternatively, if the model output was shallow or incorrect despite a strong prompt, then the prompt was effective because it exposed model limitations.
4. Does It Break Nested / Multi-Step Instructions?
  Failure Modes:
  - Omission: The model skips one or more required steps.
  - Wrong Order: Completes steps out of sequence.
  - Incomplete Detail: Does not give enough detail for a subtask labeled “explain in detail.”
  Good / Average / Bad Criteria:
  - Good(8-10): Executes all steps in correct order, with appropriate detail.
  - Average(4-7): Attempts each step but has minor ordering or detail errors.
  - Bad(0-3): Omits steps entirely or confuses the ordering.
5. Does It Break Conflicting Instructions?
  Failure Modes:
  - Blind Obedience: Tries to satisfy both contradictory instructions fully (and thus fails both).
  - Undisclosed Choice: Picks one constraint without acknowledging the conflict.
  - Refusal Without Explanation: Says “I can’t” but does not explain why.
  Good / Average / Bad Criteria:
  - Good(8-10): Explicitly recognizes the conflict, explains trade-offs, and chooses or partially satisfies with justification.
  - Average(4-7): Acknowledges conflict but gives shallow or incomplete justification.
  - Bad(0-3): Fails to notice conflict or gives a nonsensical response.
6. Does It Break Inter-Dependent Constraints?
  Failure Modes:
  - Misconditional: Applies the second constraint unconditionally or fails to check the first.
  - Partial Compliance: Applies conditions incorrectly.
  Good / Average / Bad Criteria:
  - Good(8-10): Checks the first condition, then applies the second exactly as instructed.
  - Average(4-7): Checks condition but misinterprets threshold or does only partial.
  - Bad(0-3): Ignores dependency entirely or applies condition incorrectly.
7. Does It Break Edge-Case Handling?
  Failure Modes:
  - Fabrication: Hallucinates missing data instead of admitting it’s missing.
  - Incorrect Catch: Says “insufficient data” when data actually exists.
  Good / Average / Bad Criteria:
  - Good(8-10): Precisely follows instructions—if data absent, admits it; if present, uses it.
  - Average(4-7): Tries to follow but sometimes incorrectly identifies a present/absent case.
  - Bad(0-3): Always guesses, never admits missing information, or never uses available data.
8. Does It Break Ambiguity Resolution?
  Failure Modes:
  - Single Interpretation: Chooses one meaning without acknowledging alternatives.
  - Ignores Request: Fails to clarify ambiguous term.
  Good / Average / Bad Criteria:
  - Good(8-10): Lists all reasonable interpretations, then justifies which one fits context.
  - Average(4-7): Mentions ambiguity but does not fully explore both interpretations.
  - Bad(0-3): Ignores ambiguity and picks an interpretation arbitrarily.
9. Does It Break Domain Fusion?
  Failure Modes:
  - Domain Drop: Only addresses one domain, ignoring the other.
  - Shallow Fusion: Mentions the second domain only superficially.
  Good / Average / Bad Criteria:
  - Good(8-10): Integrates both domains deeply and coherently.
  - Average(4-7): Partially integrates second domain, but one domain dominates.
  - Bad(0-3): One domain is entirely missing from the response.
10. Does It Break Multi-Source/Modal Analysis?
  Failure Modes:
  - Single-Source Focus: Uses only one input, ignoring others.
  - Surface Integration: Mentions multiple inputs but does not synthesize.
  Good / Average / Bad Criteria:
  - Good(8-10): Synthesizes data from all sources; draws connections across them.
  - Average(4-7): Uses multiple inputs but analysis lacks depth or coherence.
  - Bad(0-3): Ignores all but one source, or parrots without synthesis.
11. Does It Break Hypothetical / Counterfactual Reasoning?
  Failure Modes:
  - Reality Check: Refuses or reaffirms real-world facts instead of engaging in the scenario.
  - Shallow Answer: Acknowledges hypothetical but does not explore implications.
  Good / Average / Bad Criteria:
  - Good(8-10): Treats the hypothetical as true, reasoning logically through its consequences.
  - Average(4-7): Engages somewhat but overlooks certain implications or reverts to real-world assumptions.
  - Bad(0-3): Refuses or sidesteps the hypothetical entirely.
  
---

Generate Explanation of Evaluation (0–10) with Failure Tags and Dimension-Based Analysis
You are an evaluator assessing how well a model handled a complex prompt. Based on the score (from 0 to 10), write a detailed explanation that:
1. Explains Why the Response Got This Score
  - Describe how the model handled or failed each applicable complexity dimension.
  - Clearly justify where the response was strong, and where it broke—i.e., failed to satisfy intended challenges.
  - If the score is below 9, mention specific failure tags and explain why they apply.

2. Use These Dimensions and Failure Modes
Below are the 8 core dimensions you must consider. For each one, if it broke, use the relevant failure mode(s) and incorporate the corresponding failure tag(s) in your explanation.

2.1. Nested / Multi-Step Instructions
Failure Modes:
  - Omission: Skips required step(s)
  - Wrong Order: Steps executed out of sequence
  - Incomplete Detail: Step lacks required elaboration
Failure Tag: Missed nested instruction

2.2. Conflicting Instructions
Failure Modes:
  - Blind Obedience: Tries to satisfy both conflicts fully
  - Undisclosed Choice: Picks one side silently
  - Refusal Without Explanation: Says “can’t” with no reason
Failure Tag: Conflict evasion


2.3. Inter-Dependent Constraints
Failure Modes:
  - Misconditional: Fails to apply logic in correct order
  - Partial Compliance: Misinterprets conditions
Failure Tag: Conditional failure

2.4. Edge-Case Handling
Failure Modes:
  - Fabrication: Hallucinates data instead of admitting absence
  - Incorrect Catch: Says data is missing when it's present
Failure Tag: Edge case miss

2.5. Ambiguity Resolution
Failure Modes:
  - Single Interpretation: Picks one meaning without context
  - Ignores Request: Fails to clarify ambiguous terms
Failure Tag: Ambiguity unresolved

2.6. Domain Fusion
Failure Modes:
  - Domain Drop: Ignores one domain entirely
  - Shallow Fusion: Surface-level integration
Failure Tag: Domain omission

2.7. Multi-Source / Modal Analysis
Failure Modes:
  - Single-Source Focus: Ignores other inputs
  - Surface Integration: Uses but doesn't connect sources
Failure Tag: Source integration failure

2.8. Hypothetical / Counterfactual Reasoning
Failure Modes:
  - Reality Check: Rejects scenario, defaults to facts
  - Shallow Answer: Ignoled implication depth
Failure Tag: Shallow hypothetical

3. Format Your Explanation Like This
  - Reference all relevant dimensions and failure tags.
  - If the response is strong (score 9–10), describe which dimensions were handled well and why.
Goal
The explanation should provide a clear rationale for the evaluation score, tied to concrete dimension-based reasoning. It must help prompt engineers or model developers understand what failed, why it failed, and how it can improve.

---

Return your evaluation as JSON using this exact format:
{{
  "Ratings": {{
    "Nested / Multi‐Step Instructions": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Conflicting Instructions": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Inter‐Dependent Constraints": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Edge‐Case Handling": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Ambiguity Resolution": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Domain Fusion": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Multi‐Source/Modal Analysis": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}},
    "Hypothetical / Counterfactual Reasoning": {{"Qualitative": "Good/Average/Bad/N/A", "Score": 0-10 or null}}
  }},
  "Summary": {{
    "OverallScore": <calculated average of all non-null scores>,
    "ApplicableDimensions": <count of dimensions with non-null scores>,
    "PromptEffectiveness": "Effective" | "Partially Effective" | "Ineffective",
    "Score": <0–10>,
    "Explanation": "The model failed to follow the nested instructions completely, skipping step 2 and reversing the order of steps 3 and 4, leading to a 'Missed nested instruction' failure. It also ignored a conditional requirement, triggering a 'Missed constraint' tag. While the model did recognize the ambiguous term, its resolution was shallow, earning a score of 5 for incomplete ambiguity handling. The response also lacked integration across both required domains, qualifying as 'Shallow reasoning'. These issues affected the Nested Instruction, Constraint Handling, Ambiguity Resolution, and Domain Fusion dimensions. The overall score of 5 reflects multiple breakdowns with some partial effort.",
    "FailureTags": ["Omission", "Ambiguity", "Misconditional", "Conflict", "Oversimplification", ...],
    "StrengthAreas": ["List dimensions rated 8+ if any"],
    "ImprovementAreas": ["List dimensions rated below 6 if any"]
  }}
}}


USER PROMPT:
{user_prompt}

MODEL RESPONSE:
{model_response}
"""

    try:
        evaluation = together.Complete.create(
            prompt=judge_prompt,
            model=MODEL_2,
            max_tokens=512,
            temperature=0.3
        )
        return evaluation['choices'][0]['text'].strip()
    except Exception as e:
        return f"[Error in judge_response]: {e}"

In [None]:
def evaluate(user_prompt, question, model, model_response):

    # Call the check_complexity_criteria function
    # print("\n Prompt:", user_prompt)
    # print("\nModel Response:", model_response)
    result = check_complexity_criteria(user_prompt, question, MODEL_2)
    try:
        json_str = re.search(r"\{.*\}", result, re.DOTALL).group()
        parsed_result = json.loads(json_str)
    except Exception as e:
        print("Failed to parse JSON from model output.")
        print("Raw result:\n", result)
        return

    print("\n=== Complexity check Result ===")
    print(json.dumps(parsed_result, indent=4))
    
    score1 = parsed_result.get("Score")
    dimensions = parsed_result.get("Dimensions in prompt")

    print("\nScore1:", score1)
    print("Dimensions present in prompt:", dimensions)
        
    # Call the judge_response function
    evaluation = judge_response(user_prompt, model_response, len(dimensions))
    # cleaned = evaluation.replace("{{", "{").replace("}}", "}")
    try:
        json_str1 = re.search(r"\{.*\}", evaluation, re.DOTALL).group()
        parsed_result1 = json.loads(json_str1)
    except Exception as e:
        print("Failed to parse JSON from model output.")
        print("Raw result:\n", evaluation)
        return
    # def extract_last_json_block(text):
    #     stack = []
    #     start = None
    #     last_valid_json = None

    #     for i, char in enumerate(text):
    #         if char == '{':
    #             if not stack:
    #                 start = i
    #             stack.append('{')
    #         elif char == '}':
    #             if stack:
    #                 stack.pop()
    #                 if not stack and start is not None:
    #                     candidate = text[start:i+1]
    #                     try:
    #                         json.loads(candidate)
    #                         last_valid_json = candidate
    #                     except json.JSONDecodeError:
    #                         continue
    #     return last_valid_json
    # json_str1 = extract_last_json_block(cleaned)
    # if json_str1:
    #     parsed_result1 = json.loads(json_str1)
    # else:
    #     print("Failed to extract JSON.")
    #     print("Raw result:\n", cleaned)
    #     return


    print("\n=== Complexity Evaluation Result ===")
    print(json.dumps(parsed_result1, indent=4))
    
    score2 = parsed_result1.get("OverallScore")
    print("\nScore2:", score2)
    
    if score1 is not None and score2 is not None:
        net_score = (score1 + score2) / 2
        print("\nNet_score:", net_score)
        if net_score <= 0.5:
            print("\nPrompt is not challenging the model.")
        else:
            print("\nPrompt is challenging the model.")
    else:
        print("\nCannot compute net score: one or both scores are missing.")
      
    # Call the prompteffectiveness function    
    result2 = prompteffectiveness(user_prompt, question)
    try:
        json_str3 = re.search(r"\{.*\}", result2, re.DOTALL).group()
        parsed_result3 = json.loads(json_str3)
    except Exception as e:
        print("Failed to parse JSON from model output.")
        print("Raw result:\n", result2)
        return

    # Print result summary
    print("\n=== Effectiveness check Result ===")
    print(json.dumps(parsed_result3, indent=4))  
    
    ans1 = parsed_result3.get("OverallScore") 
    print("\nPromptEffectivenessScore:", ans1) 
    ans2 = parsed_result3.get("PromptEffectiveness") 
    print("\nPromptEffectiveness:", ans2) 
    

            
       