# Usage
1. Config your Google API key in the secrets
2. Run the first two cells to initialize the environment
3. Run either the third or fourth cell to initialize the personas and the evaluation criteria
4. Run the remaining cells to execute the evaluation and visualize the results

In [None]:
!pip install -q -r https://raw.githubusercontent.com/AI-Agents-Prompts-to-Multi-Agent-Sys/Quantitative-Eval/master/requirements.txt

In [1]:
import os
import json
import operator
import re
from typing import TypedDict, Annotated, List

import pandas as pd
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.constants import END, START
from langgraph.graph import StateGraph
from tqdm import tqdm
from tenacity import retry, stop_after_attempt

# Load environment variables (GOOGLE_API_KEY should be set either in .env file or in the secrets)
try:
    from google.colab import userdata
    os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
except ImportError:
    load_dotenv()
except KeyError:
    raise KeyError("Please set the GOOGLE_API_KEY in your secrets.")

# LLM config
# At here you can change the model, tweak its parameters, or even use different LLM provider
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20", temperature=0.7)

Use One of the following two cells to initialize the personas and the evaluation criteria.

In [None]:
# Initialize personas and criteria by a llm
# To be implemented

In [2]:
# Initialize personas and criteria manually

# What to evaluate
EVALUATION_SUBJECT = "band"

# List of items to evaluate
ITEMS = [
    "The Beatles", "Led Zeppelin", "Pink Floyd", "Queen", "The Rolling Stones",
    "Metallica", "Megadeth", "Black Sabbath", "Iron Maiden", "Tool"
]

# Persona definitions
# Each persona has generates a call to the LLM, so please be aware of the rate limits of your LLM provider
# For Gemini, the rate limit is 10 calls per minute for 2.5 flash models and 5 calls per minute for 2.5 pro models.
PERSONAS = {
    "metalhead": "You're in your 30s, a lifelong metal fan. You value power, aggression, instrumental mastery, and complexity. You dismiss pop and overproduced music as shallow.",
    "popstar": "You're in your 20s, immersed in social media culture. You love global accessibility, emotional resonance, and catchy choruses. You believe great bands bring joy and unity.",
    "boomer": "You're in your 70s. You grew up during the golden age of rock and believe greatness is rooted in legacy, songwriting, and timeless appeal. Newer music feels synthetic to you.",
    "genz": "You're a teenager, online-native, and value diversity, identity, and innovation in music. You're drawn to bands that say something real or break genre rules.",
    "indie": "You're in your 30s, an art-school type who craves authenticity, emotion, and underground cool. You dislike commercial polish and love expressive weirdness.",
}

# Criteria for evaluation
CRITERIA = {
    "Musical Innovation": "Pioneering ideas, new sounds, genre blending.",
    "Cultural Impact": "Broader societal influence, pop culture penetration.",
    "Lyrical or Thematic Depth": "Narrative richness, philosophical weight, relatability.",
    "Technical Proficiency": "Musical complexity, virtuosity, performance execution.",
    "Live Performance Strength": "Energy, presence, crowd connection on stage.",
    "Legacy & Longevity": "Enduring influence across generations and artists."
}

# Instructions/background information for the personas
PERSONA_ROLE = "music critic"
INSTRUCTION = "You have been asked to evaluate the greatness of 21 historically significant bands across genres including rock, metal, pop, and progressive."


# Prompt template
def make_prompt(persona_description):
    return f"""You are a {PERSONA_ROLE} with the following worldview:

{persona_description}

{INSTRUCTION}

Evaluate each {EVALUATION_SUBJECT} based on the following {len(CRITERIA)} criteria, scoring from 1 (low) to 5 (high):

{"".join(f"{key}: {value}{chr(10)}" for key, value in CRITERIA.items())}
Here are the {EVALUATION_SUBJECT}s to evaluate:
{chr(10).join('- ' + item for item in ITEMS)}

Please respond ONLY in the following strict JSON format:

```json
{{
  "ratings": [
    {{
      "item": "the corresponding {EVALUATION_SUBJECT} name here"{"".join(f',{chr(10)}      "{criteria}": int' for criteria in CRITERIA)}
    }},
    // ...More {EVALUATION_SUBJECT} evaluations here
  ],
  "justification": "Your paragraph explaining the ratings here.",
  "ranking": ["{EVALUATION_SUBJECT}1", "{EVALUATION_SUBJECT}2", ..., "{EVALUATION_SUBJECT}{len(ITEMS)}"]
}}
```

- The ratings list must include all {len(ITEMS)} {EVALUATION_SUBJECT}s.
- The ranking list must be in your personal order (1st to {len(ITEMS)}st).
- Do not include any commentary outside the JSON block.
"""

print("Example prompt:")
print(make_prompt(list(PERSONAS.values())[0]))


Example prompt:
You are a music critic with the following worldview:

You're in your 30s, a lifelong metal fan. You value power, aggression, instrumental mastery, and complexity. You dismiss pop and overproduced music as shallow.

You have been asked to evaluate the greatness of 21 historically significant bands across genres including rock, metal, pop, and progressive.

Evaluate each band based on the following 6 criteria, scoring from 1 (low) to 5 (high):

Musical Innovation: Pioneering ideas, new sounds, genre blending.
Cultural Impact: Broader societal influence, pop culture penetration.
Lyrical or Thematic Depth: Narrative richness, philosophical weight, relatability.
Technical Proficiency: Musical complexity, virtuosity, performance execution.
Live Performance Strength: Energy, presence, crowd connection on stage.
Legacy & Longevity: Enduring influence across generations and artists.

Here are the bands to evaluate:
- The Beatles
- Led Zeppelin
- Pink Floyd
- Queen
- The Rolling 

In [3]:
from tenacity import retry, stop_after_attempt, wait_fixed, before_sleep_log
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Clean LLM output
def clean_json_string(text: str) -> str:
    cleaned = re.sub(r"```(?:json)?", "", text)
    return cleaned.replace("```", "").strip()

# Parse JSON
def parse_json_response(response):
    response_cleaned = clean_json_string(response)
    data = json.loads(response_cleaned)
    ratings = data["ratings"]
    justification = data["justification"]
    ranking = data["ranking"]

    ranking_column = []
    for i, item in enumerate(ranking):
        ranking_column += [{"item": item, "rank": i + 1}]

    print(ratings)
    df = pd.DataFrame(ratings)
    df = pd.merge(df, pd.DataFrame(ranking_column), on="item", how="left")
    print(df)
    df.columns = [EVALUATION_SUBJECT] + list(CRITERIA.keys()) + ["Rank"]
    return df, justification

# Get llm response
@retry(stop=stop_after_attempt(3),
       before_sleep=before_sleep_log(logger, logging.INFO))
async def get_llm_response(prompt):
    response = await llm.ainvoke(prompt)
    return parse_json_response(response.content)

# State definition
class Vote(TypedDict):
    df: pd.DataFrame
    justification: str
    persona: str

class State(TypedDict):
    votes: Annotated[List[Vote], operator.add]

# Initialize progress bar
try:
    pbar.close()
except NameError:
    pass

pbar = tqdm("Evaluating bands with personas", total=len(PERSONAS), unit="persona")

# Agent node
def make_agent_node(persona_key):
    async def node(state):
        persona = PERSONAS[persona_key]
        prompt = make_prompt(persona)
        df, justification = await get_llm_response(prompt)

        state['votes'] = [{
                "df": df,
                "justification": justification,
                "persona": persona_key,
        }]

        pbar.update(1)
        return state
    return node

# Graph build
agent_keys = list(PERSONAS.keys())

graph = StateGraph(State)
for agent in agent_keys:
    graph.add_node(agent, make_agent_node(agent))

# Graph edges
for agent in agent_keys:
    graph.add_edge(START, agent)
graph.add_edge([agent for agent in agent_keys], END)

# Run
compiled = graph.compile()
result = await compiled.ainvoke({
    "votes": [],
})


 20%|██        | 1/5 [00:22<01:30, 22.61s/persona]

[{'item': 'The Beatles', 'Musical Innovation': 5, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 4, 'Technical Proficiency': 3, 'Live Performance Strength': 3, 'Legacy & Longevity': 5}, {'item': 'Led Zeppelin', 'Musical Innovation': 4, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 4, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'Pink Floyd', 'Musical Innovation': 4, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 5, 'Technical Proficiency': 4, 'Live Performance Strength': 4, 'Legacy & Longevity': 5}, {'item': 'Queen', 'Musical Innovation': 4, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 4, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'The Rolling Stones', 'Musical Innovation': 3, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 3, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'Metallica', 'Musical Innovation': 3, 'Cultu

 40%|████      | 2/5 [00:31<00:42, 14.29s/persona]

[{'item': 'The Beatles', 'Musical Innovation': 5, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 4, 'Technical Proficiency': 4, 'Live Performance Strength': 4, 'Legacy & Longevity': 5}, {'item': 'Led Zeppelin', 'Musical Innovation': 5, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 5, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'Pink Floyd', 'Musical Innovation': 5, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 5, 'Technical Proficiency': 4, 'Live Performance Strength': 4, 'Legacy & Longevity': 5}, {'item': 'Queen', 'Musical Innovation': 5, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 4, 'Technical Proficiency': 5, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'The Rolling Stones', 'Musical Innovation': 4, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 3, 'Live Performance Strength': 4, 'Legacy & Longevity': 5}, {'item': 'Metallica', 'Musical Innovation': 5, 'Cultu

 60%|██████    | 3/5 [00:31<00:15,  7.91s/persona]

[{'item': 'The Beatles', 'Musical Innovation': 4, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 2, 'Technical Proficiency': 2, 'Live Performance Strength': 3, 'Legacy & Longevity': 4}, {'item': 'Led Zeppelin', 'Musical Innovation': 4, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 4, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'Pink Floyd', 'Musical Innovation': 5, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 5, 'Technical Proficiency': 4, 'Live Performance Strength': 4, 'Legacy & Longevity': 5}, {'item': 'Queen', 'Musical Innovation': 4, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 4, 'Live Performance Strength': 5, 'Legacy & Longevity': 4}, {'item': 'The Rolling Stones', 'Musical Innovation': 3, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 3, 'Live Performance Strength': 4, 'Legacy & Longevity': 4}, {'item': 'Metallica', 'Musical Innovation': 5, 'Cultu

 80%|████████  | 4/5 [00:34<00:06,  6.01s/persona]

[{'item': 'The Beatles', 'Musical Innovation': 5, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 3, 'Live Performance Strength': 2, 'Legacy & Longevity': 5}, {'item': 'Led Zeppelin', 'Musical Innovation': 4, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 5, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'Pink Floyd', 'Musical Innovation': 5, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 5, 'Technical Proficiency': 4, 'Live Performance Strength': 4, 'Legacy & Longevity': 5}, {'item': 'Queen', 'Musical Innovation': 4, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 4, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'The Rolling Stones', 'Musical Innovation': 2, 'Cultural Impact': 4, 'Lyrical or Thematic Depth': 2, 'Technical Proficiency': 3, 'Live Performance Strength': 3, 'Legacy & Longevity': 4}, {'item': 'Metallica', 'Musical Innovation': 4, 'Cultu

100%|██████████| 5/5 [00:39<00:00,  5.83s/persona]

[{'item': 'The Beatles', 'Musical Innovation': 5, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 3, 'Live Performance Strength': 3, 'Legacy & Longevity': 5}, {'item': 'Led Zeppelin', 'Musical Innovation': 4, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 4, 'Technical Proficiency': 5, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'Pink Floyd', 'Musical Innovation': 5, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 5, 'Technical Proficiency': 4, 'Live Performance Strength': 4, 'Legacy & Longevity': 5}, {'item': 'Queen', 'Musical Innovation': 4, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 3, 'Technical Proficiency': 5, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'The Rolling Stones', 'Musical Innovation': 4, 'Cultural Impact': 5, 'Lyrical or Thematic Depth': 4, 'Technical Proficiency': 3, 'Live Performance Strength': 5, 'Legacy & Longevity': 5}, {'item': 'Metallica', 'Musical Innovation': 4, 'Cultu

In [None]:
for agent in result:
    if agent.endswith("_ranking"):
        persona = agent.replace("_ranking", "")
        ranking = result[agent]
        print(f"🎤 {persona.upper()} Top 5: {ranking[:5]}")


In [None]:
import matplotlib.pyplot as plt

result["final_scores"].plot(kind="bar", figsize=(14, 6), legend=False)
plt.title("Final Rank Scores (Total Points)")
plt.ylabel("Score")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns

# Assemble the heatmap DataFrame
rank_matrix = []
for agent in PERSONAS:
    ranking = result[f"{agent}_ranking"]
    rank_row = {band: i + 1 for i, band in enumerate(ranking)}
    rank_matrix.append(rank_row)

rank_df = pd.DataFrame(rank_matrix, index=PERSONAS.keys())
plt.figure(figsize=(14, 6))
sns.heatmap(rank_df, cmap="coolwarm", annot=True, fmt="d", cbar_kws={"label": "Rank (lower is better)"})
plt.title("Band Rankings by Persona")
plt.xlabel("Band")
plt.ylabel("Persona")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np

# Compute standard deviation of rankings across agents
band_ranks = {band: [] for band in ITEMS}
for agent in PERSONAS:
    for i, band in enumerate(result[f"{agent}_ranking"]):
        band_ranks[band].append(i + 1)

stats = {
    band: {
        "mean_rank": np.mean(ranks),
        "std_dev": np.std(ranks),
        "min_rank": min(ranks),
        "max_rank": max(ranks)
    }
    for band, ranks in band_ranks.items()
}

rank_stats_df = pd.DataFrame(stats).T.sort_values("std_dev", ascending=False)
rank_stats_df


In [None]:
df = result["summary_df"]
df.head()


In [None]:
for col in df.columns:
    print(f"\n🔹 Top 5 bands by {col.upper()}:")
    display(df.sort_values(col, ascending=False).head(5))


In [None]:
df["std_dev"] = df.std(axis=1)
df["mean"] = df.mean(axis=1)
df_consistency = df.sort_values("std_dev")
df_consistency[["mean", "std_dev"]]


In [None]:
import matplotlib.pyplot as plt
import numpy as np

top3 = df.mean(axis=1).sort_values(ascending=False).head(3).index
categories = list(df.columns[:-2])  # exclude mean/std_dev

for band in top3:
    values = df.loc[band, categories].values.flatten().tolist()
    values += values[:1]  # repeat first value to close the radar loop

    angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
    angles += angles[:1]

    plt.figure(figsize=(6, 6))
    ax = plt.subplot(111, polar=True)
    ax.plot(angles, values, linewidth=2, label=band)
    ax.fill(angles, values, alpha=0.3)
    ax.set_thetagrids(np.degrees(angles[:-1]), categories)
    ax.set_title(f"{band} Score Profile")
    plt.legend()
    plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(df[categories].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Between Evaluation Criteria")
plt.show()


In [None]:
df["mean"] = df.mean(axis=1)
df_sorted = df.sort_values("mean", ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=df_sorted["mean"], y=df_sorted.index)
plt.title("🏆 Average Score per Band (All Criteria)")
plt.xlabel("Average Score")
plt.ylabel("Band")
plt.xlim(1, 3)
plt.show()


In [None]:
top_bands = df_sorted.head(10)
top_bands.iloc[:, :6].plot(kind='barh', stacked=True, figsize=(12, 7), colormap='tab20c')
plt.title("🎼 Top 10 Bands by Score Breakdown")
plt.xlabel("Total Score")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
