In [None]:
import re

import pandas as pd
import plotly.graph_objects as go

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main/leaderboards/european_all.csv",
    header=1,
)
df.head(2)

In [None]:
columns_to_drop = [col for col in df.columns if col.endswith("_version")]
columns_to_drop += [col.replace("_version", "") for col in columns_to_drop]
columns_to_drop += [
    "generative_type",
    "parameters",
    "vocabulary_size",
    "context",
    "commercial",
    "merge",
    "rank",
]

clean_df = (
    df.map(lambda x: x.split("@@")[0] if isinstance(x, str) else x)
    .map(lambda x: re.sub(r"<.*?>(.*?)</.*>", r"\1", x) if isinstance(x, str) else x)
    .map(lambda x: re.sub(r"^(gemini|xai)/", "", x) if isinstance(x, str) else x)
    .map(lambda x: re.sub(r" *.+.+ *", "", x) if isinstance(x, str) else x)
    .map(lambda x: None if x == "-" else x)
    .drop(columns=columns_to_drop)
    .dropna()
    .set_index("model")
    .map(float)
)

zero_shot_df = (
    clean_df.reset_index()
    .map(lambda x: None if isinstance(x, str) and "few-shot" in x else x)
    .map(lambda x: re.sub(r" *.∗.*", "", x) if isinstance(x, str) else x)
    .dropna()
    .set_index("model")
)

few_shot_df = (
    clean_df.reset_index()
    .map(lambda x: None if isinstance(x, str) and "few-shot" not in x else x)
    .map(lambda x: re.sub(r" *.∗.*", "", x) if isinstance(x, str) else x)
    .map(lambda x: re.sub(r"^.*/", "", x) if isinstance(x, str) else x)
    .dropna()
    .set_index("model")
)

In [None]:
def plot_models(
    df: pd.DataFrame,
    models: list[str],
    title: str,
    max_score: float,
    colours: list[str] | None,
) -> None:
    """Create a spider plot of a list of models."""
    fig = go.Figure()

    if colours is None:
        colours = [None] * models

    for model, colour in zip(models, colours):
        model_scores = df.loc[model, :].tolist()
        trace = go.Scatterpolar(
            r=model_scores,
            theta=[x.capitalize() for x in clean_df.columns],
            name=model,
            fill="toself",
            line=dict(color=colour),
        )
        fig.add_trace(trace)

    fig.update_layout(
        polar=dict(radialaxis=dict(range=[max_score, 1])),
        showlegend=True,
        title=title.strip() + " (smaller is better)",
        width=800,
        height=500,
    )
    fig.show(config=dict(toImageButtonOptions=dict(scale=6)))

In [None]:
plot_models(
    df=zero_shot_df,
    models=[
        "gpt-5-2025-08-07",
        "o3-2025-04-16",
        "gpt-5-2025-08-07@minimal",
        "gpt-4o-2024-05-13",
    ],
    colours=["dodgerblue", "lightgreen", "magenta", "orange"],
    title="Zero-shot Performance of Large-sized Proprietary LLMs",
    max_score=3,
)

In [None]:
plot_models(
    df=zero_shot_df,
    models=[
        "gpt-5-mini-2025-08-07",
        "gpt-4o-2024-05-13",
        "gpt-5-mini-2025-08-07@minimal",
        "gpt-4o-mini-2024-07-18",
    ],
    colours=["dodgerblue", "lightgreen", "magenta", "orange"],
    title="Zero-shot Performance of Medium-sized Proprietary LLMs",
    max_score=3.5,
)

In [None]:
plot_models(
    df=zero_shot_df,
    models=[
        "gpt-5-nano-2025-08-07",
        "gpt-4o-mini-2024-07-18",
        "gpt-5-nano-2025-08-07@minimal",
    ],
    colours=["dodgerblue", "lightgreen", "magenta"],
    title="Zero-shot Performance of Small-sized Proprietary LLMs",
    max_score=5,
)