In [None]:
import re

import pandas as pd
import plotly.graph_objects as go

In [None]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    """Clean a dataframe."""
    columns_to_drop = [col for col in df.columns if col.endswith("_version")]
    columns_to_drop += [col.replace("_version", "") for col in columns_to_drop]
    columns_to_drop += [
        "generative_type",
        "parameters",
        "vocabulary_size",
        "context",
        "commercial",
        "merge",
        "rank",
    ]

    new_df = df.copy()
    new_df.model = new_df.model.map(lambda x: re.sub(r"<.*?>(.*?)</.*>", r"\1", x)).map(
        lambda x: re.sub(r"^(gemini|xai)/", "", x)
    )
    new_df = (
        new_df.set_index("model")
        .drop(columns=columns_to_drop)
        .map(lambda x: x.split("@@")[0] if isinstance(x, str) else x)
        .map(lambda x: None if x == "-" else x)
        .astype(float)
    )
    return new_df

In [None]:
def plot_models(
    df: pd.DataFrame,
    models: list[str],
    title: str,
    max_score: float,
    colours: list[str] | None,
) -> None:
    """Create a spider plot of a list of models."""
    fig = go.Figure()

    if colours is None:
        colours = [None] * models

    for model, colour in zip(models, colours):
        model_scores = df.loc[model, :].tolist()
        trace = go.Scatterpolar(
            r=model_scores,
            theta=[x.capitalize() for x in df.columns],
            name=model,
            fill="toself",
            line=dict(color=colour),
            opacity=0.8,
        )
        fig.add_trace(trace)

    fig.update_layout(
        polar=dict(radialaxis=dict(range=[max_score, 1])),
        showlegend=True,
        title=title.strip(),
        width=800,
        height=500,
    )
    fig.show(config=dict(toImageButtonOptions=dict(scale=6)))

In [None]:
COLOURS = ["dodgerblue", "lightgreen", "orange", "pink", "red"]

In [None]:
df_all = clean_df(
    df=pd.read_csv(
        "https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main/leaderboards/european_all.csv",
        header=1,
    )
)

df_nlu = clean_df(
    df=pd.read_csv(
        "https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main/leaderboards/european_nlu.csv",
        header=1,
    )
)

zero_shot_df = (
    df_all.reset_index()
    .map(lambda x: None if isinstance(x, str) and "zero-shot" not in x else x)
    .map(lambda x: re.sub(r" \(.*", "", x) if isinstance(x, str) else x)
    .set_index("model")
)

few_shot_df = (
    df_all.reset_index()
    .map(lambda x: None if isinstance(x, str) and "zero-shot" in x else x)
    .map(lambda x: re.sub(r" \(.*", "", x) if isinstance(x, str) else x)
    .dropna()
    .set_index("model")
)

In [None]:
models = [
    # "gemini-3-pro-preview",
    # "claude-sonnet-4-5-20250929#thinking",
    "gpt-5-2025-08-07",
    "gpt-5.2-2025-12-11",
]
plot_models(
    df=zero_shot_df,
    models=models,
    colours=COLOURS[: len(models)],
    title="Zero-shot Performance of GPT-5.x",
    max_score=3,
)