In [None]:
import re

import pandas as pd
import plotly.graph_objects as go

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main/leaderboards/european_all.csv",
    header=1,
)
df.head(2)

In [None]:
columns_to_drop = [col for col in df.columns if col.endswith("_version")]
columns_to_drop += [col.replace("_version", "") for col in columns_to_drop]
columns_to_drop += [
    "generative_type",
    "parameters",
    "vocabulary_size",
    "context",
    "commercial",
    "merge",
    "rank",
]

clean_df = df.copy()
clean_df.model = clean_df.model.map(lambda x: re.sub(r"<.*?>(.*?)</.*>", r"\1", x)).map(
    lambda x: re.sub(r"^(gemini|xai)/", "", x)
)
clean_df = (
    clean_df.set_index("model")
    .drop(columns=columns_to_drop)
    .map(lambda x: x.split("@@")[0] if isinstance(x, str) else x)
    .map(lambda x: None if x == "-" else x)
    .astype(float)
)

zero_shot_df = (
    clean_df.reset_index()
    .map(lambda x: None if isinstance(x, str) and "zero-shot" not in x else x)
    .map(lambda x: re.sub(r" \(.*", "", x) if isinstance(x, str) else x)
    .map(lambda x: re.sub(r"^.*/", "", x) if isinstance(x, str) else x)
    .set_index("model")
)

few_shot_df = (
    clean_df.reset_index()
    .map(lambda x: None if isinstance(x, str) and "zero-shot" in x else x)
    .map(lambda x: re.sub(r" \(.*", "", x) if isinstance(x, str) else x)
    .map(lambda x: re.sub(r"^.*/", "", x) if isinstance(x, str) else x)
    .dropna()
    .set_index("model")
)

In [None]:
def plot_models(
    df: pd.DataFrame,
    models: list[str],
    title: str,
    max_score: float,
    colours: list[str] | None,
) -> None:
    """Create a spider plot of a list of models."""
    fig = go.Figure()

    if colours is None:
        colours = [None] * models

    for model, colour in zip(models, colours):
        model_scores = df.loc[model, :].tolist()
        trace = go.Scatterpolar(
            r=model_scores,
            theta=[x.capitalize() for x in clean_df.columns],
            name=model,
            fill="toself",
            line=dict(color=colour),
        )
        fig.add_trace(trace)

    fig.update_layout(
        polar=dict(radialaxis=dict(range=[max_score, 1])),
        showlegend=True,
        title=title.strip() + " (smaller is better)",
        width=800,
        height=500,
    )
    fig.show(config=dict(toImageButtonOptions=dict(scale=6)))

In [None]:
plot_models(
    df=few_shot_df,
    models=[
        "Llama-3.1-8B-Instruct",
        "Apertus-8B-Instruct-2509",
        "Mistral-7B-Instruct-v0.1",
    ],
    colours=["dodgerblue", "lightgreen", "orange"],
    title="Few-shot Performance of 8B-sized instruction-tuned LLMs",
    max_score=4.5,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=["Llama-3.1-8B", "Apertus-8B-2509", "Mistral-7B-v0.1"],
    colours=["dodgerblue", "lightgreen", "orange"],
    title="Few-shot Performance of 8B-sized base LLMs",
    max_score=4.5,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=[
        "Llama-3.3-70B-Instruct",
        "Apertus-70B-Instruct-2509",
        "Llama-2-70b-chat-hf",
    ],
    colours=["dodgerblue", "lightgreen", "orange"],
    title="Few-shot Performance of 70B-sized instruction-tuned LLMs",
    max_score=4.5,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=["Llama-3.1-70B", "Apertus-70B-2509", "Llama-2-70b-hf"],
    colours=["dodgerblue", "lightgreen", "orange"],
    title="Few-shot Performance of 70B-sized base LLMs",
    max_score=4.5,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=["Apertus-70B-Instruct-2509", "Apertus-8B-Instruct-2509"],
    colours=["dodgerblue", "lightgreen"],
    title="Few-shot Performance of Apertus instruction-tuned LLMs",
    max_score=4.5,
)