In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns



In [2]:
from matplotlib.ticker import MultipleLocator

In [3]:
results_df = pd.read_csv('results.csv')

In [4]:
results_df.head()

Unnamed: 0,ratio,features,clustering,ARI,NMI,Homogeneity,Completeness
0,0.5,CL,K‑means,0.543655,0.681641,0.634774,0.735982
1,0.5,CL,Leiden,0.471764,0.702372,0.810282,0.619826
2,0.5,Trad,K‑means,0.516741,0.58258,0.560988,0.6059
3,0.5,Trad,Leiden,0.418836,0.637351,0.730787,0.565099
4,0.5,Trad (PCA),K‑means,0.443228,0.545381,0.534441,0.556779


In [5]:
#in results_df, replace Trad with Engineered in features, and Trad (PCA) with Eng (PCA)
results_df['features'] = results_df['features'].replace({'Trad': 'Eng', 'Trad (PCA)': 'Eng (PCA)'})

In [6]:
#remove all rows where features is Eng (PCA)
results_df = results_df[results_df['features'] != 'Eng (PCA)']

In [7]:
metrics = ["ARI", "NMI", "Homogeneity", "Completeness"]

for m in metrics:                      # flatten any 1-element containers
    results_df[m] = results_df[m].apply(
        lambda v: float(np.asarray(v).squeeze())
                  if isinstance(v, (list, np.ndarray, pd.Series))
                  else v
    )

long_df = results_df.melt(
    id_vars=["ratio", "features", "clustering"],
    value_vars=metrics,
    var_name="score_name",
    value_name="score_value",
).astype({"ratio": float, "score_value": float})

In [14]:
cmap = {
    'CL': "blue",
    'Eng': "red",
}

# --- ensure numeric columns ----------------------------------------------
long_df["ratio"]       = pd.to_numeric(long_df["ratio"],       errors="coerce")
long_df["score_value"] = pd.to_numeric(long_df["score_value"], errors="coerce")

# --- force metric order --------------------------------------------------
metric_order = ["ARI", "NMI", "Homogeneity", "Completeness"]
# only keep those metrics actually present
score_names = [m for m in metric_order if m in long_df["score_name"].unique()]

clusterings = long_df["clustering"].unique()
n_rows, n_cols = len(clusterings), len(score_names)

# --- make the figure -----------------------------------------------------
fig, axes = plt.subplots(
    n_rows, n_cols,
    figsize=(6, 3),
    sharex=True,    # share x-axis across columns
    sharey=False,    # share y-axis across rows *and* columns
)

# normalize axes array to 2D
axes = np.atleast_2d(axes)

# --- plotting loop -------------------------------------------------------
for i, clust in enumerate(clusterings):
    for j, score in enumerate(score_names):
        ax = axes[i, j]
        subset = long_df.query(
            "clustering == @clust and score_name == @score"
        )
        for feat, d in subset.groupby("features"):
            ax.plot(
                d["ratio"],
                d["score_value"],
                marker="o",
                label=feat,
                color=cmap.get(feat,"grey"),
                linewidth=0.8,
                markersize=3,
            )

        # titles & labels only on outer edges:
        if i == 0:
            ax.set_title(score, fontsize=8, pad=4)
        if j == 0:
            ax.set_ylabel(clust, fontsize=8)
        if i == n_rows - 1:
            ax.set_xlabel("Imbalance ratio", fontsize=8)

        ax.yaxis.set_major_locator(MultipleLocator(0.1))
        ax.tick_params(labelsize=7)

        # set y-axis limits
        ax.grid(False)

# --- compact legend inside first subplot ---------------------------------
handles, labels = axes[0, 3].get_legend_handles_labels()
axes[0, 3].legend(
    handles, labels,
    loc="upper right",
    fontsize=6,
    frameon=False,
)

# --- layout & export -----------------------------------------------------
fig.tight_layout()
fig.savefig(
    "figure.pdf",
    format="pdf",
    dpi=300,
    bbox_inches="tight"
)
plt.close(fig)
