### Visulization

In [5]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
def create_action_barcharts(df, output_folder="../charts_barcharts", exclude_action="wikidata", metrics = ['J(A1-A2)', 'J(A1-A34)', 'J(A4-A1|3)', 'J(A3-A4)'], exclude_llms=None):
    """
    Generates one bar chart per dataset per action.
    X-axis: Jaccard metrics.
    Each bar: one LLM's value for that metric.
    Legend is placed below the plot in multiple columns.
    """

    metrics = ['J(A1-A2)', 'J(A1-A34)', 'J(A4-A1|3)', 'J(A3-A4)']
    sns.set(style="whitegrid")

    os.makedirs(output_folder, exist_ok=True)
    df = df[df['action'] != exclude_action]

    # Exclude LLMs if provided
    if exclude_llms is not None:
        df = df[~df['llm'].isin(exclude_llms)]

    datasets = df['dataset'].unique()

    for dataset in datasets:
        df_dataset = df[df['dataset'] == dataset]
        actions = df_dataset['action'].unique()

        for action in actions:
            df_action = df_dataset[df_dataset['action'] == action]
            llms = df_action['llm'].tolist()
            
            x = np.arange(len(metrics))  # positions for metrics
            n_models = len(llms)
            width = 0.8 / n_models  # auto-adjust bar width
            
            plt.figure(figsize=(14, 5))
            
            # Use a larger color palette to minimize repeats
            colors = sns.color_palette("tab20", n_colors=n_models)

            # Plot each LLM
            for i, llm in enumerate(llms):
                values = df_action[df_action['llm'] == llm][metrics].values.flatten()
                plt.bar(x + i*width, values, width=width, color=colors[i], label=llm, edgecolor='black')

            # Center X-axis labels under the groups
            total_width = width * n_models
            plt.xticks(x + total_width/2 - width/2, metrics)

            plt.ylim(0, 1)
            plt.ylabel("Jaccard Metric Value")
            plt.title(f"J-Metrics per LLM ({dataset} - {action})")
            
            # Legend below the chart in multiple columns
            n_cols = min(6, n_models)  # max 6 columns
            plt.legend(title="LLM", loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=n_cols)
            
            plt.tight_layout()
            # Save figure
            filename = f"{output_folder}/barchart_{dataset}_{action}.png"
            plt.savefig(filename, bbox_inches='tight')
            plt.close()
            print(f"Saved: {filename}")


In [None]:
def plot_pvalue_heatmaps_by_dataset_action(
    csv_file: str,
    cmap: str = "Reds_r",
    exclude_actions=("wikidata",),
    save: bool = True,
    out_dir: str = "../charts/action_pvalue_heatmaps",
    exclude_llms=None
):
    """
    Generate one heatmap per (dataset, action) pair.
      - Excludes rows with action in `exclude_actions`.
      - Colors only p <= 0.05 (intensity increases as p -> 0).
      - Cells with p > 0.05 are shown as white (masked).
      - Annotates only the significant cells (<= 0.05).
    Args:
        csv_file: path to CSV.
        cmap: a reversed sequential colormap string (e.g., "Reds_r","Blues_r","viridis_r").
        exclude_actions: actions to drop (default includes "wikidata").
        save: if True, save PNGs into out_dir; otherwise show them.
        out_dir: directory where PNGs are saved when save=True.
    """
    df = pd.read_csv(csv_file)
    # drop excluded actions
    df = df[~df["action"].isin(exclude_actions)]

    # Exclude LLMs if provided
    if exclude_llms is not None:
        df = df[~df['llm'].isin(exclude_llms)]

    # identify p-value columns
    pval_cols = [c for c in df.columns if c.startswith("p(")]
    if not pval_cols:
        raise ValueError("No p-value columns found (columns starting with 'p(').")

    # coerce to numeric
    df[pval_cols] = df[pval_cols].apply(pd.to_numeric, errors="coerce")

    if save:
        os.makedirs(out_dir, exist_ok=True)

    for dataset in df["dataset"].unique():
        actions = df[df["dataset"] == dataset]["action"].unique()
        for action in actions:
            subset = df[(df["dataset"] == dataset) & (df["action"] == action)]
            if subset.empty:
                continue

            # prepare matrix: rows = llm, cols = p-value features
            heat = subset.set_index("llm")[pval_cols].copy()

            # mask non-significant (p > 0.05) and NaNs
            mask = (heat > 0.05) | heat.isna()

            annot = heat.copy()
            for r in annot.index:
                for c in annot.columns:
                    val = annot.loc[r, c]
                    if pd.isna(val) or val > 0.05:
                        annot.loc[r, c] = ""         # non-significant
                    elif val <= 0.001:
                        annot.loc[r, c] = "≈0"       # very close to zero
                    else:
                        annot.loc[r, c] = f"{val:.3f}"  # keep 3 decimals

            # if there are no significant values, skip (or optionally draw an empty plot)
            if (~mask).sum().sum() == 0:
                print(f"No significant p-values (<=0.05) for {dataset} | {action} — skipping.")
                continue

            # plot size based on matrix dimensions
            figsize = (max(6, 1 * heat.shape[1] + 2), max(3, 0.4 * heat.shape[0] + 1))
            plt.figure(figsize=figsize)

            sns.heatmap(
                heat,
                mask=mask,
                annot=annot,
                fmt="",               # use provided annot strings
                cmap=cmap,            # reversed cmap so lower p -> more intense color
                vmin=0.0,
                vmax=0.05,            # map color scale to [0, 0.05]
                cbar_kws={"label": "p-value (only p ≤ 0.05 shown)"},
                linewidths=0.5,
                linecolor="lightgray",
                square=False,
            )

            plt.title(f"P-values (p ≤ 0.05 colored) — {dataset} | {action}", fontsize=10)
            plt.xticks(rotation=45, ha="right")
            plt.yticks(rotation=0)
            plt.tight_layout()

            if save:
                fname = f"{dataset}_{action}_pvalues.png".replace(" ", "_")
                plt.savefig(os.path.join(out_dir, fname), dpi=300)
                plt.close()
            else:
                plt.show()

def plot_square_heatmaps(csv_file, out_dir="heatmaps", cmap="Reds_r", exclude_llms=None):
    df = pd.read_csv(csv_file)

    # Identify LLM columns (everything after the 'llm' column)
    llm_cols = df.columns.tolist()
    llm_start = llm_cols.index("llm") + 1
    llm_cols = llm_cols[llm_start:]

    if exclude_llms is not None:
        df = df[~df["llm"].isin(exclude_llms)]
        llm_cols = [c for c in llm_cols if c not in exclude_llms]

    os.makedirs(out_dir, exist_ok=True)

    # Group by predicate → dataset → action
    grouped = df.groupby(["predicate", "dataset", "action"])
    for (predicate, dataset, action), subset in grouped:
        if subset.empty:
            continue

        # Build square matrix: rows = llm, cols = llm_cols
        mat = subset.set_index("llm")[llm_cols].copy()

        # Ensure numeric
        mat = mat.apply(pd.to_numeric, errors="coerce")

        # Mask: non-significant values (p > 0.05)
        mask = (mat > 0.05) | mat.isna()

        # Annotation: ≈0 for very small, 3 decimals otherwise
        annot = mat.copy()
        for r in annot.index:
            for c in annot.columns:
                val = annot.loc[r, c]
                if pd.isna(val) or val > 0.05:
                    annot.loc[r, c] = ""
                elif val <= 0.001:
                    annot.loc[r, c] = "≈0"
                else:
                    annot.loc[r, c] = f"{val:.3f}"

        # Skip if no significant values
        if (~mask).sum().sum() == 0:
            print(f"Skipping {predicate} | {dataset} | {action}: no significant values.")
            continue

        # Prepare figure
        figsize = (max(6, 0.8 * len(llm_cols)), max(6, 0.8 * len(subset)))
        plt.figure(figsize=figsize)

        sns.heatmap(
            mat,
            mask=mask,
            annot=annot,
            fmt="",
            cmap=cmap,
            vmin=0.0, vmax=0.05,
            cbar_kws={"label": "p-value (only p ≤ 0.05 shown)"},
            linewidths=0.5, linecolor="lightgray"
        )

        plt.title(f"{predicate} | {dataset} | {action}", fontsize=10)
        plt.xticks(rotation=90)
        plt.yticks(rotation=0)
        plt.tight_layout()

        # Save inside predicate folder
        pred_dir = os.path.join(out_dir, predicate)
        os.makedirs(pred_dir, exist_ok=True)

        fname = f"{dataset}_{action}_pvalues.png".replace(" ", "_")
        plt.savefig(os.path.join(pred_dir, fname), dpi=300)
        plt.close()
        
        print(f"Saved: {os.path.join(pred_dir, fname)}")



In [None]:
import os
import sys

folder = "../../output/"
df_summery = pd.read_csv(folder + "summary_2025-09-10_22-14.csv")
df_summery

Unnamed: 0,dataset,action,llm,?A1=A2,?A1=A3+A4,?A1>A3,?A1>A4,?A3∅A4,?A4=A1|3,?A1=A1*,...,?A1=A3+A4(+),?A1=A3+A4(-),J(1-34)+,J(1-34)-,p(A1=A2),p(A1=A3+A4),p(A1>A3),p(A1>A4),p(A3∅A4),p(A4=A1|3)
0,qawiki,classification,deepseek-chat,0.9800,0.8467,0.9000,0.9333,0.5600,0.4867,0.5133,...,,,,,0.0000,0.0000,0.0000,0.0000,0.9890,0.0000
1,qawiki,classification,deepseek-r1:1.5b,0.0000,0.0000,1.0000,0.0000,0.9600,0.0000,0.0000,...,0.000000,0.000000,0.006335,0.013392,1.0000,1.0000,0.0000,1.0000,1.0000,1.0000
2,qawiki,classification,deepseek-r1:70b,0.0200,0.0067,1.0000,0.1400,0.7333,0.0067,0.0000,...,0.006803,0.000000,0.286773,0.138900,0.1250,0.5000,0.0000,0.0000,1.0000,0.5000
3,qawiki,classification,deepseek-reasoner,0.7267,0.8533,0.9133,0.8933,0.4533,0.4000,0.3867,...,,,,,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000
4,qawiki,classification,gemini-2.0-flash,0.8667,0.8867,1.0000,0.9533,0.3533,0.3133,0.4667,...,0.885135,1.000000,0.949498,1.000000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,overall,zero-shot,gpt-oss:20b,0.2422,0.1533,0.4244,0.4000,0.8489,0.1844,,...,0.160377,0.038462,0.412821,0.308892,0.5220,0.5262,0.5183,0.5188,0.5262,0.5238
210,overall,zero-shot,grok-3-mini,0.3089,0.2644,0.5533,0.4467,0.8956,0.2644,,...,0.271071,0.000000,0.591413,0.298882,0.5209,0.5219,0.5191,0.5182,0.5323,0.5217
211,overall,zero-shot,llama3.1:8b,0.0511,0.0089,0.1378,0.0911,0.6844,0.0044,,...,,,,,0.5434,0.5982,0.5279,0.5332,0.5206,0.6367
212,overall,zero-shot,llama3.3:70b,0.2733,0.1000,0.4067,0.2600,0.5733,0.0533,,...,,,,,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000


In [43]:
metrics = ['J(A1-A2)', 'J(A1-A34)', 'J(A4-A1|3)', 'J(A3-A4)']
sns.set(style="whitegrid")

# exclude_action = ["deepseek-r1:1.5b", "deepseek-r1:70b"]
# os.makedirs(output_folder, exist_ok=True)
# df_summery = df_summery[df_summery['action']]

# Exclude LLMs if provided
exclude_llms = ["deepseek-r1:1.5b", "deepseek-r1:70b"]
if exclude_llms is not None:
    df_summery = df_summery[~df_summery['llm'].isin(exclude_llms)]
datasets = df_summery['dataset'].unique()
datasets

array(['qawiki', 'spinach', 'synthetic', 'overall'], dtype=object)

In [None]:
create_action_barcharts(df, output_folder="../charts", exclude_action="wikidata", metrics=['J(A1-A2)', 'J(A1-A34)', 'J(A4-A1|3)', 'J(A3-A4)'], exclude_llms=["deepseek-r1:1.5b", "deepseek-r1:70b"])

plot_pvalue_heatmaps_by_dataset_action("../output/summary_2025-09-10_22-14.csv",exclude_llms=["deepseek-r1:1.5b", "deepseek-r1:70b"])

plot_square_heatmaps("../output/p_value_matrices_2025-09-10_22-14.csv", out_dir="../charts/llms_pvalue_heatmaps", exclude_llms=["deepseek-r1:1.5b", "deepseek-r1:70b"])