# SynCABEL: Results Analysis

This notebook presents the comprehensive analysis of results for the **SynCABEL** (Synthetic Contextualized Augmentation for Biomedical Entity Linking) method.

In [None]:
# Import required libraries
import altair as alt
import pandas as pd
import polars as pl

# Configure plotting for publication-ready figures
alt.data_transformers.enable("vegafusion")

## 1. Load Experimental Results

Load the results from different experimental configurations and baseline comparisons.

In [None]:
# Core metric computation
def _compute_metrics(df):
    tp = df["success"].sum()
    fn = df["fail"].sum()

    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    return {
        "Accuracy": recall,
        "Success": int(tp),
        "Fail": int(fn),
        "Support": int(tp + fn),
    }

# Grouped metric computation (e.g. by category)
def _compute_grouped_metrics(df, group_col):
    grouped = df.group_by(group_col).agg([
        pl.sum("success").alias("Success"),
        pl.sum("fail").alias("Fail"),
    ])
    grouped = grouped.with_columns([
        (pl.col("Success") + pl.col("Fail")).alias("Support"),
        (pl.col("Success") / (pl.col("Success") + pl.col("Fail"))).fill_null(0.0).alias("Accuracy"),
    ])
    return grouped.with_columns(Type=pl.col(group_col)).select(["Type", "Accuracy", "Success", "Fail", "Support"])

# Main function
def compute_metrics(result: pl.DataFrame, model, dataset):
    # Global and conditional metrics
    metrics_overall = _compute_metrics(result)
    metrics_single_word = _compute_metrics(result.filter(~result["multi_word_mention"]))
    metrics_multi_word = _compute_metrics(result.filter(result["multi_word_mention"]))
    metrics_direct_match = _compute_metrics(result.filter(result["direct_match"]))
    metrics_undirect_match = _compute_metrics(result.filter(~result["direct_match"]))

    # Compile into a summary table
    metrics_results = {
        "Single Word": metrics_single_word,
        "Multiple Word": metrics_multi_word,
        "Direct Match": metrics_direct_match,
        "Undirect Match": metrics_undirect_match,
        "Overall": metrics_overall,
    }

    summary_df = pl.DataFrame(pd.DataFrame.from_dict(metrics_results, orient="index").reset_index(names="Type")).to_pandas()

    # Compute metrics per category
    per_category_df = _compute_grouped_metrics(result, "category").to_pandas()
    result_df = pd.concat([summary_df, per_category_df])
    result_df["model"] = model
    result_df["dataset"] = dataset
    result_df["max_length"] = None
    result_df["n_beams"] = None
    result_df["constrained_inference"] = None
    result_df["training_stop"] = None
    result_df["data_augmentation"] = "No augmentation"
    return result_df

In [None]:
# Load experimental results
def load_results(results_dir: str = "data/results"):
    """
    Load experimental results from JSON files
    """
    all_results_df = pd.read_pickle(f"{results_dir}/gen_results.pkl")
    all_dfs = []
    models = ["coder-all", "SapBERT", "scispacy"]
    datasets = ["emea", "medline", "MM"]
    for model in models:
        for dataset in datasets:
            raw_results = pl.read_parquet(f"{results_dir}/{model}/{dataset}_all_results.parquet")
            all_dfs.append(compute_metrics(raw_results, model, dataset))
    all_results_df = pd.concat([all_results_df] + all_dfs)
    return all_results_df

# Load all results
results = load_results()
print(f"\n📈 Loaded results for {len(results)} experimental configurations")

## 2. Performance Comparison

Compare the performance of baseline methods against SynCABEL across both datasets.

In [None]:
filter_df = results[(results.Type == "Overall") & (results.data_augmentation.isin(["No augmentation", "LLM augmented complete"]))]

## 3. Visualization of Results

Create visualizations comparing baseline and SynCABEL performance.

In [None]:
results["Model"] = results["model"] + "-" + results["data_augmentation"]
results["Model"] = results["Model"].replace({
    "coder-all-No augmentation": "CODER",
    "SapBERT-No augmentation": "SapBERT",
    "mt5-large-No augmentation": "SapBERT",
})


In [None]:
# Enable correct data types
all_results_df = results[results.data_augmentation.isin(["LLM augmented complete", "No augmentation"])]
all_results_df["max_length"] = all_results_df["max_length"].astype(str)
all_results_df["n_beams"] = all_results_df["n_beams"].astype(str)

# Create selection widgets for fixed filters
type_options = sorted(all_results_df["Type"].unique())
dataset_options = sorted(all_results_df["dataset"].unique())

dropdown_type = alt.binding_select(options=type_options, name="Semantic Type: ")
dropdown_dataset = alt.binding_select(options=dataset_options, name="Dataset: ")


type_selector = alt.selection_point(
    fields=["Type"],
    bind=dropdown_type,
    value=str(type_options[0])  # Set to first option by default
)
dataset_selector = alt.selection_point(
    fields=["dataset"],
    bind=dropdown_dataset,
    value=dataset_options[1]
)


# Base chart with dynamic encoding
viz = alt.Chart(results).add_selection(
    type_selector,
    dataset_selector,
).transform_filter(
    type_selector & dataset_selector
).transform_joinaggregate(
    max_accuracy='max(Accuracy)',
    groupby=['model', 'data_augmentation']
).transform_filter(
    'datum.Accuracy === datum.max_accuracy'
).mark_bar().encode(
    # Dynamic x-axis based on selection
    x=alt.X("model:N", title="Model"),
    y=alt.Y("Accuracy:Q", title="Recall", scale=alt.Scale(domain=[0, 1])).stack(None),
    # Dynamic column grouping
    column=alt.Column('data_augmentation:N', title="Data Augmentation"),
    color=alt.Color('data_augmentation:N', legend=None),
    tooltip=["dataset", "Type", "model", "Accuracy", "Success", "Fail", "Support",
             "max_length", "n_beams", "constrained_inference", "data_augmentation"]
).properties(
    width=400,
    height=300
).interactive()

viz