# NMD-Region AlphaFold Amino Acid Composition Analysis

This notebook computes amino acid-level (AA) and physicochemical property-level changes between WT and variant NMD-region AlphaFold predicted structures. It generates summary metrics, significance testing, and visualizations for Minus1, Plus1, and Nonsense variants.

## Setup and Configuration

Import required libraries and define the amino acid list for analysis.

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patheffects as pe
from matplotlib.patches import Patch
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import re

AA_LIST = list("ACDEFGHIKLMNPQRSTVWY")

## Data Loading and Preprocessing

Load the NMD-region composition table and deduplicate entries using the `key` column to ensure unique variants for downstream analysis.

In [None]:
# Load WT/Var NMD-region table
df_original = pd.read_csv("WT_var_NMD_4c_AD.csv")
print(f"Loaded: {df_original.shape[0]} rows, {df_original.shape[1]} columns")

# Deduplicate by 'key'
if "key" in df_original.columns:
    print("Deduplicating using 'key' column...")
    df_clean = df_original.drop_duplicates(subset="key", keep="first").copy()
    print(f"After deduplication: {df_clean.shape[0]} rows")
else:
    print("Warning: 'key' column not found, skipping deduplication")
    df_clean = df_original.copy()

# Main dataframe for all downstream analysis
df_NMD_unique = df_clean.copy()

## Helper Functions

Define utility functions for:
- Column detection and resolution for amino acids and properties
- Computing amino acid-level metrics (Δ composition and log₂ fold-change)
- Computing property-level metrics (grouped AA classes)
- Statistical significance testing and visualization utilities

In [None]:
def _resolve_prop_cols(df, props, metric="diff", context="NMD"):
    """
    props can be either full column names or base names like 'Aromatic'.
    We resolve to columns based on metric/context:
      - diff   -> '{prop}_Diff_{ctx}'
      - log2fc -> '{prop}_log2FC_{ctx}'
    """
    ctx = "NMD" if "NMD" in context.upper() else "FL"
    suffix = "Diff" if metric.lower() == "diff" else "log2FC"

    cols = []
    for p in props:
        if p in df.columns:
            cols.append(p)  # already a full column name
            continue
        cand = f"{p}_{suffix}_{ctx}"
        if cand in df.columns:
            cols.append(cand)
        else:
            # try to find a close match like 'Aromatic_Diff_NMD'
            regex = re.compile(rf"^{re.escape(p)}_(Diff|log2FC)_{ctx}$", re.IGNORECASE)
            match = [c for c in df.columns if regex.match(c)]
            if match:
                cols.append(match[0])
    return [c for c in cols if c in df.columns]

def _detect_aa_cols(df, region="nmd"):
    """
    Detect per-AA WT/Var columns for the given region.
    Supports:
      - aa_AA_X_WT_nmd / aa_AA_X_vars_nmd
      - aa_X_WT_FL / aa_X_var_FL
    Ignores Diff columns. Case-insensitive.
    """
    WT, VAR = {}, {}
    region = (region or "").lower()
    # preferred region tokens
    want_nmd = {"nmd"}
    want_fl  = {"fl", "full", "full_length"}
    def score_tail(tail_l):
        if region == "nmd":
            return (any(tok in tail_l for tok in want_nmd), any(tok in tail_l for tok in want_fl))
        else:  # "fl"
            return (any(tok in tail_l for tok in want_fl), any(tok in tail_l for tok in want_nmd))

    for c in df.columns:
        cl = c.lower()
        if "diff" in cl:  # skip derived diffs as inputs
            continue

        m = re.match(r"^aa(?:_aa)?_([A-Z])_(WT|var|vars)(.*)$", c, flags=re.IGNORECASE)
        if not m:
            continue
        aa, kind, tail = m.groups()
        aa = aa.upper()
        kind = kind.lower()
        if aa not in AA_LIST:
            continue

        tail_l = tail.lower()
        sc = score_tail(tail_l)

        if kind == "wt":
            prev = WT.get(aa, ((False, False), None))
            if prev[1] is None or prev[0] < sc:
                WT[aa] = (sc, c)
        else:  # var/vars
            prev = VAR.get(aa, ((False, False), None))
            if prev[1] is None or prev[0] < sc:
                VAR[aa] = (sc, c)

    WT = {aa: tup[1] for aa, tup in WT.items() if tup[1] is not None}
    VAR = {aa: tup[1] for aa, tup in VAR.items() if tup[1] is not None}

    print(f"[detect] Found WT AAs ({len(WT)}): {sorted(WT.keys())}")
    print(f"[detect] Found Var AAs ({len(VAR)}): {sorted(VAR.keys())}")
    return {"WT": WT, "Var": VAR}

In [None]:
def add_property_level_metrics_fillna(
    df,
    aa_ctx="nmd",          # per-AA suffix: "nmd" or "fl"
    prop_ctx="NMD",        # property suffix for outputs: "NMD" or "FL"
    epsilon_pp=0.1,        # absolute floor in percentage points
    smooth_alpha=0.02,     # proportional smoothing as fraction of WT (e.g., 0.02 = 2%)
    overwrite=True,
    groups=None
):
    """
    Build property-level WT/Var totals, Diff, Ratio, and log2FC from per-AA *percent* composition.
    NaN values are treated as 0 (AA absent).

    Creates for each property P:
        P_WT_<prop_ctx>
        P_Var_<prop_ctx>
        P_Diff_<prop_ctx>          = Var - WT
        P_ratio_<aa_ctx>           = (Var + α*WT + ε_pp) / (WT + α*WT + ε_pp)
        P_log2FC_<prop_ctx>        = log2(P_ratio_<aa_ctx>)
    """
    if groups is None:
        groups = {
            "Nonpolar_Aliphatic": ["A","V","L","I","M"],
            "Aromatic":           ["F","W","Y"],
            "Polar_Uncharged":    ["S","T","N","Q","C"],
            "Positively_Charged": ["K","R","H"],
            "Negatively_Charged": ["D","E"],
            "Special_Cases":      ["G","P"],
        }

    for prop, aa_list in groups.items():
        wt_cols  = [f"aa_AA_{a}_WT_{aa_ctx}"   for a in aa_list]
        var_cols = [f"aa_AA_{a}_vars_{aa_ctx}" for a in aa_list]

        wt_out   = f"{prop}_WT_{prop_ctx}"
        var_out  = f"{prop}_Var_{prop_ctx}"
        diff_out = f"{prop}_Diff_{prop_ctx}"
        ratio_out= f"{prop}_ratio_{aa_ctx}"
        l2fc_out = f"{prop}_log2FC_{prop_ctx}"

        if (not overwrite) and all(c in df.columns for c in [wt_out,var_out,diff_out,ratio_out,l2fc_out]):
            continue

        # Fill NaNs with 0 before summing (absence = 0%)
        WT  = df[wt_cols].fillna(0).sum(axis=1).astype(float)
        VAR = df[var_cols].fillna(0).sum(axis=1).astype(float)

        # Store raw WT/Var and Diff
        df[wt_out]   = WT
        df[var_out]  = VAR
        df[diff_out] = VAR - WT

        # Percent-aware smoothing for ratios
        WT_s  = WT  + smooth_alpha*WT  + epsilon_pp
        VAR_s = VAR + smooth_alpha*WT  + epsilon_pp

        df[ratio_out] = VAR_s / WT_s
        df[l2fc_out]  = np.log2(df[ratio_out])

    return df

def add_aa_level_metrics(
    df,
    region="nmd",          # "nmd" or "fl"
    context_label=None,    # "NMD" or "FL" (auto if None)
    epsilon_pp=0.1,        # absolute floor in percentage points
    smooth_alpha=0.02,     # proportional floor (fraction of WT)
    keep_ratio=False,      # optionally store X_ratio_<region>
    overwrite=True
):
    """
    Creates for each AA X:
      X_Diff_<CTX>   = Var - WT   (percent composition)
      X_log2FC_<CTX> = log2( (Var + α·WT + ε) / (WT + α·WT + ε) )
    NaNs in inputs are treated as 0 (absence). Handles both NMD and Full Length schemas.
    """
    region = region.lower()
    if context_label is None:
        context_label = "NMD" if region == "nmd" else "FL"

    aa_map = _detect_aa_cols(df, region=region)

    missing = []
    for aa in AA_LIST:
        wt_col  = aa_map["WT"].get(aa)
        var_col = aa_map["Var"].get(aa)
        if not wt_col or not var_col:
            missing.append(aa)
            continue

        diff_col  = f"{aa}_Diff_{context_label}"
        ratio_col = f"{aa}_ratio_{region}"
        l2fc_col  = f"{aa}_log2FC_{context_label}"

        if (not overwrite) and all(c in df.columns for c in ([diff_col, l2fc_col] + ([ratio_col] if keep_ratio else []))):
            continue

        WT  = df[wt_col].astype(float).fillna(0.0)
        VAR = df[var_col].astype(float).fillna(0.0)

        # difference in % composition
        df[diff_col] = VAR - WT

        # smoothed ratio & log2FC
        WT_s  = WT  + smooth_alpha*WT  + epsilon_pp
        VAR_s = VAR + smooth_alpha*WT  + epsilon_pp
        ratio = VAR_s / WT_s
        if keep_ratio:
            df[ratio_col] = ratio
        df[l2fc_col] = np.log2(ratio)

    if missing:
        print(f"[warn] Skipped AAs with missing WT/Var columns: {missing}")

    return df

## Compute Amino Acid and Property Metrics

Derive Var−WT differences and log₂ fold-changes for each amino acid and for physicochemical property groups in the NMD region.

In [None]:
# AA-level composition metrics (creates *_Diff_NMD and *_log2FC_NMD)
df_NMD_unique = add_aa_level_metrics(
    df_NMD_unique,
    region="nmd",
    context_label="NMD",
    epsilon_pp=0.1,
    smooth_alpha=0.02,
    keep_ratio=False
)

# Property-level metrics from AA composition
df_NMD_unique = add_property_level_metrics_fillna(
    df_NMD_unique,
    aa_ctx="nmd",
    prop_ctx="NMD",
    epsilon_pp=0.1,
    smooth_alpha=0.02,
    overwrite=True
)

df_NMD_unique.head()

## Plotting Configuration and Utilities

Define color schemes, property groups, and utility functions for visualization.

In [None]:
FAMILY_COLOR = {'Minus1':'#4472C4', 'Plus1':'#C65911', 'Nonsense':'#70AD47'}

def lighten_color(color, amount=0.55):
    r, g, b = mcolors.to_rgb(color)
    return (1 - amount) + amount*r, (1 - amount) + amount*g, (1 - amount) + amount*b

props_nmd = [
    "Aromatic",
    "Nonpolar_Aliphatic",
    "Polar_Uncharged",
    "Positively_Charged",
    "Negatively_Charged",
    "Special_Cases"
]

In [None]:
def _parse_aa_label(col):
    """
    Robustly extract the AA letter from common column names:
      'A_log2FC_NMD', 'A_Diff_FL', 'aa_AA_A_log2FC_NMD', etc.
    """
    m = re.search(r'aa_AA_([A-Z])_', col)
    if m: return m.group(1)
    m = re.match(r'^([A-Z])_', col)
    if m: return m.group(1)
    # fallback: first capital letter token
    m = re.findall(r'[A-Z]', col)
    return m[0] if m else col

def stars(p):
    return ("***" if p < 1e-3 else
            "**"  if p < 1e-2 else
            "*"   if p < 5e-2 else
            "ns")

def _add_sig_stars_clear(
    ax, melted, order_x,
    show_ns=False,
    y_quantile=None,        # None => global max; else global quantile (e.g., 0.98)
    pad_frac=0.05,          # vertical gap above anchor (fraction of axis span)
    top_extra=0.06,         # extra headroom above stars (fraction of axis span)
    fontsize=13, color="black"
):
    """
    Annotate significance with stars at a UNIFORM (global) height across all x categories.
    This avoids stars sitting inside whiskers for some AAs and far above for others.
    """
    # Axis span for padding
    ylo, yhi = ax.get_ylim()
    yspan = (yhi - ylo) if yhi > ylo else 1.0
    pad = pad_frac * yspan
    bump = top_extra * yspan
    new_top = yhi

    # ---- GLOBAL anchor (key behavior) ----
    anchor = (np.nanmax(melted["value"]) if y_quantile is None
              else np.nanquantile(melted["value"], y_quantile))
    y_star = anchor + pad

    # Per-AA tests; draw stars at shared y_star
    for i, label in enumerate(order_x):
        sub = melted[melted["AA"] == label]
        a = sub.loc[sub["status"] == "Candidate", "value"].dropna().to_numpy()
        b = sub.loc[sub["status"] == "Control",   "value"].dropna().to_numpy()
        if not (a.size and b.size):
            continue

        p = mannwhitneyu(a, b, alternative="two-sided").pvalue
        s = stars(p)
        if (s == "ns") and not show_ns:
            continue

        txt = ax.text(i, y_star, s, ha="center", va="bottom",
                      fontsize=fontsize, fontweight="bold", color=color, zorder=10)
        # White halo so stars stand out from outliers
        txt.set_path_effects([pe.withStroke(linewidth=3, foreground="white")])
        new_top = max(new_top, y_star + bump)

    if new_top > yhi:
        ax.set_ylim(ylo, new_top)

def add_clip_note(ax, values, q=None, pad=None, *, loc=(0.015, 0.035)):
    """
    Add a small 'Data clipped…' note in the bottom-left of the axes.
    - values: 1D array/Series of the plotted y-values (with NaNs allowed)
    - q: tuple like (0.1, 0.9) used for zooming; if None, do nothing
    - pad: the padding you applied around the quantile window (in data units)
    - loc: axes coords (x,y) for the note (0..1)
    """
    if q is None:
        return
    v = pd.Series(values, dtype=float).replace([np.inf, -np.inf], np.nan).dropna()
    if v.empty:
        return

    qlo, qhi = v.quantile(q)
    central_pct = (q[1] - q[0]) * 100.0

    note = f"Showing central {central_pct:.0f}% of data"
    
    ax.text(
        loc[0], loc[1], note,
        transform=ax.transAxes, ha="left", va="bottom",
        fontsize=9, color="0.25",
        bbox=dict(boxstyle="round,pad=0.25", fc="white", ec="0.8", alpha=0.9)
    )

## Visualization Functions

Functions for creating boxplot visualizations of physicochemical properties and individual amino acids.

In [None]:
def plot_properties_one_panel(
    df,
    prop_cols,                   # list like ["Aromatic", "Nonpolar_Aliphatic", ...] OR full col names
    category="Minus1",           # "Minus1" | "Plus1" | "Nonsense"
    context_label="NMD region",  # or "Full length"
    savepath=None,
    order_props=None,
    metric="diff",               # "diff" or "log2fc"
    clip_quantiles=None,          # e.g., (0.01, 0.99) to winsorize per-property
    inset=None
):
    # --- pick columns for the requested metric ---
    use_cols = _resolve_prop_cols(df, prop_cols, metric=metric,
                                  context="NMD" if "NMD" in context_label.upper() else "FL")
    if not use_cols:
        print("No matching columns found for the requested metric/props.")
        return None, None

    long = df[use_cols + ["source"]].copy()

    # map source → group & family
    g = long["source"].astype(str)
    g = (g.str.replace("minus1_control","Minus1_Control",case=False,regex=False)
           .str.replace("plus1_control","Plus1_Control",case=False,regex=False)
           .str.replace("snv_control","Nonsense_Control",case=False,regex=False)
           .str.replace("minus1","Minus1",case=False,regex=False)
           .str.replace("plus1","Plus1",case=False,regex=False)
           .str.replace("snv", "Nonsense",case=False,regex=False)
        )
    long["group"]  = g
    long["family"] = long["group"].str.replace("_Control","",regex=False)
    long["status"] = np.where(long["group"].str.endswith("_Control"), "Control", "Candidate")

    # keep the chosen family
    long = long[long["family"] == category]
    if long.empty:
        print(f"No rows for category={category}.")
        return None, None

    # melt
    melted = long.melt(id_vars=["group","family","status","source"],
                       var_name="prop_col", value_name="value")

    # tidy property names
    melted["property"] = (melted["prop_col"]
                          .str.replace("_Diff_NMD","",regex=False)
                          .str.replace("_Diff_FL","", regex=False)
                          .str.replace("_log2FC_NMD","",regex=False)
                          .str.replace("_log2FC_FL","", regex=False)
                          .str.replace("_"," "))
    melted["value"] = pd.to_numeric(melted["value"], errors="coerce")

    # drop all-NaN properties
    good_props = melted.groupby("property")["value"].apply(lambda s: s.notna().any())
    melted = melted[melted["property"].isin(good_props[good_props].index)]

    if order_props is None:
        order_props = sorted(melted["property"].unique(), key=lambda s: s.lower())

    # ===== COMPUTE STATS ON FULL UNCLIPPED DATA FIRST =====
    def stars(p): 
        return "ns" if p>=0.05 else ("*" if p<0.05 and p>=0.01 else ("**" if p<0.01 and p>=0.001 else "***"))
    
    stats_per_prop = {}
    for prop in order_props:
        sub = melted[melted["property"] == prop]
        a = sub.loc[sub["status"]=="Candidate","value"].dropna().to_numpy()
        b = sub.loc[sub["status"]=="Control","value"].dropna().to_numpy()
        if a.size and b.size:
            p = mannwhitneyu(a, b, alternative="two-sided").pvalue
            stats_per_prop[prop] = {
                'p': float(p),
                'n_a': int(a.size),
                'n_b': int(b.size),
                'stars': stars(p)
            }
        else:
            stats_per_prop[prop] = {
                'p': np.nan,
                'n_a': int(a.size),
                'n_b': int(b.size),
                'stars': ""
            }

    # ===== NOW APPLY OPTIONAL WINSORIZING FOR VISUALIZATION =====
    if clip_quantiles is not None:
        qlo, qhi = clip_quantiles
        melted["value"] = (melted.groupby("property")["value"]
                           .transform(lambda s: s.clip(s.quantile(qlo), s.quantile(qhi))))

    # --- plotting ---
    fig, ax = plt.subplots(figsize=(14, 7))
    ax.grid(True, axis="y", alpha=0.3, linestyle="--"); ax.set_axisbelow(True)

    base = FAMILY_COLOR.get(category, "#777")
    pal = {"Candidate": base, "Control": lighten_color(base)}

    sns.boxplot(
        data=melted, x="property", y="value",
        hue="status", hue_order=["Candidate","Control"],
        order=order_props, palette=pal, width=0.65, dodge=True, ax=ax, legend=False
    )

    # ===== USE PRE-COMPUTED STATS =====
    y_max = float(np.nanmax(melted["value"]))
    rng = y_max - float(np.nanmin(melted["value"]))
    pad = 0.12 * (rng if rng > 0 else 1.0)
    y_annot = y_max + pad

    for i, prop in enumerate(order_props):
        s = stats_per_prop[prop]
        if not np.isfinite(s['p']):
            continue
    
        # Color-code significance
        if s['stars'] == "***":
            box_color = "#ffcccc"  # Light red for highly significant
        elif s['stars'] == "**":
            box_color = "#ffe6cc"  # Light orange for very significant  
        elif s['stars'] == "*":
            box_color = "#fff2cc"  # Light yellow for significant
        else:
            box_color = "#f0f0f0"  # Light gray for non-significant
    
        ax.text(i, y_annot, 
                f"{s['stars']}\n"
                f"n={s['n_a']}/{s['n_b']}\n"
                f"p={s['p']:.3g}",
                ha="center", va="bottom",
                bbox=dict(boxstyle="round,pad=0.3", facecolor=box_color, 
                     edgecolor="0.5", alpha=0.9, linewidth=0.8),
                fontsize=14, 
                fontweight="bold" if s['stars'] != "ns" else "normal")

    # y label / baseline
    if metric.lower() == "log2fc":
        ax.axhline(0, ls="--", lw=1, color="0.3")
        ylab = "log₂ fold-change (Var / WT)"
        title_metric = "log₂(Var/WT)"
    else:
        ylab = "Δ AA composition (Var − WT, %)"
        title_metric = "Var − WT"

    # ----- optional zoom inset -----
    if inset:
        # where/size of inset
        width     = inset.get("width", "36%")
        height    = inset.get("height","36%")
        loc       = inset.get("loc", "upper right")
        borderpad = inset.get("borderpad", 0.8)

        axins = inset_axes(ax, width=width, height=height, loc=loc, borderpad=borderpad)

        # re-plot the same boxes inside the inset (no legend)
        sns.boxplot(
            data=melted, x="property", y="value",
            hue="status", hue_order=["Candidate","Control"],
            order=order_props, palette=pal, width=0.65, dodge=True,
            ax=axins, legend=False
        )

        # choose inset y-limits
        if "limits" in inset:
            ylo, yhi = inset["limits"]
        elif "q" in inset:                       # e.g., q=(0.25, 0.75)
            qlo, qhi = melted["value"].quantile(inset["q"])
            pad = inset.get("pad", 0.5)          # add small margin
            ylo, yhi = qlo - pad, qhi + pad
        else:                                    # default: centered tight around 0
            r = inset.get("range", 2.0)          # total range (log2 units)
            ylo, yhi = -r/2, r/2

        axins.set_ylim(ylo, yhi)
        axins.axhline(0, ls="--", lw=1, color="0.3", zorder=0)

        # de-clutter inset
        axins.set_title("Zoom", fontsize=16, pad=2)
        axins.set_xlabel(""); axins.set_ylabel("")
        plt.setp(axins.get_xticklabels(), rotation=30, ha="right", fontsize=14)
        plt.setp(axins.get_yticklabels(), fontsize=8)
    
    # headroom for annotation - scale based on data range
    ymin, ymax = ax.get_ylim()
    data_range = ymax - ymin
    headroom = 0.25 * data_range  # Add 25% extra space for annotations
    ax.set_ylim(ymin, ymax + headroom)

    ax.set_xlabel("Physicochemical Property", fontsize=16, fontweight="bold")
    ax.set_ylabel(ylab, fontsize=16, fontweight="bold")
    ax.set_title(f"{category}: AA Properties — Candidate vs Control ({context_label}) [{title_metric}]",
                 fontsize=16, fontweight="bold", pad=10)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha="right")

    handles = [Patch(facecolor=pal["Candidate"], edgecolor="k", label="Candidate"),
               Patch(facecolor=pal["Control"],   edgecolor="k", label="Control")]
    ax.legend(handles=handles, title="Group", loc="upper left", bbox_to_anchor=(1.01, 1.0),fontsize=12, title_fontsize=13)

    plt.tight_layout()
    if savepath:
        fig.savefig(savepath, dpi=300, bbox_inches="tight")
    return fig, ax

In [None]:
def plot_aa_panel(
    df,
    aa_cols,                        # ["A_log2FC_NMD", "C_log2FC_NMD", ...] or ["A_Diff_NMD", ...]
    category="Minus1",              # "Minus1" | "Plus1" | "Nonsense"
    context_label="NMD region",
    metric="log2fc",                # "log2fc" or "diff" (labels/baseline only)
    savepath=None,
    order_aa=None,                  # custom x order
    # appearance / zoom
    figsize=(16, 7),
    hide_fliers=True,
    ylim=None,                      # set to e.g. (-2,2); if None, optional quantile zoom used
    q=None, pad=0.25,               # e.g., q=(0.1, 0.9) + pad for central zoom
    clip_whiskers=True,
    # stars
    show_ns=False,
    star_quantile=None,             # None => anchor at global max; else e.g. 0.98
    star_pad_frac=0.05,
    star_top_extra=0.06,
    star_fontsize=13
):
    # ---------- tidy to long ----------
    long = df[aa_cols + ["source"]].copy()

    g = long["source"].astype(str)
    g = (g.str.replace("snv_control","Nonsense_Control",case=False,regex=False)
           .str.replace("snv","Nonsense",case=False,regex=False)
           .str.replace("minus1_control","Minus1_Control",case=False,regex=False)
           .str.replace("plus1_control","Plus1_Control",case=False,regex=False)
           .str.replace("minus1","Minus1",case=False,regex=False)
           .str.replace("plus1","Plus1",case=False,regex=False))
    long["group"]  = g
    long["family"] = long["group"].str.replace("_Control","",regex=False)
    long["status"] = np.where(long["group"].str.endswith("_Control"), "Control", "Candidate")

    long = long[long["family"] == category]
    if long.empty:
        raise ValueError(f"No rows for category={category}")

    melted = long.melt(id_vars=["group","family","status","source"],
                       var_name="aa_col", value_name="value")
    melted["AA"] = melted["aa_col"].map(_parse_aa_label)
    melted["value"] = pd.to_numeric(melted["value"], errors="coerce")

    # keep AAs that have any data
    good = melted.groupby("AA")["value"].apply(lambda s: s.notna().any())
    melted = melted[melted["AA"].isin(good[good].index)]

    # x order
    if order_aa is None:
        present = [aa for aa in AA_LIST if aa in melted["AA"].unique()]
        order_aa = present
    else:
        order_aa = [aa for aa in order_aa if aa in melted["AA"].unique()]

    # ---------- plot ----------
    fig, ax = plt.subplots(figsize=figsize)
    ax.grid(True, axis="y", alpha=0.3, linestyle="--"); ax.set_axisbelow(True)

    base = FAMILY_COLOR.get(category, "#777")
    pal = {"Candidate": base, "Control": lighten_color(base)}

    sns.boxplot(
        data=melted, x="AA", y="value",
        hue="status", hue_order=["Candidate","Control"],
        order=order_aa, palette=pal, width=0.6, dodge=True, ax=ax,
        legend=False, fliersize=0 if hide_fliers else 3, linewidth=1.0
    )

    # zoom handling
    if ylim is not None:
        ax.set_ylim(*ylim)
    elif q is not None:
        qlo, qhi = melted["value"].quantile(q)
        ax.set_ylim(qlo - pad, qhi + pad)
        add_clip_note(ax, melted["value"], q=q, pad=pad)

    # clip whiskers/outliers to axes if requested
    if clip_whiskers:
        for artist in (ax.lines + ax.artists + ax.collections):
            try:
                artist.set_clip_on(True)
                artist.set_clip_path(ax.patch)
            except Exception:
                pass

    # labels/titles
    ax.set_xlabel("Amino Acid", fontsize=12, fontweight="bold")
    ylabel = "log₂ fold-change (Var / WT)" if metric.lower()=="log2fc" else "Δ AA composition (Var − WT, %)"
    ax.set_ylabel(ylabel, fontsize=12, fontweight="bold")
    title_metric = "[log₂(Var/WT)]" if metric.lower()=="log2fc" else "[Var − WT]"
    ax.set_title(f"{category}: AA Properties — Candidate vs Control ({context_label}) {title_metric}",
                 fontsize=16, fontweight="bold", pad=6)

    if metric.lower() == "log2fc":
        ax.axhline(0, ls="--", lw=1, color="0.35", zorder=0)

    # ticks & legend
    plt.setp(ax.get_xticklabels(), rotation=0, ha="center")
    handles = [Patch(facecolor=pal["Candidate"], edgecolor="k", label="Candidate"),
               Patch(facecolor=pal["Control"],   edgecolor="k", label="Control")]
    ax.legend(handles=handles, title="Group", loc="upper left", bbox_to_anchor=(1.01, 1.0))

    # significance stars (uniform height)
    _add_sig_stars_clear(
        ax=ax, melted=melted, order_x=order_aa,
        show_ns=show_ns, y_quantile=star_quantile,
        pad_frac=star_pad_frac, top_extra=star_top_extra,
        fontsize=star_fontsize, color="black"
    )

    plt.tight_layout()
    if savepath:
        fig.savefig(savepath, dpi=300, bbox_inches="tight")
    return fig, ax

## Physicochemical Property Group Visualizations

Generate boxplots showing changes in physicochemical property groups for each variant category.

In [None]:
# Generate property plots for each variant category
for cat in ["Minus1", "Plus1", "Nonsense"]:
    plot_properties_one_panel(
        df_NMD_unique, 
        props_nmd,
        category=cat, 
        context_label="NMD region",
        metric="diff", 
        savepath=f"{cat}_AAprops_diff.png"
    )
    plt.show()

## Individual Amino Acid Visualizations

Create detailed boxplots for all 20 individual amino acids showing composition differences in the NMD region for Minus1 and Plus1 variants.

In [None]:
aa_diff_nmd = [f"{aa}_Diff_NMD" for aa in AA_LIST]

# ===== MINUS1 - FULL RANGE =====
print("\nGenerating Minus1 full range plot...")
fig_m1_full, ax_m1_full = plot_aa_panel(
    df_NMD_unique,
    aa_cols=aa_diff_nmd,
    category="Minus1",
    context_label="NMD region",
    metric="diff",
    hide_fliers=False,
    ylim=None,
    q=None,
    clip_whiskers=False,
    show_ns=False,
    star_quantile=None,     
    star_pad_frac=0.08,      # More padding above data
    star_top_extra=0.15,     # More headroom for stars
    star_fontsize=13,
    figsize=(18, 7),
    savepath="AD_Minus1_individualAA_Diff_NMD.png"
)
plt.show()


# ===== PLUS1 - FULL RANGE =====
print("\nGenerating Plus1 full range plot...")
fig_p1_full, ax_p1_full = plot_aa_panel(
    df_NMD_unique,
    aa_cols=aa_diff_nmd,
    category="Plus1",
    context_label="NMD region",
    metric="diff",
    hide_fliers=False,
    ylim=None,
    q=None,
    clip_whiskers=False,
    show_ns=False,
    star_quantile=0.99999,    
    star_pad_frac=0.0,      # More padding
    star_top_extra=0.10,     # More headroom
    star_fontsize=13,
    figsize=(18, 7),
    savepath="AD_Plus1_individualAA_Diff_NMD.png"
)
plt.show()