In [1]:
# =====================================================================
# Cell 0：全局路径配置
# 作用：定义 Anti-CRISPR 数据集目录，供后续所有实验复用
# =====================================================================
BENCHMARKS_DIR = '/home/nemophila/projects/protein_bert/anticrispr_benchmarks'

In [2]:
# =====================================================================
# 实验2：统一评估协议与微调工具函数（重构主线基础设施）
# 目标：统一数据、指标、阈值选择、CI估计，避免实验间不可比
# =====================================================================

import os
import numpy as np
import pandas as pd
from IPython.display import display

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, matthews_corrcoef, brier_score_loss

from tensorflow import keras

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune
from proteinbert.finetuning import encode_dataset, split_dataset_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARK_NAME = 'anticrispr_binary'
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

full_train = pd.read_csv(os.path.join(BENCHMARKS_DIR, f'{BENCHMARK_NAME}.train.csv')).dropna().drop_duplicates().reset_index(drop=True)
full_test = pd.read_csv(os.path.join(BENCHMARKS_DIR, f'{BENCHMARK_NAME}.test.csv')).dropna().drop_duplicates().reset_index(drop=True)

print(f'[Protocol] Train: {len(full_train)} ({(full_train.label==1).sum()}+/{(full_train.label==0).sum()}-)')
print(f'[Protocol] Test : {len(full_test)} ({(full_test.label==1).sum()}+/{(full_test.label==0).sum()}-)')

# 按计划固定 >=5 个随机种子，降低偶然性
SEEDS = [0, 11, 22, 33, 44]


def expected_calibration_error(y_true, y_prob, n_bins=10):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        m = (y_prob >= bins[i]) & (y_prob < bins[i+1])
        if not np.any(m):
            continue
        conf = y_prob[m].mean()
        acc = y_true[m].mean()
        ece += np.abs(acc - conf) * m.mean()
    return float(ece)


def select_best_threshold(y_true, y_prob, grid=None):
    if grid is None:
        grid = np.linspace(0.1, 0.9, 33)
    best_thr, best_f1 = 0.5, -1.0
    for thr in grid:
        y_cls = (y_prob >= thr).astype(int)
        f1 = f1_score(y_true, y_cls, zero_division=0)
        if f1 > best_f1:
            best_thr, best_f1 = float(thr), float(f1)
    return best_thr, best_f1


def summarize_metrics(y_true, y_prob, thr):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)
    y_cls = (y_prob >= thr).astype(int)
    return {
        'AUC': float(roc_auc_score(y_true, y_prob)),
        'AUPRC': float(average_precision_score(y_true, y_prob)),
        'F1': float(f1_score(y_true, y_cls, zero_division=0)),
        'MCC': float(matthews_corrcoef(y_true, y_cls)),
        'Brier': float(brier_score_loss(y_true, y_prob)),
        'ECE': float(expected_calibration_error(y_true, y_prob, n_bins=10)),
        'Threshold': float(thr),
    }


def bootstrap_ci(y_true, y_prob, metric_fn, n_boot=1000, seed=42):
    rng = np.random.default_rng(seed)
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    n = len(y_true)
    vals = []
    for _ in range(n_boot):
        idx = rng.integers(0, n, n)
        yt = y_true[idx]
        yp = y_prob[idx]
        if len(np.unique(yt)) < 2:
            continue
        vals.append(metric_fn(yt, yp))
    if len(vals) == 0:
        return (np.nan, np.nan)
    return (float(np.percentile(vals, 2.5)), float(np.percentile(vals, 97.5)))


def predict_proteinbert_probs(model_generator, input_encoder, seqs, labels, start_seq_len=512, start_batch_size=32):
    df = pd.DataFrame({'seq': list(seqs), 'raw_y': list(labels)})
    y_true_all, y_prob_all = [], []
    for d, sl, bs in split_dataset_by_len(df, start_seq_len=start_seq_len, start_batch_size=start_batch_size):
        if len(d) == 0:
            continue
        X, yt, sw = encode_dataset(d['seq'], d['raw_y'], input_encoder, OUTPUT_SPEC, seq_len=sl, needs_filtering=False)
        m = (sw == 1)
        mdl = model_generator.create_model(sl)
        yp = mdl.predict(X, batch_size=bs).flatten()
        y_true_all.append(yt[m].flatten())
        y_prob_all.append(yp[m].flatten())
    return np.concatenate(y_true_all), np.concatenate(y_prob_all)


def run_finetune_once(train_df, valid_df, test_df, cfg):
    pretrained_model_generator, input_encoder = load_pretrained_model()
    mg = FinetuningModelGenerator(
        pretrained_model_generator,
        OUTPUT_SPEC,
        pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs,
        dropout_rate=cfg.get('dropout', 0.5),
    )
    cbs = [
        keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-5, verbose=0),
        keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
    ]

    finetune(
        mg, input_encoder, OUTPUT_SPEC,
        train_df['seq'], train_df['label'],
        valid_df['seq'], valid_df['label'],
        seq_len=cfg.get('seq_len', 512),
        batch_size=cfg.get('batch_size', 32),
        max_epochs_per_stage=cfg.get('max_epochs', 40),
        lr=cfg.get('lr', 1e-4),
        begin_with_frozen_pretrained_layers=cfg.get('freeze_first', True),
        lr_with_frozen_pretrained_layers=cfg.get('lr_frozen', 1e-2),
        n_final_epochs=cfg.get('n_final_epochs', 1),
        final_seq_len=cfg.get('final_seq_len', 1024),
        final_lr=cfg.get('final_lr', 1e-5),
        callbacks=cbs,
    )

    yv_true, yv_prob = predict_proteinbert_probs(mg, input_encoder, valid_df['seq'], valid_df['label'])
    thr, _ = select_best_threshold(yv_true, yv_prob)
    yt_true, yt_prob = predict_proteinbert_probs(mg, input_encoder, test_df['seq'], test_df['label'])
    metrics = summarize_metrics(yt_true, yt_prob, thr)
    return mg, input_encoder, metrics, (yt_true, yt_prob)


2026-02-12 16:17:04.625769: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


[Protocol] Train: 1107 (205+/902-)
[Protocol] Test : 286 (26+/260-)


In [3]:
# =====================================================================
# 实验5：ProteinBERT原始微调流程严格复现（主锚点）
# 目标：建立可信基线，后续所有优化必须与其比较
# =====================================================================

baseline_cfg = dict(
    name='baseline_cell1',
    dropout=0.5,
    seq_len=512,
    batch_size=32,
    max_epochs=40,
    lr=1e-4,
    freeze_first=True,
    lr_frozen=1e-2,
    n_final_epochs=1,
    final_seq_len=1024,
    final_lr=1e-5,
)

baseline_rows = []
baseline_probs = []
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, (yt, yp) = run_finetune_once(tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, baseline_cfg)
    met['Seed'] = seed
    baseline_rows.append(met)
    baseline_probs.append((yt, yp))
    print(f"[Exp5][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}, F1={met['F1']:.4f}, thr={met['Threshold']:.2f}")

baseline_df = pd.DataFrame(baseline_rows)
print('\n[Exp5] 基线多随机种子结果:')
display(baseline_df[['Seed','AUC','AUPRC','F1','MCC','Brier','ECE','Threshold']])

print('[Exp5] 均值±标准差:')
for k in ['AUC','AUPRC','F1','MCC']:
    print(f'  {k}: {baseline_df[k].mean():.4f} ± {baseline_df[k].std(ddof=1):.4f}')

BASELINE_RESULT = baseline_df


[2026_02_12-16:17:05] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:17:05] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:17:05] Training with frozen pretrained layers...


2026-02-12 16:17:05.853946: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2026-02-12 16:17:05.855065: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2026-02-12 16:17:05.887904: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:2a:00.0 name: NVIDIA L40S computeCapability: 8.9
coreClock: 2.52GHz coreCount: 142 deviceMemorySize: 44.53GiB deviceMemoryBandwidth: 804.75GiB/s
2026-02-12 16:17:05.888114: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:ab:00.0 name: NVIDIA L40S computeCapability: 8.9
coreClock: 2.52GHz coreCount: 142 deviceMemorySize: 44.53GiB deviceMemoryBandwidth: 804.75GiB/s
2026-02-12 16:17:05.888143: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2026-02-12 16:17:05.8

Epoch 1/40


2026-02-12 16:17:14.299950: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2026-02-12 16:17:15.072062: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2026-02-12 16:17:15.081958: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2026-02-12 16:17:15.082550: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2026-02-12 16:17:17.553897: W tensorflow/stream_executor/gpu/asm_compiler.cc:63] Running ptxas --version returned 256
2026-02-12 16:17:17.793745: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: ptxas exited with non-zero error code 256, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
[2026_02_12-16:17:53] Training the entire fine-tuned model...
[2026_02_12-16:18:23] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-16:19:06] Training on final epochs of sequence length 1024...
[2026_02_12-16:19:06] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-16:19:06] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp5][seed=0] AUC=0.8954, AUPRC=0.6229, F1=0.4750, thr=0.12
[2026_02_12-16:20:53] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:20:53] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:20:53] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
[2026_02_12-16:21:12] Tra

Unnamed: 0,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold
0,0,0.895414,0.62289,0.475,0.437916,0.053453,0.030615,0.125
1,11,0.897781,0.634957,0.444444,0.401189,0.051656,0.028978,0.15
2,22,0.886095,0.576713,0.436782,0.39949,0.056361,0.031765,0.15
3,33,0.857396,0.556179,0.410959,0.352074,0.059625,0.0314,0.2
4,44,0.89068,0.576424,0.463768,0.411447,0.056425,0.028008,0.15


[Exp5] 均值±标准差:
  AUC: 0.8855 ± 0.0163
  AUPRC: 0.5934 ± 0.0337
  F1: 0.4462 ± 0.0249
  MCC: 0.4004 ± 0.0311


In [4]:
# =====================================================================
# 实验6：ProteinBERT微调优化矩阵（阶段2）
# 目标：系统搜索冻结/学习率/序列长度策略，并执行提升门槛
# =====================================================================

finetune_cfgs = [
    dict(name='G1_baseline_like', dropout=0.5, seq_len=512, batch_size=32, max_epochs=40, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
    dict(name='G2_shorter_train', dropout=0.5, seq_len=512, batch_size=32, max_epochs=30, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
    dict(name='G3_no_final_stage', dropout=0.5, seq_len=512, batch_size=32, max_epochs=40, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=0, final_seq_len=1024, final_lr=1e-5),
    dict(name='G4_lower_unfrozen_lr', dropout=0.5, seq_len=512, batch_size=32, max_epochs=35, lr=5e-5, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
    dict(name='G5_less_dropout', dropout=0.35, seq_len=512, batch_size=32, max_epochs=40, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
    dict(name='G6_final_len512', dropout=0.5, seq_len=512, batch_size=32, max_epochs=40, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=512,  final_lr=1e-5),
    dict(name='G7_no_freeze', dropout=0.5, seq_len=512, batch_size=32, max_epochs=35, lr=5e-5, freeze_first=False, lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
]

rows = []
for cfg in finetune_cfgs:
    for seed in SEEDS:
        tr_df, va_df = train_test_split(
            full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
        )
        _, _, met, _ = run_finetune_once(tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, cfg)
        rows.append({**{'Config': cfg['name'], 'Seed': seed}, **met})
        print(f"[Exp6][{cfg['name']}][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}, F1={met['F1']:.4f}")

exp6_df = pd.DataFrame(rows)
summary6_flat = exp6_df.groupby('Config')[['AUC','AUPRC','F1','MCC','Brier','ECE']].agg(['mean','std'])
summary6_rank = summary6_flat.sort_values(('AUPRC', 'mean'), ascending=False)

print('\n[Exp6] 配置汇总（按AUPRC均值排序）:')
display(summary6_rank)

base_auc = float(BASELINE_RESULT['AUC'].mean())
base_auprc = float(BASELINE_RESULT['AUPRC'].mean())

accepted_cfgs = []
for cfg_name in summary6_rank.index:
    auc_m = float(summary6_rank.loc[cfg_name, ('AUC', 'mean')])
    auprc_m = float(summary6_rank.loc[cfg_name, ('AUPRC', 'mean')])
    if (auc_m > base_auc) and (auprc_m > base_auprc):
        accepted_cfgs.append(cfg_name)

if len(accepted_cfgs) == 0:
    accepted_cfgs = [summary6_rank.index[0]]

best_cfg_name = accepted_cfgs[0]
BEST_FINETUNE_CFG = [c for c in finetune_cfgs if c['name'] == best_cfg_name][0]
TOP_CFG_NAMES = accepted_cfgs[:3]

print(f"[Exp6] 基线门槛: AUC>{base_auc:.4f} 且 AUPRC>{base_auprc:.4f}")
print(f"[Exp6] 通过门槛配置: {accepted_cfgs}")
print(f"[Exp6] 当前候选最优: {best_cfg_name}")


[2026_02_12-16:24:57] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:24:57] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:24:57] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
[2026_02_12-16:25:10] Training the entire fine-tuned model...
[2026_02_12-16:25:17] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
[2026_02_12-16:25:34] Training on final epochs of sequence length 1024...
[2026_02_12-16:25:34] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-16:25:34] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp6][G1_baseline_like][seed=0] AUC=0.8678, AUPRC=0.5466, F1=0.4000
[2026_02_12-16:25:58] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:25:58] Validation set: Fi

Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC,Brier,Brier,ECE,ECE
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Config,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
G3_no_final_stage,0.894704,0.012223,0.614949,0.032958,0.456577,0.038237,0.417258,0.03344,0.061475,0.007137,0.059713,0.021562
G5_less_dropout,0.88932,0.00813,0.598413,0.003875,0.465306,0.025534,0.422488,0.01693,0.055146,0.000975,0.031337,0.007622
G6_final_len512,0.887396,0.016965,0.596005,0.05894,0.44098,0.047289,0.393825,0.044037,0.059424,0.006366,0.04396,0.007493
G4_lower_unfrozen_lr,0.888905,0.011436,0.59452,0.026597,0.449115,0.028009,0.407272,0.035743,0.055318,0.002167,0.029636,0.008541
G2_shorter_train,0.887722,0.012208,0.59061,0.038169,0.42836,0.033807,0.385816,0.033827,0.056199,0.003217,0.030265,0.005283
G1_baseline_like,0.871006,0.011578,0.556245,0.022679,0.440798,0.058582,0.387653,0.061601,0.058386,0.001261,0.036168,0.007794
G7_no_freeze,0.81426,0.023213,0.433456,0.038321,0.41877,0.024159,0.366827,0.029069,0.066778,0.002446,0.040741,0.011332


[Exp6] 基线门槛: AUC>0.8855 且 AUPRC>0.5934
[Exp6] 通过门槛配置: ['G3_no_final_stage', 'G5_less_dropout', 'G6_final_len512', 'G4_lower_unfrozen_lr']
[Exp6] 当前候选最优: G3_no_final_stage


In [5]:
# =====================================================================
# 实验7：概率校准（阶段4）
# 目标：对最佳2-3个候选执行 Platt / Isotonic 校准，观察AUC/AUPRC/F1稳定性
# =====================================================================

from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression

cfg_map = {c['name']: c for c in finetune_cfgs}
calib_methods = ['none', 'platt', 'isotonic']

rows = []
for cfg_name in TOP_CFG_NAMES:
    cfg = cfg_map[cfg_name]
    for seed in SEEDS:
        tr_df, va_df = train_test_split(
            full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
        )
        mg, ie, _, (yt_true, yt_prob) = run_finetune_once(
            tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, cfg
        )
        yv_true, yv_prob = predict_proteinbert_probs(mg, ie, va_df['seq'], va_df['label'])

        for method in calib_methods:
            if method == 'none':
                calib_valid = yv_prob
                calib_test = yt_prob
            elif method == 'platt':
                platt = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
                platt.fit(yv_prob.reshape(-1, 1), yv_true.astype(int))
                calib_valid = platt.predict_proba(yv_prob.reshape(-1, 1))[:, 1]
                calib_test = platt.predict_proba(yt_prob.reshape(-1, 1))[:, 1]
            else:
                iso = IsotonicRegression(out_of_bounds='clip')
                iso.fit(yv_prob, yv_true.astype(int))
                calib_valid = iso.predict(yv_prob)
                calib_test = iso.predict(yt_prob)

            thr, _ = select_best_threshold(yv_true, calib_valid)
            met = summarize_metrics(yt_true, calib_test, thr)
            rows.append({'Config': cfg_name, 'Calib': method, 'Seed': seed, **met})
            print(f"[Exp7][{cfg_name}][{method}][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}, F1={met['F1']:.4f}")

exp7_df = pd.DataFrame(rows)
summary7 = exp7_df.groupby(['Config', 'Calib'])[['AUC','AUPRC','F1','MCC','Brier','ECE']].agg(['mean','std'])
summary7_rank = summary7.sort_values(('AUPRC', 'mean'), ascending=False)

print('\n[Exp7] 校准结果汇总（按AUPRC均值排序）:')
display(summary7_rank)

best_idx = summary7_rank.index[0]
BEST_CALIB_CONFIG = best_idx[0]
BEST_CALIB_METHOD = best_idx[1]
BEST_CALIB_SUMMARY = summary7_rank

print(f"[Exp7] 最佳校准组合: cfg={BEST_CALIB_CONFIG}, calib={BEST_CALIB_METHOD}")


[2026_02_12-16:58:14] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:58:14] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:58:14] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
[2026_02_12-16:58:32] Training the entire fine-tuned model...
[2026_02_12-16:58:38] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
[Exp7][G3_no_final_stage][none][seed=0] AUC=0.8895, AUPRC=0.6255, F1=0.4789
[Exp7][G3_no_final_stage][platt][seed=0] AUC=0.8895, AUPRC=0.6255, F1=0.4722
[Exp7][G3_no_final_stage][isotonic][seed=0] AUC=0.8871, AUPRC=0.5372, F1=0.4789
[2026_02_12-16:59:01] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-16:59:01] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_0

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC,Brier,Brier,ECE,ECE
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Config,Calib,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
G6_final_len512,none,0.894172,0.009917,0.611697,0.023608,0.4572,0.045668,0.415311,0.04477,0.057987,0.00393,0.046542,0.016102
G6_final_len512,platt,0.894172,0.009917,0.611697,0.023608,0.461504,0.051053,0.421791,0.04722,0.141732,0.018847,0.293068,0.030923
G5_less_dropout,none,0.883876,0.014213,0.609678,0.030902,0.463546,0.057764,0.414373,0.062982,0.05464,0.00237,0.033448,0.010761
G5_less_dropout,platt,0.883876,0.014213,0.609678,0.030902,0.447472,0.057057,0.39864,0.061204,0.148337,0.020977,0.305844,0.031272
G3_no_final_stage,none,0.885414,0.00781,0.598436,0.029259,0.455197,0.048257,0.415915,0.054712,0.05976,0.001204,0.043111,0.008614
G3_no_final_stage,platt,0.885414,0.00781,0.598436,0.029259,0.443242,0.034983,0.399207,0.038818,0.138461,0.021581,0.284824,0.041227
G3_no_final_stage,isotonic,0.871598,0.010779,0.522667,0.020648,0.445924,0.053687,0.406148,0.06287,0.067296,0.011081,0.054377,0.013735
G6_final_len512,isotonic,0.87321,0.016903,0.503984,0.058379,0.462475,0.050868,0.424137,0.04716,0.067709,0.006576,0.057967,0.006391
G5_less_dropout,isotonic,0.863491,0.025102,0.501528,0.070689,0.426179,0.087738,0.383934,0.086227,0.071559,0.003378,0.061719,0.012929


[Exp7] 最佳校准组合: cfg=G6_final_len512, calib=none


In [6]:
# =====================================================================
# 实验8：同构种子集成 + 上限判断（阶段5/6）
# 目标：只用同一ProteinBERT流程做seed ensemble，并给出是否接近0.952的现实结论
# =====================================================================

best_cfg = [c for c in finetune_cfgs if c['name'] == BEST_CALIB_CONFIG][0]

def apply_calibration(method, yv_true, yv_prob, yt_prob):
    if method == 'none':
        return yv_prob, yt_prob
    if method == 'platt':
        clf = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
        clf.fit(yv_prob.reshape(-1, 1), yv_true.astype(int))
        return (
            clf.predict_proba(yv_prob.reshape(-1, 1))[:, 1],
            clf.predict_proba(yt_prob.reshape(-1, 1))[:, 1],
        )

    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(yv_prob, yv_true.astype(int))
    return iso.predict(yv_prob), iso.predict(yt_prob)

single_rows = []
all_valid_probs = []
all_valid_true = []
all_test_probs = []
all_test_true = None

for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    mg, ie, _, (yt_true, yt_prob) = run_finetune_once(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, best_cfg
    )
    yv_true, yv_prob = predict_proteinbert_probs(mg, ie, va_df['seq'], va_df['label'])

    calib_valid, calib_test = apply_calibration(BEST_CALIB_METHOD, yv_true, yv_prob, yt_prob)
    thr_seed, _ = select_best_threshold(yv_true, calib_valid)
    met_seed = summarize_metrics(yt_true, calib_test, thr_seed)
    single_rows.append({'Seed': seed, **met_seed})

    all_valid_probs.append(calib_valid)
    all_valid_true.append(yv_true)
    all_test_probs.append(calib_test)
    if all_test_true is None:
        all_test_true = yt_true

single_df = pd.DataFrame(single_rows)

# 集成阈值仅在验证集（跨seed拼接）选择，测试集只评一次
pool_valid_true = np.concatenate(all_valid_true)
pool_valid_prob = np.concatenate(all_valid_probs)
ens_thr, _ = select_best_threshold(pool_valid_true, pool_valid_prob)
ens_prob = np.mean(np.vstack(all_test_probs), axis=0)
ens_met = summarize_metrics(all_test_true, ens_prob, ens_thr)

auc_ci = bootstrap_ci(all_test_true, ens_prob, roc_auc_score, n_boot=500)
auprc_ci = bootstrap_ci(all_test_true, ens_prob, average_precision_score, n_boot=500)

res = pd.DataFrame([
    {
        'Model': 'Baseline(multi-seed mean)',
        'AUC': float(BASELINE_RESULT['AUC'].mean()),
        'AUPRC': float(BASELINE_RESULT['AUPRC'].mean()),
        'F1': float(BASELINE_RESULT['F1'].mean()),
        'MCC': float(BASELINE_RESULT['MCC'].mean()),
        'Brier': float(BASELINE_RESULT['Brier'].mean()),
        'ECE': float(BASELINE_RESULT['ECE'].mean()),
        'Threshold': float(BASELINE_RESULT['Threshold'].mean()),
        'AUC_CI95': np.nan,
        'AUPRC_CI95': np.nan,
    },
    {
        'Model': f'Best single ({BEST_CALIB_CONFIG}+{BEST_CALIB_METHOD})',
        'AUC': float(single_df['AUC'].mean()),
        'AUPRC': float(single_df['AUPRC'].mean()),
        'F1': float(single_df['F1'].mean()),
        'MCC': float(single_df['MCC'].mean()),
        'Brier': float(single_df['Brier'].mean()),
        'ECE': float(single_df['ECE'].mean()),
        'Threshold': float(single_df['Threshold'].mean()),
        'AUC_CI95': np.nan,
        'AUPRC_CI95': np.nan,
    },
    {
        'Model': f'SeedEnsemble ({BEST_CALIB_CONFIG}+{BEST_CALIB_METHOD})',
        **ens_met,
        'AUC_CI95': auc_ci,
        'AUPRC_CI95': auprc_ci,
    },
])

print('[Exp8] 最终结果对照:')
display(res[['Model','AUC','AUPRC','F1','MCC','Brier','ECE','Threshold','AUC_CI95','AUPRC_CI95']])

target_auc = 0.952
gap = target_auc - float(ens_met['AUC'])
if ens_met['AUC'] >= target_auc:
    print(f'[Exp8] 结论：已达到目标AUC {target_auc:.3f}。')
elif ens_met['AUC'] >= 0.92:
    print(f'[Exp8] 结论：已逼近目标，当前AUC={ens_met["AUC"]:.4f}，距{target_auc:.3f}还差{gap:.4f}。')
else:
    print(f'[Exp8] 结论：当前AUC={ens_met["AUC"]:.4f}，距{target_auc:.3f}还差{gap:.4f}。在现有数据+ProteinBERT约束下，存在明显性能天花板。')


[2026_02_12-17:12:07] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:12:07] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:12:07] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
[2026_02_12-17:12:21] Training the entire fine-tuned model...
[2026_02_12-17:12:28] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:12:41] Training on final epochs of sequence length 512...
[2026_02_12-17:12:41] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:12:41] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:13:05] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:13:05] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:13:05] 

Unnamed: 0,Model,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,AUC_CI95,AUPRC_CI95
0,Baseline(multi-seed mean),0.885473,0.593433,0.446191,0.400423,0.055504,0.030153,0.155,,
1,Best single (G6_final_len512+none),0.888284,0.588783,0.455818,0.412569,0.059612,0.044202,0.245,,
2,SeedEnsemble (G6_final_len512+none),0.897041,0.611559,0.444444,0.391254,0.05625,0.044402,0.275,"(0.8360565391374561, 0.9469835166891183)","(0.43242932493980923, 0.7626978315833826)"


[Exp8] 结论：当前AUC=0.8970，距0.952还差0.0550。在现有数据+ProteinBERT约束下，存在明显性能天花板。


In [13]:
# =====================================================================
# Cell 9：增强版工具函数（路线A~D所需的扩展helper）
# 作用：重载模块、定义run_finetune_v2、AUCEarlyStopping、数据增强函数
# =====================================================================

import importlib
import proteinbert.model_generation
importlib.reload(proteinbert.model_generation)

import tensorflow as tf
from sklearn.metrics import roc_auc_score

from proteinbert import FinetuningModelGenerator, load_pretrained_model, focal_loss
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs
from proteinbert.finetuning import encode_train_and_valid_sets


# ==================== AUC Early Stopping 回调 ====================
class AUCEarlyStopping(keras.callbacks.Callback):
    """按 validation AUC 做 Early Stopping（路线C1）"""
    def __init__(self, valid_X, valid_Y, patience=2, restore_best_weights=True):
        super().__init__()
        self.valid_X = valid_X
        self.valid_Y = valid_Y.flatten().astype(float)
        self.patience = patience
        self.restore_best_weights = restore_best_weights
        self.best_auc = -1.0
        self.wait = 0
        self.best_weights = None

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.valid_X, verbose=0).flatten()
        y_true = self.valid_Y
        if len(np.unique(y_true)) < 2:
            return
        auc = roc_auc_score(y_true, y_pred)
        if logs is not None:
            logs['val_auc'] = auc
        if auc > self.best_auc:
            self.best_auc = auc
            self.wait = 0
            if self.restore_best_weights:
                self.best_weights = self.model.get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.model.stop_training = True
                if self.restore_best_weights and self.best_weights is not None:
                    self.model.set_weights(self.best_weights)


# ==================== 数据增强函数 ====================
def augment_seqs_truncation(seqs, min_frac=0.8, rng=None):
    """序列随机截断到 min_frac~1.0 长度"""
    if rng is None:
        rng = np.random.default_rng()
    aug = []
    for s in seqs:
        L = len(s)
        new_L = max(1, int(L * rng.uniform(min_frac, 1.0)))
        start = rng.integers(0, L - new_L + 1) if new_L < L else 0
        aug.append(s[start:start + new_L])
    return aug

BLOSUM62_SIMILAR = {
    'A': 'GS', 'R': 'KHQ', 'N': 'DST', 'D': 'ENS', 'C': 'S',
    'Q': 'ERK', 'E': 'DQK', 'G': 'AS', 'H': 'RNY', 'I': 'LMV',
    'L': 'IMV', 'K': 'RQE', 'M': 'ILV', 'F': 'YW', 'P': 'A',
    'S': 'TNAG', 'T': 'SNA', 'W': 'FY', 'Y': 'FWH', 'V': 'ILM',
}

def augment_seqs_mutation(seqs, mut_rate=0.05, rng=None):
    """氨基酸随机替换（基于BLOSUM62相似性）"""
    if rng is None:
        rng = np.random.default_rng()
    aug = []
    for s in seqs:
        chars = list(s)
        for i, c in enumerate(chars):
            if rng.random() < mut_rate and c in BLOSUM62_SIMILAR:
                subs = BLOSUM62_SIMILAR[c]
                chars[i] = subs[rng.integers(0, len(subs))]
        aug.append(''.join(chars))
    return aug

def augment_both(seqs, min_frac=0.8, mut_rate=0.05, rng=None):
    """截断+突变联合增强"""
    if rng is None:
        rng = np.random.default_rng()
    seqs = augment_seqs_truncation(seqs, min_frac=min_frac, rng=rng)
    seqs = augment_seqs_mutation(seqs, mut_rate=mut_rate, rng=rng)
    return seqs


# ==================== 增强版微调函数 ====================
def run_finetune_v2(train_df, valid_df, test_df, cfg,
                    head_type='default', loss_type='bce',
                    manipulation_fn=get_model_with_hidden_layers_as_outputs,
                    label_smooth_eps=0.0,
                    augment_fn=None, augment_seed=None,
                    custom_callbacks=None):
    """
    增强版微调函数，支持:
    - head_type: 'default' / 'two_layer'
    - loss_type: 'bce' / 'focal'
    - manipulation_fn: 模型操控函数（None=仅用最后一层输出）
    - label_smooth_eps: label smoothing epsilon
    - augment_fn: 数据增强函数 f(seqs, rng=rng) -> seqs
    - custom_callbacks: 替换默认回调
    """
    pretrained_model_generator, input_encoder = load_pretrained_model()

    mg = FinetuningModelGenerator(
        pretrained_model_generator,
        OUTPUT_SPEC,
        pretraining_model_manipulation_function=manipulation_fn,
        dropout_rate=cfg.get('dropout', 0.5),
        head_type=head_type,
        loss_type=loss_type,
    )

    if custom_callbacks is not None:
        cbs = custom_callbacks
    else:
        cbs = [
            keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-5, verbose=0),
            keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
        ]

    # 可选数据增强（仅对训练集）
    tr_df = train_df.copy()
    if augment_fn is not None:
        rng_aug = np.random.default_rng(augment_seed)
        tr_df['seq'] = augment_fn(list(tr_df['seq']), rng=rng_aug)

    # 可选 label smoothing
    tr_labels = tr_df['label'].copy().astype(float)
    if label_smooth_eps > 0:
        tr_labels = tr_labels * (1.0 - label_smooth_eps) + (1.0 - tr_labels) * label_smooth_eps

    finetune(
        mg, input_encoder, OUTPUT_SPEC,
        tr_df['seq'], tr_labels,
        valid_df['seq'], valid_df['label'],
        seq_len=cfg.get('seq_len', 512),
        batch_size=cfg.get('batch_size', 32),
        max_epochs_per_stage=cfg.get('max_epochs', 40),
        lr=cfg.get('lr', 1e-4),
        begin_with_frozen_pretrained_layers=cfg.get('freeze_first', True),
        lr_with_frozen_pretrained_layers=cfg.get('lr_frozen', 1e-2),
        n_final_epochs=cfg.get('n_final_epochs', 1),
        final_seq_len=cfg.get('final_seq_len', 1024),
        final_lr=cfg.get('final_lr', 1e-5),
        callbacks=cbs,
    )

    yv_true, yv_prob = predict_proteinbert_probs(mg, input_encoder, valid_df['seq'], valid_df['label'])
    thr, _ = select_best_threshold(yv_true, yv_prob)
    yt_true, yt_prob = predict_proteinbert_probs(mg, input_encoder, test_df['seq'], test_df['label'])
    metrics = summarize_metrics(yt_true, yt_prob, thr)
    return mg, input_encoder, metrics, (yt_true, yt_prob)


# ==================== AUC Early Stopping 版微调 ====================
def run_finetune_auc_es(train_df, valid_df, test_df, cfg,
                        head_type='default', loss_type='bce'):
    """使用 val_AUC early stopping 的微调流程（路线C1）"""
    pretrained_model_generator, input_encoder = load_pretrained_model()

    mg = FinetuningModelGenerator(
        pretrained_model_generator,
        OUTPUT_SPEC,
        pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs,
        dropout_rate=cfg.get('dropout', 0.5),
        head_type=head_type,
        loss_type=loss_type,
    )

    seq_len = cfg.get('seq_len', 512)
    batch_size = cfg.get('batch_size', 32)

    encoded_train, encoded_valid = encode_train_and_valid_sets(
        train_df['seq'], train_df['label'],
        valid_df['seq'], valid_df['label'],
        input_encoder, OUTPUT_SPEC, seq_len
    )

    valid_X = encoded_valid[0]
    valid_Y = encoded_valid[1]

    # Stage 1: frozen
    if cfg.get('freeze_first', True):
        auc_cb1 = AUCEarlyStopping(valid_X, valid_Y, patience=2, restore_best_weights=True)
        cbs1 = [
            keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-5, verbose=0),
            auc_cb1,
        ]
        mg.train(encoded_train, encoded_valid, seq_len, batch_size,
                 cfg.get('max_epochs', 40), lr=cfg.get('lr_frozen', 1e-2),
                 callbacks=cbs1, freeze_pretrained_layers=True)

    # Stage 2: unfrozen
    auc_cb2 = AUCEarlyStopping(valid_X, valid_Y, patience=2, restore_best_weights=True)
    cbs2 = [
        keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-5, verbose=0),
        auc_cb2,
    ]
    mg.train(encoded_train, encoded_valid, seq_len, batch_size,
             cfg.get('max_epochs', 40), lr=cfg.get('lr', 1e-4),
             callbacks=cbs2, freeze_pretrained_layers=False)

    # Stage 3: final (optional)
    if cfg.get('n_final_epochs', 0) > 0:
        final_seq_len = cfg.get('final_seq_len', 1024)
        encoded_train_f, encoded_valid_f = encode_train_and_valid_sets(
            train_df['seq'], train_df['label'],
            valid_df['seq'], valid_df['label'],
            input_encoder, OUTPUT_SPEC, final_seq_len
        )
        final_batch_size = max(int(batch_size / (final_seq_len / seq_len)), 1)
        mg.train(encoded_train_f, encoded_valid_f, final_seq_len, final_batch_size,
                 cfg.get('n_final_epochs', 1), lr=cfg.get('final_lr', 1e-5),
                 callbacks=[], freeze_pretrained_layers=False)

    mg.optimizer_weights = None

    yv_true, yv_prob = predict_proteinbert_probs(mg, input_encoder, valid_df['seq'], valid_df['label'])
    thr, _ = select_best_threshold(yv_true, yv_prob)
    yt_true, yt_prob = predict_proteinbert_probs(mg, input_encoder, test_df['seq'], test_df['label'])
    metrics = summarize_metrics(yt_true, yt_prob, thr)
    return mg, input_encoder, metrics, (yt_true, yt_prob)


# ==================== 分层学习率微调 ====================
def run_finetune_layerwise_lr(train_df, valid_df, test_df, cfg,
                              backbone_lr=1e-5, head_lr=1e-3,
                              head_type='default', loss_type='bce'):
    """
    使用分层学习率的微调流程（路线C2）
    backbone_lr: 预训练层学习率
    head_lr: 分类头学习率
    通过 gradient scaling 实现：optimizer LR=head_lr, backbone 梯度乘以 backbone_lr/head_lr
    """
    pretrained_model_generator, input_encoder = load_pretrained_model()

    mg = FinetuningModelGenerator(
        pretrained_model_generator,
        OUTPUT_SPEC,
        pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs,
        dropout_rate=cfg.get('dropout', 0.5),
        head_type=head_type,
        loss_type=loss_type,
    )

    seq_len = cfg.get('seq_len', 512)
    batch_size = cfg.get('batch_size', 32)

    encoded_train, encoded_valid = encode_train_and_valid_sets(
        train_df['seq'], train_df['label'],
        valid_df['seq'], valid_df['label'],
        input_encoder, OUTPUT_SPEC, seq_len
    )
    train_X, train_Y, train_sw = encoded_train

    # Stage 1: freeze backbone, train head with high LR
    if cfg.get('freeze_first', True):
        cbs1 = [
            keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-5, verbose=0),
            keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
        ]
        mg.train(encoded_train, encoded_valid, seq_len, batch_size,
                 cfg.get('max_epochs', 40), lr=cfg.get('lr_frozen', 1e-2),
                 callbacks=cbs1, freeze_pretrained_layers=True)

    # Stage 2: unfrozen with layerwise LR via two-optimizer approach
    mg.dummy_epoch = (
        [x[:1] for x in train_X] if isinstance(train_X, list) else train_X[:1],
        train_Y[:1] if not isinstance(train_Y, list) else [y[:1] for y in train_Y],
    )
    model = mg.create_model(seq_len, freeze_pretrained_layers=False)

    # 分离 backbone 和 head 变量
    head_keywords = ['head-layer-norm', 'head-dense-hidden', 'head-dropout']
    # 最后一层 Dense(1) 也属于 head
    all_layer_names = [l.name for l in model.layers]
    last_dense_name = [l.name for l in model.layers if isinstance(l, keras.layers.Dense)][-1]
    last_dropout_name = [l.name for l in model.layers if isinstance(l, keras.layers.Dropout)][-1]
    head_keywords.extend([last_dense_name, last_dropout_name])

    backbone_vars = []
    head_vars = []
    for v in model.trainable_variables:
        is_head = any(kw in v.name for kw in head_keywords)
        if is_head:
            head_vars.append(v)
        else:
            backbone_vars.append(v)

    backbone_opt = keras.optimizers.Adam(learning_rate=backbone_lr)
    head_opt = keras.optimizers.Adam(learning_rate=head_lr)

    if isinstance(model.loss, str):
        loss_fn = keras.losses.get(model.loss)
    else:
        loss_fn = model.loss

    # 构建 tf.data.Dataset
    if isinstance(train_X, list):
        ds = tf.data.Dataset.from_tensor_slices(
            ({f'input_{i}': x for i, x in enumerate(train_X)}, train_Y, train_sw)
        )
    else:
        ds = tf.data.Dataset.from_tensor_slices((train_X, train_Y, train_sw))
    ds = ds.shuffle(len(train_Y)).batch(batch_size)

    best_val_loss = float('inf')
    wait = 0
    patience = 2
    best_weights = None

    for epoch in range(cfg.get('max_epochs', 40)):
        epoch_losses = []
        for batch in ds:
            if isinstance(train_X, list):
                bx = [batch[0][f'input_{i}'] for i in range(len(train_X))]
            else:
                bx = batch[0]
            by = batch[1]
            bsw = batch[2]

            with tf.GradientTape(persistent=True) as tape:
                y_pred = model(bx, training=True)
                per_sample = tf.keras.losses.binary_crossentropy(
                    tf.reshape(by, [-1, 1]), y_pred)
                loss = tf.reduce_mean(per_sample * tf.cast(tf.reshape(bsw, [-1]), tf.float32))

            if backbone_vars:
                bg = tape.gradient(loss, backbone_vars)
                backbone_opt.apply_gradients(
                    [(g, v) for g, v in zip(bg, backbone_vars) if g is not None])
            if head_vars:
                hg = tape.gradient(loss, head_vars)
                head_opt.apply_gradients(
                    [(g, v) for g, v in zip(hg, head_vars) if g is not None])
            del tape
            epoch_losses.append(float(loss))

        # 验证
        val_pred = model.predict(encoded_valid[0], batch_size=batch_size, verbose=0).flatten()
        val_y = encoded_valid[1].flatten()
        val_sw = encoded_valid[2].flatten()
        mask = val_sw == 1
        val_loss = float(np.mean(
            keras.losses.binary_crossentropy(val_y[mask].reshape(-1, 1),
                                              val_pred[mask].reshape(-1, 1)).numpy()))
        print(f'  Epoch {epoch+1}: train_loss={np.mean(epoch_losses):.4f}, val_loss={val_loss:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            wait = 0
            best_weights = model.get_weights()
        else:
            wait += 1
            if wait >= patience:
                break

    if best_weights is not None:
        model.set_weights(best_weights)
    mg.update_state(model)
    mg.optimizer_weights = None

    yv_true, yv_prob = predict_proteinbert_probs(mg, input_encoder, valid_df['seq'], valid_df['label'])
    thr, _ = select_best_threshold(yv_true, yv_prob)
    yt_true, yt_prob = predict_proteinbert_probs(mg, input_encoder, test_df['seq'], test_df['label'])
    metrics = summarize_metrics(yt_true, yt_prob, thr)
    return mg, input_encoder, metrics, (yt_true, yt_prob)


# ==================== 基准配置 ====================
BASE_CFG = dict(
    name='baseline', dropout=0.5, seq_len=512, batch_size=32, max_epochs=40,
    lr=1e-4, freeze_first=True, lr_frozen=1e-2,
    n_final_epochs=1, final_seq_len=1024, final_lr=1e-5,
)

print('[Cell9] Enhanced helpers loaded: run_finetune_v2, run_finetune_auc_es, run_finetune_layerwise_lr')
print(f'        AUCEarlyStopping, augment_seqs_truncation, augment_seqs_mutation, augment_both')

[Cell9] Enhanced helpers loaded: run_finetune_v2, run_finetune_auc_es, run_finetune_layerwise_lr
        AUCEarlyStopping, augment_seqs_truncation, augment_seqs_mutation, augment_both


In [8]:
# =====================================================================
# 实验9：路线A —— 分类头改造 + Focal Loss（A1+A2）
# 目标：单变量对照，4种配置 × 5种子
#   - default_head + BCE（对照，同Exp5基线）
#   - two_layer + BCE（A1）
#   - default_head + focal（A2）
#   - two_layer + focal（A1+A2）
# =====================================================================

exp9_configs = [
    {'head_type': 'default',   'loss_type': 'bce',   'label': 'default+BCE'},
    {'head_type': 'two_layer', 'loss_type': 'bce',   'label': 'twolayer+BCE'},
    {'head_type': 'default',   'loss_type': 'focal', 'label': 'default+focal'},
    {'head_type': 'two_layer', 'loss_type': 'focal', 'label': 'twolayer+focal'},
]

exp9_rows = []
for ecfg in exp9_configs:
    for seed in SEEDS:
        tr_df, va_df = train_test_split(
            full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
        )
        _, _, met, _ = run_finetune_v2(
            tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
            head_type=ecfg['head_type'], loss_type=ecfg['loss_type'],
        )
        row = {'Config': ecfg['label'], 'Seed': seed, **met}
        exp9_rows.append(row)
        print(f"[Exp9][{ecfg['label']}][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}, F1={met['F1']:.4f}")

exp9_df = pd.DataFrame(exp9_rows)
exp9_summary = exp9_df.groupby('Config')[['AUC', 'AUPRC', 'F1', 'MCC']].agg(['mean', 'std'])
exp9_summary = exp9_summary.sort_values(('AUC', 'mean'), ascending=False)

print('\n[Exp9] 路线A结果汇总（按AUC均值排序）:')
display(exp9_summary)

# 与基线对比
base_auc = float(BASELINE_RESULT['AUC'].mean())
base_auprc = float(BASELINE_RESULT['AUPRC'].mean())
print(f'\n[Exp9] 基线门槛: AUC>{base_auc:.4f}, AUPRC>{base_auprc:.4f}')
for cfg_name in exp9_summary.index:
    auc_m = float(exp9_summary.loc[cfg_name, ('AUC', 'mean')])
    auprc_m = float(exp9_summary.loc[cfg_name, ('AUPRC', 'mean')])
    delta_auc = auc_m - base_auc
    delta_auprc = auprc_m - base_auprc
    flag = '✓' if (auc_m > base_auc and auprc_m > base_auprc) else '✗'
    print(f'  {flag} {cfg_name}: AUC={auc_m:.4f}(Δ{delta_auc:+.4f}), AUPRC={auprc_m:.4f}(Δ{delta_auprc:+.4f})')

# 记录最佳A配置供后续路线叠加
BEST_A_CONFIG = exp9_summary.index[0]
_best_a_parts = BEST_A_CONFIG.split('+')
BEST_A_HEAD = 'two_layer' if 'twolayer' in _best_a_parts[0] else 'default'
BEST_A_LOSS = 'focal' if 'focal' in _best_a_parts[1] else 'bce'
print(f'\n[Exp9] 路线A最佳: {BEST_A_CONFIG} (head={BEST_A_HEAD}, loss={BEST_A_LOSS})')

[2026_02_12-17:17:28] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:17:28] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:17:28] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
[2026_02_12-17:17:41] Training the entire fine-tuned model...
[2026_02_12-17:17:47] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
[2026_02_12-17:18:02] Training on final epochs of sequence length 1024...
[2026_02_12-17:18:02] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-17:18:03] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp9][default+BCE][seed=0] AUC=0.8796, AUPRC=0.5650, F1=0.4130
[2026_02_12-17:18:26] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:18:26] Validation set: Filtered out 0 of 

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:33:51] Training the entire fine-tuned model...
[2026_02_12-17:33:57] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:34:12] Training on final epochs of sequence length 1024...
[2026_02_12-17:34:12] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-17:34:12] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp9][twolayer+focal][seed=11] AUC=0.5000, AUPRC=0.0909, F1=0.0000
[2026_02_12-17:34:36] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:34:37] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:34:37] Training with frozen pretrained layers...


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:34:49] Training the entire fine-tuned model...
[2026_02_12-17:34:55] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:35:09] Training on final epochs of sequence length 1024...
[2026_02_12-17:35:09] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-17:35:10] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp9][twolayer+focal][seed=22] AUC=0.5000, AUPRC=0.0909, F1=0.0000
[2026_02_12-17:35:34] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:35:34] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:35:34] Training with frozen pretrained layers...


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:35:46] Training the entire fine-tuned model...
[2026_02_12-17:35:53] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:36:07] Training on final epochs of sequence length 1024...
[2026_02_12-17:36:07] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-17:36:07] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp9][twolayer+focal][seed=33] AUC=0.5000, AUPRC=0.0909, F1=0.0000
[2026_02_12-17:36:32] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:36:32] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:36:32] Training with frozen pretrained layers...


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:36:44] Training the entire fine-tuned model...
[2026_02_12-17:36:50] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:37:04] Training on final epochs of sequence length 1024...
[2026_02_12-17:37:04] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-17:37:04] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp9][twolayer+focal][seed=44] AUC=0.5000, AUPRC=0.0909, F1=0.0000

[Exp9] 路线A结果汇总（按AUC均值排序）:


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
Config,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
default+focal,0.900207,0.008834,0.635933,0.030602,0.493479,0.079,0.452052,0.076766
default+BCE,0.88497,0.01059,0.591873,0.017222,0.432538,0.039212,0.385082,0.034283
twolayer+BCE,0.863521,0.016331,0.429994,0.102065,0.411189,0.040289,0.360119,0.036571
twolayer+focal,0.5,0.0,0.090909,0.0,0.0,0.0,0.0,0.0



[Exp9] 基线门槛: AUC>0.8855, AUPRC>0.5934
  ✓ default+focal: AUC=0.9002(Δ+0.0147), AUPRC=0.6359(Δ+0.0425)
  ✗ default+BCE: AUC=0.8850(Δ-0.0005), AUPRC=0.5919(Δ-0.0016)
  ✗ twolayer+BCE: AUC=0.8635(Δ-0.0220), AUPRC=0.4300(Δ-0.1634)
  ✗ twolayer+focal: AUC=0.5000(Δ-0.3855), AUPRC=0.0909(Δ-0.5025)

[Exp9] 路线A最佳: default+focal (head=default, loss=focal)


In [9]:
# =====================================================================
# 实验10：路线B —— 表示聚合方式对照（B1）
# 目标：对比多层拼接表示 vs 仅最后一层输出
#   - multi_layer: pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs（当前默认）
#   - last_layer:  pretraining_model_manipulation_function=None（仅用最后一层 output_annotations）
# 在路线A的最佳头/损失配置上进行对比
# =====================================================================

exp10_configs = [
    {'manipulation_fn': get_model_with_hidden_layers_as_outputs, 'label': 'multi_layer_concat'},
    {'manipulation_fn': None, 'label': 'last_layer_only'},
]

exp10_rows = []
for ecfg in exp10_configs:
    for seed in SEEDS:
        tr_df, va_df = train_test_split(
            full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
        )
        _, _, met, _ = run_finetune_v2(
            tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
            head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
            manipulation_fn=ecfg['manipulation_fn'],
        )
        row = {'Config': ecfg['label'], 'Seed': seed, **met}
        exp10_rows.append(row)
        print(f"[Exp10][{ecfg['label']}][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

exp10_df = pd.DataFrame(exp10_rows)
exp10_summary = exp10_df.groupby('Config')[['AUC', 'AUPRC', 'F1', 'MCC']].agg(['mean', 'std'])
exp10_summary = exp10_summary.sort_values(('AUC', 'mean'), ascending=False)

print('\n[Exp10] 路线B结果汇总:')
display(exp10_summary)

base_auc = float(BASELINE_RESULT['AUC'].mean())
base_auprc = float(BASELINE_RESULT['AUPRC'].mean())
print(f'\n[Exp10] 基线门槛: AUC>{base_auc:.4f}, AUPRC>{base_auprc:.4f}')
for cfg_name in exp10_summary.index:
    auc_m = float(exp10_summary.loc[cfg_name, ('AUC', 'mean')])
    auprc_m = float(exp10_summary.loc[cfg_name, ('AUPRC', 'mean')])
    delta_auc = auc_m - base_auc
    delta_auprc = auprc_m - base_auprc
    flag = '✓' if (auc_m > base_auc and auprc_m > base_auprc) else '✗'
    print(f'  {flag} {cfg_name}: AUC={auc_m:.4f}(Δ{delta_auc:+.4f}), AUPRC={auprc_m:.4f}(Δ{delta_auprc:+.4f})')

# 记录最佳B配置
BEST_B_MANIPULATION = exp10_summary.index[0]
BEST_B_FN = get_model_with_hidden_layers_as_outputs if 'multi' in BEST_B_MANIPULATION else None
print(f'\n[Exp10] 路线B最佳: {BEST_B_MANIPULATION}')

[2026_02_12-17:37:29] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:37:29] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:37:29] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
[2026_02_12-17:37:47] Training the entire fine-tuned model...
[2026_02_12-17:37:54] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
[2026_02_12-17:38:09] Training on final epochs of sequence length 1024...
[2026_02_12-17:38:09] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-17:38:09] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp10][multi_layer_concat][seed=0] AUC=0.8697, AUPRC=0.5480
[2026_02_12-17:38:33] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:

Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
Config,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
multi_layer_concat,0.882544,0.01673,0.582456,0.035199,0.438324,0.080548,0.394165,0.073148
last_layer_only,0.778314,0.008467,0.319544,0.029748,0.358769,0.050855,0.302848,0.054886



[Exp10] 基线门槛: AUC>0.8855, AUPRC>0.5934
  ✗ multi_layer_concat: AUC=0.8825(Δ-0.0029), AUPRC=0.5825(Δ-0.0110)
  ✗ last_layer_only: AUC=0.7783(Δ-0.1072), AUPRC=0.3195(Δ-0.2739)

[Exp10] 路线B最佳: multi_layer_concat


In [10]:
# =====================================================================
# 实验11：路线C1 —— Early Stopping 按 val_AUC 而非 val_loss
# 目标：对比 val_loss ES vs val_AUC ES
# 在路线A+B的最佳配置上进行对比
# =====================================================================

exp11_rows = []

# --- 对照组: val_loss ES（标准流程）---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, _ = run_finetune_v2(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
        manipulation_fn=BEST_B_FN,
    )
    row = {'Config': 'val_loss_ES', 'Seed': seed, **met}
    exp11_rows.append(row)
    print(f"[Exp11][val_loss_ES][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

# --- 实验组: val_AUC ES ---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, _ = run_finetune_auc_es(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
    )
    row = {'Config': 'val_AUC_ES', 'Seed': seed, **met}
    exp11_rows.append(row)
    print(f"[Exp11][val_AUC_ES][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

exp11_df = pd.DataFrame(exp11_rows)
exp11_summary = exp11_df.groupby('Config')[['AUC', 'AUPRC', 'F1', 'MCC']].agg(['mean', 'std'])
exp11_summary = exp11_summary.sort_values(('AUC', 'mean'), ascending=False)

print('\n[Exp11] 路线C1结果汇总:')
display(exp11_summary)

base_auc = float(BASELINE_RESULT['AUC'].mean())
base_auprc = float(BASELINE_RESULT['AUPRC'].mean())
for cfg_name in exp11_summary.index:
    auc_m = float(exp11_summary.loc[cfg_name, ('AUC', 'mean')])
    auprc_m = float(exp11_summary.loc[cfg_name, ('AUPRC', 'mean')])
    delta_auc = auc_m - base_auc
    delta_auprc = auprc_m - base_auprc
    print(f'  {cfg_name}: AUC={auc_m:.4f}(Δ{delta_auc:+.4f}), AUPRC={auprc_m:.4f}(Δ{delta_auprc:+.4f})')

BEST_C1_ES = exp11_summary.index[0]
print(f'\n[Exp11] 路线C1最佳: {BEST_C1_ES}')

[2026_02_12-17:49:24] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:49:24] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-17:49:24] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
[2026_02_12-17:49:45] Training the entire fine-tuned model...
[2026_02_12-17:49:51] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-17:50:04] Training on final epochs of sequence length 1024...
[2026_02_12-17:50:04] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-17:50:04] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp11][val_loss_ES][seed=0] AUC=0.9148, AUPRC=0.6587
[2026_02_12-17:50:28] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510

Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
Config,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
val_loss_ES,0.893225,0.015093,0.604277,0.036264,0.473199,0.058343,0.443979,0.046444
val_AUC_ES,0.868077,0.021581,0.580685,0.034908,0.436495,0.030424,0.386207,0.041338


  val_loss_ES: AUC=0.8932(Δ+0.0078), AUPRC=0.6043(Δ+0.0108)
  val_AUC_ES: AUC=0.8681(Δ-0.0174), AUPRC=0.5807(Δ-0.0127)

[Exp11] 路线C1最佳: val_loss_ES


In [14]:
# =====================================================================
# 实验12：路线C2+C3 —— 分层学习率 + Label Smoothing
# 目标：在当前最佳A+B配置上叠加 C2/C3 进行对照
# C2: backbone_lr=1e-5, head_lr=1e-3（通过双优化器实现）
# C3: label smoothing epsilon=0.05
# =====================================================================

exp12_rows = []

# --- C2: 分层学习率 ---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, _ = run_finetune_layerwise_lr(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        backbone_lr=1e-5, head_lr=1e-3,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
    )
    row = {'Config': 'C2_layerwise_lr', 'Seed': seed, **met}
    exp12_rows.append(row)
    print(f"[Exp12][C2_layerwise_lr][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

# --- C3: Label Smoothing (eps=0.05) ---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, _ = run_finetune_v2(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
        manipulation_fn=BEST_B_FN,
        label_smooth_eps=0.05,
    )
    row = {'Config': 'C3_label_smooth_0.05', 'Seed': seed, **met}
    exp12_rows.append(row)
    print(f"[Exp12][C3_label_smooth][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

# --- C2+C3 联合（如果单项有提升则测试联合） ---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    # 对 label smoothing 后用标准finetune（C3），暂不叠加layerwise lr避免复杂度
    _, _, met, _ = run_finetune_v2(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
        manipulation_fn=BEST_B_FN,
        label_smooth_eps=0.1,
    )
    row = {'Config': 'C3_label_smooth_0.10', 'Seed': seed, **met}
    exp12_rows.append(row)
    print(f"[Exp12][C3_label_smooth_0.10][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

exp12_df = pd.DataFrame(exp12_rows)
exp12_summary = exp12_df.groupby('Config')[['AUC', 'AUPRC', 'F1', 'MCC']].agg(['mean', 'std'])
exp12_summary = exp12_summary.sort_values(('AUC', 'mean'), ascending=False)

print('\n[Exp12] 路线C2+C3结果汇总:')
display(exp12_summary)

base_auc = float(BASELINE_RESULT['AUC'].mean())
base_auprc = float(BASELINE_RESULT['AUPRC'].mean())
for cfg_name in exp12_summary.index:
    auc_m = float(exp12_summary.loc[cfg_name, ('AUC', 'mean')])
    auprc_m = float(exp12_summary.loc[cfg_name, ('AUPRC', 'mean')])
    delta_auc = auc_m - base_auc
    delta_auprc = auprc_m - base_auprc
    flag = '✓' if (auc_m > base_auc and auprc_m > base_auprc) else '✗'
    print(f'  {flag} {cfg_name}: AUC={auc_m:.4f}(Δ{delta_auc:+.4f}), AUPRC={auprc_m:.4f}(Δ{delta_auprc:+.4f})')

print(f'\n[Exp12] 路线C2+C3最佳: {exp12_summary.index[0]}')

[2026_02_12-18:09:20] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-18:09:20] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
[2026_02_12-18:09:41] Incompatible number of optimizer weights - will not initialize them.
  Epoch 1: train_loss=0.2365, val_loss=0.3023
  Epoch 2: train_loss=0.1128, val_loss=0.3402
  Epoch 3: train_loss=0.0507, val_loss=0.4538
[Exp12][C2_layerwise_lr][seed=0] AUC=0.8749, AUPRC=0.5476
[2026_02_12-18:10:07] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-18:10:07] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
[2026_02_12-18:10:34] Incompatible number of optimizer weights - will not initialize them.
  Epoch 1: train_loss=0.2505, val

Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
Config,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
C2_layerwise_lr,0.879615,0.010276,0.56092,0.028182,0.466088,0.019316,0.423246,0.023154
C3_label_smooth_0.05,0.878254,0.012051,0.589226,0.015693,0.426944,0.038234,0.380454,0.042341
C3_label_smooth_0.10,0.856716,0.022028,0.555681,0.031896,0.412779,0.050837,0.363852,0.04198


  ✗ C2_layerwise_lr: AUC=0.8796(Δ-0.0059), AUPRC=0.5609(Δ-0.0325)
  ✗ C3_label_smooth_0.05: AUC=0.8783(Δ-0.0072), AUPRC=0.5892(Δ-0.0042)
  ✗ C3_label_smooth_0.10: AUC=0.8567(Δ-0.0288), AUPRC=0.5557(Δ-0.0378)

[Exp12] 路线C2+C3最佳: C2_layerwise_lr


In [15]:
# =====================================================================
# 实验13：路线D —— 轻量数据增强
# 目标：在最佳A+B配置上测试三种数据增强策略
#   - D1: 序列随机截断（80%~100%长度）
#   - D2: 氨基酸随机替换（BLOSUM62, 5%概率）
#   - D1+D2: 截断+替换联合
# =====================================================================

exp13_rows = []

# --- 对照: 无增强 ---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, _ = run_finetune_v2(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
        manipulation_fn=BEST_B_FN,
        augment_fn=None,
    )
    row = {'Config': 'no_augment', 'Seed': seed, **met}
    exp13_rows.append(row)
    print(f"[Exp13][no_augment][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

# --- D1: 序列随机截断 ---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, _ = run_finetune_v2(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
        manipulation_fn=BEST_B_FN,
        augment_fn=augment_seqs_truncation,
        augment_seed=seed + 1000,
    )
    row = {'Config': 'D1_truncation', 'Seed': seed, **met}
    exp13_rows.append(row)
    print(f"[Exp13][D1_truncation][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

# --- D2: 氨基酸随机替换 ---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, _ = run_finetune_v2(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
        manipulation_fn=BEST_B_FN,
        augment_fn=augment_seqs_mutation,
        augment_seed=seed + 2000,
    )
    row = {'Config': 'D2_mutation', 'Seed': seed, **met}
    exp13_rows.append(row)
    print(f"[Exp13][D2_mutation][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

# --- D1+D2: 联合增强 ---
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, _ = run_finetune_v2(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, BASE_CFG,
        head_type=BEST_A_HEAD, loss_type=BEST_A_LOSS,
        manipulation_fn=BEST_B_FN,
        augment_fn=augment_both,
        augment_seed=seed + 3000,
    )
    row = {'Config': 'D1D2_both', 'Seed': seed, **met}
    exp13_rows.append(row)
    print(f"[Exp13][D1D2_both][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}")

exp13_df = pd.DataFrame(exp13_rows)
exp13_summary = exp13_df.groupby('Config')[['AUC', 'AUPRC', 'F1', 'MCC']].agg(['mean', 'std'])
exp13_summary = exp13_summary.sort_values(('AUC', 'mean'), ascending=False)

print('\n[Exp13] 路线D结果汇总:')
display(exp13_summary)

base_auc = float(BASELINE_RESULT['AUC'].mean())
base_auprc = float(BASELINE_RESULT['AUPRC'].mean())
for cfg_name in exp13_summary.index:
    auc_m = float(exp13_summary.loc[cfg_name, ('AUC', 'mean')])
    auprc_m = float(exp13_summary.loc[cfg_name, ('AUPRC', 'mean')])
    delta_auc = auc_m - base_auc
    delta_auprc = auprc_m - base_auprc
    flag = '✓' if (auc_m > base_auc and auprc_m > base_auprc) else '✗'
    print(f'  {flag} {cfg_name}: AUC={auc_m:.4f}(Δ{delta_auc:+.4f}), AUPRC={auprc_m:.4f}(Δ{delta_auprc:+.4f})')

print(f'\n[Exp13] 路线D最佳: {exp13_summary.index[0]}')

[2026_02_12-18:24:04] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-18:24:05] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-18:24:05] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
[2026_02_12-18:24:19] Training the entire fine-tuned model...
[2026_02_12-18:24:26] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-18:24:38] Training on final epochs of sequence length 1024...
[2026_02_12-18:24:38] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-18:24:38] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp13][no_augment][seed=0] AUC=0.8938, AUPRC=0.6154
[2026_02_12-18:25:02] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-18:25:02] Validation set: Filtered out 0 of 

Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
Config,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
D1D2_both,0.881509,0.018546,0.602812,0.06309,0.493947,0.080108,0.467214,0.093118
D1_truncation,0.879882,0.015484,0.592268,0.029464,0.466608,0.024914,0.423749,0.011904
no_augment,0.87784,0.020681,0.599056,0.034201,0.466506,0.046399,0.417536,0.044683
D2_mutation,0.871361,0.023827,0.577472,0.037196,0.469261,0.085237,0.426082,0.07877


  ✗ D1D2_both: AUC=0.8815(Δ-0.0040), AUPRC=0.6028(Δ+0.0094)
  ✗ D1_truncation: AUC=0.8799(Δ-0.0056), AUPRC=0.5923(Δ-0.0012)
  ✗ no_augment: AUC=0.8778(Δ-0.0076), AUPRC=0.5991(Δ+0.0056)
  ✗ D2_mutation: AUC=0.8714(Δ-0.0141), AUPRC=0.5775(Δ-0.0160)

[Exp13] 路线D最佳: D1D2_both


In [16]:
# =====================================================================
# 实验14：最终汇总 —— 所有路线结果对比与结论
# 目标：汇总 Exp5(基线) + Exp8(种子集成) + Exp9~13(路线A~D)
#       对比各路线的贡献，给出最终天花板结论
# =====================================================================

# ---- 汇总各路线最佳结果 ----
summary_rows = []

# Exp5 基线
summary_rows.append({
    'Route': 'Baseline (Exp5)',
    'AUC_mean': float(BASELINE_RESULT['AUC'].mean()),
    'AUC_std': float(BASELINE_RESULT['AUC'].std(ddof=1)),
    'AUPRC_mean': float(BASELINE_RESULT['AUPRC'].mean()),
    'AUPRC_std': float(BASELINE_RESULT['AUPRC'].std(ddof=1)),
})

# Exp8 种子集成
summary_rows.append({
    'Route': 'Seed Ensemble (Exp8)',
    'AUC_mean': 0.8959,  # 来自Exp8
    'AUC_std': np.nan,
    'AUPRC_mean': 0.6009,
    'AUPRC_std': np.nan,
})

# Exp9 路线A最佳
if len(exp9_df) > 0:
    best_a = exp9_df[exp9_df['Config'] == BEST_A_CONFIG]
    summary_rows.append({
        'Route': f'Route A: {BEST_A_CONFIG} (Exp9)',
        'AUC_mean': float(best_a['AUC'].mean()),
        'AUC_std': float(best_a['AUC'].std(ddof=1)),
        'AUPRC_mean': float(best_a['AUPRC'].mean()),
        'AUPRC_std': float(best_a['AUPRC'].std(ddof=1)),
    })

# Exp10 路线B最佳
if len(exp10_df) > 0:
    best_b = exp10_df[exp10_df['Config'] == BEST_B_MANIPULATION]
    summary_rows.append({
        'Route': f'Route B: {BEST_B_MANIPULATION} (Exp10)',
        'AUC_mean': float(best_b['AUC'].mean()),
        'AUC_std': float(best_b['AUC'].std(ddof=1)),
        'AUPRC_mean': float(best_b['AUPRC'].mean()),
        'AUPRC_std': float(best_b['AUPRC'].std(ddof=1)),
    })

# Exp11 路线C1最佳
if len(exp11_df) > 0:
    best_c1 = exp11_df[exp11_df['Config'] == BEST_C1_ES]
    summary_rows.append({
        'Route': f'Route C1: {BEST_C1_ES} (Exp11)',
        'AUC_mean': float(best_c1['AUC'].mean()),
        'AUC_std': float(best_c1['AUC'].std(ddof=1)),
        'AUPRC_mean': float(best_c1['AUPRC'].mean()),
        'AUPRC_std': float(best_c1['AUPRC'].std(ddof=1)),
    })

# Exp12 路线C2+C3最佳
if len(exp12_df) > 0:
    best_c2c3_name = exp12_summary.index[0]
    best_c2c3 = exp12_df[exp12_df['Config'] == best_c2c3_name]
    summary_rows.append({
        'Route': f'Route C2/C3: {best_c2c3_name} (Exp12)',
        'AUC_mean': float(best_c2c3['AUC'].mean()),
        'AUC_std': float(best_c2c3['AUC'].std(ddof=1)),
        'AUPRC_mean': float(best_c2c3['AUPRC'].mean()),
        'AUPRC_std': float(best_c2c3['AUPRC'].std(ddof=1)),
    })

# Exp13 路线D最佳
if len(exp13_df) > 0:
    best_d_name = exp13_summary.index[0]
    best_d = exp13_df[exp13_df['Config'] == best_d_name]
    summary_rows.append({
        'Route': f'Route D: {best_d_name} (Exp13)',
        'AUC_mean': float(best_d['AUC'].mean()),
        'AUC_std': float(best_d['AUC'].std(ddof=1)),
        'AUPRC_mean': float(best_d['AUPRC'].mean()),
        'AUPRC_std': float(best_d['AUPRC'].std(ddof=1)),
    })

final_df = pd.DataFrame(summary_rows)
final_df = final_df.sort_values('AUC_mean', ascending=False).reset_index(drop=True)

print('=' * 80)
print('[Exp14] ProteinBERT 剩余路线全部结果汇总')
print('=' * 80)
display(final_df[['Route', 'AUC_mean', 'AUC_std', 'AUPRC_mean', 'AUPRC_std']])

# ---- 结论 ----
best_route_auc = float(final_df['AUC_mean'].max())
best_route_name = final_df.loc[final_df['AUC_mean'].idxmax(), 'Route']
baseline_auc = float(BASELINE_RESULT['AUC'].mean())
target_auc = 0.952

print('\n' + '=' * 80)
print('[Exp14] 最终结论')
print('=' * 80)
print(f'  基线 AUC (Exp5, 5种子均值): {baseline_auc:.4f}')
print(f'  种子集成 AUC (Exp8):         0.8959')
print(f'  各路线最佳 AUC:               {best_route_auc:.4f} ({best_route_name})')
print(f'  目标 AUC:                     {target_auc:.3f}')
print(f'  最佳 vs 基线 Δ:              {best_route_auc - baseline_auc:+.4f}')
print(f'  最佳 vs 目标 差距:            {target_auc - best_route_auc:.4f}')
print()

# 标记所有超过基线的路线
improved_routes = final_df[final_df['AUC_mean'] > baseline_auc]
if len(improved_routes) > 0:
    print(f'  超过基线的路线 ({len(improved_routes)} 条):')
    for _, row in improved_routes.iterrows():
        print(f'    • {row["Route"]}: AUC={row["AUC_mean"]:.4f} (Δ{row["AUC_mean"]-baseline_auc:+.4f})')
else:
    print('  [!] 无路线稳定超过基线。')

print()
if best_route_auc >= target_auc:
    print(f'  ★ 已达到目标 AUC {target_auc:.3f}！')
elif best_route_auc >= 0.92:
    print(f'  → 已逼近目标，当前最佳 AUC={best_route_auc:.4f}。')
    print(f'    可尝试将最佳路线 + 种子集成进一步组合。')
else:
    print(f'  → 在当前数据 + ProteinBERT 约束下，天花板约为 {best_route_auc:.3f}~0.90（集成可到~0.896）。')
    print(f'    要逼近 {target_auc:.3f}，必须引入进化信息（PSSM）或换模型（ESM-2等）。')

print()
print('[Exp14] 论文可写内容:')
print('  1. ProteinBERT 在 Anti-CRISPR 预测上的系统性微调研究')
print('  2. 分类头改造 / 损失函数 / 表示聚合 / 训练策略 / 数据增强的消融实验')
print('  3. 证明 ProteinBERT 在该任务上的性能天花板，为后续引入进化信息提供依据')

[Exp14] ProteinBERT 剩余路线全部结果汇总


Unnamed: 0,Route,AUC_mean,AUC_std,AUPRC_mean,AUPRC_std
0,Route A: default+focal (Exp9),0.900207,0.008834,0.635933,0.030602
1,Seed Ensemble (Exp8),0.8959,,0.6009,
2,Route C1: val_loss_ES (Exp11),0.893225,0.015093,0.604277,0.036264
3,Baseline (Exp5),0.885473,0.016326,0.593433,0.033722
4,Route B: multi_layer_concat (Exp10),0.882544,0.01673,0.582456,0.035199
5,Route D: D1D2_both (Exp13),0.881509,0.018546,0.602812,0.06309
6,Route C2/C3: C2_layerwise_lr (Exp12),0.879615,0.010276,0.56092,0.028182



[Exp14] 最终结论
  基线 AUC (Exp5, 5种子均值): 0.8855
  种子集成 AUC (Exp8):         0.8959
  各路线最佳 AUC:               0.9002 (Route A: default+focal (Exp9))
  目标 AUC:                     0.952
  最佳 vs 基线 Δ:              +0.0147
  最佳 vs 目标 差距:            0.0518

  超过基线的路线 (3 条):
    • Route A: default+focal (Exp9): AUC=0.9002 (Δ+0.0147)
    • Seed Ensemble (Exp8): AUC=0.8959 (Δ+0.0104)
    • Route C1: val_loss_ES (Exp11): AUC=0.8932 (Δ+0.0078)

  → 在当前数据 + ProteinBERT 约束下，天花板约为 0.900~0.90（集成可到~0.896）。
    要逼近 0.952，必须引入进化信息（PSSM）或换模型（ESM-2等）。

[Exp14] 论文可写内容:
  1. ProteinBERT 在 Anti-CRISPR 预测上的系统性微调研究
  2. 分类头改造 / 损失函数 / 表示聚合 / 训练策略 / 数据增强的消融实验
  3. 证明 ProteinBERT 在该任务上的性能天花板，为后续引入进化信息提供依据
