In [1]:
# =====================================================================
# Cell 0：全局路径配置
# 作用：定义 Anti-CRISPR 数据集目录，供后续所有实验复用
# =====================================================================
BENCHMARKS_DIR = '/home/nemophila/projects/protein_bert/anticrispr_benchmarks'

In [2]:
# =====================================================================
# Cell 1：ProteinBERT 基线实验（原始微调流程）
# 作用：作为对照基线，评估不加额外融合策略时的性能
# =====================================================================
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import pandas as pd
from IPython.display import display

from tensorflow import keras

from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

# ===================== 1. 修改基准名称（对应你的数据集前缀） =====================
BENCHMARK_NAME = 'anticrispr_binary'  # 替换原signalP_binary为你的数据集前缀

# A local (non-global) binary output
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]  # 你的数据集也是二分类（0/1），无需修改
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

# ===================== 2. 定义你的数据集根目录（核心修改） =====================
# 替换原BENCHMARKS_DIR，指向你的anticrispr_benchmarks文件夹绝对路径
BENCHMARKS_DIR = '/home/nemophila/projects/protein_bert/anticrispr_benchmarks'

# Loading the dataset
# ===================== 3. 加载你自己的训练/测试集（路径适配） =====================
# 加载训练集（你的anticrispr_binary.train.csv）
train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
# 从训练集中拆分验证集（和原逻辑一致，按标签分层拆分）
train_set, valid_set = train_test_split(train_set, stratify = train_set['label'], test_size = 0.1, random_state = 0)

# 加载测试集（你的anticrispr_binary.test.csv）
test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

# 打印数据集大小（验证是否加载成功）
print(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')

# ===================== 以下部分无需修改（模型训练/评估逻辑通用） =====================
# Loading the pre-trained model and fine-tuning it on the loaded dataset
pretrained_model_generator, input_encoder = load_pretrained_model()

# get_model_with_hidden_layers_as_outputs gives the model output access to the hidden layers (on top of the output)
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
]

finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], \
        seq_len = 512, batch_size = 32, max_epochs_per_stage = 40, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)

# Evaluating the performance on the test-set
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['seq'], test_set['label'], \
        start_seq_len = 512, start_batch_size = 32)

print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

2026-02-12 12:25:21.955062: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2026-02-12 12:25:21.955095: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


996 training set records, 111 validation set records, 286 test set records.
[2026_02_12-12:25:23] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-12:25:23] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-12:25:23] Training with frozen pretrained layers...


2026-02-12 12:25:23.535198: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2026-02-12 12:25:23.536420: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2026-02-12 12:25:23.557098: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:ab:00.0 name: NVIDIA L40S computeCapability: 8.9
coreClock: 2.52GHz coreCount: 142 deviceMemorySize: 44.53GiB deviceMemoryBandwidth: 804.75GiB/s
2026-02-12 12:25:23.557191: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2026-02-12 12:25:23.557237: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or

Epoch 1/40
Epoch 2/40

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/40
Epoch 4/40

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 5/40

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2026_02_12-12:26:11] Training the entire fine-tuned model...
[2026_02_12-12:26:17] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40

Epoch 00003: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 4/40

Epoch 00004: ReduceLROnPlateau reducing learning rate to 1e-05.
[2026_02_12-12:27:43] Training on final epochs of sequence length 1024...
[2026_02_12-12:27:43] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-12:27:43] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
Test-set performance:


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,286,0.89142
All,286,0.89142


Confusion matrix:


Unnamed: 0,0,1
0,257,3
1,16,10


In [3]:
# =====================================================================
# 实验2：统一评估协议与微调工具函数（重构主线基础设施）
# 目标：统一数据、指标、阈值选择、CI估计，避免实验间不可比
# =====================================================================

import os
import numpy as np
import pandas as pd
from IPython.display import display

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, matthews_corrcoef, brier_score_loss

from tensorflow import keras

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune
from proteinbert.finetuning import encode_dataset, split_dataset_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARK_NAME = 'anticrispr_binary'
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

full_train = pd.read_csv(os.path.join(BENCHMARKS_DIR, f'{BENCHMARK_NAME}.train.csv')).dropna().drop_duplicates().reset_index(drop=True)
full_test = pd.read_csv(os.path.join(BENCHMARKS_DIR, f'{BENCHMARK_NAME}.test.csv')).dropna().drop_duplicates().reset_index(drop=True)

print(f'[Protocol] Train: {len(full_train)} ({(full_train.label==1).sum()}+/{(full_train.label==0).sum()}-)')
print(f'[Protocol] Test : {len(full_test)} ({(full_test.label==1).sum()}+/{(full_test.label==0).sum()}-)')

# 按计划固定 >=5 个随机种子，降低偶然性
SEEDS = [0, 11, 22, 33, 44]


def expected_calibration_error(y_true, y_prob, n_bins=10):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        m = (y_prob >= bins[i]) & (y_prob < bins[i+1])
        if not np.any(m):
            continue
        conf = y_prob[m].mean()
        acc = y_true[m].mean()
        ece += np.abs(acc - conf) * m.mean()
    return float(ece)


def select_best_threshold(y_true, y_prob, grid=None):
    if grid is None:
        grid = np.linspace(0.1, 0.9, 33)
    best_thr, best_f1 = 0.5, -1.0
    for thr in grid:
        y_cls = (y_prob >= thr).astype(int)
        f1 = f1_score(y_true, y_cls, zero_division=0)
        if f1 > best_f1:
            best_thr, best_f1 = float(thr), float(f1)
    return best_thr, best_f1


def summarize_metrics(y_true, y_prob, thr):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)
    y_cls = (y_prob >= thr).astype(int)
    return {
        'AUC': float(roc_auc_score(y_true, y_prob)),
        'AUPRC': float(average_precision_score(y_true, y_prob)),
        'F1': float(f1_score(y_true, y_cls, zero_division=0)),
        'MCC': float(matthews_corrcoef(y_true, y_cls)),
        'Brier': float(brier_score_loss(y_true, y_prob)),
        'ECE': float(expected_calibration_error(y_true, y_prob, n_bins=10)),
        'Threshold': float(thr),
    }


def bootstrap_ci(y_true, y_prob, metric_fn, n_boot=1000, seed=42):
    rng = np.random.default_rng(seed)
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    n = len(y_true)
    vals = []
    for _ in range(n_boot):
        idx = rng.integers(0, n, n)
        yt = y_true[idx]
        yp = y_prob[idx]
        if len(np.unique(yt)) < 2:
            continue
        vals.append(metric_fn(yt, yp))
    if len(vals) == 0:
        return (np.nan, np.nan)
    return (float(np.percentile(vals, 2.5)), float(np.percentile(vals, 97.5)))


def predict_proteinbert_probs(model_generator, input_encoder, seqs, labels, start_seq_len=512, start_batch_size=32):
    df = pd.DataFrame({'seq': list(seqs), 'raw_y': list(labels)})
    y_true_all, y_prob_all = [], []
    for d, sl, bs in split_dataset_by_len(df, start_seq_len=start_seq_len, start_batch_size=start_batch_size):
        if len(d) == 0:
            continue
        X, yt, sw = encode_dataset(d['seq'], d['raw_y'], input_encoder, OUTPUT_SPEC, seq_len=sl, needs_filtering=False)
        m = (sw == 1)
        mdl = model_generator.create_model(sl)
        yp = mdl.predict(X, batch_size=bs).flatten()
        y_true_all.append(yt[m].flatten())
        y_prob_all.append(yp[m].flatten())
    return np.concatenate(y_true_all), np.concatenate(y_prob_all)


def run_finetune_once(train_df, valid_df, test_df, cfg):
    pretrained_model_generator, input_encoder = load_pretrained_model()
    mg = FinetuningModelGenerator(
        pretrained_model_generator,
        OUTPUT_SPEC,
        pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs,
        dropout_rate=cfg.get('dropout', 0.5),
    )
    cbs = [
        keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-5, verbose=0),
        keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
    ]

    finetune(
        mg, input_encoder, OUTPUT_SPEC,
        train_df['seq'], train_df['label'],
        valid_df['seq'], valid_df['label'],
        seq_len=cfg.get('seq_len', 512),
        batch_size=cfg.get('batch_size', 32),
        max_epochs_per_stage=cfg.get('max_epochs', 40),
        lr=cfg.get('lr', 1e-4),
        begin_with_frozen_pretrained_layers=cfg.get('freeze_first', True),
        lr_with_frozen_pretrained_layers=cfg.get('lr_frozen', 1e-2),
        n_final_epochs=cfg.get('n_final_epochs', 1),
        final_seq_len=cfg.get('final_seq_len', 1024),
        final_lr=cfg.get('final_lr', 1e-5),
        callbacks=cbs,
    )

    yv_true, yv_prob = predict_proteinbert_probs(mg, input_encoder, valid_df['seq'], valid_df['label'])
    thr, _ = select_best_threshold(yv_true, yv_prob)
    yt_true, yt_prob = predict_proteinbert_probs(mg, input_encoder, test_df['seq'], test_df['label'])
    metrics = summarize_metrics(yt_true, yt_prob, thr)
    return mg, input_encoder, metrics, (yt_true, yt_prob)


[Protocol] Train: 1107 (205+/902-)
[Protocol] Test : 286 (26+/260-)


In [4]:
# =====================================================================
# 实验3：旧特征工程分支已移除
# 说明：按 ProteinBERT 冲刺计划，仅保留“微调 -> 校准 -> 同构集成”主线
# =====================================================================

print('[Exp3] 已删除旧的特征审计分支（历史实验cell）。')


[Exp3] 已删除旧的特征审计分支（历史实验cell）。


In [5]:
# =====================================================================
# 实验4：旧融合相关分支已移除
# 说明：按计划不使用手工特征融合与复杂 stacking
# =====================================================================

print('[Exp4] 已删除旧的特征重构/融合分支（历史实验cell）。')


[Exp4] 已删除旧的特征重构/融合分支（历史实验cell）。


In [6]:
# =====================================================================
# 实验5：ProteinBERT原始微调流程严格复现（主锚点）
# 目标：建立可信基线，后续所有优化必须与其比较
# =====================================================================

baseline_cfg = dict(
    name='baseline_cell1',
    dropout=0.5,
    seq_len=512,
    batch_size=32,
    max_epochs=40,
    lr=1e-4,
    freeze_first=True,
    lr_frozen=1e-2,
    n_final_epochs=1,
    final_seq_len=1024,
    final_lr=1e-5,
)

baseline_rows = []
baseline_probs = []
for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    _, _, met, (yt, yp) = run_finetune_once(tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, baseline_cfg)
    met['Seed'] = seed
    baseline_rows.append(met)
    baseline_probs.append((yt, yp))
    print(f"[Exp5][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}, F1={met['F1']:.4f}, thr={met['Threshold']:.2f}")

baseline_df = pd.DataFrame(baseline_rows)
print('\n[Exp5] 基线多随机种子结果:')
display(baseline_df[['Seed','AUC','AUPRC','F1','MCC','Brier','ECE','Threshold']])

print('[Exp5] 均值±标准差:')
for k in ['AUC','AUPRC','F1','MCC']:
    print(f'  {k}: {baseline_df[k].mean():.4f} ± {baseline_df[k].std(ddof=1):.4f}')

BASELINE_RESULT = baseline_df


[2026_02_12-12:28:37] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-12:28:37] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-12:28:37] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
[2026_02_12-12:29:30] Training the entire fine-tuned model...
[2026_02_12-12:29:35] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
[2026_02_12-12:30:56] Training on final epochs of sequence length 1024...
[2026_02_12-12:30:56] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-12:30:56] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp5][seed=0] AUC=0.8809, AUPRC=0.5894, F1=0.3958, thr=0.10
[2026_02_12-12:31:48] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-12:31:48] Vali

Unnamed: 0,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold
0,0,0.880917,0.58937,0.395833,0.357469,0.056421,0.03115,0.1
1,11,0.888166,0.567347,0.488889,0.452864,0.058994,0.036331,0.375
2,22,0.883432,0.575615,0.428571,0.37081,0.056735,0.029448,0.15
3,33,0.867751,0.589122,0.357143,0.323103,0.058063,0.042527,0.125
4,44,0.88432,0.587653,0.470588,0.439933,0.055433,0.017079,0.1


[Exp5] 均值±标准差:
  AUC: 0.8809 ± 0.0078
  AUPRC: 0.5818 ± 0.0099
  F1: 0.4282 ± 0.0538
  MCC: 0.3888 ± 0.0555


In [7]:
# =====================================================================
# 实验6：ProteinBERT微调优化矩阵（阶段2）
# 目标：系统搜索冻结/学习率/序列长度策略，并执行提升门槛
# =====================================================================

finetune_cfgs = [
    dict(name='G1_baseline_like', dropout=0.5, seq_len=512, batch_size=32, max_epochs=40, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
    dict(name='G2_shorter_train', dropout=0.5, seq_len=512, batch_size=32, max_epochs=30, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
    dict(name='G3_no_final_stage', dropout=0.5, seq_len=512, batch_size=32, max_epochs=40, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=0, final_seq_len=1024, final_lr=1e-5),
    dict(name='G4_lower_unfrozen_lr', dropout=0.5, seq_len=512, batch_size=32, max_epochs=35, lr=5e-5, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
    dict(name='G5_less_dropout', dropout=0.35, seq_len=512, batch_size=32, max_epochs=40, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
    dict(name='G6_final_len512', dropout=0.5, seq_len=512, batch_size=32, max_epochs=40, lr=1e-4, freeze_first=True,  lr_frozen=1e-2, n_final_epochs=1, final_seq_len=512,  final_lr=1e-5),
    dict(name='G7_no_freeze', dropout=0.5, seq_len=512, batch_size=32, max_epochs=35, lr=5e-5, freeze_first=False, lr_frozen=1e-2, n_final_epochs=1, final_seq_len=1024, final_lr=1e-5),
]

rows = []
for cfg in finetune_cfgs:
    for seed in SEEDS:
        tr_df, va_df = train_test_split(
            full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
        )
        _, _, met, _ = run_finetune_once(tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, cfg)
        rows.append({**{'Config': cfg['name'], 'Seed': seed}, **met})
        print(f"[Exp6][{cfg['name']}][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}, F1={met['F1']:.4f}")

exp6_df = pd.DataFrame(rows)
summary6_flat = exp6_df.groupby('Config')[['AUC','AUPRC','F1','MCC','Brier','ECE']].agg(['mean','std'])
summary6_rank = summary6_flat.sort_values(('AUPRC', 'mean'), ascending=False)

print('\n[Exp6] 配置汇总（按AUPRC均值排序）:')
display(summary6_rank)

base_auc = float(BASELINE_RESULT['AUC'].mean())
base_auprc = float(BASELINE_RESULT['AUPRC'].mean())

accepted_cfgs = []
for cfg_name in summary6_rank.index:
    auc_m = float(summary6_rank.loc[cfg_name, ('AUC', 'mean')])
    auprc_m = float(summary6_rank.loc[cfg_name, ('AUPRC', 'mean')])
    if (auc_m > base_auc) and (auprc_m > base_auprc):
        accepted_cfgs.append(cfg_name)

if len(accepted_cfgs) == 0:
    accepted_cfgs = [summary6_rank.index[0]]

best_cfg_name = accepted_cfgs[0]
BEST_FINETUNE_CFG = [c for c in finetune_cfgs if c['name'] == best_cfg_name][0]
TOP_CFG_NAMES = accepted_cfgs[:3]

print(f"[Exp6] 基线门槛: AUC>{base_auc:.4f} 且 AUPRC>{base_auprc:.4f}")
print(f"[Exp6] 通过门槛配置: {accepted_cfgs}")
print(f"[Exp6] 当前候选最优: {best_cfg_name}")


[2026_02_12-12:44:22] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-12:44:22] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-12:44:22] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
[2026_02_12-12:45:11] Training the entire fine-tuned model...
[2026_02_12-12:45:16] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-12:46:15] Training on final epochs of sequence length 1024...
[2026_02_12-12:46:15] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-12:46:15] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[Exp6][G1_baseline_like][seed=0] AUC=0.8870, AUPRC=0.5926, F1=0.4595
[2026_02_12-12:47:08] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-12:47:08] Validat

Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC,Brier,Brier,ECE,ECE
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Config,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
G3_no_final_stage,0.890325,0.014937,0.616031,0.041537,0.441999,0.038072,0.408854,0.038001,0.061706,0.003975,0.064582,0.01224
G1_baseline_like,0.891834,0.010782,0.605903,0.030011,0.459245,0.05118,0.424317,0.046906,0.054627,0.00191,0.026364,0.005923
G5_less_dropout,0.891272,0.01171,0.60301,0.011123,0.462408,0.052523,0.419006,0.04818,0.054965,0.000801,0.028183,0.008401
G4_lower_unfrozen_lr,0.885178,0.010407,0.599872,0.026931,0.464697,0.056607,0.41895,0.045371,0.055539,0.002357,0.029476,0.008106
G2_shorter_train,0.88284,0.014274,0.58806,0.034734,0.47237,0.054602,0.42553,0.055952,0.056027,0.002511,0.034296,0.010432
G6_final_len512,0.883935,0.023477,0.586385,0.043816,0.461105,0.063541,0.416464,0.062308,0.059555,0.006016,0.040916,0.011098
G7_no_freeze,0.825178,0.042472,0.473645,0.060857,0.416582,0.05355,0.362359,0.056201,0.06581,0.003756,0.037198,0.006874


[Exp6] 基线门槛: AUC>0.8809 且 AUPRC>0.5818
[Exp6] 通过门槛配置: ['G3_no_final_stage', 'G1_baseline_like', 'G5_less_dropout', 'G4_lower_unfrozen_lr', 'G2_shorter_train', 'G6_final_len512']
[Exp6] 当前候选最优: G3_no_final_stage


In [8]:
# =====================================================================
# 实验7：概率校准（阶段4）
# 目标：对最佳2-3个候选执行 Platt / Isotonic 校准，观察AUC/AUPRC/F1稳定性
# =====================================================================

from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression

cfg_map = {c['name']: c for c in finetune_cfgs}
calib_methods = ['none', 'platt', 'isotonic']

rows = []
for cfg_name in TOP_CFG_NAMES:
    cfg = cfg_map[cfg_name]
    for seed in SEEDS:
        tr_df, va_df = train_test_split(
            full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
        )
        mg, ie, _, (yt_true, yt_prob) = run_finetune_once(
            tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, cfg
        )
        yv_true, yv_prob = predict_proteinbert_probs(mg, ie, va_df['seq'], va_df['label'])

        for method in calib_methods:
            if method == 'none':
                calib_valid = yv_prob
                calib_test = yt_prob
            elif method == 'platt':
                platt = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
                platt.fit(yv_prob.reshape(-1, 1), yv_true.astype(int))
                calib_valid = platt.predict_proba(yv_prob.reshape(-1, 1))[:, 1]
                calib_test = platt.predict_proba(yt_prob.reshape(-1, 1))[:, 1]
            else:
                iso = IsotonicRegression(out_of_bounds='clip')
                iso.fit(yv_prob, yv_true.astype(int))
                calib_valid = iso.predict(yv_prob)
                calib_test = iso.predict(yt_prob)

            thr, _ = select_best_threshold(yv_true, calib_valid)
            met = summarize_metrics(yt_true, calib_test, thr)
            rows.append({'Config': cfg_name, 'Calib': method, 'Seed': seed, **met})
            print(f"[Exp7][{cfg_name}][{method}][seed={seed}] AUC={met['AUC']:.4f}, AUPRC={met['AUPRC']:.4f}, F1={met['F1']:.4f}")

exp7_df = pd.DataFrame(rows)
summary7 = exp7_df.groupby(['Config', 'Calib'])[['AUC','AUPRC','F1','MCC','Brier','ECE']].agg(['mean','std'])
summary7_rank = summary7.sort_values(('AUPRC', 'mean'), ascending=False)

print('\n[Exp7] 校准结果汇总（按AUPRC均值排序）:')
display(summary7_rank)

best_idx = summary7_rank.index[0]
BEST_CALIB_CONFIG = best_idx[0]
BEST_CALIB_METHOD = best_idx[1]
BEST_CALIB_SUMMARY = summary7_rank

print(f"[Exp7] 最佳校准组合: cfg={BEST_CALIB_CONFIG}, calib={BEST_CALIB_METHOD}")


[2026_02_12-14:26:48] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-14:26:48] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-14:26:48] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
[2026_02_12-14:27:35] Training the entire fine-tuned model...
[2026_02_12-14:27:40] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[Exp7][G3_no_final_stage][none][seed=0] AUC=0.8862, AUPRC=0.6163, F1=0.5217
[Exp7][G3_no_final_stage][platt][seed=0] AUC=0.8862, AUPRC=0.6163, F1=0.4839
[Exp7][G3_no_final_stage][isotonic][seed=0] AUC=0.8888, AUPRC=0.5807, F1=0.5079
[2026_02_12-14:28:47] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-14:28:47] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-14:28:47] Training with froz

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC,Brier,Brier,ECE,ECE
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Config,Calib,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
G5_less_dropout,none,0.890799,0.013493,0.599071,0.015265,0.443654,0.076655,0.403499,0.073259,0.055554,0.00219,0.027581,0.007526
G5_less_dropout,platt,0.890799,0.013493,0.599071,0.015265,0.432657,0.055713,0.390972,0.051436,0.147471,0.016247,0.302451,0.027724
G3_no_final_stage,none,0.882367,0.011163,0.589878,0.038292,0.451395,0.061941,0.407201,0.06198,0.062586,0.008494,0.052371,0.010012
G3_no_final_stage,platt,0.882367,0.011163,0.589878,0.038292,0.433174,0.044164,0.397477,0.030728,0.140135,0.018265,0.286485,0.031194
G1_baseline_like,none,0.882101,0.013524,0.583824,0.031382,0.444939,0.03901,0.393941,0.042079,0.056667,0.003271,0.026243,0.009001
G1_baseline_like,platt,0.882101,0.013524,0.583824,0.031382,0.452534,0.050456,0.400794,0.053729,0.148975,0.016763,0.302825,0.025253
G3_no_final_stage,isotonic,0.873033,0.020775,0.532675,0.068018,0.449367,0.057167,0.403663,0.055256,0.068062,0.006491,0.063038,0.009169
G5_less_dropout,isotonic,0.881124,0.012931,0.528485,0.029905,0.435554,0.066677,0.393867,0.061348,0.067247,0.005726,0.063804,0.014645
G1_baseline_like,isotonic,0.867322,0.031059,0.485454,0.085427,0.440929,0.058411,0.39008,0.058853,0.074384,0.011676,0.071693,0.019054


[Exp7] 最佳校准组合: cfg=G5_less_dropout, calib=none


In [9]:
# =====================================================================
# 实验8：同构种子集成 + 上限判断（阶段5/6）
# 目标：只用同一ProteinBERT流程做seed ensemble，并给出是否接近0.952的现实结论
# =====================================================================

best_cfg = [c for c in finetune_cfgs if c['name'] == BEST_CALIB_CONFIG][0]

def apply_calibration(method, yv_true, yv_prob, yt_prob):
    if method == 'none':
        return yv_prob, yt_prob
    if method == 'platt':
        clf = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
        clf.fit(yv_prob.reshape(-1, 1), yv_true.astype(int))
        return (
            clf.predict_proba(yv_prob.reshape(-1, 1))[:, 1],
            clf.predict_proba(yt_prob.reshape(-1, 1))[:, 1],
        )

    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(yv_prob, yv_true.astype(int))
    return iso.predict(yv_prob), iso.predict(yt_prob)

single_rows = []
all_valid_probs = []
all_valid_true = []
all_test_probs = []
all_test_true = None

for seed in SEEDS:
    tr_df, va_df = train_test_split(
        full_train, test_size=0.1, stratify=full_train['label'], random_state=seed
    )
    mg, ie, _, (yt_true, yt_prob) = run_finetune_once(
        tr_df.reset_index(drop=True), va_df.reset_index(drop=True), full_test, best_cfg
    )
    yv_true, yv_prob = predict_proteinbert_probs(mg, ie, va_df['seq'], va_df['label'])

    calib_valid, calib_test = apply_calibration(BEST_CALIB_METHOD, yv_true, yv_prob, yt_prob)
    thr_seed, _ = select_best_threshold(yv_true, calib_valid)
    met_seed = summarize_metrics(yt_true, calib_test, thr_seed)
    single_rows.append({'Seed': seed, **met_seed})

    all_valid_probs.append(calib_valid)
    all_valid_true.append(yv_true)
    all_test_probs.append(calib_test)
    if all_test_true is None:
        all_test_true = yt_true

single_df = pd.DataFrame(single_rows)

# 集成阈值仅在验证集（跨seed拼接）选择，测试集只评一次
pool_valid_true = np.concatenate(all_valid_true)
pool_valid_prob = np.concatenate(all_valid_probs)
ens_thr, _ = select_best_threshold(pool_valid_true, pool_valid_prob)
ens_prob = np.mean(np.vstack(all_test_probs), axis=0)
ens_met = summarize_metrics(all_test_true, ens_prob, ens_thr)

auc_ci = bootstrap_ci(all_test_true, ens_prob, roc_auc_score, n_boot=500)
auprc_ci = bootstrap_ci(all_test_true, ens_prob, average_precision_score, n_boot=500)

res = pd.DataFrame([
    {
        'Model': 'Baseline(multi-seed mean)',
        'AUC': float(BASELINE_RESULT['AUC'].mean()),
        'AUPRC': float(BASELINE_RESULT['AUPRC'].mean()),
        'F1': float(BASELINE_RESULT['F1'].mean()),
        'MCC': float(BASELINE_RESULT['MCC'].mean()),
        'Brier': float(BASELINE_RESULT['Brier'].mean()),
        'ECE': float(BASELINE_RESULT['ECE'].mean()),
        'Threshold': float(BASELINE_RESULT['Threshold'].mean()),
        'AUC_CI95': np.nan,
        'AUPRC_CI95': np.nan,
    },
    {
        'Model': f'Best single ({BEST_CALIB_CONFIG}+{BEST_CALIB_METHOD})',
        'AUC': float(single_df['AUC'].mean()),
        'AUPRC': float(single_df['AUPRC'].mean()),
        'F1': float(single_df['F1'].mean()),
        'MCC': float(single_df['MCC'].mean()),
        'Brier': float(single_df['Brier'].mean()),
        'ECE': float(single_df['ECE'].mean()),
        'Threshold': float(single_df['Threshold'].mean()),
        'AUC_CI95': np.nan,
        'AUPRC_CI95': np.nan,
    },
    {
        'Model': f'SeedEnsemble ({BEST_CALIB_CONFIG}+{BEST_CALIB_METHOD})',
        **ens_met,
        'AUC_CI95': auc_ci,
        'AUPRC_CI95': auprc_ci,
    },
])

print('[Exp8] 最终结果对照:')
display(res[['Model','AUC','AUPRC','F1','MCC','Brier','ECE','Threshold','AUC_CI95','AUPRC_CI95']])

target_auc = 0.952
gap = target_auc - float(ens_met['AUC'])
if ens_met['AUC'] >= target_auc:
    print(f'[Exp8] 结论：已达到目标AUC {target_auc:.3f}。')
elif ens_met['AUC'] >= 0.92:
    print(f'[Exp8] 结论：已逼近目标，当前AUC={ens_met["AUC"]:.4f}，距{target_auc:.3f}还差{gap:.4f}。')
else:
    print(f'[Exp8] 结论：当前AUC={ens_met["AUC"]:.4f}，距{target_auc:.3f}还差{gap:.4f}。在现有数据+ProteinBERT约束下，存在明显性能天花板。')


[2026_02_12-15:07:43] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-15:07:43] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-15:07:43] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
[2026_02_12-15:08:12] Training the entire fine-tuned model...
[2026_02_12-15:08:17] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
[2026_02_12-15:09:14] Training on final epochs of sequence length 1024...
[2026_02_12-15:09:14] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 1022.
[2026_02_12-15:09:15] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 1022.
[2026_02_12-15:10:08] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_12-15:10:08] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_12-15:10:08] Training

Unnamed: 0,Model,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,AUC_CI95,AUPRC_CI95
0,Baseline(multi-seed mean),0.880917,0.581821,0.428205,0.388836,0.057129,0.031307,0.17,,
1,Best single (G5_less_dropout+none),0.881095,0.57291,0.438102,0.393739,0.05783,0.032222,0.175,,
2,SeedEnsemble (G5_less_dropout+none),0.895858,0.600935,0.451613,0.428139,0.054762,0.035908,0.125,"(0.8353628851254846, 0.9427035583103766)","(0.42386574714055747, 0.74843040541238)"


[Exp8] 结论：当前AUC=0.8959，距0.952还差0.0561。在现有数据+ProteinBERT约束下，存在明显性能天花板。
