# Anti-CRISPR Full Experiment Plan Demo

本 notebook 按实验编号拆分为独立 cell，覆盖 Stage0~Stage4 的完整实验清单。

- 每个编号实验对应一个独立 cell（名称与编号写在 cell 顶部注释中）
- 单独包含 RPSSM(110) 在两条线上的额外对照：
  - PSSM-only：`Ablation_RPSSM_110`
  - ProteinBERT+PSSM 融合：`Fusion_PSSM110`
- 流程环环相扣：先做 Stage1（ProteinBERT 本体 head/超参对比）选出最佳个体，再做 Stage2/Stage4 融合对照。


In [1]:
# Setup: imports, paths, constants
import os
from dataclasses import dataclass
from typing import Dict, List

import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, brier_score_loss, f1_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras

from proteinbert import (
    OutputSpec,
    OutputType,
    FinetuningModelGenerator,
    FusionTrainConfig,
    attach_pssm_features,
    finetune,
    load_anticrispr_with_ids,
    load_feature_cache,
    load_pretrained_model,
    run_finetune_with_pssm,
)
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

PROJECT_ROOT = '/home/nemophila/projects/protein_bert'
BENCHMARKS_DIR = f'{PROJECT_ROOT}/anticrispr_benchmarks'
WORK_ROOT = f'{PROJECT_ROOT}/pssm_work'
FEATURE_VARIANTS = ['110', '310', '710', '1110']
SEEDS = [0, 11, 22, 33, 44]

# 结果收集容器：key=(Exp, Seed) -> row dict
RESULTS = {}

def record_result(exp: str, seed: int, metrics: Dict[str, float], extra: Dict[str, object] = None):
    row = {'Exp': exp, 'Seed': seed, **metrics}
    if extra:
        row.update(extra)
    RESULTS[(exp, int(seed))] = row

def to_results_df() -> pd.DataFrame:
    if len(RESULTS) == 0:
        return pd.DataFrame(columns=['Exp','Seed','AUC','AUPRC','F1','MCC','Brier','ECE','Threshold'])
    df = pd.DataFrame(list(RESULTS.values()))
    return df.sort_values(['Seed', 'Exp']).reset_index(drop=True)


2026-02-13 17:37:32.453643: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# Shared helpers: metrics, thresholding, data loading, baseline/fusion runners
def expected_calibration_error(y_true, y_prob, n_bins=10):
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    ids = np.digitize(y_prob, bins) - 1
    ece = 0.0
    n = len(y_true)
    for b in range(n_bins):
        m = ids == b
        if np.any(m):
            conf = float(np.mean(y_prob[m]))
            acc = float(np.mean(y_true[m]))
            ece += (np.sum(m) / n) * abs(acc - conf)
    return float(ece)

def evaluate_binary(y_true, y_prob, thr=0.5):
    y_cls = (y_prob >= thr).astype(int)
    return {
        'AUC': float(roc_auc_score(y_true, y_prob)),
        'AUPRC': float(average_precision_score(y_true, y_prob)),
        'F1': float(f1_score(y_true, y_cls)),
        'MCC': float(matthews_corrcoef(y_true, y_cls)),
        'Brier': float(brier_score_loss(y_true, y_prob)),
        'ECE': float(expected_calibration_error(y_true, y_prob, n_bins=10)),
        'Threshold': float(thr),
    }

def find_best_thr(y_true, y_prob):
    best_thr, best_f1 = 0.5, -1.0
    for thr in np.linspace(0.05, 0.95, 19):
        cur = f1_score(y_true, (y_prob >= thr).astype(int))
        if cur > best_f1:
            best_f1, best_thr = cur, float(thr)
    return best_thr

def load_feature_set(dim: str):
    parquet_path = f'{WORK_ROOT}/features/pssm_features_{dim}.parquet'
    csv_path = f'{WORK_ROOT}/features/pssm_features_{dim}.csv'
    cache_path = parquet_path if os.path.exists(parquet_path) else csv_path
    feat_df, feat_cols = load_feature_cache(cache_path)
    train, test = load_anticrispr_with_ids(BENCHMARKS_DIR, benchmark_name='anticrispr_binary')
    train = attach_pssm_features(train, feat_df, feat_cols)
    test = attach_pssm_features(test, feat_df, feat_cols)
    return train, test, feat_cols

def run_baseline_config(seed: int, head_type: str, dropout_rate: float, lr: float, max_epochs_per_stage: int = 8):
    train_df, test_df, _ = FEATURE_DATA['310']
    sub_train, sub_valid = train_test_split(
        train_df[['seq','label']], test_size=0.1, stratify=train_df['label'], random_state=seed
    )
    output_type = OutputType(False, 'binary')
    output_spec = OutputSpec(output_type, [0, 1])
    pmg, enc = load_pretrained_model(
        local_model_dump_dir=f'{PROJECT_ROOT}/proteinbert_models',
        download_model_dump_if_not_exists=True,
        validate_downloading=False,
    )
    mg = FinetuningModelGenerator(
        pmg,
        output_spec=output_spec,
        pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs,
        dropout_rate=dropout_rate,
        head_type=head_type,
        loss_type='bce',
        lr=lr,
    )
    finetune(
        mg,
        enc,
        output_spec,
        sub_train['seq'],
        sub_train['label'],
        sub_valid['seq'],
        sub_valid['label'],
        seq_len=512,
        batch_size=8,
        max_epochs_per_stage=max_epochs_per_stage,
        begin_with_frozen_pretrained_layers=True,
        n_final_epochs=0,
    )
    model = mg.create_model(512)
    X_valid = enc.encode_X(sub_valid['seq'].tolist(), 512)
    valid_prob = model.predict(X_valid, batch_size=8, verbose=0).reshape(-1)
    thr = find_best_thr(sub_valid['label'].to_numpy(), valid_prob)
    X_test = enc.encode_X(test_df['seq'].tolist(), 512)
    test_prob = model.predict(X_test, batch_size=8, verbose=0).reshape(-1)
    return evaluate_binary(test_df['label'].to_numpy(), test_prob, thr=thr)

def run_pssm_only(seed: int, dim: str, top_k: int = None):
    train_df, test_df, feat_cols = FEATURE_DATA[dim]
    sub_train, sub_valid = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=seed)
    X_tr = sub_train[feat_cols].to_numpy(dtype=np.float32)
    X_va = sub_valid[feat_cols].to_numpy(dtype=np.float32)
    X_te = test_df[feat_cols].to_numpy(dtype=np.float32)
    y_tr = sub_train['label'].astype(int).to_numpy()
    y_va = sub_valid['label'].astype(int).to_numpy()
    y_te = test_df['label'].astype(int).to_numpy()
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_te)
    if top_k is not None:
        selector = SelectKBest(score_func=f_classif, k=min(top_k, X_tr.shape[1]))
        X_tr = selector.fit_transform(X_tr, y_tr)
        X_va = selector.transform(X_va)
        X_te = selector.transform(X_te)
    clf = LogisticRegression(max_iter=2000, solver='liblinear', random_state=seed)
    clf.fit(X_tr, y_tr)
    va_prob = clf.predict_proba(X_va)[:, 1]
    thr = find_best_thr(y_va, va_prob)
    te_prob = clf.predict_proba(X_te)[:, 1]
    return evaluate_binary(y_te, te_prob, thr=thr)

def run_fusion(seed: int, dim: str, cfg: FusionTrainConfig):
    train_df, test_df, feat_cols = FEATURE_DATA[dim]
    pmg, enc = load_pretrained_model(
        local_model_dump_dir=f'{PROJECT_ROOT}/proteinbert_models',
        download_model_dump_if_not_exists=True,
        validate_downloading=False,
    )
    return run_finetune_with_pssm(pmg, enc, train_df, test_df, feat_cols, seed=seed, cfg=cfg)

def summarize_by_exp(df: pd.DataFrame):
    if df.empty:
        return df
    return df.groupby('Exp')[['AUC','AUPRC','F1','MCC','Brier','ECE']].agg(['mean','std']).sort_values(('AUPRC','mean'), ascending=False)

# 载入 110/310/710/1110 四套特征（110 取 310 前 110 维）
FEATURE_DATA = {}
FEATURE_DATA['310'] = load_feature_set('310')
FEATURE_DATA['710'] = load_feature_set('710')
FEATURE_DATA['1110'] = load_feature_set('1110')
train_310, test_310, cols_310 = FEATURE_DATA['310']
FEATURE_DATA['110'] = (train_310.copy(), test_310.copy(), cols_310[:110])
print('Loaded feature dimensions:', {k: len(v[2]) for k, v in FEATURE_DATA.items()})


Loaded feature dimensions: {'310': 310, '710': 710, '1110': 1110, '110': 110}


In [3]:
# Exp 0.1 数据与样本对齐检查
for dim in ['110','310','710','1110']:
    tr, te, cols = FEATURE_DATA[dim]
    print(f'dim={dim}: train={tr.shape}, test={te.shape}, feat_dim={len(cols)}')
    print('  sample_id unique (train/test):', tr['sample_id'].nunique(), te['sample_id'].nunique())


dim=110: train=(1107, 313), test=(286, 313), feat_dim=110
  sample_id unique (train/test): 1107 286
dim=310: train=(1107, 313), test=(286, 313), feat_dim=310
  sample_id unique (train/test): 1107 286
dim=710: train=(1107, 713), test=(286, 713), feat_dim=710
  sample_id unique (train/test): 1107 286
dim=1110: train=(1107, 1113), test=(286, 1113), feat_dim=1110
  sample_id unique (train/test): 1107 286


In [4]:
# Exp 0.2 评估协议检查（指标、阈值、排序主指标）
metric_cols = ['AUC','AUPRC','F1','MCC','Brier','ECE','Threshold']
print('metrics:', metric_cols)
print('selection in this plan: prioritize AUPRC mean, and inspect MCC/ECE as tie-breakers')


metrics: ['AUC', 'AUPRC', 'F1', 'MCC', 'Brier', 'ECE', 'Threshold']
selection in this plan: prioritize AUPRC mean, and inspect MCC/ECE as tie-breakers


In [5]:
# Exp 0.3 输出结构与保存路径检查
print('result file target:', f'{WORK_ROOT}/features/full_plan_exp_results.csv')
print('summary file target:', f'{WORK_ROOT}/features/full_plan_exp_summary.csv')
print('result rows currently in memory:', len(RESULTS))


result file target: /home/nemophila/projects/protein_bert/pssm_work/features/full_plan_exp_results.csv
summary file target: /home/nemophila/projects/protein_bert/pssm_work/features/full_plan_exp_summary.csv
result rows currently in memory: 0


In [6]:
# Exp 1.1 Baseline_head_default
for seed in SEEDS:
    print(f'Exp1.1 seed={seed}')
    m = run_baseline_config(seed=seed, head_type='default', dropout_rate=0.4, lr=2e-5)
    record_result('Baseline_head_default', seed, m, extra={'Stage': '1', 'head_type': 'default'})
to_results_df().query("Exp == 'Baseline_head_default'")


Exp1.1 seed=0
[2026_02_13-17:37:34] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:37:34] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:37:34] Training with frozen pretrained layers...


2026-02-13 17:37:34.708573: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2026-02-13 17:37:34.709599: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2026-02-13 17:37:34.736944: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:2a:00.0 name: NVIDIA L40S computeCapability: 8.9
coreClock: 2.52GHz coreCount: 142 deviceMemorySize: 44.53GiB deviceMemoryBandwidth: 804.75GiB/s
2026-02-13 17:37:34.737089: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:ab:00.0 name: NVIDIA L40S computeCapability: 8.9
coreClock: 2.52GHz coreCount: 142 deviceMemorySize: 44.53GiB deviceMemoryBandwidth: 804.75GiB/s
2026-02-13 17:37:34.737110: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2026-02-13 17:37:34.7

Epoch 1/8


2026-02-13 17:37:41.779609: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2026-02-13 17:37:42.240245: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2026-02-13 17:37:42.247806: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2026-02-13 17:37:42.248450: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2026-02-13 17:37:44.285156: W tensorflow/stream_executor/gpu/asm_compiler.cc:63] Running ptxas --version returned 256
2026-02-13 17:37:44.478613: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: ptxas exited with non-zero error code 256, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-17:38:27] Training the entire fine-tuned model...
[2026_02_13-17:38:48] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Exp1.1 seed=11
[2026_02_13-17:40:10] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:40:10] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:40:10] Training with frozen pretrained layers...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-17:40:42] Training the entire fine-tuned model...
[2026_02_13-17:40:48] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Exp1.1 seed=22
[2026_02_13-17:41:43] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceed

Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type
0,Baseline_head_default,0,0.81287,0.526331,0.433333,0.37238,0.072586,0.060235,0.4,1,default
1,Baseline_head_default,11,0.783432,0.448844,0.38961,0.329296,0.085973,0.075089,0.3,1,default
2,Baseline_head_default,22,0.786095,0.447901,0.357143,0.29424,0.077597,0.066423,0.2,1,default
3,Baseline_head_default,33,0.813166,0.45975,0.4,0.342959,0.083882,0.074316,0.6,1,default
4,Baseline_head_default,44,0.802071,0.415521,0.361111,0.291936,0.073604,0.045049,0.3,1,default


In [7]:
# Exp 1.2 Baseline_head_two_layer
for seed in SEEDS:
    print(f'Exp1.2 seed={seed}')
    m = run_baseline_config(seed=seed, head_type='two_layer', dropout_rate=0.4, lr=2e-5)
    record_result('Baseline_head_two_layer', seed, m, extra={'Stage': '1', 'head_type': 'two_layer'})
to_results_df().query("Exp == 'Baseline_head_two_layer'")


Exp1.2 seed=0
[2026_02_13-17:46:16] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:46:16] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:46:16] Training with frozen pretrained layers...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-17:46:47] Training the entire fine-tuned model...
[2026_02_13-17:46:53] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Exp1.2 seed=11
[2026_02_13-17:47:48] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:47:48] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:47:48] Training with frozen pretrained layers...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-17:48:20] Training the entire fine-tun

Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type
1,Baseline_head_two_layer,0,0.897929,0.637807,0.535714,0.487139,0.052474,0.040079,0.25,1,two_layer
3,Baseline_head_two_layer,11,0.902811,0.609123,0.528302,0.480255,0.060129,0.060666,0.4,1,two_layer
5,Baseline_head_two_layer,22,0.878107,0.544454,0.433333,0.37238,0.065439,0.051847,0.3,1,two_layer
7,Baseline_head_two_layer,33,0.881213,0.609083,0.396396,0.379833,0.073061,0.068348,0.1,1,two_layer
9,Baseline_head_two_layer,44,0.884467,0.55496,0.384615,0.323077,0.073809,0.063993,0.55,1,two_layer


In [8]:
# Exp 1.3 Baseline_head_two_layer_lowdrop (dropout_rate=0.3)
for seed in SEEDS:
    print(f'Exp1.3 seed={seed}')
    m = run_baseline_config(seed=seed, head_type='two_layer', dropout_rate=0.3, lr=2e-5)
    record_result('Baseline_head_two_layer_lowdrop', seed, m, extra={'Stage': '1', 'head_type': 'two_layer'})
to_results_df().query("Exp == 'Baseline_head_two_layer_lowdrop'")


Exp1.3 seed=0
[2026_02_13-17:54:02] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:54:02] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:54:02] Training with frozen pretrained layers...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-17:54:33] Training the entire fine-tuned model...
[2026_02_13-17:54:39] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Exp1.3 seed=11
[2026_02_13-17:55:33] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:55:34] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-17:55:34] Training with frozen pretrained layers...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-17:56:05] Training the entire fine-tun

Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type
2,Baseline_head_two_layer_lowdrop,0,0.890089,0.599235,0.447761,0.391243,0.070092,0.055788,0.35,1,two_layer
5,Baseline_head_two_layer_lowdrop,11,0.893195,0.591658,0.512821,0.514895,0.076383,0.068628,0.8,1,two_layer
8,Baseline_head_two_layer_lowdrop,22,0.875888,0.576209,0.487805,0.456968,0.062837,0.041439,0.1,1,two_layer
11,Baseline_head_two_layer_lowdrop,33,0.84497,0.567736,0.371134,0.325054,0.063489,0.048359,0.05,1,two_layer
14,Baseline_head_two_layer_lowdrop,44,0.888757,0.596883,0.529412,0.48731,0.062511,0.049631,0.3,1,two_layer


In [9]:
# Exp 1.4 Baseline_head_two_layer_lowlr (lr=1e-5)
for seed in SEEDS:
    print(f'Exp1.4 seed={seed}')
    m = run_baseline_config(seed=seed, head_type='two_layer', dropout_rate=0.4, lr=1e-5)
    record_result('Baseline_head_two_layer_lowlr', seed, m, extra={'Stage': '1', 'head_type': 'two_layer'})
to_results_df().query("Exp == 'Baseline_head_two_layer_lowlr'")


Exp1.4 seed=0
[2026_02_13-18:01:43] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-18:01:43] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-18:01:43] Training with frozen pretrained layers...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-18:02:15] Training the entire fine-tuned model...
[2026_02_13-18:02:21] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Exp1.4 seed=11
[2026_02_13-18:03:17] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-18:03:17] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-18:03:17] Training with frozen pretrained layers...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-18:03:49] Training the entire fine-tun

Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type
3,Baseline_head_two_layer_lowlr,0,0.877219,0.582587,0.472222,0.42436,0.056973,0.027393,0.2,1,two_layer
7,Baseline_head_two_layer_lowlr,11,0.890385,0.629714,0.438356,0.384895,0.059524,0.046954,0.3,1,two_layer
11,Baseline_head_two_layer_lowlr,22,0.864941,0.571824,0.414634,0.365017,0.059419,0.040285,0.2,1,two_layer
15,Baseline_head_two_layer_lowlr,33,0.861095,0.555948,0.43038,0.381319,0.084513,0.081865,0.35,1,two_layer
19,Baseline_head_two_layer_lowlr,44,0.886686,0.5906,0.451613,0.393333,0.060585,0.047149,0.3,1,two_layer


In [10]:
# Stage 1 汇总并选最佳个体 A / 次优个体 B（用于后续 Stage 2/4）
stage1_df = to_results_df().query("Stage == '1'").copy()
stage1_summary = summarize_by_exp(stage1_df)
display(stage1_summary)
rank_df = stage1_summary.reset_index()
BEST_CONFIG_NAME = rank_df.iloc[0]['Exp']
SECOND_CONFIG_NAME = rank_df.iloc[1]['Exp'] if len(rank_df) > 1 else rank_df.iloc[0]['Exp']
print('BEST_CONFIG_NAME (A):', BEST_CONFIG_NAME)
print('SECOND_CONFIG_NAME (B):', SECOND_CONFIG_NAME)


Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC,Brier,Brier,ECE,ECE
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Exp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Baseline_head_two_layer,0.888905,0.010843,0.591086,0.039723,0.455672,0.072013,0.408537,0.072036,0.064982,0.008992,0.056987,0.011225
Baseline_head_two_layer_lowdrop,0.87858,0.019915,0.586344,0.013734,0.469786,0.063145,0.435094,0.076819,0.067062,0.00607,0.052769,0.010226
Baseline_head_two_layer_lowlr,0.876065,0.012914,0.586135,0.027603,0.441441,0.021803,0.389785,0.021892,0.064203,0.01143,0.048729,0.020188
Baseline_head_default,0.799527,0.01423,0.459669,0.040769,0.38824,0.031131,0.326162,0.03398,0.078728,0.006006,0.064222,0.012332


BEST_CONFIG_NAME (A):     Baseline_head_two_layer
Name: 0, dtype: object
SECOND_CONFIG_NAME (B):     Baseline_head_two_layer_lowdrop
Name: 1, dtype: object


In [11]:
# Stage 2 融合配置：Fusion_PSSMxxx 指将 ProteinBERT 全局表示与对应维度 PSSM 特征进行 late-fusion
FUSION_CFG = FusionTrainConfig(
    seq_len=512,
    batch_size=8,
    frozen_epochs=6,
    unfrozen_epochs=12,
    frozen_lr=1e-4,
    unfrozen_lr=2e-5,
    pssm_dropout=0.3,
    global_dropout=0.3,
    pssm_hidden_dim=128,
    global_hidden_dim=128,
    global_bottleneck_dim=64,
    fusion_hidden_dim=128,
    use_hidden_global_concat=True,
)
print('Fusion config ready.')


Fusion config ready.


In [12]:
# Exp 2.1 Fusion_PSSM110（额外加入：仅 RPSSM(110) 融合）
for seed in SEEDS:
    print(f'Exp2.1 seed={seed}')
    m = run_fusion(seed=seed, dim='110', cfg=FUSION_CFG)
    record_result('Fusion_PSSM110', seed, m, extra={'Stage': '2', 'dim': 110, 'best_ref': BEST_CONFIG_NAME})
to_results_df().query("Exp == 'Fusion_PSSM110'")


Exp2.1 seed=0
Exp2.1 seed=11
Exp2.1 seed=22
Exp2.1 seed=33
Exp2.1 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref
4,Fusion_PSSM110,0,0.897929,0.643554,0.553191,0.517163,0.056787,0.043278,0.55,2,,110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
9,Fusion_PSSM110,11,0.886686,0.617604,0.550725,0.513536,0.063103,0.059579,0.35,2,,110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
14,Fusion_PSSM110,22,0.880473,0.659372,0.472222,0.42436,0.060551,0.055715,0.3,2,,110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
19,Fusion_PSSM110,33,0.893639,0.640345,0.487805,0.456968,0.063637,0.049991,0.15,2,,110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
24,Fusion_PSSM110,44,0.889793,0.566851,0.493151,0.450536,0.089696,0.097507,0.5,2,,110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."


In [13]:
# Exp 2.2 Fusion_PSSM310（ProteinBERT + 310维PSSM融合）
for seed in SEEDS:
    print(f'Exp2.2 seed={seed}')
    m = run_fusion(seed=seed, dim='310', cfg=FUSION_CFG)
    record_result('Fusion_PSSM310', seed, m, extra={'Stage': '2', 'dim': 310, 'best_ref': BEST_CONFIG_NAME})
to_results_df().query("Exp == 'Fusion_PSSM310'")


Exp2.2 seed=0
Exp2.2 seed=11
Exp2.2 seed=22
Exp2.2 seed=33
Exp2.2 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref
5,Fusion_PSSM310,0,0.902663,0.671245,0.529412,0.48731,0.056796,0.053417,0.25,2,,310.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
11,Fusion_PSSM310,11,0.903698,0.568502,0.521739,0.479506,0.066531,0.06271,0.3,2,,310.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
17,Fusion_PSSM310,22,0.909763,0.661862,0.5625,0.521198,0.067481,0.065166,0.55,2,,310.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
23,Fusion_PSSM310,33,0.898669,0.599911,0.472222,0.42436,0.06912,0.056419,0.25,2,,310.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
29,Fusion_PSSM310,44,0.901036,0.605519,0.465116,0.4345,0.055468,0.049872,0.05,2,,310.0,"Baseline_head_two_layer Name: 0, dtype: ob..."


In [14]:
# Exp 2.3 Fusion_PSSM710（ProteinBERT + 710维PSSM融合）
for seed in SEEDS:
    print(f'Exp2.3 seed={seed}')
    m = run_fusion(seed=seed, dim='710', cfg=FUSION_CFG)
    record_result('Fusion_PSSM710', seed, m, extra={'Stage': '2', 'dim': 710, 'best_ref': BEST_CONFIG_NAME})
to_results_df().query("Exp == 'Fusion_PSSM710'")


Exp2.3 seed=0
Exp2.3 seed=11
Exp2.3 seed=22
Exp2.3 seed=33
Exp2.3 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref
6,Fusion_PSSM710,0,0.910799,0.628328,0.625,0.593366,0.055124,0.038626,0.5,2,,710.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
13,Fusion_PSSM710,11,0.913609,0.667711,0.545455,0.498165,0.051487,0.034559,0.25,2,,710.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
20,Fusion_PSSM710,22,0.90429,0.68347,0.535714,0.487139,0.059565,0.051636,0.45,2,,710.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
27,Fusion_PSSM710,33,0.897041,0.57011,0.537313,0.495364,0.067843,0.055949,0.3,2,,710.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
34,Fusion_PSSM710,44,0.908284,0.644682,0.461538,0.437981,0.07047,0.071461,0.2,2,,710.0,"Baseline_head_two_layer Name: 0, dtype: ob..."


In [15]:
# Exp 2.4 Fusion_PSSM1110（ProteinBERT + 1110维PSSM融合）
for seed in SEEDS:
    print(f'Exp2.4 seed={seed}')
    m = run_fusion(seed=seed, dim='1110', cfg=FUSION_CFG)
    record_result('Fusion_PSSM1110', seed, m, extra={'Stage': '2', 'dim': 1110, 'best_ref': BEST_CONFIG_NAME})
to_results_df().query("Exp == 'Fusion_PSSM1110'")


Exp2.4 seed=0
Exp2.4 seed=11
Exp2.4 seed=22
Exp2.4 seed=33
Exp2.4 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref
5,Fusion_PSSM1110,0,0.926331,0.705027,0.62069,0.582259,0.056287,0.052527,0.45,2,,1110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
13,Fusion_PSSM1110,11,0.935207,0.643265,0.580645,0.54,0.057036,0.03454,0.3,2,,1110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
21,Fusion_PSSM1110,22,0.945266,0.751913,0.678571,0.645911,0.043013,0.035217,0.2,2,,1110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
29,Fusion_PSSM1110,33,0.914941,0.607432,0.545455,0.503686,0.070745,0.059431,0.45,2,,1110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."
37,Fusion_PSSM1110,44,0.941124,0.692673,0.56,0.534071,0.056165,0.03723,0.2,2,,1110.0,"Baseline_head_two_layer Name: 0, dtype: ob..."


In [16]:
# Exp 3.1 Ablation_RPSSM_110（仅特征、无ProteinBERT）
for seed in SEEDS:
    print(f'Exp3.1 seed={seed}')
    m = run_pssm_only(seed=seed, dim='110')
    record_result('Ablation_RPSSM_110', seed, m, extra={'Stage': '3', 'dim': 110, 'fs': 'none'})
to_results_df().query("Exp == 'Ablation_RPSSM_110'")


Exp3.1 seed=0
Exp3.1 seed=11
Exp3.1 seed=22
Exp3.1 seed=33
Exp3.1 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs
0,Ablation_RPSSM_110,0,0.75503,0.276326,0.285714,0.202199,0.107601,0.092869,0.35,3,,110.0,,none
9,Ablation_RPSSM_110,11,0.752515,0.24193,0.290323,0.21,0.116171,0.096755,0.5,3,,110.0,,none
18,Ablation_RPSSM_110,22,0.747929,0.275975,0.3,0.244804,0.108589,0.091246,0.15,3,,110.0,,none
27,Ablation_RPSSM_110,33,0.758432,0.271897,0.289855,0.20727,0.114578,0.087758,0.4,3,,110.0,,none
36,Ablation_RPSSM_110,44,0.746006,0.243972,0.271605,0.185164,0.114592,0.102062,0.35,3,,110.0,,none


In [17]:
# Exp 3.2 Ablation_RPSSM_310（仅特征、无ProteinBERT）
for seed in SEEDS:
    print(f'Exp3.2 seed={seed}')
    m = run_pssm_only(seed=seed, dim='310')
    record_result('Ablation_RPSSM_310', seed, m, extra={'Stage': '3', 'dim': 310, 'fs': 'none'})
to_results_df().query("Exp == 'Ablation_RPSSM_310'")


Exp3.2 seed=0
Exp3.2 seed=11
Exp3.2 seed=22
Exp3.2 seed=33
Exp3.2 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs
1,Ablation_RPSSM_310,0,0.774556,0.298744,0.338462,0.2642,0.155594,0.159812,0.75,3,,310.0,,none
11,Ablation_RPSSM_310,11,0.752515,0.265955,0.339623,0.272271,0.145624,0.165334,0.85,3,,310.0,,none
21,Ablation_RPSSM_310,22,0.759615,0.256178,0.25,0.158319,0.156438,0.160834,0.45,3,,310.0,,none
31,Ablation_RPSSM_310,33,0.752959,0.275568,0.282051,0.197806,0.156083,0.170176,0.5,3,,310.0,,none
41,Ablation_RPSSM_310,44,0.785799,0.294399,0.333333,0.259641,0.142755,0.146012,0.8,3,,310.0,,none


In [18]:
# Exp 3.3 Ablation_RPSSM_710（仅特征、无ProteinBERT）
for seed in SEEDS:
    print(f'Exp3.3 seed={seed}')
    m = run_pssm_only(seed=seed, dim='710')
    record_result('Ablation_RPSSM_710', seed, m, extra={'Stage': '3', 'dim': 710, 'fs': 'none'})
to_results_df().query("Exp == 'Ablation_RPSSM_710'")


Exp3.3 seed=0
Exp3.3 seed=11
Exp3.3 seed=22
Exp3.3 seed=33
Exp3.3 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs
2,Ablation_RPSSM_710,0,0.811243,0.395224,0.351351,0.28107,0.125969,0.124631,0.45,3,,710.0,,none
13,Ablation_RPSSM_710,11,0.798817,0.38296,0.382979,0.330645,0.136547,0.142521,0.95,3,,710.0,,none
24,Ablation_RPSSM_710,22,0.802663,0.374148,0.3,0.222062,0.136859,0.131975,0.75,3,,710.0,,none
35,Ablation_RPSSM_710,33,0.804142,0.337437,0.333333,0.259641,0.138166,0.142279,0.75,3,,710.0,,none
46,Ablation_RPSSM_710,44,0.831805,0.398308,0.37931,0.312175,0.122777,0.119585,0.8,3,,710.0,,none


In [19]:
# Exp 3.4 Ablation_RPSSM_1110（仅特征、无ProteinBERT）
for seed in SEEDS:
    print(f'Exp3.4 seed={seed}')
    m = run_pssm_only(seed=seed, dim='1110')
    record_result('Ablation_RPSSM_1110', seed, m, extra={'Stage': '3', 'dim': 1110, 'fs': 'none'})
to_results_df().query("Exp == 'Ablation_RPSSM_1110'")


Exp3.4 seed=0
Exp3.4 seed=11
Exp3.4 seed=22
Exp3.4 seed=33
Exp3.4 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs
1,Ablation_RPSSM_1110,0,0.875296,0.476072,0.41791,0.356536,0.106996,0.114275,0.5,3,,1110.0,,none
13,Ablation_RPSSM_1110,11,0.857988,0.451199,0.43038,0.381319,0.108893,0.106237,0.4,3,,1110.0,,none
25,Ablation_RPSSM_1110,22,0.864941,0.466223,0.447761,0.391243,0.115857,0.118269,0.65,3,,1110.0,,none
37,Ablation_RPSSM_1110,33,0.856953,0.430368,0.318182,0.268626,0.121021,0.12767,0.95,3,,1110.0,,none
49,Ablation_RPSSM_1110,44,0.893343,0.491126,0.470588,0.418587,0.100599,0.104541,0.55,3,,1110.0,,none


In [20]:
# Exp 3.5 Ablation_RPSSM_310_FS（ANOVA+IFS 近似：在 train 内 SelectKBest）
for seed in SEEDS:
    print(f'Exp3.5 seed={seed}')
    m = run_pssm_only(seed=seed, dim='310', top_k=150)
    record_result('Ablation_RPSSM_310_FS', seed, m, extra={'Stage': '3', 'dim': 310, 'fs': 'SelectKBest_f_classif_top150'})
to_results_df().query("Exp == 'Ablation_RPSSM_310_FS'")


Exp3.5 seed=0
Exp3.5 seed=11
Exp3.5 seed=22
Exp3.5 seed=33
Exp3.5 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs
3,Ablation_RPSSM_310_FS,0,0.773225,0.289457,0.28,0.201979,0.097781,0.082583,0.2,3,,310.0,,SelectKBest_f_classif_top150
16,Ablation_RPSSM_310_FS,11,0.758876,0.243776,0.318841,0.2413,0.11149,0.093356,0.4,3,,310.0,,SelectKBest_f_classif_top150
29,Ablation_RPSSM_310_FS,22,0.783876,0.301332,0.318841,0.2413,0.099021,0.080733,0.35,3,,310.0,,SelectKBest_f_classif_top150
42,Ablation_RPSSM_310_FS,33,0.759172,0.254059,0.318841,0.2413,0.115284,0.104807,0.45,3,,310.0,,SelectKBest_f_classif_top150
55,Ablation_RPSSM_310_FS,44,0.757544,0.256145,0.313253,0.238036,0.108353,0.0961,0.3,3,,310.0,,SelectKBest_f_classif_top150


In [21]:
# Exp 3.6 Ablation_RPSSM_710_FS（ANOVA+IFS 近似：在 train 内 SelectKBest）
for seed in SEEDS:
    print(f'Exp3.6 seed={seed}')
    m = run_pssm_only(seed=seed, dim='710', top_k=250)
    record_result('Ablation_RPSSM_710_FS', seed, m, extra={'Stage': '3', 'dim': 710, 'fs': 'SelectKBest_f_classif_top250'})
to_results_df().query("Exp == 'Ablation_RPSSM_710_FS'")


Exp3.6 seed=0
Exp3.6 seed=11
Exp3.6 seed=22
Exp3.6 seed=33
Exp3.6 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs
5,Ablation_RPSSM_710_FS,0,0.855473,0.410499,0.392857,0.328368,0.099938,0.095143,0.6,3,,710.0,,SelectKBest_f_classif_top250
19,Ablation_RPSSM_710_FS,11,0.829586,0.362764,0.363636,0.296701,0.108888,0.105205,0.75,3,,710.0,,SelectKBest_f_classif_top250
33,Ablation_RPSSM_710_FS,22,0.857396,0.426873,0.395604,0.350912,0.098726,0.085197,0.1,3,,710.0,,SelectKBest_f_classif_top250
47,Ablation_RPSSM_710_FS,33,0.850148,0.360392,0.353982,0.319643,0.106483,0.106913,0.05,3,,710.0,,SelectKBest_f_classif_top250
61,Ablation_RPSSM_710_FS,44,0.82574,0.383886,0.324324,0.316228,0.090459,0.090867,0.9,3,,710.0,,SelectKBest_f_classif_top250


In [22]:
# Exp 3.7 Ablation_RPSSM_1110_FS（ANOVA+IFS 近似：在 train 内 SelectKBest）
for seed in SEEDS:
    print(f'Exp3.7 seed={seed}')
    m = run_pssm_only(seed=seed, dim='1110', top_k=350)
    record_result('Ablation_RPSSM_1110_FS', seed, m, extra={'Stage': '3', 'dim': 1110, 'fs': 'SelectKBest_f_classif_top350'})
to_results_df().query("Exp == 'Ablation_RPSSM_1110_FS'")


Exp3.7 seed=0
Exp3.7 seed=11
Exp3.7 seed=22
Exp3.7 seed=33
Exp3.7 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs
2,Ablation_RPSSM_1110_FS,0,0.887278,0.518481,0.357143,0.323103,0.078597,0.081621,0.05,3,,1110.0,,SelectKBest_f_classif_top350
17,Ablation_RPSSM_1110_FS,11,0.865828,0.442901,0.48,0.437234,0.091385,0.074149,0.3,3,,1110.0,,SelectKBest_f_classif_top350
32,Ablation_RPSSM_1110_FS,22,0.875,0.475716,0.439024,0.395668,0.097555,0.09526,0.25,3,,1110.0,,SelectKBest_f_classif_top350
47,Ablation_RPSSM_1110_FS,33,0.870858,0.386911,0.39604,0.364499,0.104707,0.100061,0.1,3,,1110.0,,SelectKBest_f_classif_top350
62,Ablation_RPSSM_1110_FS,44,0.867604,0.482359,0.435897,0.387012,0.089212,0.089656,0.35,3,,1110.0,,SelectKBest_f_classif_top350


In [23]:
# Stage 4 交互检查说明：
# 这里的 A/B 分别对应 Stage1 里选出的最佳/次优基线配置名。
# 当前采用同一融合架构（run_finetune_with_pssm）；A/B 主要作为“基线来源标签”，用于检查排序一致性。
print('A (best baseline):', BEST_CONFIG_NAME)
print('B (runner-up baseline):', SECOND_CONFIG_NAME)


A (best baseline):     Baseline_head_two_layer
Name: 0, dtype: object
B (runner-up baseline):     Baseline_head_two_layer_lowdrop
Name: 1, dtype: object


In [24]:
# Exp 4.1 Fusion_PSSM710_A
for seed in SEEDS:
    print(f'Exp4.1 seed={seed}')
    m = run_fusion(seed=seed, dim='710', cfg=FUSION_CFG)
    record_result('Fusion_PSSM710_A', seed, m, extra={'Stage': '4', 'ref_baseline': BEST_CONFIG_NAME})
to_results_df().query("Exp == 'Fusion_PSSM710_A'")


Exp4.1 seed=0
Exp4.1 seed=11
Exp4.1 seed=22
Exp4.1 seed=33
Exp4.1 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs,ref_baseline
15,Fusion_PSSM710_A,0,0.914793,0.643208,0.603774,0.563449,0.061485,0.05654,0.55,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."
31,Fusion_PSSM710_A,11,0.912574,0.619936,0.595745,0.563792,0.055342,0.045207,0.6,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."
47,Fusion_PSSM710_A,22,0.897189,0.557738,0.526316,0.4766,0.070833,0.062913,0.55,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."
63,Fusion_PSSM710_A,33,0.913314,0.644501,0.469136,0.432049,0.059676,0.050884,0.15,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."
79,Fusion_PSSM710_A,44,0.910947,0.663316,0.566667,0.522699,0.050125,0.036953,0.25,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."


In [25]:
# Exp 4.2 Fusion_PSSM710_B
for seed in SEEDS:
    print(f'Exp4.2 seed={seed}')
    m = run_fusion(seed=seed, dim='710', cfg=FUSION_CFG)
    record_result('Fusion_PSSM710_B', seed, m, extra={'Stage': '4', 'ref_baseline': SECOND_CONFIG_NAME})
to_results_df().query("Exp == 'Fusion_PSSM710_B'")


Exp4.2 seed=0
Exp4.2 seed=11
Exp4.2 seed=22
Exp4.2 seed=33
Exp4.2 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs,ref_baseline
16,Fusion_PSSM710_B,0,0.922633,0.698321,0.692308,0.661538,0.046274,0.044274,0.4,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
33,Fusion_PSSM710_B,11,0.909763,0.65354,0.487805,0.456968,0.056057,0.047992,0.1,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
50,Fusion_PSSM710_B,22,0.889645,0.58604,0.555556,0.50972,0.064013,0.048266,0.55,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
67,Fusion_PSSM710_B,33,0.879882,0.596608,0.475,0.437916,0.072762,0.076032,0.25,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
84,Fusion_PSSM710_B,44,0.909911,0.674519,0.545455,0.503686,0.04813,0.023479,0.2,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."


In [26]:
# Exp 4.3 Fusion_PSSM1110_A
for seed in SEEDS:
    print(f'Exp4.3 seed={seed}')
    m = run_fusion(seed=seed, dim='1110', cfg=FUSION_CFG)
    record_result('Fusion_PSSM1110_A', seed, m, extra={'Stage': '4', 'ref_baseline': BEST_CONFIG_NAME})
to_results_df().query("Exp == 'Fusion_PSSM1110_A'")


Exp4.3 seed=0
Exp4.3 seed=11
Exp4.3 seed=22
Exp4.3 seed=33
Exp4.3 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs,ref_baseline
13,Fusion_PSSM1110_A,0,0.920414,0.679727,0.615385,0.576923,0.060216,0.063651,0.55,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."
31,Fusion_PSSM1110_A,11,0.927811,0.624061,0.571429,0.532635,0.059214,0.049211,0.55,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."
49,Fusion_PSSM1110_A,22,0.929586,0.720247,0.634921,0.602907,0.046613,0.025568,0.25,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."
67,Fusion_PSSM1110_A,33,0.922781,0.621606,0.513514,0.47634,0.054736,0.037041,0.1,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."
85,Fusion_PSSM1110_A,44,0.935059,0.649939,0.505747,0.488565,0.059747,0.039314,0.1,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."


In [27]:
# Exp 4.4 Fusion_PSSM1110_B
for seed in SEEDS:
    print(f'Exp4.4 seed={seed}')
    m = run_fusion(seed=seed, dim='1110', cfg=FUSION_CFG)
    record_result('Fusion_PSSM1110_B', seed, m, extra={'Stage': '4', 'ref_baseline': SECOND_CONFIG_NAME})
to_results_df().query("Exp == 'Fusion_PSSM1110_B'")


Exp4.4 seed=0
Exp4.4 seed=11
Exp4.4 seed=22
Exp4.4 seed=33
Exp4.4 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs,ref_baseline
14,Fusion_PSSM1110_B,0,0.934763,0.68741,0.627451,0.591135,0.049619,0.039051,0.4,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
33,Fusion_PSSM1110_B,11,0.920858,0.6735,0.612245,0.57736,0.051576,0.037412,0.4,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
52,Fusion_PSSM1110_B,22,0.918195,0.707798,0.644068,0.60911,0.048288,0.030279,0.3,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
71,Fusion_PSSM1110_B,33,0.930769,0.68604,0.55,0.548322,0.055771,0.037192,0.85,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
90,Fusion_PSSM1110_B,44,0.920266,0.688845,0.523077,0.476849,0.057722,0.049783,0.3,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."


In [28]:
# 汇总：全部实验明细
exp_df = to_results_df()
exp_df


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs,ref_baseline
0,Ablation_RPSSM_110,0,0.755030,0.276326,0.285714,0.202199,0.107601,0.092869,0.35,3,,110.0,,none,
1,Ablation_RPSSM_1110,0,0.875296,0.476072,0.417910,0.356536,0.106996,0.114275,0.50,3,,1110.0,,none,
2,Ablation_RPSSM_1110_FS,0,0.887278,0.518481,0.357143,0.323103,0.078597,0.081621,0.05,3,,1110.0,,SelectKBest_f_classif_top350,
3,Ablation_RPSSM_310,0,0.774556,0.298744,0.338462,0.264200,0.155594,0.159812,0.75,3,,310.0,,none,
4,Ablation_RPSSM_310_FS,0,0.773225,0.289457,0.280000,0.201979,0.097781,0.082583,0.20,3,,310.0,,SelectKBest_f_classif_top150,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Fusion_PSSM1110_B,44,0.920266,0.688845,0.523077,0.476849,0.057722,0.049783,0.30,4,,,,,"Baseline_head_two_layer_lowdrop Name: 1, d..."
91,Fusion_PSSM310,44,0.901036,0.605519,0.465116,0.434500,0.055468,0.049872,0.05,2,,310.0,"Baseline_head_two_layer Name: 0, dtype: ob...",,
92,Fusion_PSSM710,44,0.908284,0.644682,0.461538,0.437981,0.070470,0.071461,0.20,2,,710.0,"Baseline_head_two_layer Name: 0, dtype: ob...",,
93,Fusion_PSSM710_A,44,0.910947,0.663316,0.566667,0.522699,0.050125,0.036953,0.25,4,,,,,"Baseline_head_two_layer Name: 0, dtype: ob..."


In [29]:
# 汇总：按 Exp 统计 mean/std，按 AUPRC mean 排序
summary_df = summarize_by_exp(exp_df)
summary_df


Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC,Brier,Brier,ECE,ECE
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Exp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Fusion_PSSM1110_B,0.92497,0.007322,0.688719,0.012294,0.591368,0.052178,0.560555,0.051796,0.052595,0.004026,0.038743,0.007032
Fusion_PSSM1110,0.932574,0.012156,0.680062,0.056067,0.597072,0.053639,0.561185,0.055029,0.056649,0.009813,0.043789,0.011436
Fusion_PSSM1110_A,0.92713,0.005775,0.659116,0.041478,0.568199,0.058265,0.535474,0.054705,0.056105,0.00574,0.042957,0.0143
Fusion_PSSM710_B,0.902367,0.017247,0.641806,0.048873,0.551225,0.086297,0.513966,0.08794,0.057447,0.011082,0.048009,0.018715
Fusion_PSSM710,0.906805,0.006444,0.63886,0.043857,0.541004,0.057947,0.502403,0.056402,0.060898,0.008117,0.050446,0.014709
Fusion_PSSM710_A,0.909763,0.007164,0.62574,0.041009,0.552327,0.055531,0.511718,0.057203,0.059492,0.007707,0.050499,0.010028
Fusion_PSSM110,0.889704,0.006661,0.625545,0.03604,0.511419,0.037807,0.472512,0.040987,0.066755,0.013107,0.061214,0.0212
Fusion_PSSM310,0.903166,0.004148,0.621408,0.043686,0.510198,0.040965,0.469375,0.039851,0.063079,0.006426,0.057517,0.006363
Baseline_head_two_layer,0.888905,0.010843,0.591086,0.039723,0.455672,0.072013,0.408537,0.072036,0.064982,0.008992,0.056987,0.011225
Baseline_head_two_layer_lowdrop,0.87858,0.019915,0.586344,0.013734,0.469786,0.063145,0.435094,0.076819,0.067062,0.00607,0.052769,0.010226


In [30]:
# 保存结果到 pssm_work/features/
os.makedirs(f'{WORK_ROOT}/features', exist_ok=True)
res_path = f'{WORK_ROOT}/features/full_plan_exp_results.csv'
sum_path = f'{WORK_ROOT}/features/full_plan_exp_summary.csv'
exp_df.to_csv(res_path, index=False)
summary_df.to_csv(sum_path)
print('saved:', res_path)
print('saved:', sum_path)


saved: /home/nemophila/projects/protein_bert/pssm_work/features/full_plan_exp_results.csv
saved: /home/nemophila/projects/protein_bert/pssm_work/features/full_plan_exp_summary.csv


## Stage 5: 严格防泄漏的 BERT + ANOVA + IFS(top-k)

本阶段新增实验用于回答：在 ProteinBERT+PSSM 融合中，先做 ANOVA 排序再做 IFS 选 top-k，是否比原始全维融合更好。

**严格防泄漏约束**：
- ANOVA 排序只在每个 seed 的训练子集（sub-train）上拟合；
- IFS 的 k 选择只使用验证子集（sub-valid）表现，不看测试集；
- 测试集仅在最终选定 k 后做一次评估。

In [31]:
# Exp 5.0 辅助函数：严格防泄漏的 ANOVA+IFS 融合训练
# 说明：
# 1) 先按 seed 将训练集划分为 sub-train/sub-valid；
# 2) 仅在 sub-train 上做 ANOVA(F-score) 排序；
# 3) 对 k_candidates 做 IFS：每个 k 都训练一次融合模型，用 sub-valid AUPRC 选最佳 k；
# 4) 返回最佳 k 对应的测试集指标。

from tensorflow import keras as tf_keras

def _split_train_valid_for_seed(train_df, seed):
    return train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=seed)

def _rank_features_by_anova(sub_train_df, feat_cols):
    X = sub_train_df[feat_cols].to_numpy(dtype=np.float32)
    y = sub_train_df['label'].astype(int).to_numpy()
    selector = SelectKBest(score_func=f_classif, k='all')
    selector.fit(X, y)
    scores = np.nan_to_num(selector.scores_, nan=-1e9, posinf=1e9, neginf=-1e9)
    order = np.argsort(scores)[::-1]
    ranked_cols = [feat_cols[i] for i in order]
    return ranked_cols

def _build_late_fusion_model_local(pretrained_model_generator, seq_len, pssm_dim, freeze_pretrained_layers, cfg):
    base_model = pretrained_model_generator.create_model(seq_len, compile=False, init_weights=True)
    if cfg.use_hidden_global_concat:
        base_model = get_model_with_hidden_layers_as_outputs(base_model)
    if freeze_pretrained_layers:
        for layer in base_model.layers:
            layer.trainable = False
    _, global_output = base_model.output

    pssm_input = tf_keras.layers.Input(shape=(pssm_dim,), name='pssm_input')
    global_branch = tf_keras.layers.LayerNormalization(name='global_ln_in')(global_output)
    global_branch = tf_keras.layers.Dense(cfg.global_bottleneck_dim, activation='relu', name='global_bottleneck')(global_branch)
    global_branch = tf_keras.layers.Dropout(cfg.global_dropout, name='global_drop')(global_branch)
    global_branch = tf_keras.layers.Dense(cfg.global_hidden_dim, activation='relu', name='global_dense')(global_branch)
    global_branch = tf_keras.layers.LayerNormalization(name='global_ln_out')(global_branch)

    pssm_branch = tf_keras.layers.LayerNormalization(name='pssm_ln')(pssm_input)
    pssm_branch = tf_keras.layers.Dense(cfg.pssm_hidden_dim, activation='relu', name='pssm_dense')(pssm_branch)
    pssm_branch = tf_keras.layers.Dropout(cfg.pssm_dropout, name='pssm_drop')(pssm_branch)
    pssm_branch = tf_keras.layers.LayerNormalization(name='pssm_ln_out')(pssm_branch)

    fused = tf_keras.layers.Concatenate(name='late_fusion')([global_branch, pssm_branch])
    fused = tf_keras.layers.Dense(cfg.fusion_hidden_dim, activation='relu', name='fusion_dense')(fused)
    fused = tf_keras.layers.Dropout(cfg.pssm_dropout, name='fusion_drop')(fused)
    out = tf_keras.layers.Dense(1, activation='sigmoid', name='output')(fused)
    return tf_keras.models.Model(inputs=base_model.inputs + [pssm_input], outputs=out)

def _encode_inputs(enc, seqs, seq_len, pssm_feats):
    tokenized, annotations = enc.encode_X(seqs, seq_len)
    return [tokenized, annotations, pssm_feats.astype(np.float32)]

def _train_eval_fusion_for_given_k(seed, sub_train_df, sub_valid_df, test_df, selected_cols, cfg):
    x_tr = sub_train_df[selected_cols].to_numpy(dtype=np.float32)
    x_va = sub_valid_df[selected_cols].to_numpy(dtype=np.float32)
    x_te = test_df[selected_cols].to_numpy(dtype=np.float32)

    scaler = StandardScaler()
    x_tr = scaler.fit_transform(x_tr)
    x_va = scaler.transform(x_va)
    x_te = scaler.transform(x_te)

    y_tr = sub_train_df['label'].astype(int).to_numpy()
    y_va = sub_valid_df['label'].astype(int).to_numpy()
    y_te = test_df['label'].astype(int).to_numpy()

    pmg, enc = load_pretrained_model(
        local_model_dump_dir=f'{PROJECT_ROOT}/proteinbert_models',
        download_model_dump_if_not_exists=True,
        validate_downloading=False,
    )

    X_tr = _encode_inputs(enc, sub_train_df['seq'].tolist(), cfg.seq_len, x_tr)
    X_va = _encode_inputs(enc, sub_valid_df['seq'].tolist(), cfg.seq_len, x_va)
    X_te = _encode_inputs(enc, test_df['seq'].tolist(), cfg.seq_len, x_te)

    callbacks = [tf_keras.callbacks.EarlyStopping(monitor='val_loss', patience=cfg.patience, restore_best_weights=True)]

    model = _build_late_fusion_model_local(
        pmg,
        seq_len=cfg.seq_len,
        pssm_dim=len(selected_cols),
        freeze_pretrained_layers=True,
        cfg=cfg,
    )
    model.compile(optimizer=tf_keras.optimizers.Adam(learning_rate=cfg.frozen_lr), loss='binary_crossentropy')
    model.fit(X_tr, y_tr, validation_data=(X_va, y_va), epochs=cfg.frozen_epochs, batch_size=cfg.batch_size, callbacks=callbacks, verbose=0)

    for layer in model.layers:
        layer.trainable = True
    model.compile(optimizer=tf_keras.optimizers.Adam(learning_rate=cfg.unfrozen_lr), loss='binary_crossentropy')
    model.fit(X_tr, y_tr, validation_data=(X_va, y_va), epochs=cfg.unfrozen_epochs, batch_size=cfg.batch_size, callbacks=callbacks, verbose=0)

    va_prob = model.predict(X_va, batch_size=cfg.batch_size, verbose=0).reshape(-1)
    thr = find_best_thr(y_va, va_prob)
    te_prob = model.predict(X_te, batch_size=cfg.batch_size, verbose=0).reshape(-1)

    metrics = evaluate_binary(y_te, te_prob, thr=thr)
    valid_auprc = float(average_precision_score(y_va, va_prob))
    return metrics, valid_auprc

def run_fusion_anova_ifs_no_leak(seed, dim, k_candidates, cfg):
    train_df, test_df, feat_cols = FEATURE_DATA[dim]
    sub_train_df, sub_valid_df = _split_train_valid_for_seed(train_df, seed)

    ranked_cols = _rank_features_by_anova(sub_train_df, feat_cols)
    k_candidates = [k for k in k_candidates if 1 <= k <= len(ranked_cols)]
    if len(k_candidates) == 0:
        raise ValueError(f'No valid k in {k_candidates} for dim={dim}')

    best = None
    for k in k_candidates:
        selected_cols = ranked_cols[:k]
        metrics, valid_auprc = _train_eval_fusion_for_given_k(
            seed=seed,
            sub_train_df=sub_train_df,
            sub_valid_df=sub_valid_df,
            test_df=test_df,
            selected_cols=selected_cols,
            cfg=cfg,
        )
        row = {'k': k, 'valid_AUPRC': valid_auprc, 'metrics': metrics}
        if (best is None) or (row['valid_AUPRC'] > best['valid_AUPRC']):
            best = row

    return best

In [32]:
# Exp 5.1 Fusion_PSSM710_ANOVA_IFS_noLeak
# 含义：ProteinBERT 与 710 维 PSSM 融合前，先在 sub-train 上 ANOVA 排序，再在 sub-valid 上 IFS 选最佳 top-k。
# 非重复点：这是“融合路径 + 严格防泄漏 + 自动选k”，不同于 Stage3 的 PSSM-only FS。

k_candidates_710 = [80, 120, 160, 200, 240, 280, 320]
for seed in SEEDS:
    print(f'Exp5.1 seed={seed}')
    best = run_fusion_anova_ifs_no_leak(seed=seed, dim='710', k_candidates=k_candidates_710, cfg=FUSION_CFG)
    record_result(
        'Fusion_PSSM710_ANOVA_IFS_noLeak',
        seed,
        best['metrics'],
        extra={
            'Stage': '5',
            'dim': 710,
            'fs': 'ANOVA+IFS_noLeak',
            'selected_k': int(best['k']),
            'valid_AUPRC_for_k_select': float(best['valid_AUPRC']),
        },
    )

to_results_df().query("Exp == 'Fusion_PSSM710_ANOVA_IFS_noLeak'")

Exp5.1 seed=0
Exp5.1 seed=11
Exp5.1 seed=22
Exp5.1 seed=33
Exp5.1 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs,ref_baseline,selected_k,valid_AUPRC_for_k_select
18,Fusion_PSSM710_ANOVA_IFS_noLeak,0,0.916568,0.655684,0.5,0.491952,0.058395,0.050552,0.75,5,,710.0,,ANOVA+IFS_noLeak,,280.0,0.865886
38,Fusion_PSSM710_ANOVA_IFS_noLeak,11,0.901331,0.667572,0.52459,0.475717,0.050084,0.03412,0.2,5,,710.0,,ANOVA+IFS_noLeak,,240.0,0.884997
58,Fusion_PSSM710_ANOVA_IFS_noLeak,22,0.899112,0.651497,0.638298,0.610422,0.050044,0.031679,0.4,5,,710.0,,ANOVA+IFS_noLeak,,200.0,0.911481
78,Fusion_PSSM710_ANOVA_IFS_noLeak,33,0.912426,0.670072,0.576923,0.534615,0.053528,0.03922,0.5,5,,710.0,,ANOVA+IFS_noLeak,,280.0,0.692528
98,Fusion_PSSM710_ANOVA_IFS_noLeak,44,0.918195,0.665161,0.468085,0.451925,0.055977,0.050678,0.05,5,,710.0,,ANOVA+IFS_noLeak,,160.0,0.798432


In [33]:
# Exp 5.2 Fusion_PSSM1110_ANOVA_IFS_noLeak
# 含义：ProteinBERT 与 1110 维 PSSM 融合前，严格在 train/valid 内完成 ANOVA+IFS 选k，再评估 test。

k_candidates_1110 = [120, 200, 280, 360, 440, 520, 600]
for seed in SEEDS:
    print(f'Exp5.2 seed={seed}')
    best = run_fusion_anova_ifs_no_leak(seed=seed, dim='1110', k_candidates=k_candidates_1110, cfg=FUSION_CFG)
    record_result(
        'Fusion_PSSM1110_ANOVA_IFS_noLeak',
        seed,
        best['metrics'],
        extra={
            'Stage': '5',
            'dim': 1110,
            'fs': 'ANOVA+IFS_noLeak',
            'selected_k': int(best['k']),
            'valid_AUPRC_for_k_select': float(best['valid_AUPRC']),
        },
    )

to_results_df().query("Exp == 'Fusion_PSSM1110_ANOVA_IFS_noLeak'")

Exp5.2 seed=0
Exp5.2 seed=11
Exp5.2 seed=22
Exp5.2 seed=33
Exp5.2 seed=44


Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold,Stage,head_type,dim,best_ref,fs,ref_baseline,selected_k,valid_AUPRC_for_k_select
14,Fusion_PSSM1110_ANOVA_IFS_noLeak,0,0.926627,0.695996,0.595745,0.563792,0.055125,0.049926,0.5,5,,1110.0,,ANOVA+IFS_noLeak,,280.0,0.905339
35,Fusion_PSSM1110_ANOVA_IFS_noLeak,11,0.93358,0.677144,0.521739,0.479506,0.052855,0.044822,0.2,5,,1110.0,,ANOVA+IFS_noLeak,,440.0,0.902198
56,Fusion_PSSM1110_ANOVA_IFS_noLeak,22,0.917899,0.679609,0.566667,0.522699,0.057001,0.041833,0.25,5,,1110.0,,ANOVA+IFS_noLeak,,520.0,0.907458
77,Fusion_PSSM1110_ANOVA_IFS_noLeak,33,0.914497,0.585459,0.527778,0.490572,0.057734,0.037198,0.15,5,,1110.0,,ANOVA+IFS_noLeak,,600.0,0.743731
98,Fusion_PSSM1110_ANOVA_IFS_noLeak,44,0.919231,0.716719,0.518519,0.493771,0.053211,0.045954,0.15,5,,1110.0,,ANOVA+IFS_noLeak,,600.0,0.843009


In [34]:
# Exp 5.3 融合 FS 新增实验对比（有意义且不重复）
# 对比项：
# - 原始融合：Fusion_PSSM710 / Fusion_PSSM1110
# - 严格防泄漏融合FS：Fusion_PSSM710_ANOVA_IFS_noLeak / Fusion_PSSM1110_ANOVA_IFS_noLeak

cmp_df = to_results_df().copy()
focus_exps = [
    'Fusion_PSSM710',
    'Fusion_PSSM1110',
    'Fusion_PSSM710_ANOVA_IFS_noLeak',
    'Fusion_PSSM1110_ANOVA_IFS_noLeak',
]
cmp_focus = cmp_df[cmp_df['Exp'].isin(focus_exps)].copy()
cmp_summary = summarize_by_exp(cmp_focus)
cmp_summary

Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC,Brier,Brier,ECE,ECE
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Exp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Fusion_PSSM1110,0.932574,0.012156,0.680062,0.056067,0.597072,0.053639,0.561185,0.055029,0.056649,0.009813,0.043789,0.011436
Fusion_PSSM1110_ANOVA_IFS_noLeak,0.922367,0.007675,0.670986,0.050359,0.546089,0.033829,0.510068,0.034006,0.055185,0.002187,0.043946,0.004758
Fusion_PSSM710_ANOVA_IFS_noLeak,0.909527,0.008786,0.661997,0.008007,0.541579,0.067124,0.512926,0.062292,0.053606,0.003663,0.04125,0.008972
Fusion_PSSM710,0.906805,0.006444,0.63886,0.043857,0.541004,0.057947,0.502403,0.056402,0.060898,0.008117,0.050446,0.014709


In [35]:
# Exp 5.4 保存包含 Stage5 的新结果文件（不覆盖原 full_plan 文件）
# 为便于回溯，另存一套带 noLeak FS 的结果。

exp_df_stage5 = to_results_df()
summary_df_stage5 = summarize_by_exp(exp_df_stage5)

os.makedirs(f'{WORK_ROOT}/features', exist_ok=True)
res_path_5 = f'{WORK_ROOT}/features/full_plan_exp_results_with_stage5_noleak.csv'
sum_path_5 = f'{WORK_ROOT}/features/full_plan_exp_summary_with_stage5_noleak.csv'
exp_df_stage5.to_csv(res_path_5, index=False)
summary_df_stage5.to_csv(sum_path_5)
print('saved:', res_path_5)
print('saved:', sum_path_5)

saved: /home/nemophila/projects/protein_bert/pssm_work/features/full_plan_exp_results_with_stage5_noleak.csv
saved: /home/nemophila/projects/protein_bert/pssm_work/features/full_plan_exp_summary_with_stage5_noleak.csv


## Stage 5 结果解读提示

- 若 `Fusion_PSSM710_ANOVA_IFS_noLeak` 的 AUPRC/MCC 同时高于 `Fusion_PSSM710`，说明 710 维下存在有效冗余特征，筛选有益。
- 若 `Fusion_PSSM1110_ANOVA_IFS_noLeak` 优于 `Fusion_PSSM1110`，说明 1110 维的高维噪声对融合有负担。
- 若二者都无提升，说明当前融合头的“软特征选择”已足够，后续可不再投入 ANOVA+IFS。