# Anti-CRISPR Exp15: ProteinBERT + PSSM 融合

**功能**：本 notebook 总览与适用范围（PSSM 流水线 + 融合实验 + 基线/消融对照）。

该 notebook 仅保留与 `proteinbert_pssm_融合计划_47322538.plan.md` 相关内容：
- PSSM 数据生产流水线（UniRef50 + PSI-BLAST）
- RPSSM(110) + PSSM-AC(200) 特征缓存对齐
- Exp15 主实验与关键消融（RPSSM-only）
- 与纯 ProteinBERT 基线同协议、同 seeds 对照

In [11]:
# 功能：导入依赖并设置项目路径、310 维缓存路径与 5 个随机种子
import os
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score, brier_score_loss, f1_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import train_test_split

from proteinbert import (
    OutputType,
    OutputSpec,
    FinetuningModelGenerator,
    load_pretrained_model,
    finetune,
    FusionTrainConfig,
    load_anticrispr_with_ids,
    load_feature_cache,
    attach_pssm_features,
    run_finetune_with_pssm,
)
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

PROJECT_ROOT = '/home/nemophila/projects/protein_bert'
BENCHMARKS_DIR = f'{PROJECT_ROOT}/anticrispr_benchmarks'
WORK_ROOT = f'{PROJECT_ROOT}/pssm_work'
FEATURE_CACHE_PARQUET = f'{WORK_ROOT}/features/pssm_features_310.parquet'
FEATURE_CACHE_CSV = f'{WORK_ROOT}/features/pssm_features_310.csv'
SEEDS = [0, 11, 22, 33, 44]


## 阶段0-1：PSSM 数据准备（命令级）

**功能**：说明在命令行完成 PSSM 数据准备（下载/建库、生成 FASTA、跑 PSI-BLAST、特征提取、建缓存）的步骤。

首次执行（数据库下载与建库）：

```bash
cd /home/nemophila/projects/protein_bert
conda install -c bioconda blast -y
bash scripts/pssm/00_download_uniref50.sh
mkdir -p /home/nemophila/projects/protein_bert/pssm_work/{fasta,pssm,logs,features}
```

生成 FASTA + 跑 PSI-BLAST + 提特征：

```bash
python scripts/pssm/00_prepare_fasta.py \
  --train-csv /home/nemophila/projects/protein_bert/anticrispr_benchmarks/anticrispr_binary.train.csv \
  --test-csv /home/nemophila/projects/protein_bert/anticrispr_benchmarks/anticrispr_binary.test.csv \
  --work-root /home/nemophila/projects/protein_bert/pssm_work

bash scripts/pssm/01_run_psiblast_batch.sh \
  /home/nemophila/projects/protein_bert/pssm_work/sample_manifest.csv \
  /home/nemophila/projects/protein_bert/blast_db/uniref50 8

bash scripts/pssm/02_retry_failed.sh \
  /home/nemophila/projects/protein_bert/pssm_work/sample_manifest.csv \
  /home/nemophila/projects/protein_bert/pssm_work/logs/failed_ids.txt \
  /home/nemophila/projects/protein_bert/blast_db/uniref50 4

python scripts/pssm/03_extract_rpssm_pssmac.py \
  --manifest-csv /home/nemophila/projects/protein_bert/pssm_work/sample_manifest.csv \
  --work-root /home/nemophila/projects/protein_bert/pssm_work

python scripts/pssm/04_build_feature_cache.py \
  --manifest-csv /home/nemophila/projects/protein_bert/pssm_work/sample_manifest.csv \
  --work-root /home/nemophila/projects/protein_bert/pssm_work
```

In [12]:
# 功能：加载 Anti-CRISPR 数据并按 sample_id 挂载 310 维 PSSM 特征
train_df, test_df = load_anticrispr_with_ids(BENCHMARKS_DIR, benchmark_name='anticrispr_binary')
cache_path = FEATURE_CACHE_PARQUET if os.path.exists(FEATURE_CACHE_PARQUET) else FEATURE_CACHE_CSV
feature_df, feature_cols = load_feature_cache(cache_path)
train_df = attach_pssm_features(train_df, feature_df, feature_cols)
test_df = attach_pssm_features(test_df, feature_df, feature_cols)
print('train shape:', train_df.shape)
print('test shape:', test_df.shape)
print('feature dim:', len(feature_cols))


train shape: (1107, 313)
test shape: (286, 313)
feature dim: 310


In [13]:
# 功能：定义二分类评估指标（AUC/AUPRC/F1/MCC/Brier/ECE）及验证集最优阈值搜索
def expected_calibration_error(y_true, y_prob, n_bins=10):
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    ids = np.digitize(y_prob, bins) - 1
    ece = 0.0
    n = len(y_true)
    for b in range(n_bins):
        m = ids == b
        if np.any(m):
            conf = float(np.mean(y_prob[m]))
            acc = float(np.mean(y_true[m]))
            ece += (np.sum(m) / n) * abs(acc - conf)
    return float(ece)

def evaluate_binary(y_true, y_prob, thr=0.5):
    y_cls = (y_prob >= thr).astype(int)
    return {
        'AUC': float(roc_auc_score(y_true, y_prob)),
        'AUPRC': float(average_precision_score(y_true, y_prob)),
        'F1': float(f1_score(y_true, y_cls)),
        'MCC': float(matthews_corrcoef(y_true, y_cls)),
        'Brier': float(brier_score_loss(y_true, y_prob)),
        'ECE': float(expected_calibration_error(y_true, y_prob, n_bins=10)),
        'Threshold': float(thr),
    }

def find_best_thr(y_true, y_prob):
    best_thr, best_f1 = 0.5, -1.0
    for thr in np.linspace(0.05, 0.95, 19):
        cur_f1 = f1_score(y_true, (y_prob >= thr).astype(int))
        if cur_f1 > best_f1:
            best_f1, best_thr = cur_f1, float(thr)
    return best_thr


In [14]:
# 功能：单 seed 下训练并评估纯 ProteinBERT（多层 hidden/global 拼接）基线，在验证集选阈值后在测试集打分
def run_baseline_one_seed(seed):
    sub_train, sub_valid = train_test_split(
        train_df[['seq', 'label']], test_size=0.1, stratify=train_df['label'], random_state=seed
    )

    output_type = OutputType(False, 'binary')
    output_spec = OutputSpec(output_type, [0, 1])

    pretrained_model_generator, input_encoder = load_pretrained_model(
        local_model_dump_dir=f'{PROJECT_ROOT}/proteinbert_models',
        download_model_dump_if_not_exists=True,
        validate_downloading=False,
    )

    mg = FinetuningModelGenerator(
        pretrained_model_generator,
        output_spec=output_spec,
        pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs,
        dropout_rate=0.4,
        head_type='two_layer',
        loss_type='bce',
        lr=2e-5,
    )

    finetune(
        mg,
        input_encoder,
        output_spec,
        sub_train['seq'],
        sub_train['label'],
        sub_valid['seq'],
        sub_valid['label'],
        seq_len=512,
        batch_size=8,
        max_epochs_per_stage=8,
        begin_with_frozen_pretrained_layers=True,
        n_final_epochs=0,
    )

    model = mg.create_model(512)
    X_valid = input_encoder.encode_X(sub_valid['seq'].tolist(), 512)
    valid_prob = model.predict(X_valid, batch_size=8, verbose=0).reshape(-1)
    thr = find_best_thr(sub_valid['label'].to_numpy(), valid_prob)

    X_test = input_encoder.encode_X(test_df['seq'].tolist(), 512)
    test_prob = model.predict(X_test, batch_size=8, verbose=0).reshape(-1)
    return evaluate_binary(test_df['label'].to_numpy(), test_prob, thr=thr)


In [15]:
# 功能：5 个 seed 下跑基线与两组融合实验（多层表示先瓶颈压缩再与 PSSM 等宽融合），汇总为 exp_df
cfg = FusionTrainConfig(
    seq_len=512,
    batch_size=8,
    frozen_epochs=6,
    unfrozen_epochs=12,
    frozen_lr=1e-4,
    unfrozen_lr=2e-5,
    pssm_dropout=0.3,
    global_dropout=0.3,
    pssm_hidden_dim=128,
    global_hidden_dim=128,
    global_bottleneck_dim=64,
    fusion_hidden_dim=128,
    use_hidden_global_concat=True,
)

all_rows = []
for seed in SEEDS:
    print(f'Running seed={seed}')

    base_metrics = run_baseline_one_seed(seed)
    all_rows.append({'Exp': 'Baseline_ProteinBERT', 'Seed': seed, **base_metrics})

    pretrained_model_generator, input_encoder = load_pretrained_model(
        local_model_dump_dir=f'{PROJECT_ROOT}/proteinbert_models',
        download_model_dump_if_not_exists=True,
        validate_downloading=False,
    )
    rpssm_metrics = run_finetune_with_pssm(
        pretrained_model_generator,
        input_encoder,
        train_df,
        test_df,
        feature_cols=feature_cols[:110],
        seed=seed,
        cfg=cfg,
    )
    all_rows.append({'Exp': 'Ablation_RPSSM_110', 'Seed': seed, **rpssm_metrics})

    pretrained_model_generator, input_encoder = load_pretrained_model(
        local_model_dump_dir=f'{PROJECT_ROOT}/proteinbert_models',
        download_model_dump_if_not_exists=True,
        validate_downloading=False,
    )
    exp15_metrics = run_finetune_with_pssm(
        pretrained_model_generator,
        input_encoder,
        train_df,
        test_df,
        feature_cols=feature_cols,
        seed=seed,
        cfg=cfg,
    )
    all_rows.append({'Exp': 'Exp15_ProteinBERT_PSSM310', 'Seed': seed, **exp15_metrics})

exp_df = pd.DataFrame(all_rows)
exp_df


Running seed=0
[2026_02_13-13:58:04] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-13:58:04] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-13:58:04] Training with frozen pretrained layers...


2026-02-13 13:58:04.179645: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2026-02-13 13:58:04.181404: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2026-02-13 13:58:04.214052: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:2a:00.0 name: NVIDIA L40S computeCapability: 8.9
coreClock: 2.52GHz coreCount: 142 deviceMemorySize: 44.53GiB deviceMemoryBandwidth: 804.75GiB/s
2026-02-13 13:58:04.214226: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:ab:00.0 name: NVIDIA L40S computeCapability: 8.9
coreClock: 2.52GHz coreCount: 142 deviceMemorySize: 44.53GiB deviceMemoryBandwidth: 804.75GiB/s
2026-02-13 13:58:04.214250: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2026-02-13 13:58:04.2

Epoch 1/8


2026-02-13 13:58:12.561163: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2026-02-13 13:58:13.297685: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2026-02-13 13:58:13.309636: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2026-02-13 13:58:13.310437: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2026-02-13 13:58:15.810656: W tensorflow/stream_executor/gpu/asm_compiler.cc:63] Running ptxas --version returned 256
2026-02-13 13:58:16.029332: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: ptxas exited with non-zero error code 256, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-13:59:11] Training the entire fine-tuned model...
[2026_02_13-13:59:39] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Running seed=11
[2026_02_13-14:03:48] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_02_13-14:03:48] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_02_13-14:03:48] Training with frozen pretrained layers...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[2026_02_13-14:04:24] Training the entire fine-tuned model...
[2026_02_13-14:04:31] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Running seed=22
[2026_02_13-14:08:01] Training set: Filtered out 0 of 996 (0.0%) records of lengths exce

Unnamed: 0,Exp,Seed,AUC,AUPRC,F1,MCC,Brier,ECE,Threshold
0,Baseline_ProteinBERT,0,0.870562,0.59603,0.416667,0.358148,0.057541,0.05379,0.15
1,Ablation_RPSSM_110,0,0.885059,0.606118,0.45283,0.397061,0.060402,0.043639,0.4
2,Exp15_ProteinBERT_PSSM310,0,0.886982,0.615714,0.477612,0.42595,0.072465,0.066497,0.4
3,Baseline_ProteinBERT,11,0.895858,0.622127,0.528302,0.480255,0.067429,0.065583,0.55
4,Ablation_RPSSM_110,11,0.908432,0.691046,0.595745,0.563792,0.050634,0.045323,0.45
5,Exp15_ProteinBERT_PSSM310,11,0.928994,0.72619,0.638298,0.610422,0.048686,0.038688,0.55
6,Baseline_ProteinBERT,22,0.903402,0.633933,0.467532,0.424618,0.053819,0.052904,0.15
7,Ablation_RPSSM_110,22,0.904142,0.675248,0.5,0.44754,0.057499,0.051433,0.4
8,Exp15_ProteinBERT_PSSM310,22,0.883284,0.613421,0.533333,0.485119,0.057954,0.048066,0.35
9,Baseline_ProteinBERT,33,0.886834,0.609703,0.423529,0.379818,0.072381,0.07693,0.2


In [16]:
# 功能：按实验组汇总各指标均值与标准差，按 AUPRC 排序
summary = exp_df.groupby('Exp')[['AUC','AUPRC','F1','MCC','Brier','ECE']].agg(['mean','std'])
summary = summary.sort_values(('AUPRC', 'mean'), ascending=False)
summary


Unnamed: 0_level_0,AUC,AUC,AUPRC,AUPRC,F1,F1,MCC,MCC,Brier,Brier,ECE,ECE
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Exp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Ablation_RPSSM_110,0.897692,0.009604,0.657032,0.032155,0.525416,0.077424,0.486134,0.095735,0.055556,0.003648,0.048144,0.00345
Exp15_ProteinBERT_PSSM310,0.90142,0.018612,0.648604,0.045897,0.545978,0.064561,0.507445,0.070373,0.058311,0.008693,0.047319,0.011341
Baseline_ProteinBERT,0.886598,0.013516,0.604456,0.028341,0.453693,0.046073,0.404309,0.048904,0.062082,0.00761,0.059683,0.011431


In [17]:
# 功能：将实验结果明细与汇总表保存为 CSV
os.makedirs(f'{WORK_ROOT}/features', exist_ok=True)
report_path = f'{WORK_ROOT}/features/exp15_results.csv'
summary_path = f'{WORK_ROOT}/features/exp15_summary.csv'
exp_df.to_csv(report_path, index=False)
summary.to_csv(summary_path)
print('saved:', report_path)
print('saved:', summary_path)


saved: /home/nemophila/projects/protein_bert/pssm_work/features/exp15_results.csv
saved: /home/nemophila/projects/protein_bert/pssm_work/features/exp15_summary.csv
