In [3]:
BENCHMARKS_DIR = '/home/nemophila/projects/protein_bert/anticrispr_benchmarks'

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import pandas as pd
from IPython.display import display

from tensorflow import keras

from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

# ===================== 1. 修改基准名称（对应你的数据集前缀） =====================
BENCHMARK_NAME = 'anticrispr_binary'  # 替换原signalP_binary为你的数据集前缀

# A local (non-global) binary output
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]  # 你的数据集也是二分类（0/1），无需修改
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

# ===================== 2. 定义你的数据集根目录（核心修改） =====================
# 替换原BENCHMARKS_DIR，指向你的anticrispr_benchmarks文件夹绝对路径
BENCHMARKS_DIR = '/home/nemophila/projects/protein_bert/anticrispr_benchmarks'

# Loading the dataset
# ===================== 3. 加载你自己的训练/测试集（路径适配） =====================
# 加载训练集（你的anticrispr_binary.train.csv）
train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
# 从训练集中拆分验证集（和原逻辑一致，按标签分层拆分）
train_set, valid_set = train_test_split(train_set, stratify = train_set['label'], test_size = 0.1, random_state = 0)

# 加载测试集（你的anticrispr_binary.test.csv）
test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

# 打印数据集大小（验证是否加载成功）
print(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')

# ===================== 以下部分无需修改（模型训练/评估逻辑通用） =====================
# Loading the pre-trained model and fine-tuning it on the loaded dataset
pretrained_model_generator, input_encoder = load_pretrained_model()

# get_model_with_hidden_layers_as_outputs gives the model output access to the hidden layers (on top of the output)
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
]

finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], \
        seq_len = 512, batch_size = 32, max_epochs_per_stage = 40, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)

# Evaluating the performance on the test-set
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['seq'], test_set['label'], \
        start_seq_len = 512, start_batch_size = 32)

print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

996 training set records, 111 validation set records, 286 test set records.
[2026_01_31-16:39:14] Training set: Filtered out 0 of 996 (0.0%) records of lengths exceeding 510.
[2026_01_31-16:39:14] Validation set: Filtered out 0 of 111 (0.0%) records of lengths exceeding 510.
[2026_01_31-16:39:14] Training with frozen pretrained layers...
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 6/40

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[2026_01_31-16:39:27] Training the entire fine-tuned model...
[2026_01_31-16:39:32] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40

Epoch 00002: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 3/40

Epoch 00003: ReduceLROnPlateau reducing learning rate to 1e-05.
[2026_01_31-16:39:41] Training on final epochs of sequence length 1024...
[2026_01_31-16:39:41] Trai

Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,286,0.89571
All,286,0.89571


Confusion matrix:


Unnamed: 0,0,1
0,257,3
1,16,10
