In [1]:
"""
Train a new model.
"""
from __future__ import annotations

import time
from sklearn.model_selection import KFold, StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.autograd import Variable
from torch.utils.data import IterableDataset, dataloader
from multiprocessing.reduction import ForkingPickler
from sklearn.metrics import average_precision_score as average_precision
from tqdm import tqdm
from typing import Callable, NamedTuple, Optional
from collections import OrderedDict
import sys
import numpy as np
import argparse
import pandas as pd
import torch.optim as optim
from torch.optim import Optimizer
from src.models.mvsf import ModelAffinity
from src.utils import *
from multiprocessing.reduction import ForkingPickler
from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torch.cuda.amp import GradScaler, autocast
from torcheval.metrics.functional import r2_score

# 解决使用multiprocessing模块时由于Tensor对象内部实现机制导致的序列化错误
default_collate_func = dataloader.default_collate
def default_collate_override(batch):
    dataloader._use_shared_memory = False
    return default_collate_func(batch)
setattr(dataloader, 'default_collate', default_collate_override)
for t in torch._storage_classes:
    if sys.version_info[0] == 2:
        if t in ForkingPickler.dispatch:
            del ForkingPickler.dispatch[t]
    else:
        if t in ForkingPickler._extra_reducers:
            del ForkingPickler._extra_reducers[t]

class TrainArguments(NamedTuple):
    cmd: str
    device: int
    train: str
    test: str
    no_augment: bool
    augment_weight: float
    weight_module1: float
    weight_module2: float
    num_epochs: int
    batch_size: int
    weight_decay: float
    lr: float
    kfolds: int
    outfile: Optional[str]
    save_prefix: Optional[str]
    checkpoint: Optional[str]
    seed: Optional[int]
    func: Callable[[TrainArguments], None]

def add_args(parser):
    data_grp = parser.add_argument_group("Data")
    contact_grp = parser.add_argument_group("Contact Module")
    train_grp = parser.add_argument_group("Training")
    misc_grp = parser.add_argument_group("Output and Device")

    # Data
    data_grp.add_argument("--train", default="datasets/pairs_sabdab.csv", help="list of training pairs")
    data_grp.add_argument("--test", default="datasets/pairs_benchmark.csv", help="list of validation/testing pairs")
    data_grp.add_argument("--seq-path", default="datasets/seq_natural.fasta")
    data_grp.add_argument("--feature-path", default="datasets/seq_natural_embedding.csv")
    data_grp.add_argument("--no-augment", default=True, help="data is automatically augmented by adding (B A) for all pairs (A B). Set this flag to not augment data",)
    data_grp.add_argument("--augment-weight", type=float, default=0.5, help="weight of augment data",)

    # Model
    contact_grp.add_argument("--weight-module1", type=float, default=1, help="weight of module1",)
    contact_grp.add_argument("--weight-module2", type=float, default=1, help="weight of module1",)

    # Training
    train_grp.add_argument("--num-epochs", type=int, default=30, help="number of epochs",)
    train_grp.add_argument("--batch-size", type=int, default=16, help="minibatch size (default: 16)",)
    train_grp.add_argument("--weight-decay", type=float, default=0.00001, help="L2 regularization /0.0001",)  # 正则化项的设置
    train_grp.add_argument("--lr", type=float, default=0.00001, help="learning rate",)
    train_grp.add_argument("--kfolds", type=int, default=10)
    train_grp.add_argument("--cross-validate", default=True, help="cross validate",)

    # Output and Device
    misc_grp.add_argument("-o", "--outfile", help="output file path (default: stdout)")
    misc_grp.add_argument("--save-prefix", help="path prefix for saving models")
    misc_grp.add_argument("-d", "--device", type=int, required=True, help="compute device to use")
    misc_grp.add_argument("--checkpoint", help="checkpoint model to start training from")
    misc_grp.add_argument("--seed", help="Set random seed", type=int)
    return parser

def predict_affinity(model, Lchain, Hchain, antigen, embedding_tensor, aaindex_feature, use_cuda):
    b = len(Hchain)
    lchain_embeddings = []
    hchain_embeddings = []
    ag_embeddings = []

    lchain_aaindex = []
    hchain_aaindex = []
    ag_aaindex = []

    for i in range(b):
        lchain_embedding = embedding_tensor[Lchain[i]]
        hchain_embedding = embedding_tensor[Hchain[i]]
        ag_embedding = embedding_tensor[antigen[i]]

        lchain_aaindex.append(aaindex_feature[Lchain[i]])
        hchain_aaindex.append(aaindex_feature[Hchain[i]])
        ag_aaindex.append(aaindex_feature[antigen[i]])

        lchain_embeddings.append(lchain_embedding)
        hchain_embeddings.append(hchain_embedding)
        ag_embeddings.append(ag_embedding)

    if use_cuda:
        lchain_embeddings = torch.stack(lchain_embeddings, 0).cuda()
        hchain_embeddings = torch.stack(hchain_embeddings, 0).cuda()
        ag_embeddings = torch.stack(ag_embeddings, 0).cuda()

        lchain_aaindex = torch.stack(lchain_aaindex, 0).cuda()
        hchain_aaindex = torch.stack(hchain_aaindex, 0).cuda()
        ag_aaindex = torch.stack(ag_aaindex, 0).cuda()



    ph = model.predict(lchain_aaindex, hchain_aaindex, ag_aaindex, lchain_embeddings, hchain_embeddings, ag_embeddings)
    return ph

def model_eval(model, test_iterator, embedding_tensors, aaindex_feature, write, weight1, weight2, use_cuda):

    p_hat = []
    true_y = []

    for lchain, hchain, antigen, y in test_iterator:

        ph = predict_affinity(model, lchain, hchain, antigen, embedding_tensors, aaindex_feature, use_cuda)
        p_hat.append(ph)
        true_y.append(y)

    y = torch.cat(true_y, 0)

    p_hat = torch.cat(p_hat, 0)
    if use_cuda:
        y.cuda()
        p_hat = torch.Tensor([x.cuda() for x in p_hat])
        p_hat.cuda()
    criterion = nn.MSELoss()
    loss = criterion(p_hat.float(), y.float())

    with torch.no_grad():
        p_hat = p_hat.float()
        y = y.float()
        max_val = 16.9138
        min_val = 5.0400
        p_hat = (p_hat * (max_val - min_val)) + min_val

        # if write:
        #     with open('pred_skempi.csv', 'a') as f:
        #         for i in range(len(y)):
        #             f.write(str(y[i].item()) + ',' + str(p_hat[i].item()) + '\n')

        rmse = torch.sqrt(torch.mean((y - p_hat) ** 2)).item()
        mae = torch.mean(torch.abs(y - p_hat)).item()
        r_2 = r2_score(y, p_hat).item()
        p = pearsonr(y, p_hat).item()

    return loss, rmse, mae, r_2, p

def train_model(args, output):
    # Create data sets
    batch_size = args.batch_size
    use_cuda = (args.device > -1) and torch.cuda.is_available()  # True
    train_fi = args.train
    train_df = pd.read_csv(train_fi)
    test_fi = args.test
    test_df = pd.read_csv(test_fi)


    # Train the model
    lr = args.lr
    # wd = args.weight_decay  # 0.0001
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    digits = int(np.floor(np.log10(num_epochs))) + 1
    save_prefix = args.save_prefix
    weight1 = args.weight_module1
    weight2 = args.weight_module2


    log(f'Using save prefix "{save_prefix}"', file=output)
    log(f"Training with SAM: lr={lr}", file=output)
    log(f"\tnum_epochs: {num_epochs}", file=output)
    log(f"\tbatch_size: {batch_size}", file=output)
    log(f"\tmodule 1 weight: {weight1}", file=output)
    log(f"\tmodule 2 weight: {weight2}", file=output)
    output.flush()


    if(args.cross_validate):
    # ===============================================cross validation=================================================
        k_folds = args.kfolds
        kfold = KFold(n_splits=k_folds, shuffle=False)
        for fold, (train_ids, test_ids) in enumerate(kfold.split(train_df)):
            print(f'******************************** FOLD {fold} ******************************')
            log(f'******************************** FOLD {fold} ******************************', file=output)
            train_df_fold = train_df.iloc[train_ids]
            test_df_fold = train_df.iloc[test_ids]
            train_df_fold = train_df_fold.reset_index(drop=True)
            test_df_fold = test_df_fold.reset_index(drop=True)

            train_df_fold.columns = ["light", "heavy", "antigen", "delta_g"]
            train_l_fold = train_df_fold["light"]
            train_h_fold = train_df_fold["heavy"]
            train_ag_fold = train_df_fold["antigen"]
            train_y_fold = torch.from_numpy(train_df_fold["delta_g"].values)
            train_y_fold = -train_y_fold

            max_val = 16.05654
            min_val = 5.0400
            train_y_fold = (train_y_fold - min_val) / (max_val - min_val)

            test_df_fold.columns = ["light", "heavy", "antigen", "delta_g"]
            test_l_fold = test_df_fold["light"]
            test_h_fold = test_df_fold["heavy"]
            test_ag_fold = test_df_fold["antigen"]
            test_y_fold = torch.from_numpy(test_df_fold["delta_g"].values)
            test_y_fold = -test_y_fold

            train_dataset_fold = PairedDataset(train_l_fold, train_h_fold, train_ag_fold, train_y_fold)
            train_iterator_fold = torch.utils.data.DataLoader(
                train_dataset_fold,
                batch_size=batch_size,
                collate_fn=collate_paired_sequences,
                shuffle=True,
                pin_memory=False,
                drop_last=False,
                # num_workers=2,
            )
            log(f"Loaded {len(train_l_fold)} training pairs", file=output)
            output.flush()

            test_dataset_fold = PairedDataset(test_l_fold, test_h_fold, test_ag_fold, test_y_fold)
            test_iterator_fold = torch.utils.data.DataLoader(
                test_dataset_fold,
                batch_size=batch_size,
                collate_fn=collate_paired_sequences,
                shuffle=False,
                pin_memory=False,
                drop_last=False,
                # num_workers=2,
            )

            all_proteins = set(train_l_fold).union(train_h_fold).union(train_ag_fold) \
                .union(test_l_fold).union(test_h_fold).union(test_ag_fold)
            fastaPath = args.seq_path
            embeddingPath = args.feature_path
            embeddings = embed_dict(fastaPath, embeddingPath)
            log("Embedded successfully...", file=output)
            aaindex_feature = seq_aaindex_dict(all_proteins, fastaPath)

            model = ModelAffinity(batch_size, use_cuda)
            model.use_cuda = use_cuda  # default is False
            if use_cuda:
                model.cuda()
            params = [p for p in model.parameters() if p.requires_grad]
            base_optimizer = optim.SGD
            optimizer = SAM(params, base_optimizer, lr=lr, weight_decay=args.weight_decay)

            batch_report_fmt = ("[{}/{}] training {:.1%}: Loss={:.6}, MSE={:.6}, MAE={:.6}")
            epoch_report_fmt = (
                "-----------------------------------Finished Epoch {}/{}: Loss={:.6}, RMSE={:.6}, MAE={:.6}, r_2={:.6}, p={:.6}")

            N = len(train_iterator_fold) * batch_size


            for epoch in range(num_epochs):
                if epoch == 10:
                    optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / 10
                print("lr:", optimizer.param_groups[0]['lr'])
                model.train()
                n = 0
                loss_accum = 0
                # acc_accum = 0
                mse_accum = 0
                mae_accum = 0
                optimizer.zero_grad()
                all_y = []
                all_p_hat = []
                for (lchain, hchain, antigen, y) in train_iterator_fold:

                    phat = predict_affinity(
                        model, lchain, hchain, antigen, embeddings, aaindex_feature, use_cuda=use_cuda)
                    phat = phat.float().view(-1)

                    if use_cuda:
                        y = y.cuda()
                    # y = Variable(y)
                    y = y.float()

                    criterion = nn.MSELoss()
                    b = len(y)
                    loss = criterion(phat, y)
                    loss.requires_grad_(True)
                    loss.backward()
                    # scaler.scale(loss).backward()
                    if use_cuda:
                        y = y.cpu()
                        phat = phat.cpu()
                    all_y.append(y)
                    all_p_hat.append(phat)

                    with torch.no_grad():
                        phat = phat.float()
                        y = y.float()
                        mse = torch.mean((y - phat) ** 2).item()
                        mae = torch.mean(torch.abs(y - phat)).item()
                    n += b
                    delta = b * (loss.item() - loss_accum)
                    loss_accum += delta / n
                    delta = b * (mse - mse_accum)
                    mse_accum += delta / n
                    delta = b * (mae - mae_accum)
                    mae_accum += delta / n
                    report = (n - b) // 100 < n // 100

                    optimizer.step()

                    if report:
                        tokens = [epoch + 1, num_epochs, n / N, loss_accum, mse_accum, mae_accum, ]
                        log(batch_report_fmt.format(*tokens), file=output)
                        output.flush()

                model.eval()
                with torch.no_grad():
                    if epoch+1 == 30:
                        write = True
                    else:
                        write = False
                    (inter_loss, inter_rmse, inter_mae, inter_r_2, inter_p,) = model_eval(
                        model, test_iterator_fold, embeddings, aaindex_feature, write, weight1, weight2, use_cuda=use_cuda)

                    tokens = [epoch + 1, num_epochs, inter_loss, inter_mae, inter_rmse, inter_r_2, inter_p, ]

                    # scheduler.step(inter_mse)
                    log(epoch_report_fmt.format(*tokens), file=output)
                    output.flush()

                    # Save the model (every epoch)
                    # if save_prefix is not None:
                    #     save_path = (save_prefix + "_epoch" + str(epoch + 1).zfill(digits) + ".pth")
                    #     log(f"Saving model to {save_path}", file=output)
                    #     model.cpu()
                    #     torch.save(model, save_path)
                    #     if use_cuda:
                    #         model.cuda()

                    # update learning rate
                    # scheduler.step()

                # output.flush()
            # break
    else:
        num_samples = len(train_df)
        train_df.columns = ["light", "heavy", "antigen", "delta_g"]
        train_l = train_df["light"]
        train_h = train_df["heavy"]
        train_ag = train_df["antigen"]
        train_y = torch.from_numpy(train_df["delta_g"].values)
        train_y = -train_y
        train_y = NormalizeData(train_y)

        test_df.columns = ["light", "heavy", "antigen", "delta_g"]
        test_l = test_df["light"]
        test_h = test_df["heavy"]
        test_ag = test_df["antigen"]
        test_y = torch.from_numpy(test_df["delta_g"].values)
        test_y = -test_y

        train_dataset = PairedDataset(train_l, train_h, train_ag, train_y)
        train_iterator = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=batch_size,
            collate_fn=collate_paired_sequences,
            shuffle=True,
            pin_memory=False,
            drop_last=True,
            # num_workers=4,
        )
        log(f"Loaded {len(train_l)} training pairs", file=output)
        output.flush()

        test_dataset = PairedDataset(test_l, test_h, test_ag, test_y)
        test_iterator = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=batch_size,
            collate_fn=collate_paired_sequences,
            shuffle=False,
            pin_memory=False,
            drop_last=True,
            # num_workers=4,
        )

        log(f"Loaded {len(test_l)} test pairs", file=output)
        log("Loading embeddings...", file=output)
        output.flush()

        all_proteins = set(train_l).union(train_h).union(train_ag).union(test_l).union(test_h).union(test_ag)

        fastaPath = args.seq_path
        embeddingPath = args.feature_path
        embeddings = embed_dict(fastaPath, embeddingPath)
        log("embeded successfully...", file=output)
        aaindex_feature = seq_aaindex_dict(all_proteins, fastaPath)

        model = ModelAffinity(batch_size, use_cuda)
        if use_cuda:
            model.cuda()

        params = [p for p in model.parameters() if p.requires_grad]
        base_optimizer = optim.Adam
        optimizer = SAM(params, base_optimizer, lr=lr)
        log(f'Using save prefix "{save_prefix}"', file=output)
        log(f"Training with SAM: lr={lr}", file=output)
        log(f"\tnum_epochs: {num_epochs}", file=output)
        log(f"\tbatch_size: {batch_size}", file=output)
        log(f"\tmodule 1 weight: {weight1}", file=output)
        log(f"\tmodule 2 weight: {weight2}", file=output)
        output.flush()

        batch_report_fmt = ("[{}/{}] training {:.1%}: Loss={:.6}, MSE={:.6}, MAE={:.6}")
        epoch_report_fmt = (
            "-----------------------------------Finished Epoch {}/{}: Loss={:.6}, RMSE={:.6}, MAE={:.6}, r_2={:.6}, p={:.6}")

        N = len(train_iterator) * batch_size
        for epoch in range(num_epochs):

            model.train()
            n = 0
            loss_accum = 0
            mse_accum = 0
            mae_accum = 0
            all_y = []
            all_p_hat = []
            optimizer.zero_grad()
            for (lchain, hchain, antigen, y) in train_iterator:
                phat = predict_affinity(
                    model, lchain, hchain, antigen, embeddings, aaindex_feature, use_cuda=use_cuda)
                phat = phat.float().view(-1)

                if use_cuda:
                    y = y.cuda()
                # y = Variable(y)
                y = y.float()

                criterion = nn.MSELoss()
                b = len(y)
                loss = criterion(phat, y)
                loss.requires_grad_(True)
                loss.backward()
                # scaler.scale(loss).backward()
                if use_cuda:
                    y = y.cpu()
                    phat = phat.cpu()

                all_y.append(y)
                all_p_hat.append(phat)

                with torch.no_grad():
                    phat = phat.float()
                    y = y.float()
                    mse = torch.mean((y - phat) ** 2).item()
                    mae = torch.mean(torch.abs(y - phat)).item()
                n += b
                delta = b * (loss.item() - loss_accum)
                loss_accum += delta / n
                delta = b * (mse - mse_accum)
                mse_accum += delta / n
                delta = b * (mae - mae_accum)
                mae_accum += delta / n
                report = (n - b) // 100 < n // 100

                optimizer.step()
                if report:
                    tokens = [epoch + 1, num_epochs, n / N, loss_accum, mse_accum, mae_accum, ]
                    log(batch_report_fmt.format(*tokens), file=output)
                    output.flush()

            model.eval()
            with torch.no_grad():

                (inter_loss, inter_rmse, inter_mae, inter_r_2, inter_p,) = model_eval(
                    model, test_iterator, embeddings, aaindex_feature, weight1, weight2, use_cuda=use_cuda)
                tokens = [epoch + 1, num_epochs, inter_loss, inter_mae, inter_rmse, inter_r_2, inter_p, ]
                # scheduler.step(inter_mse)
                log(epoch_report_fmt.format(*tokens), file=output)
                output.flush()

                # Save the model (every epoch)
                # if save_prefix is not None:
                #     save_path = (save_prefix + "_epoch" + str(epoch + 1).zfill(digits) + ".pth")
                #     log(f"Saving model to {save_path}", file=output)
                #     model.cpu()
                #     torch.save(model, save_path)
                #     if use_cuda:
                #         model.cuda()

                # update learning rate
                # scheduler.step()

    # Save the model (final)
    # if save_prefix is not None:
    #     save_path = save_prefix + "_final.pth"
    #     log(f"Saving final model to {save_path}", file=output)
    #     model.cpu()
    #     torch.save(model, save_path)
    #     if use_cuda:
    #         model.cuda()

def main(args):
    output = args.outfile
    if output is None:
        output = sys.stdout
    else:
        output = open(output, "w")

    log(f'Called as: {" ".join(sys.argv)}', file=output, print_also=True)

    # Set the device
    device = args.device
    use_cuda = (device > -1) and torch.cuda.is_available()
    if use_cuda:
        torch.cuda.set_device(device)
        log(
            f"Using CUDA device {device} - {torch.cuda.get_device_name(device)}",
            file=output,
            print_also=True,
        )
    else:
        log("Using CPU", file=output, print_also=True)
        device = "cpu"

    if args.seed is not None:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
    train_model(args, output)

    output.close()

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

# 确保目录存在
os.makedirs("my_datasets", exist_ok=True)

# 设置文件路径
input_file = "my_datasets/pairs_benchmark.csv"
output_file_80 = "my_datasets/pairs_benchmark_1.csv"
output_file_20 = "my_datasets/pairs_benchmark_2.csv"

# 读取原始CSV文件
print(f"读取文件: {input_file}")
try:
    df = pd.read_csv(input_file)
    print(f"成功读取数据，共 {len(df)} 行")
except Exception as e:
    print(f"读取文件时出错: {str(e)}")
    exit(1)

# 显示原始数据的前几行
print("\n原始数据预览:")
print(df.head())

# 随机分割数据 (80% / 20%)
df_80, df_20 = train_test_split(df, test_size=0.2, random_state=42)

# 输出分割后的数据统计
print("\n数据分割统计:")
print(f"原始数据总行数: {len(df)}")
print(f"80%子集行数: {len(df_80)} ({len(df_80)/len(df)*100:.2f}%)")
print(f"20%子集行数: {len(df_20)} ({len(df_20)/len(df)*100:.2f}%)")

# 保存分割后的数据
df_80.to_csv(output_file_80, index=False)
df_20.to_csv(output_file_20, index=False)

print(f"\n已将80%的数据保存至: {output_file_80}")
print(f"已将20%的数据保存至: {output_file_20}")

# 验证保存的文件
print("\n验证文件大小:")
file_size_original = os.path.getsize(input_file)
file_size_80 = os.path.getsize(output_file_80)
file_size_20 = os.path.getsize(output_file_20)

print(f"原始文件大小: {file_size_original/1024:.2f} KB")
print(f"80%文件大小: {file_size_80/1024:.2f} KB ({file_size_80/file_size_original*100:.2f}%)")
print(f"20%文件大小: {file_size_20/1024:.2f} KB ({file_size_20/file_size_original*100:.2f}%)")

# 检查两个子集是否有重叠
overlap_count = pd.merge(df_80, df_20, how='inner').shape[0]
if overlap_count > 0:
    print(f"\n警告: 两个子集之间存在 {overlap_count} 行重叠数据!")
else:
    print("\n验证成功: 两个子集之间没有重叠数据")

读取文件: my_datasets/pairs_benchmark.csv
成功读取数据，共 2424 行

原始数据预览:
        light       heavy         antigen  delta_g
0  1AHW_light  1AHW_heavy  1AHW_antigen_1   -10.90
1  1AHW_light  1AHW_heavy  1AHW_antigen_2    -7.53
2  1AHW_light  1AHW_heavy  1AHW_antigen_3   -10.46
3  1AHW_light  1AHW_heavy  1AHW_antigen_4   -11.86
4  1AHW_light  1AHW_heavy  1AHW_antigen_5   -10.20

数据分割统计:
原始数据总行数: 2424
80%子集行数: 1939 (79.99%)
20%子集行数: 485 (20.01%)

已将80%的数据保存至: my_datasets/pairs_benchmark_1.csv
已将20%的数据保存至: my_datasets/pairs_benchmark_2.csv

验证文件大小:
原始文件大小: 116.18 KB
80%文件大小: 92.85 KB (79.92%)
20%文件大小: 23.36 KB (20.11%)

验证成功: 两个子集之间没有重叠数据


In [1]:
# 导入必要的库
from __future__ import annotations
import time
from sklearn.model_selection import KFold, StratifiedKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.autograd import Variable
from torch.utils.data import IterableDataset, dataloader
from multiprocessing.reduction import ForkingPickler
from sklearn.metrics import average_precision_score as average_precision
from tqdm import tqdm
from typing import Callable, NamedTuple, Optional
from collections import OrderedDict
import sys
import os
import numpy as np
import pandas as pd
import torch.optim as optim
from torch.optim import Optimizer
from src.models.mvsf import ModelAffinity
from src.utils import *
from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torch.cuda.amp import GradScaler, autocast
from torcheval.metrics.functional import r2_score

# 解决使用multiprocessing模块时由于Tensor对象内部实现机制导致的序列化错误
default_collate_func = dataloader.default_collate
def default_collate_override(batch):
    dataloader._use_shared_memory = False
    return default_collate_func(batch)
setattr(dataloader, 'default_collate', default_collate_override)
for t in torch._storage_classes:
    if sys.version_info[0] == 2:
        if t in ForkingPickler.dispatch:
            del ForkingPickler.dispatch[t]
    else:
        if t in ForkingPickler._extra_reducers:
            del ForkingPickler._extra_reducers[t]

# 替换TrainArguments和add_args为简单的类
class Args:
    # 默认参数值
    device = 0  # 使用CUDA设备编号，-1表示使用CPU
    train = "my_datasets/pairs_benchmark_1.csv"  # 训练数据路径
    test = "my_datasets/pairs_benchmark_2.csv"  # 测试数据路径
    seq_path = "my_datasets/seq_natural.fasta"  # 序列路径
    feature_path = "my_datasets/seq_natural_embedding.csv"  # 特征路径
    no_augment = True  # 不增强数据
    augment_weight = 0.5  # 增强数据权重
    weight_module1 = 1.0  # 模块1权重
    weight_module2 = 1.0  # 模块2权重
    num_epochs = 30  # 训练轮数
    batch_size = 16  # 批大小
    weight_decay = 0.00001  # L2正则化系数
    lr = 0.00001  # 学习率
    kfolds = 10  # 交叉验证折数
    cross_validate = True  # 是否使用交叉验证
    outfile = "output.log"  # 输出文件路径
    save_prefix = "saved_models/model"  # 保存模型的路径前缀
    checkpoint = None  # 加载检查点模型
    seed = 42  # 随机种子

# 创建参数实例
args = Args()

# 确保保存路径的目录存在
os.makedirs(os.path.dirname(args.save_prefix), exist_ok=True)

# 日志输出函数
def log(message, file=None, print_also=True):
    if print_also:
        print(message)
    if file and file != sys.stdout:
        file.write(message + "\n")
        file.flush()

# 预测亲和力函数
def predict_affinity(model, Lchain, Hchain, antigen, embedding_tensor, aaindex_feature, use_cuda):
    b = len(Hchain)
    lchain_embeddings = []
    hchain_embeddings = []
    ag_embeddings = []

    lchain_aaindex = []
    hchain_aaindex = []
    ag_aaindex = []

    for i in range(b):
        lchain_embedding = embedding_tensor[Lchain[i]]
        hchain_embedding = embedding_tensor[Hchain[i]]
        ag_embedding = embedding_tensor[antigen[i]]

        lchain_aaindex.append(aaindex_feature[Lchain[i]])
        hchain_aaindex.append(aaindex_feature[Hchain[i]])
        ag_aaindex.append(aaindex_feature[antigen[i]])

        lchain_embeddings.append(lchain_embedding)
        hchain_embeddings.append(hchain_embedding)
        ag_embeddings.append(ag_embedding)

    if use_cuda:
        lchain_embeddings = torch.stack(lchain_embeddings, 0).cuda()
        hchain_embeddings = torch.stack(hchain_embeddings, 0).cuda()
        ag_embeddings = torch.stack(ag_embeddings, 0).cuda()

        lchain_aaindex = torch.stack(lchain_aaindex, 0).cuda()
        hchain_aaindex = torch.stack(hchain_aaindex, 0).cuda()
        ag_aaindex = torch.stack(ag_aaindex, 0).cuda()

    ph = model.predict(lchain_aaindex, hchain_aaindex, ag_aaindex, lchain_embeddings, hchain_embeddings, ag_embeddings)
    return ph

# 模型评估函数
def model_eval(model, test_iterator, embedding_tensors, aaindex_feature, write, weight1, weight2, use_cuda):
    p_hat = []
    true_y = []

    for lchain, hchain, antigen, y in test_iterator:
        ph = predict_affinity(model, lchain, hchain, antigen, embedding_tensors, aaindex_feature, use_cuda)
        p_hat.append(ph)
        true_y.append(y)

    y = torch.cat(true_y, 0)

    p_hat = torch.cat(p_hat, 0)
    if use_cuda:
        y.cuda()
        p_hat = torch.Tensor([x.cuda() for x in p_hat])
        p_hat.cuda()
    criterion = nn.MSELoss()
    loss = criterion(p_hat.float(), y.float())

    with torch.no_grad():
        p_hat = p_hat.float()
        y = y.float()
        max_val = 16.9138
        min_val = 5.0400
        p_hat = (p_hat * (max_val - min_val)) + min_val

        # if write:
        #     with open('pred_skempi.csv', 'a') as f:
        #         for i in range(len(y)):
        #             f.write(str(y[i].item()) + ',' + str(p_hat[i].item()) + '\n')

        rmse = torch.sqrt(torch.mean((y - p_hat) ** 2)).item()
        mae = torch.mean(torch.abs(y - p_hat)).item()
        r_2 = r2_score(y, p_hat).item()
        p = pearsonr(y, p_hat).item()

    return loss, rmse, mae, r_2, p

# 训练模型函数
def train_model(args, output):
    # 创建数据集
    batch_size = args.batch_size
    use_cuda = (args.device > -1) and torch.cuda.is_available()
    train_fi = args.train
    train_df = pd.read_csv(train_fi)
    test_fi = args.test
    test_df = pd.read_csv(test_fi)

    # 训练模型参数
    lr = args.lr
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    digits = int(np.floor(np.log10(num_epochs))) + 1
    save_prefix = args.save_prefix
    weight1 = args.weight_module1
    weight2 = args.weight_module2

    # 打印参数信息
    log(f'Using save prefix "{save_prefix}"', file=output)
    log(f"Training with SAM: lr={lr}", file=output)
    log(f"\tnum_epochs: {num_epochs}", file=output)
    log(f"\tbatch_size: {batch_size}", file=output)
    log(f"\tmodule 1 weight: {weight1}", file=output)
    log(f"\tmodule 2 weight: {weight2}", file=output)
    if output != sys.stdout:
        output.flush()

    if(args.cross_validate):
    # ===============================================cross validation=================================================
        k_folds = args.kfolds
        kfold = KFold(n_splits=k_folds, shuffle=False)
        for fold, (train_ids, test_ids) in enumerate(kfold.split(train_df)):
            print(f'******************************** FOLD {fold} ******************************')
            log(f'******************************** FOLD {fold} ******************************', file=output)
            train_df_fold = train_df.iloc[train_ids]
            test_df_fold = train_df.iloc[test_ids]
            train_df_fold = train_df_fold.reset_index(drop=True)
            test_df_fold = test_df_fold.reset_index(drop=True)

            train_df_fold.columns = ["light", "heavy", "antigen", "delta_g"]
            train_l_fold = train_df_fold["light"]
            train_h_fold = train_df_fold["heavy"]
            train_ag_fold = train_df_fold["antigen"]
            train_y_fold = torch.from_numpy(train_df_fold["delta_g"].values)
            train_y_fold = -train_y_fold

            max_val = 16.05654
            min_val = 5.0400
            train_y_fold = (train_y_fold - min_val) / (max_val - min_val)

            test_df_fold.columns = ["light", "heavy", "antigen", "delta_g"]
            test_l_fold = test_df_fold["light"]
            test_h_fold = test_df_fold["heavy"]
            test_ag_fold = test_df_fold["antigen"]
            test_y_fold = torch.from_numpy(test_df_fold["delta_g"].values)
            test_y_fold = -test_y_fold

            train_dataset_fold = PairedDataset(train_l_fold, train_h_fold, train_ag_fold, train_y_fold)
            train_iterator_fold = torch.utils.data.DataLoader(
                train_dataset_fold,
                batch_size=batch_size,
                collate_fn=collate_paired_sequences,
                shuffle=True,
                pin_memory=False,
                drop_last=False,
                # num_workers=2,
            )
            log(f"Loaded {len(train_l_fold)} training pairs", file=output)
            if output != sys.stdout:
                output.flush()

            test_dataset_fold = PairedDataset(test_l_fold, test_h_fold, test_ag_fold, test_y_fold)
            test_iterator_fold = torch.utils.data.DataLoader(
                test_dataset_fold,
                batch_size=batch_size,
                collate_fn=collate_paired_sequences,
                shuffle=False,
                pin_memory=False,
                drop_last=False,
                # num_workers=2,
            )

            all_proteins = set(train_l_fold).union(train_h_fold).union(train_ag_fold) \
                .union(test_l_fold).union(test_h_fold).union(test_ag_fold)
            fastaPath = args.seq_path
            embeddingPath = args.feature_path
            embeddings = embed_dict(fastaPath, embeddingPath)
            log("Embedded successfully...", file=output)
            aaindex_feature = seq_aaindex_dict(all_proteins, fastaPath)

            model = ModelAffinity(batch_size, use_cuda)
            model.use_cuda = use_cuda  # default is False
            if use_cuda:
                model.cuda()
            params = [p for p in model.parameters() if p.requires_grad]
            base_optimizer = optim.SGD
            optimizer = SAM(params, base_optimizer, lr=lr, weight_decay=args.weight_decay)

            batch_report_fmt = ("[{}/{}] training {:.1%}: Loss={:.6}, MSE={:.6}, MAE={:.6}")
            epoch_report_fmt = (
                "-----------------------------------Finished Epoch {}/{}: Loss={:.6}, RMSE={:.6}, MAE={:.6}, r_2={:.6}, p={:.6}")

            N = len(train_iterator_fold) * batch_size

            for epoch in range(num_epochs):
                if epoch == 10:
                    optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / 10
                print("lr:", optimizer.param_groups[0]['lr'])
                model.train()
                n = 0
                loss_accum = 0
                # acc_accum = 0
                mse_accum = 0
                mae_accum = 0
                optimizer.zero_grad()
                all_y = []
                all_p_hat = []
                for (lchain, hchain, antigen, y) in train_iterator_fold:

                    phat = predict_affinity(
                        model, lchain, hchain, antigen, embeddings, aaindex_feature, use_cuda=use_cuda)
                    phat = phat.float().view(-1)

                    if use_cuda:
                        y = y.cuda()
                    # y = Variable(y)
                    y = y.float()

                    criterion = nn.MSELoss()
                    b = len(y)
                    loss = criterion(phat, y)
                    loss.requires_grad_(True)
                    loss.backward()
                    # scaler.scale(loss).backward()
                    if use_cuda:
                        y = y.cpu()
                        phat = phat.cpu()
                    all_y.append(y)
                    all_p_hat.append(phat)

                    with torch.no_grad():
                        phat = phat.float()
                        y = y.float()
                        mse = torch.mean((y - phat) ** 2).item()
                        mae = torch.mean(torch.abs(y - phat)).item()
                    n += b
                    delta = b * (loss.item() - loss_accum)
                    loss_accum += delta / n
                    delta = b * (mse - mse_accum)
                    mse_accum += delta / n
                    delta = b * (mae - mae_accum)
                    mae_accum += delta / n
                    report = (n - b) // 100 < n // 100

                    optimizer.step()

                    if report:
                        tokens = [epoch + 1, num_epochs, n / N, loss_accum, mse_accum, mae_accum, ]
                        log(batch_report_fmt.format(*tokens), file=output)
                        if output != sys.stdout:
                            output.flush()

                model.eval()
                with torch.no_grad():
                    if epoch+1 == 30:
                        write = True
                    else:
                        write = False
                    (inter_loss, inter_rmse, inter_mae, inter_r_2, inter_p,) = model_eval(
                        model, test_iterator_fold, embeddings, aaindex_feature, write, weight1, weight2, use_cuda=use_cuda)

                    tokens = [epoch + 1, num_epochs, inter_loss, inter_mae, inter_rmse, inter_r_2, inter_p, ]

                    # scheduler.step(inter_mse)
                    log(epoch_report_fmt.format(*tokens), file=output)
                    if output != sys.stdout:
                        output.flush()
                    '''
                    # 保存模型 (每个epoch)
                    if save_prefix is not None:
                        save_path = (save_prefix + f"_fold{fold}_epoch" + str(epoch + 1).zfill(digits) + ".pth")
                        log(f"Saving model to {save_path}", file=output)
                        model.cpu()
                        torch.save(model.state_dict(), save_path)
                        if use_cuda:
                            model.cuda()
                    '''
            # 保存最终模型
            if save_prefix is not None:
                save_path = save_prefix + f"_fold{fold}_final.pth"
                log(f"Saving final model to {save_path}", file=output)
                model.cpu()
                torch.save(model.state_dict(), save_path)
                if use_cuda:
                    model.cuda()
    else:
        num_samples = len(train_df)
        train_df.columns = ["light", "heavy", "antigen", "delta_g"]
        train_l = train_df["light"]
        train_h = train_df["heavy"]
        train_ag = train_df["antigen"]
        train_y = torch.from_numpy(train_df["delta_g"].values)
        train_y = -train_y
        train_y = NormalizeData(train_y)

        test_df.columns = ["light", "heavy", "antigen", "delta_g"]
        test_l = test_df["light"]
        test_h = test_df["heavy"]
        test_ag = test_df["antigen"]
        test_y = torch.from_numpy(test_df["delta_g"].values)
        test_y = -test_y

        train_dataset = PairedDataset(train_l, train_h, train_ag, train_y)
        train_iterator = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=batch_size,
            collate_fn=collate_paired_sequences,
            shuffle=True,
            pin_memory=False,
            drop_last=True,
            # num_workers=4,
        )
        log(f"Loaded {len(train_l)} training pairs", file=output)
        if output != sys.stdout:
            output.flush()

        test_dataset = PairedDataset(test_l, test_h, test_ag, test_y)
        test_iterator = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=batch_size,
            collate_fn=collate_paired_sequences,
            shuffle=False,
            pin_memory=False,
            drop_last=True,
            # num_workers=4,
        )

        log(f"Loaded {len(test_l)} test pairs", file=output)
        log("Loading embeddings...", file=output)
        if output != sys.stdout:
            output.flush()

        all_proteins = set(train_l).union(train_h).union(train_ag).union(test_l).union(test_h).union(test_ag)

        fastaPath = args.seq_path
        embeddingPath = args.feature_path
        embeddings = embed_dict(fastaPath, embeddingPath)
        log("embeded successfully...", file=output)
        aaindex_feature = seq_aaindex_dict(all_proteins, fastaPath)

        model = ModelAffinity(batch_size, use_cuda)
        if use_cuda:
            model.cuda()

        params = [p for p in model.parameters() if p.requires_grad]
        base_optimizer = optim.Adam
        optimizer = SAM(params, base_optimizer, lr=lr)
        log(f'Using save prefix "{save_prefix}"', file=output)
        log(f"Training with SAM: lr={lr}", file=output)
        log(f"\tnum_epochs: {num_epochs}", file=output)
        log(f"\tbatch_size: {batch_size}", file=output)
        log(f"\tmodule 1 weight: {weight1}", file=output)
        log(f"\tmodule 2 weight: {weight2}", file=output)
        if output != sys.stdout:
            output.flush()

        batch_report_fmt = ("[{}/{}] training {:.1%}: Loss={:.6}, MSE={:.6}, MAE={:.6}")
        epoch_report_fmt = (
            "-----------------------------------Finished Epoch {}/{}: Loss={:.6}, RMSE={:.6}, MAE={:.6}, r_2={:.6}, p={:.6}")

        N = len(train_iterator) * batch_size
        for epoch in range(num_epochs):

            model.train()
            n = 0
            loss_accum = 0
            mse_accum = 0
            mae_accum = 0
            all_y = []
            all_p_hat = []
            optimizer.zero_grad()
            for (lchain, hchain, antigen, y) in train_iterator:
                phat = predict_affinity(
                    model, lchain, hchain, antigen, embeddings, aaindex_feature, use_cuda=use_cuda)
                phat = phat.float().view(-1)

                if use_cuda:
                    y = y.cuda()
                # y = Variable(y)
                y = y.float()

                criterion = nn.MSELoss()
                b = len(y)
                loss = criterion(phat, y)
                loss.requires_grad_(True)
                loss.backward()
                # scaler.scale(loss).backward()
                if use_cuda:
                    y = y.cpu()
                    phat = phat.cpu()

                all_y.append(y)
                all_p_hat.append(phat)

                with torch.no_grad():
                    phat = phat.float()
                    y = y.float()
                    mse = torch.mean((y - phat) ** 2).item()
                    mae = torch.mean(torch.abs(y - phat)).item()
                n += b
                delta = b * (loss.item() - loss_accum)
                loss_accum += delta / n
                delta = b * (mse - mse_accum)
                mse_accum += delta / n
                delta = b * (mae - mae_accum)
                mae_accum += delta / n
                report = (n - b) // 100 < n // 100

                optimizer.step()
                if report:
                    tokens = [epoch + 1, num_epochs, n / N, loss_accum, mse_accum, mae_accum, ]
                    log(batch_report_fmt.format(*tokens), file=output)
                    if output != sys.stdout:
                        output.flush()

            model.eval()
            with torch.no_grad():

                (inter_loss, inter_rmse, inter_mae, inter_r_2, inter_p,) = model_eval(
                    model, test_iterator, embeddings, aaindex_feature, False, weight1, weight2, use_cuda=use_cuda)
                tokens = [epoch + 1, num_epochs, inter_loss, inter_mae, inter_rmse, inter_r_2, inter_p, ]
                # scheduler.step(inter_mse)
                log(epoch_report_fmt.format(*tokens), file=output)
                if output != sys.stdout:
                    output.flush()
                '''
                # 保存模型 (每个epoch)
                if save_prefix is not None:
                    save_path = (save_prefix + "_epoch" + str(epoch + 1).zfill(digits) + ".pth")
                    log(f"Saving model to {save_path}", file=output)
                    model.cpu()
                    torch.save(model.state_dict(), save_path)
                    if use_cuda:
                        model.cuda()
                '''
        # 保存最终模型
        if save_prefix is not None:
            save_path = save_prefix + "_final.pth"
            log(f"Saving final model to {save_path}", file=output)
            model.cpu()
            torch.save(model.state_dict(), save_path)
            if use_cuda:
                model.cuda()

# 初始化输出
output = args.outfile
if output is None:
    output = sys.stdout
else:
    output = open(output, "w")

# 打印开始信息
log("Starting training with parameters:", file=output, print_also=True)
for key, value in vars(args).items():
    log(f"  {key}: {value}", file=output, print_also=True)

# 设置设备
device = args.device
use_cuda = (device > -1) and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(device)
    log(
        f"Using CUDA device {device} - {torch.cuda.get_device_name(device)}",
        file=output,
        print_also=True,
    )
else:
    log("Using CPU", file=output, print_also=True)
    device = "cpu"

# 设置随机种子
if args.seed is not None:
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

# 开始训练
train_model(args, output)

# 关闭输出文件
if output != sys.stdout:
    output.close()

Starting training with parameters:
Using CUDA device 0 - NVIDIA L20
Using save prefix "saved_models/model"
Training with SAM: lr=1e-05
	num_epochs: 30
	batch_size: 16
	module 1 weight: 1.0
	module 2 weight: 1.0
******************************** FOLD 0 ******************************
******************************** FOLD 0 ******************************
Loaded 1745 training pairs
Embedded successfully...
lr: 1e-05
[1/30] training 6.4%: Loss=0.0480409, MSE=0.0480409, MAE=0.168816
[1/30] training 11.8%: Loss=0.0483327, MSE=0.0483327, MAE=0.172164
[1/30] training 17.3%: Loss=0.0499472, MSE=0.0499472, MAE=0.173657
[1/30] training 22.7%: Loss=0.0475479, MSE=0.0475479, MAE=0.169036
[1/30] training 29.1%: Loss=0.0448904, MSE=0.0448904, MAE=0.164549
[1/30] training 34.5%: Loss=0.0431689, MSE=0.0431689, MAE=0.161731
[1/30] training 40.0%: Loss=0.0427187, MSE=0.0427187, MAE=0.160904
[1/30] training 45.5%: Loss=0.042431, MSE=0.042431, MAE=0.160464
[1/30] training 51.8%: Loss=0.0422203, MSE=0.0422203

In [13]:
import pandas as pd
import os

# 确保目录存在
os.makedirs('datasets', exist_ok=True)

# 读取TSV文件
file_path = "/root/private_data/ckx/affinity_contract/MVSF-AB/my_datasets/final_dataset_train_no_du.tsv"
df = pd.read_csv(file_path, sep='\t')

print(f"成功读取TSV文件：{file_path}")
print(f"文件包含 {len(df)} 条记录和 {len(df.columns)} 列")
print("列名:", df.columns.tolist())

# 创建一个字典来存储所有唯一的序列
sequences = {}
pair_data = []

# 处理每个数据点
for i, row in df.iterrows():
    pdb_id = row['pdb_id']
    variant_id = i+1  # 为每个变体分配一个唯一ID
    
    # 创建ID
    heavy_id = f"{pdb_id}_heavy"
    light_id = f"{pdb_id}_light"
    antigen_id = f"{pdb_id}_antigen_{variant_id}"
    
    # 存储序列
    sequences[heavy_id] = row['heavy_sequence']
    sequences[light_id] = row['light_sequence']
    sequences[antigen_id] = row['antigen_sequence']
    
    # 存储配对信息
    pair_data.append({
        'light': light_id,
        'heavy': heavy_id,
        'antigen': antigen_id,
        'delta_g': float(row['delta_g'])
    })

# 创建CSV文件
pair_df = pd.DataFrame(pair_data)
pair_df.to_csv('my_datasets/pairs_benchmark.csv', index=False)
print(f"已创建 datasets/pairs_benchmark.csv，包含 {len(pair_data)} 条记录")

# 创建FASTA文件
with open('my_datasets/seq_natural.fasta', 'w') as f:
    for seq_id, sequence in sequences.items():
        f.write(f">{seq_id}\n{sequence}\n")
print(f"已创建 datasets/seq_natural.fasta，包含 {len(sequences)} 个序列")

# 显示文件的前几行内容
print("\n===== pairs_benchmark.csv 内容预览 =====")
print(pair_df.head().to_string())

print("\n===== seq_natural.fasta 内容预览 =====")
with open('datasets/seq_natural.fasta', 'r') as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i > 10:  # 只打印前几行
            print("...")
            break

成功读取TSV文件：/root/private_data/ckx/affinity_contract/MVSF-AB/my_datasets/final_dataset_train_no_du.tsv
文件包含 2424 条记录和 5 列
列名: ['pdb_id', 'heavy_sequence', 'light_sequence', 'antigen_sequence', 'delta_g']
已创建 datasets/pairs_benchmark.csv，包含 2424 条记录
已创建 datasets/seq_natural.fasta，包含 3654 个序列

===== pairs_benchmark.csv 内容预览 =====
        light       heavy         antigen  delta_g
0  1AHW_light  1AHW_heavy  1AHW_antigen_1   -10.90
1  1AHW_light  1AHW_heavy  1AHW_antigen_2    -7.53
2  1AHW_light  1AHW_heavy  1AHW_antigen_3   -10.46
3  1AHW_light  1AHW_heavy  1AHW_antigen_4   -11.86
4  1AHW_light  1AHW_heavy  1AHW_antigen_5   -10.20

===== seq_natural.fasta 内容预览 =====
>1AHW_heavy
DIKMTQSPSSMYASLGERVTITCKASQDIRKYLNWYQQKPWKSPKTLIYYATSLADGVPSRFSGSGSGQDYSLTISSLESDDTATYYCLQHGESPYTFGGGTKLEINRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNEC
>1AHW_light
EIQLQQSGAELVRPGALVKLSCKASGFNIKDYYMHWVKQRPEQGLEWIGLIDPENGNTIYDPKFQGKASITADTSSNTAYLQLSSLTSEDT

In [20]:
import csv
import torch
from protein_bert_pytorch import ProteinBERT, PretrainingWrapper
import pandas as pd
import torch
import numpy as np
from tape import TAPETokenizer,ProteinBertModel
from tqdm import tqdm
import os
from Bio import SeqIO

torch.cuda.set_device(0)

def get_feature(_list):
    # load model
    model = ProteinBertModel.from_pretrained('bert-base')
    torch.save(model, 'pretrain_bert.models')
    device = torch.device('cuda')
    # model = torch.load('../cmap_final/src/models/pretrain_bert.models')
    # model = ProteinBertModel.from_pretrained('./bert-base-chinese')
    model = model.to(device)
    for param in model.parameters():
        param.requires_grad = False
    model.eval()
    tokenizer = TAPETokenizer(vocab='iupac')  # iupac是TAPE模型的词汇表，UniRep模型使用unirep。
    feature = []
    for seq in tqdm(_list):      # 进度条
        token_ids = torch.tensor([tokenizer.encode(seq)])
        output = model(token_ids.to(device))
        pooled_output = output[1]
        feature.append(pooled_output[0].tolist())
    _df = pd.DataFrame(np.array(feature))
    return _df

def get_feature2():
    model = ProteinBERT(
        num_tokens=21,
        num_annotation=8943,
        dim=512,
        dim_global=256,
        depth=6,
        narrow_conv_kernel=9,
        wide_conv_kernel=9,
        wide_conv_dilation=5,
        attn_heads=8,
        attn_dim_head=64
    )

    seq = torch.randint(0, 21, (2, 2048))
    mask = torch.ones(2, 2048).bool()
    annotation = torch.randint(0, 1, (2, 8943)).float()

    seq_logits, annotation_logits = model(seq, annotation, mask=mask)

# 修改的parse函数 - 适用于Jupyter Notebook
def parse_fasta(fasta_file):
    """
    解析FASTA文件，提取序列名称和序列
    
    参数:
        fasta_file: FASTA文件路径
        
    返回:
        names: 序列名称列表
        sequences: 序列列表
    """
    names = []
    sequences = []
    
    for record in SeqIO.parse(fasta_file, "fasta"):
        names.append(record.name)
        sequences.append(str(record.seq))
    
    return names, sequences

# 主程序代码
fastaPath = './my_datasets/seq_natural.fasta'
outputPath = './my_datasets/seq_natural_embedding.csv'

# 使用修改后的函数解析FASTA文件
names, sequence = parse_fasta(fastaPath)

# 处理序列
new_sequence = []
for seq in sequence:
    seq = seq.replace('_', '')
    seq = seq.replace('J', '')
    new_sequence.append(seq)

# 将序列名和处理后的序列写入CSV
rows = zip(names, new_sequence)
with open(outputPath, 'w') as f:
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)

# 获取特征向量
df = get_feature(new_sequence)

# 保存特征向量
df.to_csv(outputPath, index=False)

100%|██████████| 567/567 [00:00<00:00, 1417264.82B/s]
100%|██████████| 370264230/370264230 [01:25<00:00, 4311005.28B/s] 
  state_dict = torch.load(resolved_archive_file, map_location='cpu')
  token_ids = torch.tensor([tokenizer.encode(seq)])
100%|██████████| 3654/3654 [00:23<00:00, 157.70it/s]


# mydataset

In [3]:
import torch
import pandas as pd
import numpy as np
from src.models.mvsf import ModelAffinity
from src.utils import *
import os
from tqdm import tqdm

def test_model(model_path, test_data_path, seq_path, feature_path, batch_size=16, device=0):
    """
    加载训练好的模型并在测试数据集上进行测试
    
    Args:
        model_path: 模型文件路径
        test_data_path: 测试数据CSV文件路径
        seq_path: 序列FASTA文件
        feature_path: 特征embedding文件路径
        batch_size: 批量大小
        device: 使用的GPU设备，-1表示CPU
    
    Returns:
        评估指标：loss, rmse, mae, r2, p
    """
    # 设置设备
    use_cuda = (device > -1) and torch.cuda.is_available()
    if use_cuda:
        torch.cuda.set_device(device)
        print(f"使用CUDA设备 {device} - {torch.cuda.get_device_name(device)}")
    else:
        print("使用CPU")
        device = "cpu"
    
    # 创建必要的目录
    os.makedirs("datasets", exist_ok=True)
    
    # 加载测试数据
    test_df = pd.read_csv(test_data_path)
    if len(test_df.columns) == 4 and 'light' in test_df.columns:
        # 已经是正确格式
        test_df.columns = ["light", "heavy", "antigen", "delta_g"]
    else:
        # 需要转换格式
        print("转换TSV数据为所需的CSV格式...")
        pair_data = []
        
        # 处理每个数据点
        for i, row in test_df.iterrows():
            try:
                pdb_id = row['pdb_id']
                
                # 创建ID
                heavy_id = f"{pdb_id}_heavy"
                light_id = f"{pdb_id}_light"
                antigen_id = f"{pdb_id}_antigen_{i+1}"
                
                # 存储配对信息
                pair_data.append({
                    'light': light_id,
                    'heavy': heavy_id,
                    'antigen': antigen_id,
                    'delta_g': float(row['delta_g'])
                })
            except KeyError:
                print(f"警告: 行 {i} 缺少必要字段，已跳过")
                continue
        
        test_df = pd.DataFrame(pair_data)
    
    # 提取数据
    test_l = test_df["light"]
    test_h = test_df["heavy"]
    test_ag = test_df["antigen"]
    test_y = torch.from_numpy(test_df["delta_g"].values)
    test_y = -test_y  # 注意这里取了负值，与训练时保持一致
    
    # 创建测试数据集和数据加载器
    test_dataset = PairedDataset(test_l, test_h, test_ag, test_y)
    test_iterator = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=collate_paired_sequences,
        shuffle=False,
        pin_memory=False,
        drop_last=False,
    )
    print(f"加载了 {len(test_l)} 对测试数据")
    
    # 加载所有蛋白质序列和特征
    print("加载序列和特征...")
    all_proteins = set(test_l).union(test_h).union(test_ag)
    embeddings = embed_dict(seq_path, feature_path)
    print("成功加载嵌入向量")
    aaindex_feature = seq_aaindex_dict(all_proteins, seq_path)
    
    # 创建模型实例并加载权重
    model = ModelAffinity(batch_size, use_cuda)
    model.use_cuda = use_cuda
    if use_cuda:
        model.cuda()
    
    # 加载训练好的模型参数
    print(f"加载模型: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location='cuda' if use_cuda else 'cpu'))
    
    # 设置为评估模式
    model.eval()
    
    # 测试模型
    print("开始测试...")
    with torch.no_grad():
        p_hat = []
        true_y = []
        
        for lchain, hchain, antigen, y in tqdm(test_iterator, desc="处理样本"):
            try:
                ph = predict_affinity(model, lchain, hchain, antigen, embeddings, aaindex_feature, use_cuda)
                p_hat.append(ph)
                true_y.append(y)
            except Exception as e:
                print(f"处理样本时出错: {str(e)}")
                continue
        
        if len(p_hat) == 0:
            print("没有成功的预测，请检查数据准备过程")
            return None
            
        y = torch.cat(true_y, 0)
        p_hat = torch.cat(p_hat, 0)
        
        # 确保形状一致 - 修复维度不匹配问题
        y = y.view(-1)          # 确保是一维张量
        p_hat = p_hat.view(-1)  # 确保是一维张量
        
        if use_cuda:
            y = y.cuda()
            p_hat = p_hat.cuda()
        
        p_hat = p_hat.float()
        y = y.float()
        
        # 计算原始损失
        criterion = nn.MSELoss()
        loss = criterion(p_hat, y).item()
        
        # 转换回原始刻度
        max_val = 16.9138
        min_val = 5.0400
        p_hat_original = (p_hat * (max_val - min_val)) + min_val
        
        # 计算评估指标
        rmse = torch.sqrt(torch.mean((y - p_hat_original) ** 2)).item()
        mae = torch.mean(torch.abs(y - p_hat_original)).item()
        
        # 使用numpy计算R2分数和Pearson相关系数，避免形状问题
        from sklearn.metrics import r2_score as sklearn_r2
        import scipy.stats as stats
        
        y_np = y.cpu().numpy()
        p_hat_np = p_hat_original.cpu().numpy()
        
        r_2 = sklearn_r2(y_np, p_hat_np)
        p, _ = stats.pearsonr(y_np, p_hat_np)
        
        # 保存结果到CSV
        results = pd.DataFrame({
            'True_Value': y_np,
            'Predicted_Value': p_hat_np
        })
        results.to_csv('test_results.csv', index=False)
        
        # 打印评估结果
        print("\n评估结果:")
        print(f"Loss: {loss:.6f}")
        print(f"RMSE: {rmse:.6f}")
        print(f"MAE: {mae:.6f}")
        print(f"R²: {r_2:.6f}")
        print(f"Pearson相关系数: {p:.6f}")
        
        return loss, rmse, mae, r_2, p

# 运行测试
test_model("saved_models/model_fold1_final.pth", 
           "my_datasets/pairs_benchmark_2.csv", 
           "my_datasets/seq_natural.fasta", 
           "my_datasets/seq_natural_embedding.csv")

使用CUDA设备 0 - NVIDIA L20
加载了 485 对测试数据
加载序列和特征...
成功加载嵌入向量


  model.load_state_dict(torch.load(model_path, map_location='cuda' if use_cuda else 'cpu'))


加载模型: saved_models/model_fold1_final.pth
开始测试...


处理样本: 100%|██████████| 31/31 [00:00<00:00, 148.07it/s]


评估结果:
Loss: 113.544724
RMSE: 1.729647
MAE: 1.274623
R²: 0.449870
Pearson相关系数: 0.672221





(113.54472351074219,
 1.729647159576416,
 1.2746225595474243,
 0.4498702883720398,
 0.6722210802142126)

# sabdab

In [5]:
import torch
import pandas as pd
import numpy as np
from src.models.mvsf import ModelAffinity
from src.utils import *
import os
from tqdm import tqdm

def test_model(model_path, test_data_path, seq_path, feature_path, batch_size=16, device=0):
    """
    加载训练好的模型并在测试数据集上进行测试
    
    Args:
        model_path: 模型文件路径
        test_data_path: 测试数据CSV文件路径
        seq_path: 序列FASTA文件
        feature_path: 特征embedding文件路径
        batch_size: 批量大小
        device: 使用的GPU设备，-1表示CPU
    
    Returns:
        评估指标：loss, rmse, mae, r2, p
    """
    # 设置设备
    use_cuda = (device > -1) and torch.cuda.is_available()
    if use_cuda:
        torch.cuda.set_device(device)
        print(f"使用CUDA设备 {device} - {torch.cuda.get_device_name(device)}")
    else:
        print("使用CPU")
        device = "cpu"
    
    # 创建必要的目录
    os.makedirs("datasets", exist_ok=True)
    
    # 加载测试数据
    test_df = pd.read_csv(test_data_path)
    if len(test_df.columns) == 4 and 'light' in test_df.columns:
        # 已经是正确格式
        test_df.columns = ["light", "heavy", "antigen", "delta_g"]
    else:
        # 需要转换格式
        print("转换TSV数据为所需的CSV格式...")
        pair_data = []
        
        # 处理每个数据点
        for i, row in test_df.iterrows():
            try:
                pdb_id = row['pdb_id']
                
                # 创建ID
                heavy_id = f"{pdb_id}_heavy"
                light_id = f"{pdb_id}_light"
                antigen_id = f"{pdb_id}_antigen_{i+1}"
                
                # 存储配对信息
                pair_data.append({
                    'light': light_id,
                    'heavy': heavy_id,
                    'antigen': antigen_id,
                    'delta_g': float(row['delta_g'])
                })
            except KeyError:
                print(f"警告: 行 {i} 缺少必要字段，已跳过")
                continue
        
        test_df = pd.DataFrame(pair_data)
    
    # 提取数据
    test_l = test_df["light"]
    test_h = test_df["heavy"]
    test_ag = test_df["antigen"]
    test_y = torch.from_numpy(test_df["delta_g"].values)
    test_y = -test_y  # 注意这里取了负值，与训练时保持一致
    
    # 创建测试数据集和数据加载器
    test_dataset = PairedDataset(test_l, test_h, test_ag, test_y)
    test_iterator = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=collate_paired_sequences,
        shuffle=False,
        pin_memory=False,
        drop_last=False,
    )
    print(f"加载了 {len(test_l)} 对测试数据")
    
    # 加载所有蛋白质序列和特征
    print("加载序列和特征...")
    all_proteins = set(test_l).union(test_h).union(test_ag)
    embeddings = embed_dict(seq_path, feature_path)
    print("成功加载嵌入向量")
    aaindex_feature = seq_aaindex_dict(all_proteins, seq_path)
    
    # 创建模型实例并加载权重
    model = ModelAffinity(batch_size, use_cuda)
    model.use_cuda = use_cuda
    if use_cuda:
        model.cuda()
    
    # 加载训练好的模型参数
    print(f"加载模型: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location='cuda' if use_cuda else 'cpu'))
    
    # 设置为评估模式
    model.eval()
    
    # 测试模型
    print("开始测试...")
    with torch.no_grad():
        p_hat = []
        true_y = []
        
        for lchain, hchain, antigen, y in tqdm(test_iterator, desc="处理样本"):
            try:
                ph = predict_affinity(model, lchain, hchain, antigen, embeddings, aaindex_feature, use_cuda)
                p_hat.append(ph)
                true_y.append(y)
            except Exception as e:
                print(f"处理样本时出错: {str(e)}")
                continue
        
        if len(p_hat) == 0:
            print("没有成功的预测，请检查数据准备过程")
            return None
            
        y = torch.cat(true_y, 0)
        p_hat = torch.cat(p_hat, 0)
        
        # 确保形状一致 - 修复维度不匹配问题
        y = y.view(-1)          # 确保是一维张量
        p_hat = p_hat.view(-1)  # 确保是一维张量
        
        if use_cuda:
            y = y.cuda()
            p_hat = p_hat.cuda()
        
        p_hat = p_hat.float()
        y = y.float()
        
        # 计算原始损失
        criterion = nn.MSELoss()
        loss = criterion(p_hat, y).item()
        
        # 转换回原始刻度
        max_val = 16.9138
        min_val = 5.0400
        p_hat_original = (p_hat * (max_val - min_val)) + min_val
        
        # 计算评估指标
        rmse = torch.sqrt(torch.mean((y - p_hat_original) ** 2)).item()
        mae = torch.mean(torch.abs(y - p_hat_original)).item()
        
        # 使用numpy计算R2分数和Pearson相关系数，避免形状问题
        from sklearn.metrics import r2_score as sklearn_r2
        import scipy.stats as stats
        
        y_np = y.cpu().numpy()
        p_hat_np = p_hat_original.cpu().numpy()
        
        r_2 = sklearn_r2(y_np, p_hat_np)
        p, _ = stats.pearsonr(y_np, p_hat_np)
        
        # 保存结果到CSV
        results = pd.DataFrame({
            'True_Value': y_np,
            'Predicted_Value': p_hat_np
        })
        results.to_csv('test_results.csv', index=False)
        
        # 打印评估结果
        print("\n评估结果:")
        print(f"Loss: {loss:.6f}")
        print(f"RMSE: {rmse:.6f}")
        print(f"MAE: {mae:.6f}")
        print(f"R²: {r_2:.6f}")
        print(f"Pearson相关系数: {p:.6f}")
        
        return loss, rmse, mae, r_2, p

# 运行测试
test_model("saved_models/model_fold1_final.pth", 
           "datasets/pairs_sabdab.csv", 
           "datasets/seq_natural.fasta", 
           "datasets/seq_natural_embedding.csv")

使用CUDA设备 0 - NVIDIA L20
加载了 578 对测试数据
加载序列和特征...
成功加载嵌入向量


  model.load_state_dict(torch.load(model_path, map_location='cuda' if use_cuda else 'cpu'))


加载模型: saved_models/model_fold1_final.pth
开始测试...


处理样本: 100%|██████████| 37/37 [00:00<00:00, 148.84it/s]


评估结果:
Loss: 113.773117
RMSE: 1.602451
MAE: 1.213400
R²: 0.430709
Pearson相关系数: 0.670533





(113.77311706542969,
 1.602450966835022,
 1.2134004831314087,
 0.4307090640068054,
 0.6705332453548047)

# skempi

In [6]:
import torch
import pandas as pd
import numpy as np
from src.models.mvsf import ModelAffinity
from src.utils import *
import os
from tqdm import tqdm

def test_model(model_path, test_data_path, seq_path, feature_path, batch_size=16, device=0):
    """
    加载训练好的模型并在测试数据集上进行测试
    
    Args:
        model_path: 模型文件路径
        test_data_path: 测试数据CSV文件路径
        seq_path: 序列FASTA文件
        feature_path: 特征embedding文件路径
        batch_size: 批量大小
        device: 使用的GPU设备，-1表示CPU
    
    Returns:
        评估指标：loss, rmse, mae, r2, p
    """
    # 设置设备
    use_cuda = (device > -1) and torch.cuda.is_available()
    if use_cuda:
        torch.cuda.set_device(device)
        print(f"使用CUDA设备 {device} - {torch.cuda.get_device_name(device)}")
    else:
        print("使用CPU")
        device = "cpu"
    
    # 创建必要的目录
    os.makedirs("datasets", exist_ok=True)
    
    # 加载测试数据
    test_df = pd.read_csv(test_data_path)
    if len(test_df.columns) == 4 and 'light' in test_df.columns:
        # 已经是正确格式
        test_df.columns = ["light", "heavy", "antigen", "delta_g"]
    else:
        # 需要转换格式
        print("转换TSV数据为所需的CSV格式...")
        pair_data = []
        
        # 处理每个数据点
        for i, row in test_df.iterrows():
            try:
                pdb_id = row['pdb_id']
                
                # 创建ID
                heavy_id = f"{pdb_id}_heavy"
                light_id = f"{pdb_id}_light"
                antigen_id = f"{pdb_id}_antigen_{i+1}"
                
                # 存储配对信息
                pair_data.append({
                    'light': light_id,
                    'heavy': heavy_id,
                    'antigen': antigen_id,
                    'delta_g': float(row['delta_g'])
                })
            except KeyError:
                print(f"警告: 行 {i} 缺少必要字段，已跳过")
                continue
        
        test_df = pd.DataFrame(pair_data)
    
    # 提取数据
    test_l = test_df["light"]
    test_h = test_df["heavy"]
    test_ag = test_df["antigen"]
    test_y = torch.from_numpy(test_df["delta_g"].values)
    test_y = -test_y  # 注意这里取了负值，与训练时保持一致
    
    # 创建测试数据集和数据加载器
    test_dataset = PairedDataset(test_l, test_h, test_ag, test_y)
    test_iterator = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=collate_paired_sequences,
        shuffle=False,
        pin_memory=False,
        drop_last=False,
    )
    print(f"加载了 {len(test_l)} 对测试数据")
    
    # 加载所有蛋白质序列和特征
    print("加载序列和特征...")
    all_proteins = set(test_l).union(test_h).union(test_ag)
    embeddings = embed_dict(seq_path, feature_path)
    print("成功加载嵌入向量")
    aaindex_feature = seq_aaindex_dict(all_proteins, seq_path)
    
    # 创建模型实例并加载权重
    model = ModelAffinity(batch_size, use_cuda)
    model.use_cuda = use_cuda
    if use_cuda:
        model.cuda()
    
    # 加载训练好的模型参数
    print(f"加载模型: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location='cuda' if use_cuda else 'cpu'))
    
    # 设置为评估模式
    model.eval()
    
    # 测试模型
    print("开始测试...")
    with torch.no_grad():
        p_hat = []
        true_y = []
        
        for lchain, hchain, antigen, y in tqdm(test_iterator, desc="处理样本"):
            try:
                ph = predict_affinity(model, lchain, hchain, antigen, embeddings, aaindex_feature, use_cuda)
                p_hat.append(ph)
                true_y.append(y)
            except Exception as e:
                print(f"处理样本时出错: {str(e)}")
                continue
        
        if len(p_hat) == 0:
            print("没有成功的预测，请检查数据准备过程")
            return None
            
        y = torch.cat(true_y, 0)
        p_hat = torch.cat(p_hat, 0)
        
        # 确保形状一致 - 修复维度不匹配问题
        y = y.view(-1)          # 确保是一维张量
        p_hat = p_hat.view(-1)  # 确保是一维张量
        
        if use_cuda:
            y = y.cuda()
            p_hat = p_hat.cuda()
        
        p_hat = p_hat.float()
        y = y.float()
        
        # 计算原始损失
        criterion = nn.MSELoss()
        loss = criterion(p_hat, y).item()
        
        # 转换回原始刻度
        max_val = 16.9138
        min_val = 5.0400
        p_hat_original = (p_hat * (max_val - min_val)) + min_val
        
        # 计算评估指标
        rmse = torch.sqrt(torch.mean((y - p_hat_original) ** 2)).item()
        mae = torch.mean(torch.abs(y - p_hat_original)).item()
        
        # 使用numpy计算R2分数和Pearson相关系数，避免形状问题
        from sklearn.metrics import r2_score as sklearn_r2
        import scipy.stats as stats
        
        y_np = y.cpu().numpy()
        p_hat_np = p_hat_original.cpu().numpy()
        
        r_2 = sklearn_r2(y_np, p_hat_np)
        p, _ = stats.pearsonr(y_np, p_hat_np)
        
        # 保存结果到CSV
        results = pd.DataFrame({
            'True_Value': y_np,
            'Predicted_Value': p_hat_np
        })
        results.to_csv('test_results.csv', index=False)
        
        # 打印评估结果
        print("\n评估结果:")
        print(f"Loss: {loss:.6f}")
        print(f"RMSE: {rmse:.6f}")
        print(f"MAE: {mae:.6f}")
        print(f"R²: {r_2:.6f}")
        print(f"Pearson相关系数: {p:.6f}")
        
        return loss, rmse, mae, r_2, p

# 运行测试
test_model("saved_models/model_fold1_final.pth", 
           "datasets/pairs_skempi.csv", 
           "datasets/seq.fasta", 
           "datasets/embedding.csv")

使用CUDA设备 0 - NVIDIA L20
加载了 387 对测试数据
加载序列和特征...
成功加载嵌入向量


  model.load_state_dict(torch.load(model_path, map_location='cuda' if use_cuda else 'cpu'))


加载模型: saved_models/model_fold1_final.pth
开始测试...


处理样本: 100%|██████████| 25/25 [00:00<00:00, 138.25it/s]



评估结果:
Loss: 119.294930
RMSE: 1.880481
MAE: 1.354302
R²: 0.114168
Pearson相关系数: 0.371876


(119.29492950439453,
 1.8804807662963867,
 1.3543022871017456,
 0.11416751146316528,
 0.37187572717454687)

# abbind

In [7]:
import torch
import pandas as pd
import numpy as np
from src.models.mvsf import ModelAffinity
from src.utils import *
import os
from tqdm import tqdm

def test_model(model_path, test_data_path, seq_path, feature_path, batch_size=16, device=0):
    """
    加载训练好的模型并在测试数据集上进行测试
    
    Args:
        model_path: 模型文件路径
        test_data_path: 测试数据CSV文件路径
        seq_path: 序列FASTA文件
        feature_path: 特征embedding文件路径
        batch_size: 批量大小
        device: 使用的GPU设备，-1表示CPU
    
    Returns:
        评估指标：loss, rmse, mae, r2, p
    """
    # 设置设备
    use_cuda = (device > -1) and torch.cuda.is_available()
    if use_cuda:
        torch.cuda.set_device(device)
        print(f"使用CUDA设备 {device} - {torch.cuda.get_device_name(device)}")
    else:
        print("使用CPU")
        device = "cpu"
    
    # 创建必要的目录
    os.makedirs("datasets", exist_ok=True)
    
    # 加载测试数据
    test_df = pd.read_csv(test_data_path)
    if len(test_df.columns) == 4 and 'light' in test_df.columns:
        # 已经是正确格式
        test_df.columns = ["light", "heavy", "antigen", "delta_g"]
    else:
        # 需要转换格式
        print("转换TSV数据为所需的CSV格式...")
        pair_data = []
        
        # 处理每个数据点
        for i, row in test_df.iterrows():
            try:
                pdb_id = row['pdb_id']
                
                # 创建ID
                heavy_id = f"{pdb_id}_heavy"
                light_id = f"{pdb_id}_light"
                antigen_id = f"{pdb_id}_antigen_{i+1}"
                
                # 存储配对信息
                pair_data.append({
                    'light': light_id,
                    'heavy': heavy_id,
                    'antigen': antigen_id,
                    'delta_g': float(row['delta_g'])
                })
            except KeyError:
                print(f"警告: 行 {i} 缺少必要字段，已跳过")
                continue
        
        test_df = pd.DataFrame(pair_data)
    
    # 提取数据
    test_l = test_df["light"]
    test_h = test_df["heavy"]
    test_ag = test_df["antigen"]
    test_y = torch.from_numpy(test_df["delta_g"].values)
    test_y = -test_y  # 注意这里取了负值，与训练时保持一致
    
    # 创建测试数据集和数据加载器
    test_dataset = PairedDataset(test_l, test_h, test_ag, test_y)
    test_iterator = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=collate_paired_sequences,
        shuffle=False,
        pin_memory=False,
        drop_last=False,
    )
    print(f"加载了 {len(test_l)} 对测试数据")
    
    # 加载所有蛋白质序列和特征
    print("加载序列和特征...")
    all_proteins = set(test_l).union(test_h).union(test_ag)
    embeddings = embed_dict(seq_path, feature_path)
    print("成功加载嵌入向量")
    aaindex_feature = seq_aaindex_dict(all_proteins, seq_path)
    
    # 创建模型实例并加载权重
    model = ModelAffinity(batch_size, use_cuda)
    model.use_cuda = use_cuda
    if use_cuda:
        model.cuda()
    
    # 加载训练好的模型参数
    print(f"加载模型: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location='cuda' if use_cuda else 'cpu'))
    
    # 设置为评估模式
    model.eval()
    
    # 测试模型
    print("开始测试...")
    with torch.no_grad():
        p_hat = []
        true_y = []
        
        for lchain, hchain, antigen, y in tqdm(test_iterator, desc="处理样本"):
            try:
                ph = predict_affinity(model, lchain, hchain, antigen, embeddings, aaindex_feature, use_cuda)
                p_hat.append(ph)
                true_y.append(y)
            except Exception as e:
                print(f"处理样本时出错: {str(e)}")
                continue
        
        if len(p_hat) == 0:
            print("没有成功的预测，请检查数据准备过程")
            return None
            
        y = torch.cat(true_y, 0)
        p_hat = torch.cat(p_hat, 0)
        
        # 确保形状一致 - 修复维度不匹配问题
        y = y.view(-1)          # 确保是一维张量
        p_hat = p_hat.view(-1)  # 确保是一维张量
        
        if use_cuda:
            y = y.cuda()
            p_hat = p_hat.cuda()
        
        p_hat = p_hat.float()
        y = y.float()
        
        # 计算原始损失
        criterion = nn.MSELoss()
        loss = criterion(p_hat, y).item()
        
        # 转换回原始刻度
        max_val = 16.9138
        min_val = 5.0400
        p_hat_original = (p_hat * (max_val - min_val)) + min_val
        
        # 计算评估指标
        rmse = torch.sqrt(torch.mean((y - p_hat_original) ** 2)).item()
        mae = torch.mean(torch.abs(y - p_hat_original)).item()
        
        # 使用numpy计算R2分数和Pearson相关系数，避免形状问题
        from sklearn.metrics import r2_score as sklearn_r2
        import scipy.stats as stats
        
        y_np = y.cpu().numpy()
        p_hat_np = p_hat_original.cpu().numpy()
        
        r_2 = sklearn_r2(y_np, p_hat_np)
        p, _ = stats.pearsonr(y_np, p_hat_np)
        
        # 保存结果到CSV
        results = pd.DataFrame({
            'True_Value': y_np,
            'Predicted_Value': p_hat_np
        })
        results.to_csv('test_results.csv', index=False)
        
        # 打印评估结果
        print("\n评估结果:")
        print(f"Loss: {loss:.6f}")
        print(f"RMSE: {rmse:.6f}")
        print(f"MAE: {mae:.6f}")
        print(f"R²: {r_2:.6f}")
        print(f"Pearson相关系数: {p:.6f}")
        
        return loss, rmse, mae, r_2, p

# 运行测试
test_model("saved_models/model_fold1_final.pth", 
           "datasets/pairs_abbind.csv", 
           "datasets/seq.fasta", 
           "datasets/embedding.csv")

使用CUDA设备 0 - NVIDIA L20
加载了 1089 对测试数据
加载序列和特征...
成功加载嵌入向量


  model.load_state_dict(torch.load(model_path, map_location='cuda' if use_cuda else 'cpu'))


加载模型: saved_models/model_fold1_final.pth
开始测试...


处理样本: 100%|██████████| 69/69 [00:00<00:00, 145.90it/s]


评估结果:
Loss: 91.431992
RMSE: 2.252096
MAE: 1.755535
R²: 0.336023
Pearson相关系数: 0.593236





(91.43199157714844,
 2.252095937728882,
 1.7555351257324219,
 0.3360234498977661,
 0.5932355253485596)