In [1]:
import tqdm
import pandas as pd
import numpy as np
import scipy.stats
import h5py
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

In [2]:
def load_data_from_h5_to_dataframe(h5_file_path):
    """
    从 HDF5 文件中读取 peptide_encodings, mhcii_encodings 和 labels，并转换为 Pandas DataFrame。
    :param h5_file_path: HDF5 文件的路径
    :return: 包含 peptide_encodings, mhcii_encodings 和 label 的 DataFrame
    """
    # 打开 HDF5 文件进行读取
    with h5py.File(h5_file_path, 'r') as h5f:
        # 打印文件中的数据集名称
        print(f"Datasets in HDF5 file: {list(h5f.keys())}")

        # 读取肽段和MHCII的编码数据集，以及标签数据集
        peptide_encodings = h5f['peptide_encodings'][:]
        mhcii_encodings = h5f['mhcii_encodings'][:]
        labels = h5f['labels'][:]

        print(f"Peptide Encodings Shape: {peptide_encodings.shape}")
        print(f"MHCII Encodings Shape: {mhcii_encodings.shape}")
        print(f"Labels Shape: {labels.shape}")

        # 如果 peptide_encodings 或 mhcii_encodings 是三维的，则沿着序列长度维度取平均，转换为二维
        if peptide_encodings.ndim == 3:
            peptide_encodings = np.mean(peptide_encodings, axis=1)

        if mhcii_encodings.ndim == 3:
            mhcii_encodings = np.mean(mhcii_encodings, axis=1)

    # 将 peptide_encodings, mhcii_encodings 和 labels 转换为 DataFrame
    data_df = pd.DataFrame({
        'peptide_encoding': list(peptide_encodings),  # 将肽段编码转换为列表形式
        'mhcii_encoding': list(mhcii_encodings),  # 将MHCII编码转换为列表形式
        'label': labels  # 直接使用标签数组
    })

    return data_df



HDF5 文件的大致内容结构：
/ (Root)
    embeddings (Dataset)
        Shape: (N, D)  # N 是样本数，D 是每个序列的编码维度
        Type: float32  # 数据类型为 32 位浮点数
    labels (Dataset)
        Shape: (N,)  # N 是样本数，与 embeddings 数据集中的样本数对应
        Type: float32  # 标签数据类型为 32 位浮点数


In [None]:
BA_file = "D:\\OneDrive\\我要毕业\\new_representation_data\\APAAC.h5" # 替换为你的 HDF5 文件路径
data_df = load_data_from_h5_to_dataframe(BA_file)

In [4]:
def calc_train_error(X_train, y_train, model):
    '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    mae = mean_absolute_error(y_train, predictions)
    corr = scipy.stats.pearsonr(y_train, predictions)
    return mse,mae,corr

def calc_validation_error(X_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    corr = scipy.stats.pearsonr(y_test, predictions)
    return mse,mae,corr

def calc_metrics(X_train, y_train, X_test, y_test, model):
    '''fits model and returns the metrics for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_mse_error,train_mae_error,train_corr = calc_train_error(X_train, y_train, model)
    val_mse_error,val_mae_error,val_corr = calc_validation_error(X_test, y_test, model)
    return train_mse_error, val_mse_error, train_mae_error, val_mae_error,train_corr,val_corr

In [5]:
def report_results(
    train_mse_error_list,
    validation_mse_error_list,
    train_mae_error_list,
    validation_mae_error_list,
    train_corr_list,
    validation_corr_list,
    train_corr_pval_list,
    validation_corr_pval_list,
):
    result_df = pd.DataFrame(
        {
            "train_mse_error": round(np.mean(train_mse_error_list) * 100, 4),
            "train_mse_std": round(np.std(train_mse_error_list) * 100, 4),
            "val_mse_error": round(np.mean(validation_mse_error_list) * 100, 4),
            "val_mse_std": round(np.std(validation_mse_error_list) * 100, 4),
            "train_mae_error": round(np.mean(train_mae_error_list) * 100, 4),
            "train_mae_std": round(np.std(train_mae_error_list) * 100, 4),
            "val_mae_error": round(np.mean(validation_mae_error_list) * 100, 4),
            "val_mae_std": round(np.std(validation_mae_error_list) * 100, 4),
            "train_corr": round(np.mean(train_corr_list), 4),
            "train_corr_pval": round(np.mean(train_corr_pval_list), 4),
            "validation_corr": round(np.mean(validation_corr_list), 4),
            "validation_corr_pval": round(np.mean(validation_corr_pval_list), 4),
        },
        index=[0],
    )

    result_detail_df = pd.DataFrame(
        {
            "train_mse_errors": list(np.multiply(train_mse_error_list, 100)),
            "val_mse_errors": list(np.multiply(validation_mse_error_list, 100)),
            "train_mae_errors": list(np.multiply(train_mae_error_list, 100)),
            "val_mae_errors": list(np.multiply(validation_mae_error_list, 100)),
            "train_corrs": list(np.multiply(train_corr_list, 100)),
            "train_corr_pvals": list(np.multiply(train_corr_pval_list, 100)),
            "validation_corr": list(np.multiply(validation_corr_list, 100)),
            "validation_corr_pval": list(np.multiply(validation_corr_pval_list, 100)),
        },
        index=range(len(train_mse_error_list)),
    )
    return result_df, result_detail_df


In [6]:
def predictAffinityWithModel(regressor_model):
    K = 10
    kf = KFold(n_splits=K, shuffle=True, random_state=42)

    train_mse_error_list = []
    validation_mse_error_list = []
    train_mae_error_list = []
    validation_mae_error_list = []
    train_corr_list = []
    validation_corr_list = []
    train_corr_pval_list = []
    validation_corr_pval_list = []

    data = np.array(data_df['embedding'].tolist(), dtype=float)
    target = np.array(data_df['label'].tolist(), dtype=float)

    # 使用 tqdm 包裹 KFold 进度条
    with tqdm.tqdm(total=K, desc="K-Fold Progress", unit="fold") as pbar:
        for train_index, val_index in kf.split(data, target):

            # split data
            X_train, X_val = data[train_index], data[val_index]
            y_train, y_val = target[train_index], target[val_index]

            # instantiate model
            reg = regressor_model # linear_model.BayesianRidge()

            # calculate error_list
            train_mse_error,val_mse_error,train_mae_error,val_mae_error,train_corr,val_corr = calc_metrics(X_train, y_train, X_val, y_val, reg)

            # append to appropriate list
            train_mse_error_list.append(train_mse_error)
            validation_mse_error_list.append(val_mse_error)

            train_mae_error_list.append(train_mae_error)
            validation_mae_error_list.append(val_mae_error)

            train_corr_list.append(train_corr[0])
            validation_corr_list.append(val_corr[0])

            train_corr_pval_list.append(train_corr[1])
            validation_corr_pval_list.append(val_corr[1])

    return report_results(
        train_mse_error_list,
        validation_mse_error_list,
        train_mae_error_list,
        validation_mae_error_list,
        train_corr_list,
        validation_corr_list,
        train_corr_pval_list,
        validation_corr_pval_list,
        )

In [7]:
model = MLPRegressor(
    hidden_layer_sizes=(256,64),  # 两层隐藏层，分别有 256 和 64 个神经元
    activation='relu',             # 使用 ReLU 激活函数
    solver='adam',                 # 使用 Adam 优化器
    max_iter=100,                  # 最大迭代次数 500
    verbose=True                  # 输出训练过程的日志信息
)

result_df, result_detail_df = predictAffinityWithModel(model)

K-Fold Progress:   0%|          | 0/10 [00:00<?, ?fold/s]

Iteration 1, loss = 0.03144493
Iteration 2, loss = 0.02810487
Iteration 3, loss = 0.02683461
Iteration 4, loss = 0.02599480
Iteration 5, loss = 0.02529420
Iteration 6, loss = 0.02472152
Iteration 7, loss = 0.02433612
Iteration 8, loss = 0.02396042
Iteration 9, loss = 0.02368381
Iteration 10, loss = 0.02340641
Iteration 11, loss = 0.02319870
Iteration 12, loss = 0.02301860
Iteration 13, loss = 0.02272572
Iteration 14, loss = 0.02255641
Iteration 15, loss = 0.02235680
Iteration 16, loss = 0.02222214
Iteration 17, loss = 0.02205446
Iteration 18, loss = 0.02195190
Iteration 19, loss = 0.02187787
Iteration 20, loss = 0.02168048
Iteration 21, loss = 0.02160088
Iteration 22, loss = 0.02153291
Iteration 23, loss = 0.02143141
Iteration 24, loss = 0.02138966
Iteration 25, loss = 0.02127558
Iteration 26, loss = 0.02127073
Iteration 27, loss = 0.02106488
Iteration 28, loss = 0.02106195
Iteration 29, loss = 0.02098541
Iteration 30, loss = 0.02096866
Iteration 31, loss = 0.02087014
Iteration 32, los

In [41]:
# Save the result dataframes to CSV files
result_df_file = f"D:\\OneDrive\\results\\APAAC_result.csv" 
result_detail_df_file = f"D:\\OneDrive\\results\\APAAC_result_detail.csv"
result_df.to_csv(result_df_file, index=False)
result_detail_df.to_csv(result_detail_df_file, index=False)