In [3]:
# !pip install torch
# !pip install torchmetrics
# !pip install wandb
# !pip install pytorch_lightning
# !pip install cafaeval
import pandas as pd
import pdb
import numpy as np
from tqdm import tqdm
import time
import math
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from torch.utils.data import Dataset
import torch;
from torch import nn
import torch.nn.functional as F
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import MultilabelF1Score
from torchmetrics.classification import MultilabelAccuracy
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
import wandb
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

In [12]:
# 1.数据预处理，使用T5（ProtTrans）、ems2、protein与训练模型生成embedding;
# 2.配置读取embedding文件 & 构建数据集;

# 查看最终生成评估文件的模版格式
# sub = pd.read_csv("../input/cafa-5-protein-function-prediction/sample_submission.tsv", sep= "\t", header = None)
# sub.columns = ["The Protein ID", "The GO ID", "Predictedin Protein"]
# print(sub.head(5))
MAIN_DIR = "../input/cafa-5-protein-function-prediction"

class config:
    train_labels_path = MAIN_DIR + "/Train/train_terms.tsv"
    test_sequences_path = MAIN_DIR + "/Test (Targets)/testsuperset.fasta"
    train_sequences_path = MAIN_DIR  + "/Train/train_sequences.fasta"
    
    num_labels = 500
    n_epochs = 20
    batch_size = 128
    lr = 0.001 
    # for Mac
    device = torch.device('mps') 
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
    print(device)
   
# Directories for the different embedding vectors : 
embeds_map = {
    "T5" : "cafa-5-t5-embeddings-numpy",
    "ProtBERT" : "cafa-5-protbert-embeddings-numpy",
    "EMS2" : "cafa-5-ems2-embeddings-numpy"
}

# Length of the different embedding vectors :
embeds_dim = {
    "T5" : 1024,
    "ProtBERT" : 1024,
    "EMS2" : 1280
}


class ProteinSequenceDataset(Dataset): 
#     def __init__(self, datatype, embeddings_sources):
#     super(ProteinSequenceDataset).__init__()
#     self.datatype = datatype

#     # 加载嵌入并进行拼接
#     embeddings_list = []
#     for embeddings_source in embeddings_sources:
#         embeds_path = f"../input/{embeds_map[embeddings_source]}/{datatype}_embeddings.npy"
#         embeds = np.load(embeds_path)
#         embeddings_list.append(embeds)

#     # 调整嵌入向量的大小，使它们在拼接维度上具有相同的大小
#     max_size = max(embed.shape[0] for embed in embeddings_list)
#     for i, embed in enumerate(embeddings_list):
#         if embed.shape[0] < max_size:
#             pad_size = max_size - embed.shape[0]
#             embeddings_list[i] = np.pad(embed, ((0, pad_size), (0, 0)), mode='constant', constant_values=0)

#     # 使用numpy的hstack函数进行水平拼接
#     combined_embeds = np.hstack(embeddings_list)

#     # 加载IDs
#     ids_path = f"../input/{list(embeds_map.values())[0]}/{datatype}_ids.npy"  # 假设所有ID文件相同
#     ids = np.load(ids_path)

#     # 创建包含所有嵌入和ID的DataFrame
#     embeds_list = [combined_embeds[i, :] for i in range(combined_embeds.shape[0])]
#     self.df = pd.DataFrame(data={"EntryID": ids, "embed": embeds_list})
    
#     if datatype == "train":
#         np_labels = np.load(f"../input/train-targets-top{config.num_labels}/train_targets_top{config.num_labels}.npy")
#         df_labels = pd.DataFrame(self.df['EntryID'])
#         df_labels['labels_vect'] = [row for row in np_labels]
#         self.df = self.df.merge(df_labels, on="EntryID")

    
    def __init__(self, datatype, embeddings_source):
        super(ProteinSequenceDataset).__init__()
        self.datatype = datatype
        embeds = np.load("../input/"+embeds_map[embeddings_source]+"/"+datatype+"_embeddings.npy")
        ids = np.load("../input/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy")

        embeds_list = []
        for l in range(embeds.shape[0]):
            embeds_list.append(embeds[l,:])
        self.df = pd.DataFrame(data={"EntryID": ids, "embed" : embeds_list})
        
        if datatype=="train":
            np_labels = np.load(
                "../input/train-targets-top"+str(config.num_labels)+ \
                "/train_targets_top"+str(config.num_labels)+".npy")
            df_labels = pd.DataFrame(self.df['EntryID'])
            df_labels['labels_vect']=[row for row in np_labels]
            self.df = self.df.merge(df_labels, on="EntryID")
            
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, index): 
        embed = torch.tensor(self.df.iloc[index]["embed"] , dtype = torch.float32)
        if self.datatype=="train":
            targets = torch.tensor(self.df.iloc[index]["labels_vect"], dtype = torch.float32)
            return embed, targets
        if self.datatype=="test":
            id = self.df.iloc[index]["EntryID"]
            return embed, id

mps


In [13]:
# 3.定义分类模型 - MLP


###############################################################
########## >>>>> Baseline <<<<<<  ##########
###############################################################
# class MultiLayerPerceptron(torch.nn.Module): # member function 
    
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()

#         self.linear1 = torch.nn.Linear(input_dim, 1012)
#         self.activation1 = torch.nn.Tanh()
#         self.linear2 = torch.nn.Linear(1012, 864)
#         self.activation2 = torch.nn.Tanh()
#         self.linear3 = torch.nn.Linear(864, 712)
#         self.activation3 = torch.nn.Tanh()
#         self.linear4 = torch.nn.Linear(712, num_classes)
      

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.activation1(x)
#         x = self.linear2(x)
#         x = self.activation2(x)
#         x = self.linear3(x)
#         x = self.activation3(x)
#         x = self.linear4(x)
#         return x

###############################################################
########## >>>>> 调整隐藏层神经元数量 2048<<<<<<  ##########
###############################################################
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()

#         # 增加第一个隐藏层的神经元数量到2048，逐层减少
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.activation1 = nn.Tanh()
#         self.linear2 = nn.Linear(2048, 1024)
#         self.activation2 = nn.Tanh()
#         self.linear3 = nn.Linear(1024, 768)
#         self.activation3 = nn.Tanh()
#         self.linear4 = nn.Linear(768, num_classes)

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.activation1(x)
#         x = self.linear2(x)
#         x = self.activation2(x)
#         x = self.linear3(x)
#         x = self.activation3(x)
#         x = self.linear4(x)
#         return x

###############################################################
########## >>>>> 调修改激活函数 ReLU<<<<<<  ##########
###############################################################
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()

#         # 使用ReLU激活函数
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.activation1 = nn.ReLU()
#         self.linear2 = nn.Linear(2048, 1024)
#         self.activation2 = nn.ReLU()
#         self.linear3 = nn.Linear(1024, 768)
#         self.activation3 = nn.ReLU()
#         self.linear4 = nn.Linear(768, num_classes)

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.activation1(x)
#         x = self.linear2(x)
#         x = self.activation2(x)
#         x = self.linear3(x)
#         x = self.activation3(x)
#         x = self.linear4(x)
#         return x
        
###############################################################
########## >>>>> 添加正则化层 Dropout(0.5)<<<<<<  ##########
###############################################################
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()

#         # 添加Dropout和Batch Normalization
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.bn1 = nn.BatchNorm1d(2048)
#         self.activation1 = nn.ReLU()
#         self.dropout1 = nn.Dropout(0.5)
#         self.linear2 = nn.Linear(2048, 1024)
#         self.bn2 = nn.BatchNorm1d(1024)
#         self.activation2 = nn.ReLU()
#         self.dropout2 = nn.Dropout(0.5)
#         self.linear3 = nn.Linear(1024, 768)
#         self.bn3 = nn.BatchNorm1d(768)
#         self.activation3 = nn.ReLU()
#         self.dropout3 = nn.Dropout(0.5)
#         self.linear4 = nn.Linear(768, num_classes)

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.bn1(x)
#         x = self.activation1(x)
#         x = self.dropout1(x)
#         x = self.linear2(x)
#         x = self.bn2(x)
#         x = self.activation2(x)
#         x = self.dropout2(x)
#         x = self.linear3(x)
#         x = self.bn3(x)
#         x = self.activation3(x)
#         x = self.dropout3(x)
#         x = self.linear4(x)
#         return x

###############################################################
########## >>>>> 增加层数 5层<<<<<<  ##########
###############################################################
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()

#         # 增加层数
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.activation1 = nn.ReLU()
#         self.linear2 = nn.Linear(2048, 1536)
#         self.activation2 = nn.ReLU()
#         self.linear3 = nn.Linear(1536, 1024)
#         self.activation3 = nn.ReLU()
#         self.linear4 = nn.Linear(1024, 768)
#         self.activation4 = nn.ReLU()
#         self.linear5 = nn.Linear(768, num_classes)

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.activation1(x)
#         x = self.linear2(x)
#         x = self.activation2(x)
#         x = self.linear3(x)
#         x = self.activation3(x)
#         x = self.linear4(x)
#         x = self.activation4(x)
#         x = self.linear5(x)
#         return x

###############################################################
########## >>>>> 减少层数 3层<<<<<<  ##########
###############################################################
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()

#         # 减少层数
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.activation1 = nn.ReLU()
#         self.linear2 = nn.Linear(2048, 1024)
#         self.activation2 = nn.ReLU()
#         self.linear3 = nn.Linear(1024, num_classes)

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.activation1(x)
#         x = self.linear2(x)
#         x = self.activation2(x)
#         x = self.linear3(x)
#         return x


###############################################################
########## >>>>> 尝试增加注意力机制 <<<<<<  ##########
###############################################################
# class AttentionModule(nn.Module):
#     def __init__(self, feature_dim):
#         super(AttentionModule, self).__init__()
#         self.attention_weights = nn.Sequential(
#             nn.Linear(feature_dim, feature_dim),
#             nn.ReLU(),
#             nn.Linear(feature_dim, 1),
#             nn.Softmax(dim=1)
#         )

#     def forward(self, x):
#         weights = self.attention_weights(x)  # 计算每个特征的重要性
#         return x * weights  # 将重要性权重应用到特征上

# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.activation1 = nn.ReLU()
#         self.attention1 = AttentionModule(2048)  # 注意力层加在第一层后
#         self.linear2 = nn.Linear(2048, 1024)
#         self.activation2 = nn.ReLU()
#         self.attention2 = AttentionModule(1024)  # 注意力层加在第二层后
#         self.linear3 = nn.Linear(1024, num_classes)

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.activation1(x)
#         x = self.attention1(x)  # 应用注意力机制
#         x = self.linear2(x)
#         x = self.activation2(x)
#         x = self.attention2(x)  # 应用注意力机制
#         x = self.linear3(x)
#         return x

###############################################################
########## >>>>> 尝试增加多头注意力机制 <<<<<<  ##########
###############################################################
# class MultiHeadAttention(nn.Module):
#     def __init__(self, feature_dim, num_heads):
#         super(MultiHeadAttention, self).__init__()
#         self.num_heads = num_heads
#         self.feature_dim = feature_dim
#         self.dim_per_head = feature_dim // num_heads

#         assert self.dim_per_head * num_heads == feature_dim, "Feature dimension must be divisible by number of heads"

#         self.linear_layers = nn.ModuleList([nn.Linear(feature_dim, feature_dim) for _ in range(3)])
#         self.output_linear = nn.Linear(feature_dim, feature_dim)
#         self.dropout = nn.Dropout(0.0)

#     def forward(self, x):
#         batch_size = x.size(0)

#         query, key, value = [l(x).view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1, 2) for l in self.linear_layers]

#         scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.dim_per_head)
#         attention = F.softmax(scores, dim=-1)
#         attention = self.dropout(attention)

#         weighted_value = torch.matmul(attention, value)
#         weighted_value = weighted_value.transpose(1, 2).contiguous().view(batch_size, -1, self.feature_dim)

#         return self.output_linear(weighted_value)
    
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.dropout1 = nn.Dropout(0.0)  # 增加Dropout
#         self.attention1 = MultiHeadAttention(2048, 16)
#         self.linear2 = nn.Linear(2048, 1024)
#         self.dropout2 = nn.Dropout(0.0)  # 增加Dropout
#         self.attention2 = MultiHeadAttention(1024, 16)
#         self.linear3 = nn.Linear(1024, num_classes)

#     def forward(self, x):
#         x = F.relu(self.linear1(x))
#         x = self.dropout1(x)  # 应用Dropout
#         x = self.attention1(x)
#         x = F.relu(self.linear2(x))
#         x = self.dropout2(x)  # 应用Dropout
#         x = self.attention2(x)
#         x = self.linear3(x)
#         x = x.squeeze()
#         return x


###############################################################
########## >>>>> 神经元4096、10层、增加残差网络 <<<<<<  ##########
###############################################################
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes, layers=10, features=4096):
#         super(MultiLayerPerceptron, self).__init__()
#         self.layers = nn.ModuleList()
#         self.batch_norms = nn.ModuleList()

#         self.layers.append(nn.Linear(input_dim, features))
#         self.batch_norms.append(nn.BatchNorm1d(features))

#         if input_dim != features:
#             self.residual_adapt = nn.Linear(input_dim, features)
#         else:
#             self.residual_adapt = None

#         # Intermediate layers
#         for _ in range(1, layers - 1):
#             self.layers.append(nn.Linear(features, features))
#             self.batch_norms.append(nn.BatchNorm1d(features))

#         # Final layer
#         self.layers.append(nn.Linear(features, num_classes))

#     def forward(self, x):
#         initial_input = x  # Save initial input for residual connection adaptation

#         x = F.relu(self.batch_norms[0](self.layers[0](x)))

#         if self.residual_adapt:
#             residual = self.residual_adapt(initial_input)  # Adapt the initial input dimension
#         else:
#             residual = initial_input

#         for i in range(1, len(self.layers) - 1):
#             y = F.relu(self.batch_norms[i](self.layers[i](x)))
#             x = y + residual  # Apply residual connection
#             residual = x  # Update residual to the latest x

#         x = self.layers[-1](x)
#         return x

###############################################################
########## >>>>> 3层、Dropout(0.5) <<<<<<  ##########
###############################################################
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()

#         # 减少层数
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.bn1 = nn.BatchNorm1d(2048)
#         self.activation1 = nn.ReLU()
#         self.dropout1 = nn.Dropout(0.5)
#         self.linear2 = nn.Linear(2048, 1024)
#         self.bn2 = nn.BatchNorm1d(1024)
#         self.activation2 = nn.ReLU()
#         self.dropout2 = nn.Dropout(0.5)
#         self.linear3 = nn.Linear(1024, num_classes)

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.bn1(x)
#         x = self.activation1(x)
#         x = self.dropout1(x)
#         x = self.linear2(x)
#         x = self.bn2(x)
#         x = self.activation2(x)
#         x = self.dropout2(x)
#         x = self.linear3(x)
#         return x
    
class MultiLayerPerceptron(nn.Module):
    def __init__(self,input_dim,num_classes):
        super().__init__()
        
        self.activation = nn.PReLU()
        
        self.bn1 = nn.BatchNorm1d(input_dim)
        self.fc1 = nn.Linear(input_dim, 800)
        self.ln1 = nn.LayerNorm(800, elementwise_affine=True)
        
        self.bn2 = nn.BatchNorm1d(800)
        self.fc2 = nn.Linear(800, 600)
        self.ln2 = nn.LayerNorm(600, elementwise_affine=True)
        
        self.bn3 = nn.BatchNorm1d(600)
        self.fc3 = nn.Linear(600, 400)
        self.ln3 = nn.LayerNorm(400, elementwise_affine=True)
        
        self.bn4 = nn.BatchNorm1d(1200)
        self.fc4 = nn.Linear(1200, num_classes)
        self.ln4 = nn.LayerNorm(num_classes, elementwise_affine=True)
        
        self.sigm = nn.Sigmoid()
    def forward(self,inputs):

        fc1_out = self.bn1(inputs)
        fc1_out = self.ln1(self.fc1(inputs))
        fc1_out = self.activation(fc1_out)
        
        x = self.bn2(fc1_out)
        
        x = self.ln2(self.fc2(x))
        x = self.activation(x)
        
        x = self.bn3(x)
        
        x = self.ln3(self.fc3(x))
        x = self.activation(x)
        
        x = torch.cat([x, fc1_out], axis = -1)
        
        x = self.bn4(x)
        
        x = self.ln4(self.fc4(x))
        out = self.sigm(x)
        return out

In [14]:
# 3.定义分类模型 - 1DCNN

###############################################################
########## >>>>> baseline <<<<<<  ##########
###############################################################
class CNN1D(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNN1D, self).__init__()
        # (batch_size, channels, embed_size)  
        # Multi-dialted Layer
        #  某一层：1x3 conv dilate1
        #  某一层：1x3 conv dilate1 + 1x3 conv dilat2 + 1x7 conv dilate1  + 1x7 conv dilate2 
        #  最直观的改进！！同一层有不同的感受野！！！
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
        # (batch_size, 3, embed_size)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        # (batch_size, 3, embed_size/2 = 512)
        self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
        # (batch_size, 8, embed_size/2 = 512)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        # (batch_size, 8, embed_size/4 = 256)
        self.fc1 = nn.Linear(in_features=int(8 * input_dim/4), out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)

    def forward(self, x):
        x = x.reshape(x.shape[0], 1, x.shape[1])
        x = self.pool1(nn.functional.relu(self.conv1(x)))
        x = self.pool2(nn.functional.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

###############################################################
########## >>>>> 第二层 扩大感受野 <<<<<<  ##########
###############################################################
# class CNN1D(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(CNN1D, self).__init__()
#         self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1)
#         self.bn1 = nn.BatchNorm1d(3)
#         self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.conv2a = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1)
#         self.bn2a = nn.BatchNorm1d(8)
#         self.conv2b = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=5, dilation=1, padding=2)
#         self.bn2b = nn.BatchNorm1d(8)
#         self.conv2c = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=2, padding=2)
#         self.bn2c = nn.BatchNorm1d(8)
#         self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.fc1 = nn.Linear(in_features=int(8 * input_dim/4), out_features=128)
#         self.fc2 = nn.Linear(in_features=128, out_features=num_classes)

#     def forward(self, x):
#         x = x.reshape(x.shape[0], 1, x.shape[1])
#         x = self.pool1(F.relu(self.bn1(self.conv1(x))))
#         x = F.relu(self.bn2a(self.conv2a(x))) + F.relu(self.bn2b(self.conv2b(x))) + F.relu(self.bn2c(self.conv2c(x)))
#         x = self.pool2(x)
#         x = torch.flatten(x, 1)
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x


###############################################################
########## >>>>> 增加注意力机制 <<<<<<  ##########
###############################################################
# class AttentionModule(nn.Module):
#     def __init__(self, channels):
#         super(AttentionModule, self).__init__()
#         self.attention_weights = nn.Sequential(
#             nn.Conv1d(channels, channels, kernel_size=3, padding=1),  # 可以是更复杂的结构
#             nn.ReLU(),
#             nn.BatchNorm1d(channels),
#             nn.Conv1d(channels, 1, kernel_size=3, padding=1),
#             nn.Sigmoid()
#         )

#     def forward(self, x):
#         weights = self.attention_weights(x)
#         return x * weights

# class CNN1D(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(CNN1D, self).__init__()
#         self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
#         self.attention1 = AttentionModule(3)
#         self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
#         self.attention2 = AttentionModule(8)
#         self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.fc1 = nn.Linear(in_features=int(8 * input_dim / 4), out_features=128)
#         self.fc2 = nn.Linear(in_features=128, out_features=num_classes)

#     def forward(self, x):
#         x = x.reshape(x.shape[0], 1, x.shape[1])
#         x = self.attention1(self.conv1(x))
#         x = self.pool1(F.relu(x))
#         x = self.attention2(self.conv2(x))
#         x = self.pool2(F.relu(x))
#         x = torch.flatten(x, 1)
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x

In [15]:
# 4.模型训练

model_type = "convolutional"
def train_model(embeddings_source, model_type = model_type, train_size=0.9):
    
    train_dataset = ProteinSequenceDataset(datatype = "train", embeddings_source = embeddings_source)
    train_set, val_set = random_split(train_dataset, lengths = [int(len(train_dataset)*train_size), len(train_dataset)-int(len(train_dataset)*train_size)])
    train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=config.batch_size, shuffle=True)

    if model_type == "linear":
        model = MultiLayerPerceptron(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)
    if model_type == "convolutional":
        model = CNN1D(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)

        
    optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
    scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=1)
    CrossEntropy = torch.nn.CrossEntropyLoss()
    f1_score = MultilabelF1Score(num_labels=config.num_labels).to(config.device)
    n_epochs = config.n_epochs


    print("BEGIN TRAINING...")
    train_loss_history=[]
    val_loss_history=[]
    
    train_f1score_history=[]
    val_f1score_history=[]
    for epoch in range(n_epochs):
        print("EPOCH ", epoch+1)
        ## TRAIN PHASE :
        losses = []
        scores = []
        for embed, targets in tqdm(train_dataloader):
            embed, targets = embed.to(config.device), targets.to(config.device)
            optimizer.zero_grad()
            preds = model(embed)
            loss= CrossEntropy(preds, targets)
            score=f1_score(preds, targets)
            losses.append(loss.item()) 
            scores.append(score.item())
            loss.backward()
            optimizer.step()
        avg_loss = np.mean(losses)
        avg_score = np.mean(scores)
        print("Running Average TRAIN Loss : ", avg_loss)
        print("Running Average TRAIN F1-Score : ", avg_score)
        train_loss_history.append(avg_loss)
        train_f1score_history.append(avg_score)
        
        ## VALIDATION PHASE : 
        losses = []
        scores = []
        for embed, targets in val_dataloader:
            embed, targets = embed.to(config.device), targets.to(config.device)
            preds = model(embed)
            loss= CrossEntropy(preds, targets)
            score=f1_score(preds, targets)
            losses.append(loss.item())
            scores.append(score.item())

        avg_loss = np.mean(losses)
        avg_score = np.mean(scores)
        print("Running Average VAL Loss : ", avg_loss)
        print("Running Average VAL F1-Score : ", avg_score)
        val_loss_history.append(avg_loss)
        val_f1score_history.append(avg_score)
        
        scheduler.step(avg_loss)
        print("\n")
        
    print("TRAINING FINISHED")
    print("FINAL TRAINING SCORE : ", train_f1score_history[-1])
    print("FINAL VALIDATION SCORE : ", val_f1score_history[-1])
    print("FINAL TRAINING MAX F1 SCORE : ", max(train_f1score_history))
    print("FINAL VALIDATION MAX F1 SCORE : ", max(val_f1score_history))

    losses_history = {"train" : train_loss_history, "val" : val_loss_history}
    scores_history = {"train" : train_f1score_history, "val" : val_f1score_history}
    
    return model, losses_history, scores_history


# 训练模型
t5_model, t5_losses, t5_scores = train_model(embeddings_source="T5",model_type="linear")
# protbert_model, protbert_losses, protbert_scores = train_model(embeddings_source="ProtBERT",model_type="linear")
# ems2_model, ems2_losses, ems2_scores = train_model(embeddings_source="EMS2",model_type="convolutional")

# plt.figure(figsize = (10, 4))
# #plt.plot(ems2_losses["val"], label = "EMS2")
# plt.plot(t5_losses["val"], label = "T5")
# #plt.plot(protbert_losses["val"], label = "ProtBERT") 
# plt.title("Validation Losses for # Vector Embeddings")
# plt.xlabel("Epochs")
# plt.ylabel("Average Loss")
# plt.legend()
# plt.show()

# plt.figure(figsize = (10, 4))
# #plt.plot(ems2_scores["val"], label = "EMS2")
# plt.plot(t5_scores["val"], label = "T5")
# #plt.plot(protbert_scores["val"], label = "ProtBERT")
# plt.title("Validation F1-Scores for # Vector Embeddings")
# plt.xlabel("Epochs")
# plt.ylabel("Average F1-Score")
# plt.legend()
# plt.show()

BEGIN TRAINING...
EPOCH  1


100%|███████████████████████████████████████| 1001/1001 [00:15<00:00, 66.08it/s]


Running Average TRAIN Loss :  148.11298829954225
Running Average TRAIN F1-Score :  0.18063766602333728
Running Average VAL Loss :  147.3607474054609
Running Average VAL F1-Score :  0.2115955469198525


EPOCH  2


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.64it/s]


Running Average TRAIN Loss :  145.64049745725467
Running Average TRAIN F1-Score :  0.224618124050813
Running Average VAL Loss :  145.84388262884957
Running Average VAL F1-Score :  0.23656268923410348


EPOCH  3


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.43it/s]


Running Average TRAIN Loss :  144.6687542499005
Running Average TRAIN F1-Score :  0.24698475315854265
Running Average VAL Loss :  145.61247335161482
Running Average VAL F1-Score :  0.2509148513366069


EPOCH  4


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.31it/s]


Running Average TRAIN Loss :  144.04660862666387
Running Average TRAIN F1-Score :  0.26306359560696874
Running Average VAL Loss :  144.64057963235038
Running Average VAL F1-Score :  0.26017869829333257


EPOCH  5


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.59it/s]


Running Average TRAIN Loss :  143.62687552773156
Running Average TRAIN F1-Score :  0.2764860439401764
Running Average VAL Loss :  144.6617626462664
Running Average VAL F1-Score :  0.26680476364812683


EPOCH  6


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.43it/s]


Running Average TRAIN Loss :  143.29244340406908
Running Average TRAIN F1-Score :  0.286810508260241
Running Average VAL Loss :  144.97399650301253
Running Average VAL F1-Score :  0.27583765438092606


EPOCH  7


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.40it/s]


Running Average TRAIN Loss :  142.77964465649097
Running Average TRAIN F1-Score :  0.3066443005850265
Running Average VAL Loss :  145.0096802030291
Running Average VAL F1-Score :  0.28551822820944445


EPOCH  8


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.18it/s]


Running Average TRAIN Loss :  142.5869755854497
Running Average TRAIN F1-Score :  0.3159130912769091
Running Average VAL Loss :  144.4905960219247
Running Average VAL F1-Score :  0.28764244967273306


EPOCH  9


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.48it/s]


Running Average TRAIN Loss :  142.49004821320037
Running Average TRAIN F1-Score :  0.3205737567448116
Running Average VAL Loss :  145.09972967420305
Running Average VAL F1-Score :  0.2906730242871812


EPOCH  10


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.35it/s]


Running Average TRAIN Loss :  142.411641520101
Running Average TRAIN F1-Score :  0.3251487248308294
Running Average VAL Loss :  144.40837580817086
Running Average VAL F1-Score :  0.2911611261910626


EPOCH  11


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.08it/s]


Running Average TRAIN Loss :  142.34831311724164
Running Average TRAIN F1-Score :  0.32797124122942123
Running Average VAL Loss :  144.8645897592817
Running Average VAL F1-Score :  0.29322261549532413


EPOCH  12


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.34it/s]


Running Average TRAIN Loss :  142.28521574555816
Running Average TRAIN F1-Score :  0.3315605051391251
Running Average VAL Loss :  144.313166482108
Running Average VAL F1-Score :  0.2925274439954332


EPOCH  13


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.45it/s]


Running Average TRAIN Loss :  142.20899585338978
Running Average TRAIN F1-Score :  0.3350775029573526
Running Average VAL Loss :  144.32341289520264
Running Average VAL F1-Score :  0.2945912509624447


EPOCH  14


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.19it/s]


Running Average TRAIN Loss :  142.12568899301382
Running Average TRAIN F1-Score :  0.3377521744855753
Running Average VAL Loss :  144.17862789971488
Running Average VAL F1-Score :  0.293881769957287


EPOCH  15


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.10it/s]


Running Average TRAIN Loss :  142.08073331545162
Running Average TRAIN F1-Score :  0.34018241927459403
Running Average VAL Loss :  144.4891800880432
Running Average VAL F1-Score :  0.29564752536160605


EPOCH  16


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.60it/s]


Running Average TRAIN Loss :  142.01919361308856
Running Average TRAIN F1-Score :  0.3427799721578737
Running Average VAL Loss :  144.45540741511755
Running Average VAL F1-Score :  0.29692351658429417


EPOCH  17


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.53it/s]


Running Average TRAIN Loss :  141.8925131036566
Running Average TRAIN F1-Score :  0.34698231043396416
Running Average VAL Loss :  144.81054060799735
Running Average VAL F1-Score :  0.29672463797032833


EPOCH  18


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.29it/s]


Running Average TRAIN Loss :  141.87232024567228
Running Average TRAIN F1-Score :  0.3481591928761441
Running Average VAL Loss :  144.62272753034318
Running Average VAL F1-Score :  0.29825897780912264


EPOCH  19


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.29it/s]


Running Average TRAIN Loss :  141.86771812781944
Running Average TRAIN F1-Score :  0.34881879755488404
Running Average VAL Loss :  144.2957844734192
Running Average VAL F1-Score :  0.29718715072210344


EPOCH  20


100%|███████████████████████████████████████| 1001/1001 [00:14<00:00, 67.67it/s]


Running Average TRAIN Loss :  141.94673776388407
Running Average TRAIN F1-Score :  0.34855694123557757
Running Average VAL Loss :  144.46160418646676
Running Average VAL F1-Score :  0.296888583738889


TRAINING FINISHED
FINAL TRAINING SCORE :  0.34855694123557757
FINAL VALIDATION SCORE :  0.296888583738889
FINAL TRAINING MAX F1 SCORE :  0.34881879755488404
FINAL VALIDATION MAX F1 SCORE :  0.29825897780912264


In [21]:
# 4.模型预测

def predict(embeddings_source):
    
    test_dataset = ProteinSequenceDataset(datatype="test", embeddings_source = embeddings_source)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
    
    if embeddings_source == "T5":
        model = t5_model
    if embeddings_source == "ProtBERT":
        model = protbert_model
    if embeddings_source == "EMS2":
        model = ems2_model
        
    model.eval()
    
    labels = pd.read_csv(config.train_labels_path, sep = "\t")
    top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
    labels_names = top_terms[:config.num_labels].index.values
    print("GENERATE PREDICTION FOR TEST SET...")

    ids_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
    go_terms_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
    confs_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=np.float32)

    for i, (embed, id) in tqdm(enumerate(test_dataloader)):
        embed = embed.to(config.device)
        confs_[i*config.num_labels:(i+1)*config.num_labels] = torch.nn.functional.sigmoid(model(embed)).squeeze().detach().cpu().numpy()
        ids_[i*config.num_labels:(i+1)*config.num_labels] = id[0]
        go_terms_[i*config.num_labels:(i+1)*config.num_labels] = labels_names

    submission_df = pd.DataFrame(data={"Id" : ids_, "GO term" : go_terms_, "Confidence" : confs_})
    print("PREDICTIONS DONE")
    return submission_df


# submission_df = predict("T5")
# submission_df.to_csv('submission-t5.tsv', sep='\t', header=False, index=False)
# submission_df = predict("ProtBERT")
# submission_df.to_csv('submission-protbert.tsv', sep='\t', header=False, index=False)
submission_df = predict("EMS2")
submission_df.to_csv('submission-ems2.tsv', sep='\t', header=False, index=False)
print(submission_df)

GENERATE PREDICTION FOR TEST SET...


141864it [02:14, 1054.67it/s]


PREDICTIONS DONE
              Id     GO term  Confidence
0         Q9ZSA8  GO:0005575    0.932083
1         Q9ZSA8  GO:0008150    0.928279
2         Q9ZSA8  GO:0110165    0.930866
3         Q9ZSA8  GO:0003674    0.915635
4         Q9ZSA8  GO:0005622    0.912028
...          ...         ...         ...
70931995  P0AG74  GO:0044403    0.188066
70931996  P0AG74  GO:0042803    0.180563
70931997  P0AG74  GO:0016829    0.162771
70931998  P0AG74  GO:0090596    0.171527
70931999  P0AG74  GO:0051640    0.190207

[70932000 rows x 3 columns]


In [3]:
###############################################################
########## >>>>> ENSEMBLING WITH TOP PUBLIC SUBMISSION <<<<<<  ##########
###############################################################
submission_best_public2 = pd.read_csv('../input/cafa-5-053818-pred/submission (3).tsv',
    sep='\t', header=None, names=['Id2', 'GO term2', 'Confidence2'])


submissions = submission_best_public2.merge(submission_df, left_on=['Id2', 'GO term2'], 
                                                  right_on=['Id', 'GO term'], how='outer')

submissions.drop(['Id', 'GO term'], axis=1, inplace=True)
submissions['confidence_combined'] = submissions.apply(lambda row: row['Confidence2'] if not np.isnan(row['Confidence2']) else row['Confidence'], axis=1)

submissions[['Id2', 'GO term2', 'confidence_combined']].to_csv('submission.tsv', sep='\t', header=False, index=False)

NameError: name 'submission_df' is not defined