In [3]:
import pandas as pd
import pdb
import numpy as np
from tqdm import tqdm
import time
import math
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from torch.utils.data import Dataset
import torch;
from torch import nn
import torch.nn.functional as F
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import MultilabelF1Score
from torchmetrics.classification import MultilabelAccuracy
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
import wandb
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

# 1.数据预处理，使用T5（ProtTrans）、ems2、protein与训练模型生成embedding;
# 2.配置读取embedding文件 & 构建数据集;
MAIN_DIR = "../input/cafa-5-protein-function-prediction"

class config:
    train_labels_path = MAIN_DIR + "/Train/train_terms.tsv"
    test_sequences_path = MAIN_DIR + "/Test (Targets)/testsuperset.fasta"
    train_sequences_path = MAIN_DIR  + "/Train/train_sequences.fasta"
    
    num_labels = 500
    n_epochs = 10
    batch_size = 128
    lr = 0.001 
    # for Mac
    device = torch.device('mps') 
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
    print(device)
   
# Directories for the different embedding vectors : 
embeds_map = {
    "T5" : "cafa-5-t5-embeddings-numpy",
    "ProtBERT" : "cafa-5-protbert-embeddings-numpy",
    "EMS2" : "cafa-5-ems2-embeddings-numpy"
}

# Length of the different embedding vectors :
embeds_dim = {
    "T5" : 1024,
    "ProtBERT" : 1024,
    "EMS2" : 1280
}

class ProteinSequenceDataset(Dataset):
    
#     def __init__(self, datatype, embeddings_sources):
#         super(ProteinSequenceDataset).__init__()
#         self.datatype = datatype

#         # 加载嵌入并进行拼接
#         embeddings_list = []
#         for embeddings_source in embeddings_sources:
#             embeds_path = f"../input/{embeds_map[embeddings_source]}/{datatype}_embeddings.npy"
#             embeds = np.load(embeds_path)
#             print(f"{embeddings_source} embeddings shape: {embeds.shape}")
#             embeddings_list.append(embeds)

#         # 使用numpy的hstack函数进行水平拼接
#         combined_embeds = np.hstack(embeddings_list)

#         # 加载IDs
#         ids_path = f"../input/{list(embeds_map.values())[0]}/{datatype}_ids.npy"  # 假设所有ID文件相同
#         ids = np.load(ids_path)

#         # 创建包含所有嵌入和ID的DataFrame
#         embeds_list = [combined_embeds[i, :] for i in range(combined_embeds.shape[0])]
#         self.df = pd.DataFrame(data={"EntryID": ids, "embed": embeds_list})
        
#         if datatype == "train":
#             np_labels = np.load(f"../input/train-targets-top{config.num_labels}/train_targets_top{config.num_labels}.npy")
#             df_labels = pd.DataFrame(self.df['EntryID'])
#             df_labels['labels_vect'] = [row for row in np_labels]
#             self.df = self.df.merge(df_labels, on="EntryID")
    def __init__(self, datatype, embeddings_sources):
        super(ProteinSequenceDataset).__init__()
        self.datatype = datatype

        # 加载嵌入并进行拼接
        embeddings_list = []
        for embeddings_source in embeddings_sources:
            embeds_path = f"../input/{embeds_map[embeddings_source]}/{datatype}_embeddings.npy"
            embeds = np.load(embeds_path)
            embeddings_list.append(embeds)

        # 调整嵌入向量的大小，使它们在拼接维度上具有相同的大小
        max_size = max(embed.shape[0] for embed in embeddings_list)
        for i, embed in enumerate(embeddings_list):
            if embed.shape[0] < max_size:
                pad_size = max_size - embed.shape[0]
                embeddings_list[i] = np.pad(embed, ((0, pad_size), (0, 0)), mode='constant', constant_values=0)

        # 使用numpy的hstack函数进行水平拼接
        combined_embeds = np.hstack(embeddings_list)

        # 加载IDs
        ids_path = f"../input/{list(embeds_map.values())[0]}/{datatype}_ids.npy"  # 假设所有ID文件相同
        ids = np.load(ids_path)

        # 创建包含所有嵌入和ID的DataFrame
        embeds_list = [combined_embeds[i, :] for i in range(combined_embeds.shape[0])]
        self.df = pd.DataFrame(data={"EntryID": ids, "embed": embeds_list})

        if datatype == "train":
            np_labels = np.load(f"../input/train-targets-top{config.num_labels}/train_targets_top{config.num_labels}.npy")
            df_labels = pd.DataFrame(self.df['EntryID'])
            df_labels['labels_vect'] = [row for row in np_labels]
            self.df = self.df.merge(df_labels, on="EntryID")

    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        embed = torch.tensor(self.df.iloc[index]["embed"], dtype=torch.float32)
        if self.datatype == "train":
            targets = torch.tensor(self.df.iloc[index]["labels_vect"], dtype=torch.float32)
            return embed, targets
        if self.datatype == "test":
            id = self.df.iloc[index]["EntryID"]
            return embed, id


# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(MultiLayerPerceptron, self).__init__()

#         # 减少层数
#         self.linear1 = nn.Linear(input_dim, 2048)
#         self.bn1 = nn.BatchNorm1d(2048)
#         self.activation1 = nn.ReLU()
#         self.dropout1 = nn.Dropout(0.5)
#         self.linear2 = nn.Linear(2048, 1024)
#         self.bn2 = nn.BatchNorm1d(1024)
#         self.activation2 = nn.ReLU()
#         self.dropout2 = nn.Dropout(0.5)
#         self.linear3 = nn.Linear(1024, num_classes)

#     def forward(self, x):
#         x = self.linear1(x)
#         x = self.bn1(x)
#         x = self.activation1(x)
#         x = self.dropout1(x)
#         x = self.linear2(x)
#         x = self.bn2(x)
#         x = self.activation2(x)
#         x = self.dropout2(x)
#         x = self.linear3(x)
#         return x

class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(MultiLayerPerceptron, self).__init__()

        # 减少层数
        self.linear1 = nn.Linear(input_dim, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        self.activation1 = nn.GELU()
        self.dropout1 = nn.Dropout(0.5)
        self.linear2 = nn.Linear(2048, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.activation2 = nn.GELU()
        self.dropout2 = nn.Dropout(0.5)
        self.linear3 = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = self.linear1(x)
        x = self.bn1(x)
        x = self.activation1(x)
        x = self.dropout1(x)
        x = self.linear2(x)
        x = self.bn2(x)
        x = self.activation2(x)
        x = self.dropout2(x)
        x = self.linear3(x)
        return x
    
# class CNN1D(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(CNN1D, self).__init__()
#         # (batch_size, channels, embed_size)  
#         # Multi-dialted Layer
#         #  某一层：1x3 conv dilate1
#         #  某一层：1x3 conv dilate1 + 1x3 conv dilat2 + 1x7 conv dilate1  + 1x7 conv dilate2 
#         #  最直观的改进！！同一层有不同的感受野！！！
#         self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
#         # (batch_size, 3, embed_size)
#         self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
#         # (batch_size, 3, embed_size/2 = 512)
#         self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
#         # (batch_size, 8, embed_size/2 = 512)
#         self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
#         # (batch_size, 8, embed_size/4 = 256)
#         self.fc1 = nn.Linear(in_features=int(8 * input_dim/4), out_features=128)
#         self.fc2 = nn.Linear(in_features=128, out_features=num_classes)

#     def forward(self, x):
#         x = x.reshape(x.shape[0], 1, x.shape[1])
#         x = self.pool1(nn.functional.relu(self.conv1(x)))
#         x = self.pool2(nn.functional.relu(self.conv2(x)))
#         x = torch.flatten(x, 1)
#         x = nn.functional.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x


###############################################################
########## >>>>> 第二层 扩大感受野 <<<<<<  ##########
###############################################################
class CNN1D(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1)
        self.bn1 = nn.BatchNorm1d(3)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.conv2a = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1)
        self.bn2a = nn.BatchNorm1d(8)
        self.conv2b = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=5, dilation=1, padding=2)
        self.bn2b = nn.BatchNorm1d(8)
        self.conv2c = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=2, padding=2)
        self.bn2c = nn.BatchNorm1d(8)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.fc1 = nn.Linear(in_features=int(8 * input_dim/4), out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)

    def forward(self, x):
        x = x.reshape(x.shape[0], 1, x.shape[1])
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = F.relu(self.bn2a(self.conv2a(x))) + F.relu(self.bn2b(self.conv2b(x))) + F.relu(self.bn2c(self.conv2c(x)))
        x = self.pool2(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    
###############################################################
########## >>>>> 增加注意力机制 <<<<<<  ##########
###############################################################
# class AttentionModule(nn.Module):
#     def __init__(self, channels):
#         super(AttentionModule, self).__init__()
#         self.attention_weights = nn.Sequential(
#             nn.Conv1d(channels, channels, kernel_size=3, padding=1),  # 可以是更复杂的结构
#             nn.ReLU(),
#             nn.BatchNorm1d(channels),
#             nn.Conv1d(channels, 1, kernel_size=3, padding=1),
#             nn.Sigmoid()
#         )

#     def forward(self, x):
#         weights = self.attention_weights(x)
#         return x * weights

# class CNN1D(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(CNN1D, self).__init__()
#         self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
#         self.attention1 = AttentionModule(3)
#         self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
#         self.attention2 = AttentionModule(8)
#         self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.fc1 = nn.Linear(in_features=int(8 * input_dim / 4), out_features=128)
#         self.fc2 = nn.Linear(in_features=128, out_features=num_classes)

#     def forward(self, x):
#         x = x.reshape(x.shape[0], 1, x.shape[1])
#         x = self.attention1(self.conv1(x))
#         x = self.pool1(F.relu(x))
#         x = self.attention2(self.conv2(x))
#         x = self.pool2(F.relu(x))
#         x = torch.flatten(x, 1)
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x


# class Attention(nn.Module):
#     def __init__(self, input_dim, hidden_dim):
#         super(Attention, self).__init__()
#         self.linear1 = nn.Linear(input_dim, hidden_dim)
#         self.linear2 = nn.Linear(hidden_dim, 1)

#     def forward(self, x):
#         x = F.relu(self.linear1(x))
#         x = self.linear2(x)
#         alpha = F.softmax(x, dim=1)
#         return alpha

# class CNN1D(nn.Module):
#     def __init__(self, input_dim, num_classes, attention_hidden_dim=64):
#         super(CNN1D, self).__init__()
#         self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=5, padding=2)
#         self.bn1 = nn.BatchNorm1d(64)
#         self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)

#         self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
#         self.bn2 = nn.BatchNorm1d(128)
#         self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)

#         self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
#         self.bn3 = nn.BatchNorm1d(256)
#         self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)

#         self.attention = Attention(256, attention_hidden_dim)

#         self.fc1 = nn.Linear(in_features=256, out_features=512)
#         self.fc2 = nn.Linear(in_features=512, out_features=num_classes)

#     def forward(self, x):
#         x = x.unsqueeze(1)
#         x = self.pool1(F.relu(self.bn1(self.conv1(x))))
#         x = self.pool2(F.relu(self.bn2(self.conv2(x))))
#         x = self.pool3(F.relu(self.bn3(self.conv3(x))))
#         alpha = self.attention(x.permute(0, 2, 1))  # Attention weights
#         x = torch.bmm(alpha.permute(0, 2, 1), x).squeeze()  # Weighted sum
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x


    

# class OptimizedCNN1D(nn.Module):
#     def __init__(self, input_dim, num_classes):
#         super(OptimizedCNN1D, self).__init__()
#         self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=5, padding=2)
#         self.bn1 = nn.BatchNorm1d(64)
#         self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
#         self.bn2 = nn.BatchNorm1d(128)
#         self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
#         self.bn3 = nn.BatchNorm1d(256)
#         self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)
        
#         self.fc1 = nn.Linear(in_features=256 * (input_dim // 8), out_features=512)
#         self.fc2 = nn.Linear(in_features=512, out_features=num_classes)

#     def forward(self, x):
#         x = x.unsqueeze(1)  # Add channel dimension
#         x = self.pool1(F.relu(self.bn1(self.conv1(x))))
#         x = self.pool2(F.relu(self.bn2(self.conv2(x))))
#         x = self.pool3(F.relu(self.bn3(self.conv3(x))))
#         x = x.view(x.size(0), -1)  # Flatten
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x


    
model_type = "convolutional"
def train_model(embeddings_source, model_type = model_type, train_size=0.9):

    train_dataset = ProteinSequenceDataset(datatype="train", embeddings_sources=embeddings_source)
    train_set, val_set = random_split(train_dataset, lengths = [int(len(train_dataset)*train_size), len(train_dataset)-int(len(train_dataset)*train_size)])
    train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=config.batch_size, shuffle=True)

    total_input_dim = sum(embeds_dim[src] for src in embeddings_sources)
    model = MultiLayerPerceptron(input_dim=total_input_dim, num_classes=config.num_labels).to(config.device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
    scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=1)
    CrossEntropy = torch.nn.CrossEntropyLoss()
    f1_score = MultilabelF1Score(num_labels=config.num_labels).to(config.device)
    n_epochs = config.n_epochs


    print("BEGIN TRAINING...")
    train_loss_history=[]
    val_loss_history=[]
    
    train_f1score_history=[]
    val_f1score_history=[]
    for epoch in range(n_epochs):
        print("EPOCH ", epoch+1)
        ## TRAIN PHASE :
        losses = []
        scores = []
        for embed, targets in tqdm(train_dataloader):
            embed, targets = embed.to(config.device), targets.to(config.device)
            optimizer.zero_grad()
            preds = model(embed)
            loss= CrossEntropy(preds, targets)
            score=f1_score(preds, targets)
            losses.append(loss.item()) 
            scores.append(score.item())
            loss.backward()
            optimizer.step()
        avg_loss = np.mean(losses)
        avg_score = np.mean(scores)
        print("Running Average TRAIN Loss : ", avg_loss)
        print("Running Average TRAIN F1-Score : ", avg_score)
        train_loss_history.append(avg_loss)
        train_f1score_history.append(avg_score)
        
        ## VALIDATION PHASE : 
        losses = []
        scores = []
        for embed, targets in val_dataloader:
            embed, targets = embed.to(config.device), targets.to(config.device)
            preds = model(embed)
            loss= CrossEntropy(preds, targets)
            score=f1_score(preds, targets)
            losses.append(loss.item())
            scores.append(score.item())

        avg_loss = np.mean(losses)
        avg_score = np.mean(scores)
        print("Running Average VAL Loss : ", avg_loss)
        print("Running Average VAL F1-Score : ", avg_score)
        val_loss_history.append(avg_loss)
        val_f1score_history.append(avg_score)
        
        scheduler.step(avg_loss)
        print("\n")
        
    print("TRAINING FINISHED")
    print("FINAL TRAINING SCORE : ", train_f1score_history[-1])
    print("FINAL VALIDATION SCORE : ", val_f1score_history[-1])
    print("FINAL TRAINING MAX F1 SCORE : ", max(train_f1score_history))
    print("FINAL VALIDATION MAX F1 SCORE : ", max(val_f1score_history))

    losses_history = {"train" : train_loss_history, "val" : val_loss_history}
    scores_history = {"train" : train_f1score_history, "val" : val_f1score_history}
    
    return model, losses_history, scores_history


# 训练模型
embeddings_sources = ["T5", "ProtBERT", "EMS2"]
t5_model, t5_losses, t5_scores = train_model(embeddings_source = embeddings_sources,model_type="linear")
# t5_model, t5_losses, t5_scores = train_model(embeddings_source = embeddings_sources,model_type="convolutional")

# # 指定保存路径
# model_path = "t5_model.pth"

# # 保存模型
# torch.save(t5_model.state_dict(), model_path)

# print(f"Model saved to {model_path}")

# 4.模型预测

def predict(embeddings_sources):
    test_dataset = ProteinSequenceDataset(datatype="test", embeddings_sources=embeddings_sources)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
    
    model = t5_model
    model.eval()
    
    labels = pd.read_csv(config.train_labels_path, sep = "\t")
    top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
    labels_names = top_terms[:config.num_labels].index.values
    print("GENERATE PREDICTION FOR TEST SET...")

    ids_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
    go_terms_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
    confs_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=np.float32)

    for i, (embed, id) in tqdm(enumerate(test_dataloader)):
        embed = embed.to(config.device)
        confs_[i*config.num_labels:(i+1)*config.num_labels] = torch.nn.functional.sigmoid(model(embed)).squeeze().detach().cpu().numpy()
        ids_[i*config.num_labels:(i+1)*config.num_labels] = id[0]
        go_terms_[i*config.num_labels:(i+1)*config.num_labels] = labels_names

    submission_df = pd.DataFrame(data={"Id" : ids_, "GO term" : go_terms_, "Confidence" : confs_})
    print("PREDICTIONS DONE")
    return submission_df
submission_df = predict(["T5", "ProtBERT", "EMS2"])
submission_df.to_csv('submission.tsv', sep='\t', header=False, index=False)


mps
BEGIN TRAINING...
EPOCH  1


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 56.39it/s]


Running Average TRAIN Loss :  133.52577859609872
Running Average TRAIN F1-Score :  0.16652627472396378
Running Average VAL Loss :  130.06730086462838
Running Average VAL F1-Score :  0.18029049171933106


EPOCH  2


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 58.58it/s]


Running Average TRAIN Loss :  130.43866453828153
Running Average TRAIN F1-Score :  0.18519467959454008
Running Average VAL Loss :  128.55258069719588
Running Average VAL F1-Score :  0.18270099382581456


EPOCH  3


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 56.53it/s]


Running Average TRAIN Loss :  129.3222176916711
Running Average TRAIN F1-Score :  0.18816738566020866
Running Average VAL Loss :  128.22666393007552
Running Average VAL F1-Score :  0.18348183296620846


EPOCH  4


100%|███████████████████████████████████████| 1001/1001 [00:18<00:00, 54.60it/s]


Running Average TRAIN Loss :  128.55858844524616
Running Average TRAIN F1-Score :  0.18859252501855958
Running Average VAL Loss :  127.8585694858006
Running Average VAL F1-Score :  0.1810517647702779


EPOCH  5


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 56.34it/s]


Running Average TRAIN Loss :  127.88731513085304
Running Average TRAIN F1-Score :  0.1869535091427061
Running Average VAL Loss :  127.98071098327637
Running Average VAL F1-Score :  0.17886019085666963


EPOCH  6


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 57.86it/s]


Running Average TRAIN Loss :  127.34622472101873
Running Average TRAIN F1-Score :  0.1847792199769101
Running Average VAL Loss :  127.45680856704712
Running Average VAL F1-Score :  0.1758612064378602


EPOCH  7


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 57.31it/s]


Running Average TRAIN Loss :  126.81494780091734
Running Average TRAIN F1-Score :  0.18301985278115288
Running Average VAL Loss :  128.3714177267892
Running Average VAL F1-Score :  0.17309936614973204


EPOCH  8


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 56.45it/s]


Running Average TRAIN Loss :  126.34226155924154
Running Average TRAIN F1-Score :  0.181399076216883
Running Average VAL Loss :  127.6075541632516
Running Average VAL F1-Score :  0.17029622962166155


EPOCH  9


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 56.84it/s]


Running Average TRAIN Loss :  124.89644815633585
Running Average TRAIN F1-Score :  0.18421163075751476
Running Average VAL Loss :  127.03648349217006
Running Average VAL F1-Score :  0.1728574221155473


EPOCH  10


100%|███████████████████████████████████████| 1001/1001 [00:17<00:00, 56.85it/s]


Running Average TRAIN Loss :  124.36522939036061
Running Average TRAIN F1-Score :  0.18668044159760128
Running Average VAL Loss :  126.93043967655727
Running Average VAL F1-Score :  0.17365932052156755


TRAINING FINISHED
FINAL TRAINING SCORE :  0.18668044159760128
FINAL VALIDATION SCORE :  0.17365932052156755
FINAL TRAINING MAX F1 SCORE :  0.18859252501855958
FINAL VALIDATION MAX F1 SCORE :  0.18348183296620846
GENERATE PREDICTION FOR TEST SET...


141865it [02:50, 830.12it/s]


PREDICTIONS DONE


In [None]:
#预测结果融合
test_pred_df_foldseek = pd.read_csv('/kaggle/input/foldseek-cafa/foldseek_submission.tsv',
    sep='\t', header=None, names=[1, 2, 3])
test_pred_df_foldseek = test_pred_df_foldseek[test_pred_df_foldseek[3] > 0.6]

submission_best_public = pd.read_csv('/kaggle/input/cafa5-tuning-merge-datasets/submission.tsv',
    sep='\t', header=None, names=['Id', 'GO term', 'Confidence'])

submissions_merged = submission_best_public.merge(test_pred_df_foldseek, left_on=['Id', 'GO term'], 
                                                  right_on=[1, 2], how='outer')
submissions_merged.drop([1, 2], axis=1, inplace=True)
submissions_merged['confidence_combined'] = submissions_merged.apply(lambda row: row['Confidence'] if not np.isnan(row['Confidence']) else row[3], axis=1)
submissions_merged[['Id', 'GO term', 'confidence_combined']].to_csv('submission.tsv',
    sep='\t', header=False, index=False)



final_submission.loc[final_submission.confidence_combined<0.22,"confidence_combined"]=0
final_submission.loc[final_submission.confidence_combined>0.8,"confidence_combined"]=1
final_submission.confidence_combined.hist(range=[0, 1], bins=100)
final_submission.to_csv('submission.tsv', sep='\t', header=False, index=False)