In [1]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
from transformers import EsmTokenizer, EsmModel
import torch
import torch.nn as nn
import obonet

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 文件路径
home_dir = os.environ.get('HOME')
directory_path = os.path.join(home_dir, 'Data', 'protein', 'cafa-5-protein-function-prediction')

# 读取并合并序列和注释数据
sequences_dict = {}
for seq_record in SeqIO.parse(os.path.join(directory_path, 'Train', 'train_sequences.fasta'), "fasta"):
    sequences_dict[seq_record.id] = str(seq_record.seq)
sequences = pd.DataFrame(list(sequences_dict.items()), columns=['EntryID', 'Sequence'])
sequences_annotations = pd.read_csv(os.path.join(directory_path, 'Train', 'train_taxonomy.tsv'), sep='\t')
train = pd.merge(sequences, sequences_annotations, on='EntryID')

In [3]:
# 读取GO本体文件并构建网络图
def read_obo(file_path):
    return obonet.read_obo(file_path)

# 提取指定命名空间的term并返回DataFrame
def extract_terms(graph, namespaces, max_terms=5):
    terms_list = []
    count = 0
    for node, data in graph.nodes(data=True):
        if 'namespace' in data and data['namespace'] in namespaces:
            terms_list.append({
                'ID': node,
                'Name': data['name'],
                'Namespace': data['namespace']
            })
            count += 1
            if count >= max_terms:
                break
    return pd.DataFrame(terms_list)

In [4]:
# 读取OBO文件并提取术语
par = os.path.join(directory_path, 'Train', 'go-basic.obo')
graph = read_obo(par)
namespaces = {"molecular_function", "biological_process", "cellular_component"}
obo_info = extract_terms(graph, namespaces, max_terms=5)

# 读入train_terms.tsv并合并
sequences_terms = pd.read_csv(os.path.join(directory_path, 'Train', 'train_terms.tsv'), sep='\t')
obo_info2 = pd.merge(obo_info, sequences_terms, on='term')
train_total = pd.merge(train, obo_info2, on='EntryID').iloc[:, :-1]

KeyError: 'term'

In [None]:
# 处理前5个蛋白质序列
subset_train_total = train_total.head(5)

# 加载预训练的ESM模型和tokenizer
model_name = "facebook/esm-1b"
tokenizer = EsmTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name)

def get_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state

subset_train_total['Embedding'] = subset_train_total['Sequence'].apply(get_embedding)

# 打印ESM嵌入
print(subset_train_total[['EntryID', 'Sequence', 'Embedding']])

In [None]:
# 定义MLP模型
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
# 假设EL Embedding的维度为256
embedding_dim = 256

# 提取GO术语并生成EL Embeddings
def extract_terms_and_generate_embeddings(graph, namespaces, embedding_dim, max_terms=5):
    terms = []
    embeddings = []
    count = 0
    for node, data in graph.nodes(data=True):
        if 'namespace' in data and data['namespace'] in namespaces:
            terms.append(node)
            embeddings.append(np.random.randn(embedding_dim))  # 示例中用随机向量表示
            count += 1
            if count >= max_terms:
                break
    return terms, np.array(embeddings)

terms, embeddings = extract_terms_and_generate_embeddings(graph, namespaces, embedding_dim, max_terms=5)

# 将GO术语和嵌入表示存储在DataFrame中
terms_df = pd.DataFrame({'term': terms, 'EL_Embedding': list(embeddings)})

# 打印部分结果
print(terms_df.head())

In [None]:
# 创建MLP模型实例
input_dim = 1280  # ESM2 Embedding的维度
output_dim = embedding_dim  # EL Embedding的维度
mlp = MLP(input_dim, output_dim)

# 投射蛋白质ESM Embedding
def project_embedding(esm_embedding):
    esm_embedding_tensor = torch.tensor(esm_embedding).float()
    projected_embedding = mlp(esm_embedding_tensor)
    return projected_embedding.detach().numpy()

# 对每个蛋白质序列的ESM Embedding进行投射
subset_train_total['Projected_Embedding'] = subset_train_total['Embedding'].apply(lambda x: project_embedding(x.mean(dim=1).squeeze().numpy()))

# 打印部分结果
print(subset_train_total.head())

# 合并EL Embeddings与蛋白质的Projected Embedding
final_df = pd.merge(subset_train_total, terms_df, on='term')

# 打印最终的结果
print(final_df.head())