In [None]:
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import torch

# 加载模型和 tokenizer
model = AutoModel.from_pretrained('/autodl-fs/data/ESM2-3B')
tokenizer = AutoTokenizer.from_pretrained('/autodl-fs/data/ESM2-3B')

model.eval()

# 使用 CPU 进行推理
device = torch.device("cpu")
model.to(device)

# 读取 CSV 文件，假设文件中有两列 'protein_name' 和 'sequence'
df = pd.read_csv('/autodl-fs/data/realdata1.csv')

# 提取蛋白质名称和序列
sequences = df['Sequence'].tolist()
protein_names = df['Protein_ID'].tolist()

# 分批处理，避免内存不足
batch_size = 1  # 单个处理，以确保CPU内存不溢出
all_representations = []

for i in range(0, len(sequences), batch_size):
    batch_sequences = sequences[i:i + batch_size]
    batch_names = protein_names[i:i + batch_size]

    # 对每个蛋白质序列进行编码
    encodings = tokenizer(batch_sequences, return_tensors='pt', padding=True, truncation=True)

    # 将编码后的输入转移到 CPU
    encodings = {key: val.to(device) for key, val in encodings.items()}

    # 获取模型输出
    with torch.no_grad():
        outputs = model(**encodings)

    # 获取最后一层的特征表示
    sequence_representations = outputs.last_hidden_state.mean(dim=1)  # [batch_size, hidden_size]
    
    # 将特征保存
    all_representations.append(sequence_representations.cpu())

# 合并所有批量的表示
all_representations = torch.cat(all_representations, dim=0)

# 将特征表示转换为 NumPy 数组
sequence_representations_np = all_representations.numpy()

# 创建一个 DataFrame 来保存特征
output_df = pd.DataFrame(sequence_representations_np, index=protein_names)

# 保存特征到 CSV 文件
output_df.to_csv('/autodl-fs/data/realdata4.csv')

print("特征已经保存为 /autodl-fs/data/realdata3.csv 文件")
