In [None]:
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import torch

model = AutoModel.from_pretrained('/autodl-fs/data/ESM2-3B')
tokenizer = AutoTokenizer.from_pretrained('/autodl-fs/data/ESM2-3B')

model.eval()

device = torch.device("cpu")
model.to(device)

df = pd.read_csv('../Data/data1.csv')

sequences = df['Sequence'].tolist()
protein_names = df['Protein_ID'].tolist()

batch_size = 1  all_representations = []

for i in range(0, len(sequences), batch_size):
    batch_sequences = sequences[i:i + batch_size]
    batch_names = protein_names[i:i + batch_size]

    encodings = tokenizer(batch_sequences, return_tensors='pt', padding=True, truncation=True)

    encodings = {key: val.to(device) for key, val in encodings.items()}

    with torch.no_grad():
        outputs = model(**encodings)

    sequence_representations = outputs.last_hidden_state.mean(dim=1)  # [batch_size, hidden_size]
    
    all_representations.append(sequence_representations.cpu())

all_representations = torch.cat(all_representations, dim=0)

sequence_representations_np = all_representations.numpy()

output_df = pd.DataFrame(sequence_representations_np, index=protein_names)

output_df.to_csv('/autodl-fs/data/realdata4.csv')

print("特征已经保存为 data2.csv 文件")
