In [1]:
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError

from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import numpy as np
import torch

In [2]:
model = ESMC.from_pretrained("esmc_600m").to("cuda")

def embed_sequence(model, sequence):
    protein = ESMProtein(sequence=sequence)
    protein_tensor = model.encode(protein)
    output = model.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True, return_hidden_states=True))
    return output

def batch_embed(model, inputs):

    with ThreadPoolExecutor(max_workers=1) as executor:
        futures = [
            executor.submit(embed_sequence, model, protein) for protein in inputs
        ]
        results = []
        for future in futures:
            try:
                results.append(future.result())
            except Exception as e:
               results.append(ESMProteinError(500, str(e)))
                
    return results

In [3]:
df = pd.read_csv("dataset.csv")

In [4]:
outputs = batch_embed(model, df['AA'])

In [7]:
seq_esmc_feat = [embed.embeddings for embed in outputs]

In [11]:
seq_esmc_feat  = [feat[:, 1:-1, :] for feat in seq_esmc_feat ]

In [16]:
torch.save(seq_esmc_feat, "seq_esmc_feat.pt")