In [1]:
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig
import torch
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

protein = ESMProtein(sequence="MKSIGVVRKVDELGRIVMPIELRRALDIAIKDSIEFFVDGDKIILKKYKPHGVCLMTGEITSENKEYGNGKITLSPEGAQLLLEEIQAALKE")
client = ESMC.from_pretrained("esmc_300m").to("cuda") # or "cpu"
protein_tensor = client.encode(protein)
logits_output = client.logits(
   protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
)
print(logits_output.logits, logits_output.embeddings)

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 102927.71it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ForwardTrackData(sequence=tensor([[[-36.5000, -36.5000, -36.5000,  ..., -36.5000, -36.5000, -36.5000],
         [-39.2500, -39.0000, -39.2500,  ..., -39.2500, -39.0000, -39.0000],
         [-31.0000, -31.0000, -31.0000,  ..., -31.0000, -31.0000, -31.0000],
         ...,
         [-35.7500, -35.7500, -35.7500,  ..., -35.7500, -35.7500, -35.7500],
         [-36.0000, -36.0000, -36.0000,  ..., -36.0000, -36.2500, -36.0000],
         [-32.5000, -32.5000, -32.5000,  ..., -32.5000, -32.5000, -32.5000]]],
       device='cuda:0', dtype=torch.bfloat16), structure=None, secondary_structure=None, sasa=None, function=None) tensor([[[-0.0020,  0.0091,  0.0063,  ...,  0.0116, -0.0080, -0.0049],
         [-0.0277,  0.0271,  0.0293,  ...,  0.0253,  0.0121,  0.0194],
         [ 0.0055,  0.0543, -0.0269,  ...,  0.0392, -0.0025,  0.0052],
         ...,
         [-0.0119, -0.0165, -0.0182,  ...,  0.0134, -0.0242,  0.0235],
         [-0.0354,  0.0206, -0.0019,  ...,  0.0205, -0.0475, -0.0042],
         [-0

In [3]:
test = torch.mean(logits_output.embeddings, dim=-2).squeeze()

In [5]:
protein_df = pd.read_csv('../dataset/train_set_v3.csv')
protein_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34993 entries, 0 to 34992
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   species                34993 non-null  object
 1   TF name                34993 non-null  object
 2   TF sequence            34993 non-null  object
 3   binding site sequence  34993 non-null  object
 4   label                  34993 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


In [6]:
# Initialize a list to store the embeddings
all_embeddings = []

# Create a new column for the embeddings
protein_df['embedding'] = None

# Process each TF sequence in protein_df
for idx, row in protein_df.iterrows():
    # Get the TF sequence
    tf_sequence = row['TF sequence']
    
    # Create ESMProtein object
    protein = ESMProtein(sequence=tf_sequence)
    
    # Encode the protein
    protein_tensor = client.encode(protein)
    
    # Get logits and embeddings
    logits_output = client.logits(
        protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
    )
    
    # Calculate mean in the -2 dimension
    embedding = torch.mean(logits_output.embeddings, dim=-2).squeeze()
    
    # Store the embedding
    all_embeddings.append(embedding)
    
    # Print progress every 100 proteins
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(protein_df)} proteins")

# Convert list to tensor
all_embeddings_tensor = torch.stack(all_embeddings)

# Save the embeddings
torch.save(all_embeddings_tensor, 'training_set_tf_embedding_v3.pt')

print(f"All {len(all_embeddings)} embeddings processed and saved to training_set_tf_embedding_v3.pt")

Processed 100/34993 proteins
Processed 200/34993 proteins
Processed 300/34993 proteins
Processed 400/34993 proteins
Processed 500/34993 proteins
Processed 600/34993 proteins
Processed 700/34993 proteins
Processed 800/34993 proteins
Processed 900/34993 proteins
Processed 1000/34993 proteins
Processed 1100/34993 proteins
Processed 1200/34993 proteins
Processed 1300/34993 proteins
Processed 1400/34993 proteins
Processed 1500/34993 proteins
Processed 1600/34993 proteins
Processed 1700/34993 proteins
Processed 1800/34993 proteins
Processed 1900/34993 proteins
Processed 2000/34993 proteins
Processed 2100/34993 proteins
Processed 2200/34993 proteins
Processed 2300/34993 proteins
Processed 2400/34993 proteins
Processed 2500/34993 proteins
Processed 2600/34993 proteins
Processed 2700/34993 proteins
Processed 2800/34993 proteins
Processed 2900/34993 proteins
Processed 3000/34993 proteins
Processed 3100/34993 proteins
Processed 3200/34993 proteins
Processed 3300/34993 proteins
Processed 3400/3499

In [6]:
test_df = pd.read_csv('../dataset/test_set_v3.csv')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3890 entries, 0 to 3889
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   species                3890 non-null   object
 1   TF name                3890 non-null   object
 2   TF sequence            3890 non-null   object
 3   binding site sequence  3890 non-null   object
 4   label                  3890 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 152.1+ KB


In [7]:
# Initialize a list to store the embeddings
all_embeddings_test = []

# Create a new column for the embeddings
test_df['embedding'] = None

# Process each TF sequence in test_df
for idx, row in test_df.iterrows():
    # Get the TF sequence
    tf_sequence = row['TF sequence']
    
    # Create ESMProtein object
    protein = ESMProtein(sequence=tf_sequence)
    
    # Encode the protein
    protein_tensor = client.encode(protein)
    
    # Get logits and embeddings
    logits_output = client.logits(
        protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
    )
    
    # Calculate mean in the -2 dimension
    embedding = torch.mean(logits_output.embeddings, dim=-2).squeeze()
    
    # Store the embedding
    all_embeddings_test.append(embedding)
    
    # Print progress every 100 proteins
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(test_df)} proteins")

# Convert list to tensor
all_embeddings_tensor_test = torch.stack(all_embeddings_test)

# Save the embeddings
torch.save(all_embeddings_tensor_test, 'test_set_tf_embedding_v3.pt')

print(f"All {len(all_embeddings_test)} embeddings processed and saved to test_set_tf_embedding_v3.pt")

Processed 100/3890 proteins
Processed 200/3890 proteins
Processed 300/3890 proteins
Processed 400/3890 proteins
Processed 500/3890 proteins
Processed 600/3890 proteins
Processed 700/3890 proteins
Processed 800/3890 proteins
Processed 900/3890 proteins
Processed 1000/3890 proteins
Processed 1100/3890 proteins
Processed 1200/3890 proteins
Processed 1300/3890 proteins
Processed 1400/3890 proteins
Processed 1500/3890 proteins
Processed 1600/3890 proteins
Processed 1700/3890 proteins
Processed 1800/3890 proteins
Processed 1900/3890 proteins
Processed 2000/3890 proteins
Processed 2100/3890 proteins
Processed 2200/3890 proteins
Processed 2300/3890 proteins
Processed 2400/3890 proteins
Processed 2500/3890 proteins
Processed 2600/3890 proteins
Processed 2700/3890 proteins
Processed 2800/3890 proteins
Processed 2900/3890 proteins
Processed 3000/3890 proteins
Processed 3100/3890 proteins
Processed 3200/3890 proteins
Processed 3300/3890 proteins
Processed 3400/3890 proteins
Processed 3500/3890 pro

In [6]:
train_df_v4 = pd.read_csv('../dataset/training_dataset_with_negatives_v4.csv')
test_df_v4 = pd.read_csv('../dataset/test_dataset_with_negatives_v4.csv')

In [7]:
# Initialize a list to store the embeddings
train_embeddings = []

# Create a new column for the embeddings
train_df_v4['embedding'] = None

# Process each TF sequence in protein_df
for idx, row in train_df_v4.iterrows():
    # Get the TF sequence
    tf_sequence = row['TF sequence']
    
    # Create ESMProtein object
    protein = ESMProtein(sequence=tf_sequence)
    
    # Encode the protein
    protein_tensor = client.encode(protein)
    
    # Get logits and embeddings
    logits_output = client.logits(
        protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
    )
    
    # Calculate mean in the -2 dimension
    embedding = torch.mean(logits_output.embeddings, dim=-2).squeeze()
    
    # Store the embedding
    train_embeddings.append(embedding)
    
    # Print progress every 100 proteins
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(train_df_v4)} proteins")

# Convert list to tensor
all_embeddings_tensor = torch.stack(train_embeddings)

# Save the embeddings
torch.save(all_embeddings_tensor, 'training_set_tf_embedding_v4.pt')

print(f"All {len(train_embeddings)} embeddings processed and saved to training_set_tf_embedding_v4.pt")

Processed 100/35715 proteins
Processed 200/35715 proteins
Processed 300/35715 proteins
Processed 400/35715 proteins
Processed 500/35715 proteins
Processed 600/35715 proteins
Processed 700/35715 proteins
Processed 800/35715 proteins
Processed 900/35715 proteins
Processed 1000/35715 proteins
Processed 1100/35715 proteins
Processed 1200/35715 proteins
Processed 1300/35715 proteins
Processed 1400/35715 proteins
Processed 1500/35715 proteins
Processed 1600/35715 proteins
Processed 1700/35715 proteins
Processed 1800/35715 proteins
Processed 1900/35715 proteins
Processed 2000/35715 proteins
Processed 2100/35715 proteins
Processed 2200/35715 proteins
Processed 2300/35715 proteins
Processed 2400/35715 proteins
Processed 2500/35715 proteins
Processed 2600/35715 proteins
Processed 2700/35715 proteins
Processed 2800/35715 proteins
Processed 2900/35715 proteins
Processed 3000/35715 proteins
Processed 3100/35715 proteins
Processed 3200/35715 proteins
Processed 3300/35715 proteins
Processed 3400/3571

In [8]:
# Initialize a list to store the embeddings
all_embeddings_test = []

# Create a new column for the embeddings
test_df_v4['embedding'] = None

# Process each TF sequence in test_df
for idx, row in test_df_v4.iterrows():
    # Get the TF sequence
    tf_sequence = row['TF sequence']
    
    # Create ESMProtein object
    protein = ESMProtein(sequence=tf_sequence)
    
    # Encode the protein
    protein_tensor = client.encode(protein)
    
    # Get logits and embeddings
    logits_output = client.logits(
        protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
    )
    
    # Calculate mean in the -2 dimension
    embedding = torch.mean(logits_output.embeddings, dim=-2).squeeze()
    
    # Store the embedding
    all_embeddings_test.append(embedding)
    
    # Print progress every 100 proteins
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(test_df_v4)} proteins")

# Convert list to tensor
all_embeddings_tensor_test = torch.stack(all_embeddings_test)

# Save the embeddings
torch.save(all_embeddings_tensor_test, 'test_set_tf_embedding_v4.pt')

print(f"All {len(all_embeddings_test)} embeddings processed and saved to test_set_tf_embedding_v4.pt")

Processed 100/2175 proteins
Processed 200/2175 proteins
Processed 300/2175 proteins
Processed 400/2175 proteins
Processed 500/2175 proteins
Processed 600/2175 proteins
Processed 700/2175 proteins
Processed 800/2175 proteins
Processed 900/2175 proteins
Processed 1000/2175 proteins
Processed 1100/2175 proteins
Processed 1200/2175 proteins
Processed 1300/2175 proteins
Processed 1400/2175 proteins
Processed 1500/2175 proteins
Processed 1600/2175 proteins
Processed 1700/2175 proteins
Processed 1800/2175 proteins
Processed 1900/2175 proteins
Processed 2000/2175 proteins
Processed 2100/2175 proteins
All 2175 embeddings processed and saved to test_set_tf_embedding_v4.pt


In [4]:
train_df_6070 = pd.read_csv('../dataset/training_dataset_with_negatives_6070.csv')
test_df_6070 = pd.read_csv('../dataset/testing_dataset_with_negatives_6070.csv')

In [5]:
# Initialize a list to store the embeddings
train_embeddings = []

# Create a new column for the embeddings
train_df_6070['embedding'] = None

# Process each TF sequence in protein_df
for idx, row in train_df_6070.iterrows():
    # Get the TF sequence
    tf_sequence = row['TF sequence']
    
    # Create ESMProtein object
    protein = ESMProtein(sequence=tf_sequence)
    
    # Encode the protein
    protein_tensor = client.encode(protein)
    
    # Get logits and embeddings
    logits_output = client.logits(
        protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
    )
    
    # Calculate mean in the -2 dimension
    embedding = torch.mean(logits_output.embeddings, dim=-2).squeeze()
    
    # Store the embedding
    train_embeddings.append(embedding)
    
    # Print progress every 100 proteins
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(train_df_6070)} proteins")

# Convert list to tensor
all_embeddings_tensor = torch.stack(train_embeddings)

# Save the embeddings
torch.save(all_embeddings_tensor, 'training_set_tf_embedding_6070.pt')

print(f"All {len(train_embeddings)} embeddings processed and saved to training_set_tf_embedding_6070.pt")

Processed 100/11043 proteins
Processed 200/11043 proteins
Processed 300/11043 proteins
Processed 400/11043 proteins
Processed 500/11043 proteins
Processed 600/11043 proteins
Processed 700/11043 proteins
Processed 800/11043 proteins
Processed 900/11043 proteins
Processed 1000/11043 proteins
Processed 1100/11043 proteins
Processed 1200/11043 proteins
Processed 1300/11043 proteins
Processed 1400/11043 proteins
Processed 1500/11043 proteins
Processed 1600/11043 proteins
Processed 1700/11043 proteins
Processed 1800/11043 proteins
Processed 1900/11043 proteins
Processed 2000/11043 proteins
Processed 2100/11043 proteins
Processed 2200/11043 proteins
Processed 2300/11043 proteins
Processed 2400/11043 proteins
Processed 2500/11043 proteins
Processed 2600/11043 proteins
Processed 2700/11043 proteins
Processed 2800/11043 proteins
Processed 2900/11043 proteins
Processed 3000/11043 proteins
Processed 3100/11043 proteins
Processed 3200/11043 proteins
Processed 3300/11043 proteins
Processed 3400/1104

In [6]:
# Initialize a list to store the embeddings
all_embeddings_test = []

# Create a new column for the embeddings
test_df_6070['embedding'] = None

# Process each TF sequence in test_df
for idx, row in test_df_6070.iterrows():
    # Get the TF sequence
    tf_sequence = row['TF sequence']
    
    # Create ESMProtein object
    protein = ESMProtein(sequence=tf_sequence)
    
    # Encode the protein
    protein_tensor = client.encode(protein)
    
    # Get logits and embeddings
    logits_output = client.logits(
        protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
    )
    
    # Calculate mean in the -2 dimension
    embedding = torch.mean(logits_output.embeddings, dim=-2).squeeze()
    
    # Store the embedding
    all_embeddings_test.append(embedding)
    
    # Print progress every 100 proteins
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(test_df_6070)} proteins")

# Convert list to tensor
all_embeddings_tensor_test = torch.stack(all_embeddings_test)

# Save the embeddings
torch.save(all_embeddings_tensor_test, 'test_set_tf_embedding_6070.pt')

print(f"All {len(all_embeddings_test)} embeddings processed and saved to test_set_tf_embedding_6070.pt")

Processed 100/1237 proteins
Processed 200/1237 proteins
Processed 300/1237 proteins
Processed 400/1237 proteins
Processed 500/1237 proteins
Processed 600/1237 proteins
Processed 700/1237 proteins
Processed 800/1237 proteins
Processed 900/1237 proteins
Processed 1000/1237 proteins
Processed 1100/1237 proteins
Processed 1200/1237 proteins
All 1237 embeddings processed and saved to test_set_tf_embedding_6070.pt
