In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at zhihan1996/DNABERT-2-117M were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['bert.pooler.dense.

In [3]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to GPU
model = model.to(device)

dna = "ACGTAGCATCGGATCTATCTATCGACACTTGGTTATCGATCTACGAGCATCTCGTTAGC"
inputs = tokenizer(dna, return_tensors='pt')
# Move input tensors to the same device as the model
inputs = {k: v.to(device) for k, v in inputs.items()}

# Forward pass
with torch.no_grad():  # Add this for inference to save memory
    hidden_states = model(**inputs)[0]  # [1, sequence_length, 768]

# Embedding with max pooling
embedding_max = torch.max(hidden_states[0], dim=0)[0]
print(embedding_max.shape)  # expect to be 768

# If you need to bring it back to CPU for further processing:
embedding_max_cpu = embedding_max.cpu()

Using device: cuda
torch.Size([768])


In [5]:
import pandas as pd
import numpy as np

training_set = pd.read_csv('../dataset/train_set_v3.csv')
test_set = pd.read_csv('../dataset/test_set_v3.csv')

In [4]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34993 entries, 0 to 34992
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   species                34993 non-null  object
 1   TF name                34993 non-null  object
 2   TF sequence            34993 non-null  object
 3   binding site sequence  34993 non-null  object
 4   label                  34993 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


In [6]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3890 entries, 0 to 3889
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   species                3890 non-null   object
 1   TF name                3890 non-null   object
 2   TF sequence            3890 non-null   object
 3   binding site sequence  3890 non-null   object
 4   label                  3890 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 152.1+ KB


In [5]:
import torch
from tqdm import tqdm

# Function to generate embeddings for a DNA sequence
def generate_embedding(sequence, tokenizer, model, device):
    inputs = tokenizer(sequence, return_tensors='pt')
    # Move input tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Forward pass
    with torch.no_grad():
        hidden_states = model(**inputs)[0]  # [1, sequence_length, 768]
    
    # Embedding with max pooling
    embedding_max = torch.max(hidden_states[0], dim=0)[0]
    return embedding_max

In [8]:

# Create a list to store embeddings
embeddings_list = []

# Process all sequences in training_set
print(f"Generating embeddings for {len(training_set)} sequences...")
for idx, row in tqdm(training_set.iterrows(), total=len(training_set)):
    sequence = row['binding site sequence']
    embedding = generate_embedding(sequence, tokenizer, model, device)
    embeddings_list.append(embedding.cpu())  # Store on CPU to save GPU memory

# Save the embeddings dictionary
output_path = 'training_set_DNA_embedding_v3.pt'
torch.save(embeddings_list, output_path)
print(f"Embeddings saved to {output_path}")

# Optional: Print some statistics
print(f"Total embeddings generated: {len(embeddings_list)}")

Generating embeddings for 34993 sequences...


  0%|          | 0/34993 [00:00<?, ?it/s]

100%|██████████| 34993/34993 [02:06<00:00, 276.30it/s]


Embeddings saved to training_set_DNA_embedding_v3.pt
Total embeddings generated: 34993


In [9]:
# Create a list to store embeddings
embeddings_list_test = []

# Process all sequences in training_set
print(f"Generating embeddings for {len(test_set)} sequences...")
for idx, row in tqdm(test_set.iterrows(), total=len(test_set)):
    sequence = row['binding site sequence']
    embedding = generate_embedding(sequence, tokenizer, model, device)
    embeddings_list_test.append(embedding.cpu())  # Store on CPU to save GPU memory

# Save the embeddings dictionary
output_path = 'test_set_DNA_embedding_v3.pt'
torch.save(embeddings_list_test, output_path)
print(f"Embeddings saved to {output_path}")

# Optional: Print some statistics
print(f"Total embeddings generated: {len(embeddings_list_test)}")

Generating embeddings for 3890 sequences...


100%|██████████| 3890/3890 [00:13<00:00, 278.15it/s]

Embeddings saved to test_set_DNA_embedding_v3.pt
Total embeddings generated: 3890





In [6]:
import pandas as pd

train_dna = pd.read_csv('../dataset/training_dataset_with_negatives_v4.csv')

embeddings_list = []

# Process all sequences in training_set
print(f"Generating embeddings for {len(train_dna)} sequences...")
for idx, row in tqdm(train_dna.iterrows(), total=len(train_dna)):
    sequence = row['binding site sequence']
    embedding = generate_embedding(sequence, tokenizer, model, device)
    embeddings_list.append(embedding.cpu())  # Store on CPU to save GPU memory

# Save the embeddings dictionary
output_path = 'training_set_DNA_embedding_v4.pt'
torch.save(embeddings_list, output_path)
print(f"Embeddings saved to {output_path}")

# Optional: Print some statistics
print(f"Total embeddings generated: {len(embeddings_list)}")

Generating embeddings for 35715 sequences...


100%|██████████| 35715/35715 [01:53<00:00, 313.77it/s]


Embeddings saved to training_set_DNA_embedding_v4.pt
Total embeddings generated: 35715


In [7]:
dna_test = pd.read_csv('../dataset/test_dataset_with_negatives_v4.csv')

# Create a list to store embeddings
embeddings_list_test = []

# Process all sequences in training_set
print(f"Generating embeddings for {len(dna_test)} sequences...")
for idx, row in tqdm(dna_test.iterrows(), total=len(dna_test)):
    sequence = row['binding site sequence']
    embedding = generate_embedding(sequence, tokenizer, model, device)
    embeddings_list_test.append(embedding.cpu())  # Store on CPU to save GPU memory

# Save the embeddings dictionary
output_path = 'test_set_DNA_embedding_v4.pt'
torch.save(embeddings_list_test, output_path)
print(f"Embeddings saved to {output_path}")

# Optional: Print some statistics
print(f"Total embeddings generated: {len(embeddings_list_test)}")

Generating embeddings for 2175 sequences...


100%|██████████| 2175/2175 [00:06<00:00, 316.37it/s]


Embeddings saved to test_set_DNA_embedding_v4.pt
Total embeddings generated: 2175
