Simple usage:

In [5]:
from SynCodonLM import CodonEmbeddings

model = CodonEmbeddings()
seq = 'ATGTCCACCGGGCGGTGA'

mean_embedding = model.get_mean_embedding(seq, species_token_type=67) #E. coli
print(mean_embedding.shape)

raw_embedding_final_layer = model.get_raw_embeddings(seq, species_token_type=67)
print(raw_embedding_final_layer.hidden_states[-1].shape) #treat this like a typical Hugging Face model dictionary based output!

torch.Size([768])
torch.Size([1, 8, 768])


Manual Usage
Prepare Sequence

In [None]:
from SynCodonLM import clean_split_sequence
seq = 'ATGTCCACCGGGCGGTGA'
seq = clean_split_sequence(seq)  # Returns: 'ATG TCC ACC GGG CGG TGA'

Load Model & Tokenizer from Hugging Face

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
import torch

tokenizer = AutoTokenizer.from_pretrained("jheuschkel/SynCodonLM")
config = AutoConfig.from_pretrained("jheuschkel/SynCodonLM")
model = AutoModelForMaskedLM.from_pretrained("jheuschkel/SynCodonLM", config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

If there are networking issues, you can manually download the model from Hugging Face & place it in the /SynCodonLM directory

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("./SynCodonLM", trust_remote_code=True)
# config = AutoConfig.from_pretrained("./SynCodonLM", trust_remote_code=True)
# model = AutoModel.from_pretrained("./SynCodonLM", trust_remote_code=True, config=config)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

Tokenize Input Sequences, Set Token Type ID Based on Species ID found here

In [None]:
token_type_id = 67  #E. coli
inputs = tokenizer(seq, return_tensors="pt").to(device)
inputs['token_type_ids'] = torch.full_like(inputs['input_ids'], token_type_id) # manually set token_type_ids


Gather Model Outputs

In [None]:
outputs = model(**inputs, output_hidden_states=True)

Get Mean Embedding from Final Layer

In [None]:
embedding = outputs.hidden_states[-1] #this can also index any layer (0-11)
mean_embedding = torch.mean(embedding, dim=1).squeeze(0)

You Can Also View Language Head Output

In [None]:
logits = outputs.logits  # shape: [batch_size, sequence_length, vocab_size]

Usage With Batches

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
import torch
from SynCodonLM import clean_split_sequence

tokenizer = AutoTokenizer.from_pretrained("jheuschkel/SynCodonLM")
config = AutoConfig.from_pretrained("jheuschkel/SynCodonLM")
model = AutoModelForMaskedLM.from_pretrained("jheuschkel/SynCodonLM", config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# List of sequences
seqs = [
    'ATGTCCACCGGGCGGTGA',
    'ATGCGTACCGGGTAGTGA',
    'ATGTTTACCGGGTGGTGA'
]

# List of token type ids (species)
species_token_type_ids = [
    67,   # E. coli
    394,  # C. griseus
    317   # H. sapiens
]

# Prepare list
seqs = [clean_split_sequence(seq) for seq in seqs]

# Tokenize batch with padding
inputs = tokenizer(seqs, return_tensors="pt", padding=True).to(device)

# Create token_type_ids tensor
batch_size, seq_len = inputs['input_ids'].shape
token_type_ids = torch.zeros((batch_size, seq_len), dtype=torch.long).to(device)

# Fill each row with the species-specific token_type_id
for i, species_id in enumerate(species_token_type_ids):
    token_type_ids[i, :] = species_id  # Fill entire row with the species ID

# Add to inputs
inputs['token_type_ids'] = token_type_ids

# Run model
outputs = model(**inputs)