In [1]:
import torch
import pandas as pd

import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, BigBirdForMaskedLM

from CodonTransformer.CodonPrediction import predict_dna_sequence
from CodonTransformer.CodonJupyter import (
    UserContainer,
    display_protein_input,
    display_organism_dropdown,
    format_model_output,
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer").to(DEVICE)

**Optimizing a Single Sequence**
-------------------------------------
1. Run the next code cell and input only your protein sequence and organism

2. Run the code cell after it to optimize the sequence and display it.

Protein sequences should end with "*" or "_" or an amino acid.

In [3]:
# Sample: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG, Homo sapiens
user = UserContainer()
display_protein_input(user)
display_organism_dropdown(user)

VBox(children=(HTML(value='<b style="font-size:20px;">Enter Protein Sequence:</b><div style="height:18px;"></d…

HTML(value='\n        <style>\n            .widget-textarea > textarea {\n                font-size: 12px;\n  …

VBox(children=(HTML(value='<b style="font-size:20px;">Select Organism:</b><div style="height:10px;"></div>'), …

In [4]:
output = predict_dna_sequence(
    protein=user.protein,
    organism=user.organism,
    device=DEVICE,
    tokenizer_object=tokenizer,
    model_object=model,
    attention_type="original_full",
)

print(format_model_output(output))

-----------------------------
|          Organism         |
-----------------------------
Homo sapiens

-----------------------------
|       Input Protein       |
-----------------------------
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG

-----------------------------
|      Processed Input      |
-----------------------------
M_UNK A_UNK L_UNK W_UNK M_UNK R_UNK L_UNK L_UNK P_UNK L_UNK L_UNK A_UNK L_UNK L_UNK A_UNK L_UNK W_UNK G_UNK P_UNK D_UNK P_UNK A_UNK A_UNK A_UNK F_UNK V_UNK N_UNK Q_UNK H_UNK L_UNK C_UNK G_UNK S_UNK H_UNK L_UNK V_UNK E_UNK A_UNK L_UNK Y_UNK L_UNK V_UNK C_UNK G_UNK E_UNK R_UNK G_UNK F_UNK F_UNK Y_UNK T_UNK P_UNK K_UNK T_UNK R_UNK R_UNK E_UNK A_UNK E_UNK D_UNK L_UNK Q_UNK V_UNK G_UNK Q_UNK V_UNK E_UNK L_UNK G_UNK G_UNK __UNK

-----------------------------
|       Predicted DNA       |
-----------------------------
ATGGCCCTGTGGATGAGGCTGCTGCCCCTGCTGGCCCTGCTGGCCCTGTGGGGGCCTGACCCAGCTGCCGCCTTTGTGAACCAGCACCTGTGTGGCAGCCACCTGGTGGAGGCCCTGTACCTGGTGT

**Optimizing Multiple Sequences**
-------------------------------------
1. Create a CSV file that has columns 'protein_sequence' and 'organism'.
   You can have other columns in any order.

2. Replace the _dataset_path_ below with the actual path to your CSV file.

3. Run the next code cells to optimize and save the predicted DNA sequences.

In [5]:
# Update with the actual path to your dataset
dataset_path = "demo/sample_dataset.csv"
output_path = "demo/sample_predictions.csv"

dataset = pd.read_csv(dataset_path, index_col=0)
dataset["predicted_dna"] = None
dataset.head()

Unnamed: 0,protein_sequence,organism,predicted_dna
0,MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL...,Escherichia coli general,
1,MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS...,Escherichia coli general,
2,MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY...,Homo sapiens,
3,MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA...,Homo sapiens,
4,MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ...,Arabidopsis thaliana,


In [6]:
for index, data in tqdm(
    dataset.iterrows(),
    desc=f"CodonTransformer Predicting",
    unit=" Sequences",
    total=dataset.shape[0],
):

    predicted_dna = predict_dna_sequence(
        protein=data["protein_sequence"],
        organism=data["organism"],
        device=DEVICE,
        tokenizer_object=tokenizer,
        model_object=model,
    )
    dataset.loc[index, "predicted_dna"] = predicted_dna

dataset.to_csv(output_path)
dataset.head()

CodonTransformer Predicting: 100%|██████████| 5/5 [00:00<00:00, 17.00 Sequences/s]


Unnamed: 0,protein_sequence,organism,predicted_dna
0,MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL...,Escherichia coli general,DNASequencePrediction(organism='Escherichia co...
1,MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS...,Escherichia coli general,DNASequencePrediction(organism='Escherichia co...
2,MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY...,Homo sapiens,"DNASequencePrediction(organism='Homo sapiens',..."
3,MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA...,Homo sapiens,"DNASequencePrediction(organism='Homo sapiens',..."
4,MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ...,Arabidopsis thaliana,DNASequencePrediction(organism='Arabidopsis th...
