In [1]:
import warnings
from tqdm import tqdm

import pandas as pd
import torch
from transformers import AutoTokenizer, BigBirdForMaskedLM

from CodonTransformer.CodonJupyter import (
    UserContainer,
    display_organism_dropdown,
    display_protein_input,
    format_model_output,
)
from CodonTransformer.CodonPrediction import predict_dna_sequence

warnings.filterwarnings("ignore")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer").to(DEVICE)

**Optimizing a Single Sequence**
-------------------------------------
1. Run the next code cell and input only your protein sequence and organism

2. Run the code cell after it to optimize the sequence and display it.

Protein sequences should end with "*" or "_" or an amino acid.

In [None]:
# Sample: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG, Homo sapiens
user = UserContainer()
display_protein_input(user)
display_organism_dropdown(user)

In [None]:
output = predict_dna_sequence(
    protein=user.protein,
    organism=user.organism,
    device=DEVICE,
    tokenizer=tokenizer,
    model=model,
    attention_type="original_full",
    deterministic=True,
    # Can set temperature for non deterministic prediction
)

print(format_model_output(output))

**Optimizing Multiple Sequences**
-------------------------------------
1. Create a CSV file that has columns 'protein_sequence' and 'organism'.
   You can have other columns in any order.

2. Replace the _dataset_path_ below with the actual path to your CSV file.

3. Run the next code cells to optimize and save the predicted DNA sequences.

In [None]:
# Update with the actual path to your dataset
dataset_path = "demo/sample_dataset.csv"
output_path = "demo/sample_predictions.csv"

dataset = pd.read_csv(dataset_path, index_col=0)
dataset["predicted_dna"] = None
dataset.head()

In [None]:
for index, data in tqdm(
    dataset.iterrows(),
    desc=f"CodonTransformer Predicting",
    unit=" Sequences",
    total=dataset.shape[0],
):

    outputs = predict_dna_sequence(
        protein=data["protein_sequence"],
        organism=data["organism"],
        device=DEVICE,
        tokenizer_object=tokenizer,
        model_object=model,
    )
    dataset.loc[index, "predicted_dna"] = outputs.predicted_dna

dataset.to_csv(output_path)
dataset.head()