In [None]:
import torch
import pandas as pd

import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, BigBirdForMaskedLM

from CodonTransformer.CodonPrediction import predict_dna_sequence
from CodonTransformer.CodonUtils import load_python_object_from_disk, load_pkl_from_url
from CodonTransformer.CodonJupyter import (
    UserContainer,
    display_protein_sequence_input,
    display_organism_dropdown,
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# You can import organism2id using the URL or the local file
ORGANISM_REFERENCE_URL = "https://github.com/Adibvafa/CodonTransformer/blob/main/src/organism2id.pkl?raw=true"
ORGANISM_REFERENCE = "src/organism2id.pkl"
ORGANISM2ID = load_python_object_from_disk(ORGANISM_REFERENCE)
ORGANISM2ID = load_pkl_from_url(ORGANISM_REFERENCE_URL)

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer").to(DEVICE)

**Optimizing a Single Sequence**
-------------------------------------
1. Run the next code cell and input only your protein sequence and organism

2. Run the code cell after it to optimize the sequence and display it.

Protein sequences should end with "*" or "_" or an amino acid.

In [None]:
# Sample: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG, Homo sapiens
user = UserContainer()
display_protein_sequence_input(user)
display_organism_dropdown(ORGANISM2ID, user)

In [None]:
user.predicted_dna = predict_dna_sequence(
    protein=user.protein_sequence,
    organism_id=user.organism_id,
    device=DEVICE,
    tokenizer_object=tokenizer,
    model_object=model,
    attention_type="original_full",
)

print(
    f"{'-'*5 + '|' + '    Organism     ' + '|' + '-'*5}\n{user.organism}\n\n"
    f"\n{'-'*5 + '|' + '  Predicted DNA  ' + '|' + '-'*5}\n{user.predicted_dna}\n\n"
    f"\n{'-'*5 + '|' + '  Input Protein  ' + '|' + '-'*5}\n{user.protein_sequence}\n"
)

**Optimizing Multiple Sequences**
-------------------------------------
1. Create a CSV file that has columns 'protein_sequence' and 'organism'.
   You can have other columns in any order.

2. Replace the _dataset_path_ below with the actual path to your CSV file.

3. Run the next code cells to optimize and save the predicted DNA sequences.

In [None]:
# Update with the actual path to your dataset
dataset_path = "demo/sample_dataset.csv"
output_path = "demo/sample_predictions.csv"

dataset = pd.read_csv(dataset_path, index_col=0)
dataset["predicted_dna"] = None
dataset.head()

In [None]:
for index, data in tqdm(
    dataset.iterrows(),
    desc=f"Model Predicting",
    unit=" Sequences",
    total=dataset.shape[0],
):

    predicted_dna = predict_dna_sequence(
        protein=data["protein_sequence"],
        organism_id=ORGANISM2ID[data["organism"]],
        device=DEVICE,
        tokenizer_object=tokenizer,
        model_object=model,
    )
    dataset.loc[index, "predicted_dna"] = predicted_dna

dataset.to_csv(output_path)
dataset.head()