In [1]:
# Clone the ProteinBERT repository
!git clone https://github.com/nadavbra/protein_bert.git

Cloning into 'protein_bert'...
remote: Enumerating objects: 249, done.[K
remote: Counting objects: 100% (97/97), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 249 (delta 83), reused 61 (delta 61), pack-reused 152 (from 1)[K
Receiving objects: 100% (249/249), 23.44 MiB | 19.43 MiB/s, done.
Resolving deltas: 100% (124/124), done.


In [2]:
%cd protein_bert

/content/protein_bert


In [3]:
# Initialize and update the submodules
!git submodule init
!git submodule update

Submodule 'proteinbert/shared_utils' (https://github.com/nadavbra/shared_utils.git) registered for path 'proteinbert/shared_utils'
Cloning into '/content/protein_bert/proteinbert/shared_utils'...
Submodule path 'proteinbert/shared_utils': checked out 'dc1c62a1754c51f6d46b7486e4a3e5e62c0570e1'


In [4]:
# Install ProteinBERT and dependencies
!python setup.py install

running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  self.initialize_options()
running bdist_egg
running egg_info
creating protein_bert.egg-info
writing protein_bert.egg-info/PKG-

In [None]:
from proteinbert import load_pretrained_model

model, input_encoder = load_pretrained_model()

In [5]:
!pip install biopython tensorflow

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [8]:
import sys
import numpy as np
from Bio import SeqIO  # For parsing FASTA files

In [9]:
# Add the path to ProteinBERT code
sys.path.append("/content/protein_bert")

# Import modules from ProteinBERT
from proteinbert.existing_model_loading import load_pretrained_model
from proteinbert.tokenization import tokenize_seq, additional_token_to_index
from proteinbert.conv_and_global_attention_model import create_model

# Step 1: Load the pre-trained ProteinBERT model
def load_model(seq_len, vocab_size, n_annotations, model_path):
    model_generator, input_encoder = load_pretrained_model(
        local_model_dump_dir="/content",
        local_model_dump_file_name=model_path
    )
    model = create_model(seq_len=seq_len, vocab_size=vocab_size, n_annotations=n_annotations)
    return model, input_encoder

# Step 2: Tokenize the sequences
def tokenize_sequences(sequences, seq_len):
    tokenized = []
    pad_token = additional_token_to_index["<PAD>"]
    for seq in sequences:
        tokens = tokenize_seq(seq)
        # Ensure all sequences are of length `seq_len` (pad or truncate)
        if len(tokens) > seq_len:
            tokens = tokens[:seq_len]
        else:
            tokens.extend([pad_token] * (seq_len - len(tokens)))
        tokenized.append(tokens)
    return np.array(tokenized)

# Step 3: Predict sequence probabilities
def get_sequence_probabilities(model, tokenized_sequences):
    input_annotations = np.zeros((len(tokenized_sequences), model.input_shape[1][1]))  # Dummy annotations
    predictions = model.predict([tokenized_sequences, input_annotations])
    seq_probabilities = predictions[0]  # Assuming first output corresponds to sequence probabilities
    return seq_probabilities

# Step 4: Calculate naturalness scores
def calculate_naturalness(seq_probs):
    # Sum log probabilities as naturalness score for each sequence
    return [np.sum(np.log(prob + 1e-8)) for prob in seq_probs]  # Add small epsilon for numerical stability

# Main function
def find_most_natural_sequence(fasta_path, model_path, seq_len=512, vocab_size=28, n_annotations=0):
    model, input_encoder = load_model(seq_len, vocab_size, n_annotations, model_path)

    # Parse sequences from FASTA
    sequences = [str(record.seq) for record in SeqIO.parse(fasta_path, "fasta")]

    tokenized_sequences = tokenize_sequences(sequences, seq_len)
    seq_probs = get_sequence_probabilities(model, tokenized_sequences)
    naturalness_scores = calculate_naturalness(seq_probs)

    # Find the sequence with the highest score
    most_natural_sequence_idx = np.argmax(naturalness_scores)
    most_natural_sequence = sequences[most_natural_sequence_idx]
    return most_natural_sequence, naturalness_scores

if __name__ == "__main__":
    # File paths
    fasta_path = "/content/top_10000_lowest_instability_sequences2.fasta"
    model_path = "/content/epoch_92400_sample_23500000.pkl"

    # Parameters
    seq_len = 512  # Adjust based on the model's max sequence length
    vocab_size = 28  # 20 amino acids + additional tokens
    n_annotations = 0  # Assuming no annotations for naturalness scoring

    # Find the most natural sequence
    most_natural_sequence, scores = find_most_natural_sequence(fasta_path, model_path, seq_len, vocab_size, n_annotations)
    print("Most natural sequence:", most_natural_sequence)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m724s[0m 2s/step
Most natural sequence: mmrkksfwfgmltalmlvftmefsvsasaaqpgknvekdyivgfksgvktasikkdiikesggkvdkqfriinagkakldkealkevkndpdvayveedhvghglgqtvpygiplikadkvqaqgfkganvkvavldtgiqashpdlnvvggasfvageayntdgnghgthvagtvaalynttgvlgvapnvslfavkvlnssgngtysgivsgiewvttngmdvinmslggalgstamkqavdhayskgavvvasagnsgssgytntigypakcdsviavgavdsnsnrasfssvgaelevmapgagvystyptntyttlngtsmasphvagtsalilskhpnlsasqvrtrlsstatylgssfsygkglinveaaaq
