<a href="https://colab.research.google.com/github/ErikHartman/bopep/blob/main/bopep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# bopep: identifying peptide binders in large scale peptidomic data

Bayesian optimization guided search for binders in large scale peptidomic datasets.

Relies on ESM2 for peptide embeddings, ColabFold utilizing AlphaFold 2 multimer for docking and PyRosetta for interface energy calculations. A deep ensemble is used as a surrogate model utilizing Torch.

Set runtime to GPU (T4).

In [1]:
#@title Installation

import os

print("Fetching bopep")
!git clone https://github.com/ErikHartman/bopep /content/bopep/

print("Installing ColabFold")
!pip install --quiet colabfold
!pip install --quiet biopython

print("Installing esm-fair and fetching ESM model")
!pip install --quiet fair-esm

# Code to fetch ESM model if it doesn't already exist
esm_model_path = "/root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt"
if not os.path.exists(esm_model_path):
    print("Downloading ESM model...")
    import esm
    model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
else:
    print("ESM model already exists.")

print("Installing PyRosetta")
!pip install pyrosettacolabsetup
import pyrosettacolabsetup
pyrosettacolabsetup.install_pyrosetta(serialization=True, cache_wheel_on_google_drive=False)

print("Installing other necessary packages")
!pip install -r https://raw.githubusercontent.com/ErikHartman/bopep/main/requirements.txt


Fetching bopep
Cloning into '/content/bopep'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 38 (delta 12), reused 22 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (38/38), 26.44 KiB | 13.22 MiB/s, done.
Resolving deltas: 100% (12/12), done.
Installing ColabFold
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.3/360.3 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.0/230.0 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


Installing PyRosetta
Collecting pyrosettacolabsetup
  Downloading pyrosettacolabsetup-1.0.9-py3-none-any.whl.metadata (294 bytes)
Downloading pyrosettacolabsetup-1.0.9-py3-none-any.whl (4.9 kB)
Installing collected packages: pyrosettacolabsetup
Successfully installed pyrosettacolabsetup-1.0.9

Note that USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE.
See https://github.com/RosettaCommons/rosetta/blob/main/LICENSE.md or email license@uw.edu for details.

Looking for compatible PyRosetta wheel file at google-drive/PyRosetta/colab.bin//wheels.serialization...
Downloading PyRosetta package...

Resolving west.rosettacommons.org (west.rosettacommons.org)... 128.95.160.153, 2607:4000:406::160:153

HTTP request sent, awaiting response... 302 Found
Location: https://west.rosettacommons.org/pyrosetta/release/release/PyRosetta4.Release.python310.ubuntu.cxx11thread.serialization.wheel/pyrosetta-2024.42+release.3366cf78a3-cp310-cp310-linux_x86_64.whl [following]
--2024-11-13

In [2]:
#@title Embedding settings
import pandas as pd

%cd bopep

# Data input
# @markdown Upload your input data file and set the path to the path relative to /bopep.
data_file = "data/test_data.csv" #@param {type:"string"}

if not os.path.exists(data_file):
  raise ValueError("The data file does not exist in the path.")

data = pd.read_csv(data_file)  # Load the CSV file
peptides = data["peptide"].tolist()
# @markdown  ### Filtering options:

# @markdown Set maximum and minimum peptide length
max_length = 30  #@param {type:"slider", min:10, max:60, step:1}
min_length = 5   #@param {type:"slider", min:1, max:30, step:1}

# @markdown Set maximum repeat length for amino acids
max_repeat_length = 5  #@param {type:"slider", min:1, max:15, step:1}

# @markdown  Set maximum allowed fraction of single amino acids
max_single_aa_fraction = 0.73  #@param {type:"slider", min:0, max:1, step:0.01}

# @markdown  Variance kept during PCA reduction
pca_variance = 0.95  #@param {type:"slider", min:0.1, max:1, step:0.01}


/content/bopep
Data file: data/test_data.csv
Max length: 30
Min length: 5
Max repeat length: 5
Max single amino acid fraction: 0.73


In [3]:
#@title Generate embeddings
from src.embeddings.embed import embed
from src.embeddings.utils import filter_peptides
from sklearn.decomposition import PCA
import numpy as np

filtered_peptides = filter_peptides(peptides, max_single_aa_fraction, max_repeat_length, min_length, max_length)
emeddings = embed(filtered_peptides, model_path=esm_model_path)

if pca_variance < 1:
  embedding_array = np.array(list(emeddings.values()))
  peptide_sequences = list(emeddings.keys())
  pca = PCA(n_components=0.95, svd_solver="full")
  embeddings_reduced = pca.fit_transform(embedding_array)
  print(f"Reduced embedding size: {np.shape(embeddings_reduced)} (before PCA: {np.shape(embedding_array)})")
  embeddings = {
      peptide_sequences[i]: embeddings_reduced[i] for i in range(len(peptide_sequences))
  }


  model_data = torch.load(str(model_location), map_location="cpu")
  regression_data = torch.load(regression_location, map_location="cpu")


Loaded model from local path: /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt


Generating embeddings: 100%|██████████| 16/16 [06:22<00:00, 23.89s/it]


In [4]:
#@title Bayesian optimization settings



In [5]:
#@title Initialize PyRosetta

In [6]:
#@title Run bopep!