<a href="https://colab.research.google.com/github/ErikHartman/bopep/blob/main/bopep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# bopep: identifying peptide binders in large scale peptidomic data

Bayesian optimization guided search for binders in large scale peptidomic datasets.

Relies on ESM2 for peptide embeddings, ColabFold utilizing AlphaFold 2 multimer for docking and PyRosetta for interface energy calculations. A deep ensemble is used as a surrogate model utilizing Torch.

Set runtime to T4 GPU.

In [1]:
#@title Installation

import os

print("Fetching bopep")
!git clone https://github.com/ErikHartman/bopep /content/bopep/

print("Installing necessary packages using pip")
!pip install -r --quiet /content/bopep/requirements.txt

print("Installing ColabFold")
!pip install --quiet colabfold
!pip install --quiet biopython

print("Installing fair-esm")
!pip install --quiet fair-esm

# Code to fetch ESM model if it doesn't already exist
esm_model_path = "/root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt"
if not os.path.exists(esm_model_path):
    print("Downloading ESM model...")
    import esm
    model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
else:
    print("ESM model already exists.")

print("Installing PyRosetta")
!pip install pyrosettacolabsetup
import pyrosettacolabsetup
pyrosettacolabsetup.install_pyrosetta(serialization=True, cache_wheel_on_google_drive=False)


Fetching bopep
Cloning into '/content/bopep'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 93 (delta 40), reused 59 (delta 16), pack-reused 0 (from 0)[K
Receiving objects: 100% (93/93), 103.63 KiB | 803.00 KiB/s, done.
Resolving deltas: 100% (40/40), done.
Installing other necessary packages
Collecting biopython==1.84 (from -r /content/bopep/requirements.txt (line 1))
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting fair-esm==2.0.0 (from -r /content/bopep/requirements.txt (line 2))
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Collecting numpy==2.0.2 (from -r /content/bopep/requirements.txt (line 4))
  Downloading numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m1.1 M

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


Installing PyRosetta
Collecting pyrosettacolabsetup
  Downloading pyrosettacolabsetup-1.0.9-py3-none-any.whl.metadata (294 bytes)
Downloading pyrosettacolabsetup-1.0.9-py3-none-any.whl (4.9 kB)
Installing collected packages: pyrosettacolabsetup
Successfully installed pyrosettacolabsetup-1.0.9

Note that USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE.
See https://github.com/RosettaCommons/rosetta/blob/main/LICENSE.md or email license@uw.edu for details.

Looking for compatible PyRosetta wheel file at google-drive/PyRosetta/colab.bin//wheels.serialization...
Downloading PyRosetta package...
--2024-11-13 08:17:50--  https://west.rosettacommons.org/pyrosetta/release/release/PyRosetta4.Release.python310.ubuntu.cxx11thread.serialization.wheel/.latest
Resolving west.rosettacommons.org (west.rosettacommons.org)... 128.95.160.153, 2607:4000:406::160:153
Connecting to west.rosettacommons.org (west.rosettacommons.org)|128.95.160.153|:443... connected.
HTTP request sent, aw

In [2]:
#@title Embedding settings
import pandas as pd

%cd bopep

# Data input
# @markdown Upload your input data file and set the path.
data_file = "/content/bopep/data/test_data.csv" #@param {type:"string"}

if not os.path.exists(data_file):
  raise ValueError("The data file does not exist in the path.")

data = pd.read_csv(data_file)  # Load the CSV file
peptides = data["peptide"].tolist()
# @markdown  ### Filtering options:

# @markdown Set maximum and minimum peptide length
max_length = 30  #@param {type:"slider", min:10, max:60, step:1}
min_length = 5   #@param {type:"slider", min:1, max:30, step:1}

# @markdown Set maximum repeat length for amino acids
max_repeat_length = 5  #@param {type:"slider", min:1, max:15, step:1}

# @markdown  Set maximum allowed fraction of single amino acids
max_single_aa_fraction = 0.73  #@param {type:"slider", min:0, max:1, step:0.01}

# @markdown  Variance kept during PCA reduction
pca_variance = 0.95  #@param {type:"slider", min:0.1, max:1, step:0.01}


/content/bopep


In [3]:
#@title Generate embeddings
from src.embeddings.embed import embed
from src.embeddings.utils import filter_peptides
from sklearn.decomposition import PCA
import numpy as np

filtered_peptides = filter_peptides(peptides, max_single_aa_fraction, max_repeat_length, min_length, max_length)
embeddings = embed(filtered_peptides, model_path=esm_model_path)

if pca_variance < 1:
  embedding_array = np.array(list(embeddings.values()))
  peptide_sequences = list(embeddings.keys())
  pca = PCA(n_components=0.95, svd_solver="full")
  embeddings_reduced = pca.fit_transform(embedding_array)
  print(f"Reduced embedding size: {np.shape(embeddings_reduced)} (before PCA: {np.shape(embedding_array)})")
  embeddings = {
      peptide_sequences[i]: embeddings_reduced[i] for i in range(len(peptide_sequences))
  }


  model_data = torch.load(str(model_location), map_location="cpu")
  regression_data = torch.load(regression_location, map_location="cpu")


Model moved to GPU.


Generating embeddings: 100%|██████████| 16/16 [00:12<00:00,  1.26it/s]


Reduced embedding size: (1000, 96) (before PCA: (1000, 1280))


In [4]:
#@title Bayesian optimization settings

# Target structure file (PDB format)
target_structure = "/content/bopep/data/4glp.pdb"  #@param {type:"string"}

# @markdown Model settings
num_recycles = 9  #@param {type:"slider", min:1, max:20, step:1}  # Number of recycles for AlphaFold
num_relax = 1  #@param {type:"slider", min:0, max:5, step:1}  # Number of relaxations
num_models = 5  #@param {type:"slider", min:1, max:5, step:1}  # Number of models
num_processes = 2  #@param {type:"slider", min:1, max:8, step:1}  # Number of CPU processes for docking
gpu_ids = ["0"]  #@param {type:"hidden"}  # List of GPU IDs, Colab generally has one GPU

# @markdown Stopping and relaxation parameters
recycle_early_stop_tolerance = 0.3  #@param {type:"slider", min:0, max:1, step:0.1}  # Early stop tolerance
amber = True  #@param {type:"boolean"}  # Whether to use AMBER for relaxation

# @markdown Target binding site (optional)
binding_site_residue_indices = [44, 49, 74, 82, 89, 105]  #@param {type:"raw"}  # Binding site residues

# @markdown Objective weights for Bayesian optimization
iptm_score_weight = 1.0  #@param {type:"number"}
interface_sasa_weight = 0.2  #@param {type:"number"}
interface_dG_weight = 0.2  #@param {type:"number"}
rosetta_score_weight = 2  #@param {type:"number"}
interface_delta_hbond_unsat_weight = 0.2  #@param {type:"number"}
packstat_weight = 0.2  #@param {type:"number"}

# @markdown Bayesian Optimization Iterations
n_initial = 100  #@param {type:"slider", min:50, max:1000, step:1}  # Initial number of evaluations
n_exploration_iterations = 100  #@param {type:"slider", min:50, max:1500, step:1}  # Number of exploration iterations
n_exploitation_with_distance_weight = 100  #@param {type:"slider", min:50, max:3000, step:50}  # Exploitation iterations with distance weight
n_exploitation_iterations = 0  #@param {type:"slider", min:0, max:1000, step:1}  # Number of exploitation iterations without distance weight
batch_size = 4  #@param {type:"slider", min:1, max:32, step:1}  # Batch size for optimization
agreeing_models = 0  #@param {type:"slider", min:0, max:10, step:1}  # Number of agreeing models to use
proximity_threshold = 5.0  #@param {type:"slider", min:1, max:20, step:0.5}  # Proximity threshold in Ångstroms
hparam_opt_interval = 50  #@param {type:"slider", min:10, max:200, step:10}  # Hyperparameter optimization interval


In [5]:
#@title Initialize PyRosetta

In [6]:
#@title Run bopep!