<a href="https://colab.research.google.com/github/ErikHartman/bopep/blob/main/bopep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# bopep: identifying peptide binders in large scale peptidomic data

Bayesian optimization guided search for binders in large scale peptidomic datasets.

Relies on ESM2 for peptide embeddings, ColabFold utilizing AlphaFold 2 multimer for docking and PyRosetta for interface energy calculations. A deep ensemble is used as a surrogate model utilizing Torch.

In [None]:
#@title Installation

import os

print("Fetching bopep")
!git clone https://github.com/ErikHartman/bopep /content/bopep/

print("Installing ColabFold")
!pip install --quiet colabfold
!pip install --quiet biopython

print("Installing esm-fair and fetching ESM model")
!pip install --quiet fair-esm

# Code to fetch ESM model if it doesn't already exist
esm_model_path = "/root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt"
if not os.path.exists(esm_model_path):
    print("Downloading ESM model...")
    import esm
    model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
else:
    print("ESM model already exists.")

print("Installing PyRosetta")
!pip install pyrosettacolabsetup
import pyrosettacolabsetup
pyrosettacolabsetup.install_pyrosetta(serialization=True, cache_wheel_on_google_drive=False)

print("Installing other necessary packages")
!pip install -r https://raw.githubusercontent.com/ErikHartman/bopep/main/requirements.txt


Fetching bopep
Cloning into '/content/bopep'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 31 (delta 8), reused 16 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (31/31), 21.04 KiB | 742.00 KiB/s, done.
Resolving deltas: 100% (8/8), done.
Installing ColabFold
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.3/360.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.0/230.0 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


Installing PyRosetta
Collecting pyrosettacolabsetup
  Downloading pyrosettacolabsetup-1.0.9-py3-none-any.whl.metadata (294 bytes)
Downloading pyrosettacolabsetup-1.0.9-py3-none-any.whl (4.9 kB)
Installing collected packages: pyrosettacolabsetup
Successfully installed pyrosettacolabsetup-1.0.9

Note that USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE.
See https://github.com/RosettaCommons/rosetta/blob/main/LICENSE.md or email license@uw.edu for details.

Looking for compatible PyRosetta wheel file at google-drive/PyRosetta/colab.bin//wheels.serialization...
Downloading PyRosetta package...

Resolving west.rosettacommons.org (west.rosettacommons.org)... 128.95.160.153, 2607:4000:406::160:153

HTTP request sent, awaiting response... 302 Found
Location: https://west.rosettacommons.org/pyrosetta/release/release/PyRosetta4.Release.python310.ubuntu.cxx11thread.serialization.wheel/pyrosetta-2024.42+release.3366cf78a3-cp310-cp310-linux_x86_64.whl [following]
--2024-11-13

In [4]:
#@title Embedding settings
import pandas as pd

%cd bopep

# Data input
data_file = "data/test_data.csv" #@param {type:"string"}
data = pd.read_csv(data_file)  # Load the CSV file
peptides = data["peptide"].tolist()

# Set maximum and minimum peptide length
max_length = 30  #@param {type:"slider", min:1, max:100, step:1}
min_length = 5   #@param {type:"slider", min:1, max:100, step:1}

# Set maximum repeat length for amino acids
max_repeat_length = 5  #@param {type:"slider", min:1, max:20, step:1}

# Set maximum allowed fraction of single amino acids
max_single_aa_fraction = 0.75  #@param {type:"number"}

# Display settings
print(f"Data file: {data_file}")
print(f"Max length: {max_length}")
print(f"Min length: {min_length}")
print(f"Max repeat length: {max_repeat_length}")
print(f"Max single amino acid fraction: {max_single_aa_fraction}")

[Errno 2] No such file or directory: 'bopep'
/content/bopep
Data file: data/test_data.csv
Max length: 30
Min length: 5
Max repeat length: 5
Max single amino acid fraction: 0.75


In [2]:
#@title Generate embeddings
from src.embeddings.embed import embed
from src.embeddings.utils import filter_peptides

filtered_peptides = filter_peptides(peptides, max_single_aa_fraction, max_repeat_length, min_length, max_length)
emeddings = embed(filtered_peptides, model_path=esm_model_path)


ModuleNotFoundError: No module named 'tqdm'

In [None]:
#@title Bayesian optimization settings

In [None]:
#@title Initialize PyRosetta

In [None]:
#@title Run bopep!