<a href="https://colab.research.google.com/github/Angelique28/Designing-Protein-Binding-Peptides---CECAM-Workshop/blob/main/notebooks/1_Protein_Setup_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Workshop Notebook 1: Protein Setup & Visualization (15 min)**
In this notebook, we will:
*   Retrieve MDM2 sequence (UniProt).
*   Live fold with Boltz-2.
*   Visualize structure in py3Dmol.
*   Select binding domain + p53 pocket.
*   Fold just the selected substructure

In [1]:
#@title **Set up our environment: Install dependencies and import packages (1 min)**
#@markdown **Press the (|>) button to the left!**

import subprocess
import torch

if not torch.cuda.is_available():
    print("⚠️ Warning: GPU runtime not detected. Please go to Runtime > Change runtime type > select GPU.")
else:
    print("✅ GPU detected:", torch.cuda.get_device_name(0))

### Start Install Dependencies

runtime = "GPU(L4 or T4)"
precision = '32-true' if runtime == "GPU(L4 or T4)" else 'bf16-true'

dependencies = [
    'torch', 'torchvision', 'torchaudio', 'numpy', 'hydra-core',
    'pytorch-lightning', 'rdkit', 'dm-tree', 'requests', 'pandas',
    'types-requests', 'einops', 'einx', 'fairscale', 'mashumaro',
    'modelcif', 'wandb', 'click', 'pyyaml', 'biopython',
    'scipy', 'numba', 'gemmi', 'scikit-learn', 'chembl_structure_pipeline', 'py3Dmol',
    'cuequivariance_ops_cu12', 'cuequivariance_ops_torch_cu12', 'cuequivariance_torch',
    'ipywidgets'
    ]

print('Installing dependencies... ', end='')

for i in range(0, len(dependencies), 20):
  subprocess.run(f"pip install {' '.join(dependencies[i:i+20])}", shell=True)

subprocess.run("git clone https://github.com/jwohlwend/boltz.git", shell=True)
subprocess.run(f"sed -i 's/bf16-mixed/{precision}/g' /content/boltz/src/boltz/main.py", shell=True)
subprocess.run("cd boltz; pip install --no-deps -e .", shell=True)

### End Install Dependencies

import os
import sys
import requests
import yaml
import re
from pathlib import Path
from string import ascii_uppercase, ascii_lowercase

import ipywidgets as widgets
from IPython.display import display, HTML
from google.colab import drive
import py3Dmol

from rdkit import Chem, RDLogger
from rdkit.Chem import Draw, AllChem
from Bio.PDB import MMCIFParser
from Bio.PDB.Polypeptide import is_aa
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

print('done.')

✅ GPU detected: Tesla T4
Installing dependencies... done.


In [5]:
#@title **Set up our Paths and mount a folder in Google Drive**

#@markdown We will set a project ID so that we can keep separate executions separated, and a step ID so that we can keep the outputs of each step separate
PROJECT_ID = "MDM2" #@param {type:"string"}
STEP_ID = "1"

#@markdown We will use Google Drive mounts for persistence between multiple notebooks in this tutorial.

#@markdown Log in with your Google account and give permissions to access the drive.
WORKSHOP_DIRECTORY = Path('/content/drive/MyDrive/cecam_workshop_2025_generative')
drive.mount(str(WORKSHOP_DIRECTORY.parent.parent))
STEP_PATH = WORKSHOP_DIRECTORY / 'projects' / PROJECT_ID / STEP_ID
STEP_PATH.mkdir(exist_ok = True, parents = True)
(WORKSHOP_DIRECTORY / 'data').mkdir(exist_ok=True)

a3m = requests.get('https://github.com/timmonspatrick/CECAM-Binding-Workshop-2025/raw/refs/heads/main/data/uniref_MDM2.a3m').text
with (WORKSHOP_DIRECTORY / 'data' / 'uniref_MDM2.a3m').open('w') as f:
  f.write(a3m)
a3m = requests.get('https://github.com/timmonspatrick/CECAM-Binding-Workshop-2025/raw/refs/heads/main/data/uniref_MDM2_full.a3m').text
with (WORKSHOP_DIRECTORY / 'data' / 'uniref_MDM2_full.a3m').open('w') as f:
  f.write(a3m)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
#@title **Retrieve Target Sequence**

def get_uniprot_sequence(uniprot_id: str) -> str:
    """
    Retrieve the protein sequence for a given UniProt ID.
    """
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)

    if response.status_code == 200:
        fasta = response.text
        # Extract sequence (skip the first line which is header)
        sequence = "".join(fasta.split("\n")[1:])
        return sequence
    else:
        raise ValueError(f"Failed to retrieve data for {uniprot_id}: {response.status_code}")

# Example usage
uniprot_id = "Q00987"  #@param {type:"string"}
sequence = get_uniprot_sequence(uniprot_id.strip())
print('Target sequence:', sequence)

Target sequence: MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKDTYTMKEVLFYLGQYIMTKRLYDEKQQHIVYCSNDLLGDLFGVPSFSVKEHRKIYTMIYRNLVVVNQQESSDSGTSVSENRCHLEGGSDQKDLVQELQEEKPSSSHLVSRPSTSSRRRAISETEENSDELSGERQRKRHKSDSISLSFDESLALCVIREICCERSSSSESTGTPSNPDLDAGVSEHSGDWLDQDSVSDQFSVEFEVESLDSEDYSLSEEGQELSDEDDEVYQVTVYQAGESDTDSFEEDPEISLADYWKCTSCNEMNPPLPSHCNRCWALRENWLPEDKGKDKGEISEKAKLENSTQAEEGFDVPDCKKTIVNDSRESCVEENDDKITQASQSQESEDYSQPSTSSSIIYSSQEDVKEFEREETQDKEESVESSLPLNAIEPCVICQGRPKNGCIVHGKTGHLMACFTCAKKLKKRNKPCPVCRQPIQMIVLTYFP


In [8]:
#@title **Retrieve UniProt Features**

from pprint import pprint
pprint([feature for feature in requests.get(f'https://rest.uniprot.org/uniprotkb/{uniprot_id}').json()['features'] if feature['type'] in ('Region', 'Domain')][:3])

[{'description': 'SWIB/MDM2',
  'evidences': [{'evidenceCode': 'ECO:0000255',
                 'id': 'PRU01273',
                 'source': 'PROSITE-ProRule'}],
  'location': {'end': {'modifier': 'EXACT', 'value': 109},
               'start': {'modifier': 'EXACT', 'value': 26}},
  'type': 'Domain'},
 {'description': 'Necessary for interaction with USP2',
  'location': {'end': {'modifier': 'EXACT', 'value': 110},
               'start': {'modifier': 'EXACT', 'value': 1}},
  'type': 'Region'},
 {'description': 'Sufficient to promote the mitochondrial pathway of apoptosis',
  'evidences': [{'evidenceCode': 'ECO:0000269',
                 'id': '30879903',
                 'source': 'PubMed'}],
  'location': {'end': {'modifier': 'EXACT', 'value': 101},
               'start': {'modifier': 'EXACT', 'value': 1}},
  'type': 'Region'}]


In [11]:
#@title **Select Suitable Subsequence**

#subsequence_start, subsequence_end = None, None

subsequence_start = "26" #@param {type:"string"}
subsequence_end = "109" #@param {type:"string"}

if subsequence_start and subsequence_end:
  subsequence_start = int(subsequence_start)
  subsequence_end = int(subsequence_end)
  sub_sequence = sequence[subsequence_start-1:subsequence_end-1]
  print(sub_sequence)

TLVRPKPLLLKLLKSVGAQKDTYTMKEVLFYLGQYIMTKRLYDEKQQHIVYCSNDLLGDLFGVPSFSVKEHRKIYTMIYRNLV


In [12]:
#@title **Predict Structure of our Target Protein**
#@markdown ## We will predict the 3D structure of MDM2 using Boltz-2.

#@markdown First: Paste the sequence of the target protein

#@markdown Second: Paste the selected subsequence of the protein, much shorter!
sequence_to_fold = "TLVRPKPLLLKLLKSVGAQKDTYTMKEVLFYLGQYIMTKRLYDEKQQHIVYCSNDLLGDLFGVPSFSVKEHRKIYTMIYRNLV" #@param {type:"string"}
assert sequence_to_fold != "", "You must provide a sequence to fold!"

def fix_msa(msa_path: Path):
  rewrite = False
  with msa_path.open('r') as f:
    msa = list(f.readlines())
    if msa[-1] == '\x00':
      msa = msa[:-1]
      rewrite = True
  if rewrite:
    with msa_path.open('w') as f:
      for line in msa:
        f.write(line)

def write_boltz_yaml(sequence: str, msa_path: str = None):
  data = f'''
sequences:
  - protein:
      id: [A1]
      sequence: {sequence}
      {"msa: " if msa_path is not None else ""}{msa_path if msa_path is not None else ""}
'''
  yaml_path = STEP_PATH / 'target.yaml'
  yaml_path.parent.mkdir(exist_ok=True, parents=True)
  with yaml_path.open('w') as f:
    f.write(str(data))

msa_path = None
use_msa_server = True
if len(sequence_to_fold) == 491 and sequence_to_fold.startswith('MCN') and sequence_to_fold.endswith('YFP'):
  msa_path = str(WORKSHOP_DIRECTORY / 'data' / 'uniref_MDM2_full.a3m')
elif len(sequence_to_fold) == 83 and sequence_to_fold.startswith('TLV') and sequence_to_fold.endswith('NLV'):
  msa_path = str(WORKSHOP_DIRECTORY / 'data' / 'uniref_MDM2.a3m')
if msa_path:
  use_msa_server = False
  fix_msa(Path(msa_path))

write_boltz_yaml(sequence_to_fold, msa_path = msa_path)

def fix_pdb(pdb_path: Path) -> Path:
  fixed_path = pdb_path.with_suffix(".fixed.pdb")
  with pdb_path.open('r') as f:
      with fixed_path.open('w') as g:
          for line in f:
              line = line[:22] + line[23:] #remove the 1 from the chain id # A1 -> A
              g.write(line)
  return fixed_path

#@title Run prediction using Boltz-2
output_format = 'pdb'
num_workers = 0
#@markdown Lower the step scale to increase the diversity of result. (default: 1.638)
step_scale = 1.638 #@param {type:"slider", min:1, max:2, step:0.001}
#@markdown Number of diffusion samples to be generated. (default: 1, AlphaFold3: 5)
diffusion_samples = 1 #@param {type:"slider", min:1, max:10, step:1}
#@markdown Number of recycling steps for the prediction. Higher makes a more refined structure (default: 3, AlphaFold3: 10)
recycling_steps = 3 #@param {type:"slider", min:1, max:25, step:1}
#@markdown Number of sampling steps for structure prediction. (default: 200)
sampling_steps = 50 #@param {type:"slider", min:50, max:400, step:50}
#@markdown Maximum number of MSA sequences to be used
max_msa_seqs = 8192 #@param [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
#@markdown Subsample MSA?
subsample_msa = False #@param {type:"boolean"}
#@markdown Number of subsampled MSA
num_subsampled_msa = 1024 #@param [4, 8, 16, 32, 64, 128, 256, 512, 1024]
#@markdown MSA pairing strategy
msa_pairing_strategy = 'greedy' #@param ['greedy', 'complete']

yaml_path = str(STEP_PATH / 'target.yaml')
commandline = f'{yaml_path} --num_workers {num_workers} --step_scale {step_scale} --recycling_steps {recycling_steps} --sampling_steps {sampling_steps}'
commandline += f' --diffusion_samples {diffusion_samples} --override --max_msa_seqs {max_msa_seqs} --msa_pairing_strategy {msa_pairing_strategy} --output_format {output_format}'
if subsample_msa:
    commandline += f' --num_subsampled_msa {num_subsampled_msa}'
if use_msa_server:
    commandline += ' --use_msa_server'
if runtime.startswith('GPU'):
    commandline += ' --no_kernels --accelerator gpu'
elif runtime == 'TPU':
    commandline += ' --accelerator tpu'
else:
    commandline += ' --accelerator cpu'

!rm -rf {str(STEP_PATH / 'boltz_results_target' / 'predictions')}
!rm -rf {str(STEP_PATH / 'boltz_results_target' / 'processed')}
!boltz predict {commandline} --out_dir {str(STEP_PATH)}

rec = SeqRecord(Seq(str(sequence_to_fold)), id=f'tgt_{PROJECT_ID}', description="")
(STEP_PATH.parent / 'sequences').mkdir(exist_ok=True, parents=True)
SeqIO.write([rec],
    str(STEP_PATH.parent / 'sequences' / 'target.fasta'),
    "fasta")


Checking input data.
Processing 1 inputs with 1 threads.
  0% 0/1 [00:00<?, ?it/s]100% 1/1 [00:00<00:00,  9.49it/s]100% 1/1 [00:00<00:00,  9.47it/s]
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running structure prediction for 1 input.
2025-09-02 09:54:37.214942: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756806877.550443    5527 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756806877.638589    5527 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempti

1

In [13]:
#@title 5. Visualize Folded MDM2 {run: "auto"}
#@markdown We can use py3Dmol to view the 3D structure directly in Colab.

pymol_color_list = ["#33ff33","#00ffff","#ff33cc","#ffff00","#ff9999","#e5e5e5","#7f7fff","#ff7f00",
                    "#7fff7f","#199999","#ff007f","#ffdd5e","#8c3f99","#b2b2b2","#007fff","#c4b200",
                    "#8cb266","#00bfbf","#b27f7f","#fcd1a5","#ff7f7f","#ffbfdd","#7fffff","#ffff7f",
                    "#00ff7f","#337fcc","#d8337f","#bfff3f","#ff7fff","#d8d8ff","#3fffbf","#b78c4c",
                    "#339933","#66b2b2","#ba8c84","#84bf00","#b24c66","#7f7f7f","#3f3fa5","#a5512b"]
alphabet_list = list(ascii_uppercase+ascii_lowercase)

def show_pdb(pdb_str, show_sidechains=False, show_mainchains=False,
             color="pLDDT", chains=None, vmin=50, vmax=90,
             size=(800,480), hbondCutoff=4.0,
             Ls=None,
             animate=False):

  structure_format = 'pdb'

  if chains is None:
    chains = 1 if Ls is None else len(Ls)

  view = py3Dmol.view(js='https://3Dmol.org/build/3Dmol-min.js', width=size[0], height=size[1])
  if animate:
    view.addModelsAsFrames(pdb_str, structure_format,{'hbondCutoff':hbondCutoff})
  else:
    view.addModel(pdb_str, structure_format) #, {'hbondCutoff':hbondCutoff})


  if color == "pLDDT":
    view.setStyle({'cartoon': {'colorscheme': {'prop':'b','gradient': 'roygb','min':vmin,'max':vmax}}})
  elif color == "rainbow":
    view.setStyle({'cartoon': {'color':'spectrum'}})
  elif color == "chain":
    for n,chain,color in zip(range(chains),alphabet_list,pymol_color_list):
       view.setStyle({'chain':chain},{'cartoon': {'color':color}})

  if show_sidechains:
    BB = ['C','O','N']
    view.addStyle({'and':[{'resn':["GLY","PRO"],'invert':True},{'atom':BB,'invert':True}]},
                  {'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
    view.addStyle({'and':[{'resn':"GLY"},{'atom':'CA'}]},
                  {'sphere':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
    view.addStyle({'and':[{'resn':"PRO"},{'atom':['C','O'],'invert':True}]},
                  {'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})

  if show_mainchains:
    BB = ['C','O','N','CA']
    view.addStyle({'atom':BB},{'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
  view.zoomTo()

  view.setHoverable(
    {},
    True,
    '''function(atom,viewer,event,container) {
        if(!atom.label) {
        atom.label = viewer.addLabel(atom.resn + atom.resi,{position: atom, backgroundColor: 'mintcream', fontColor:'black'});
        }}''',
    '''function(atom,viewer) {
        if(atom.label) {
        viewer.removeLabel(atom.label);
        delete atom.label;
        }
    }''',
    viewer=(0, 1)
  )

  if animate: view.animate()
  return view

model_number = "0" #@param {type:"string"}
color = "confidence" #@param ["confidence", "rainbow", "chain"]
if color == "confidence": color = "pLDDT"
show_sidechains = False #@param {type:"boolean"}
show_mainchains = False #@param {type:"boolean"}

fix_pdb(STEP_PATH / 'boltz_results_target' / 'predictions' / 'target' / f'target_model_{model_number}.pdb')
with (STEP_PATH / 'boltz_results_target' / 'predictions' / 'target' / f'target_model_{model_number}.fixed.pdb').open() as f:
  pdb_str = f.read()
show_pdb(pdb_str,
         color=color,
         show_sidechains=show_sidechains,
         show_mainchains=show_mainchains,
         ).show()

### Notebook Summary
- We retrieved the MDM2 sequence and folded it using Boltz-2.
- The folded structure was visualized in 3D.
- The p53-binding region was identified and highlighted.

[➡️ Next: In Notebook 2, we will evaluate the binding feasibility and identify hotspot residues for peptide design.](https://colab.research.google.com/github/timmonspatrick/CECAM-Binding-Workshop-2025/blob/main/notebooks/2_Target_Feasibility.ipynb)