This is the first version of the notebook. Use this newer version instead: https://colab.research.google.com/drive/1CTtUGg05-2MtlWmfJhqzLTtkDDaxCDOQ

# DiffDock
Dock a PDB files and a SMILES with [DiffDock](https://github.com/gcorso/DiffDock).

Select Runtime / Run all to run an example PDB file and SMILES.

May require "premium GPU" (colab pro), and even then it may fail on large complexes.

In [None]:
#@title PDB + SMILES input

PDB_id = '' #@param {type:"string"}
SMILES_or_pubchem_id = '' #@param {type:"string"}

#@markdown Download a tar file containing all results?
download_results = True #@param {type:"boolean"}

In [None]:
import os
import requests
import time
from random import random

def download_pdb_file(pdb_id: str) -> str:
    """Download pdb file as a string from rcsb.org"""
    PDB_DIR ="/tmp/pdb/"
    os.makedirs(PDB_DIR, exist_ok=True)

    # url or pdb_id
    if pdb_id.startswith('http'):
        url = pdb_id
        filename = url.split('/')[-1]
    else:
        url = f"http://files.rcsb.org/view/{pdb_id}.pdb"
        filename = f'{pdb_id}.pdb'

    cache_path = os.path.join(PDB_DIR, filename)
    if os.path.exists(cache_path):
        return cache_path

    pdb_req = requests.get(url)
    pdb_req.raise_for_status()
    open(cache_path, 'w').write(pdb_req.text)
    return cache_path

def download_smiles_str(pubchem_id: str, retries:int = 2) -> str:
    """Given a pubchem id, get a smiles string"""
    while True:
        req = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{pubchem_id}/property/CanonicalSMILES/CSV")
        smiles_url_csv = req.text if req.status_code == 200 else None
        if smiles_url_csv is not None:
            break
        if retries == 0:
            return None
        time.sleep(1+random())
        retries -= 1

    return smiles_url_csv.splitlines()[1].split(',')[1].strip('"').strip("'") if smiles_url_csv is not None else None

In [None]:
if not PDB_id or not SMILES_or_pubchem_id:
    PDB_id = "6agt"
    SMILES_or_pubchem_id = "COc(cc1)ccc1C#N"
    print(f"No input supplied. Using example data: {PDB_id} and {SMILES_or_pubchem_id}")

# to run many PDB+smiles at once, fill in a list of PDB_files and smiles here...
pdb_files = [download_pdb_file(PDB_id)]
smiless = [download_smiles_str(SMILES_or_pubchem_id) if str(SMILES_or_pubchem_id).isnumeric() else SMILES_or_pubchem_id]

with open("/tmp/input_protein_ligand.csv", 'w') as out:
    out.write("protein_path,ligand\n")
    for pdb_file in pdb_files:
        for smiles in smiless:
            out.write(f"{pdb_file},{smiles}\n")

## Install prerequisites

In [None]:
!pip install ipython-autotime
%load_ext autotime

In [None]:
if not os.path.exists("/content/DiffDock"):
    %cd /content
    !git clone https://github.com/gcorso/DiffDock.git
    %cd /content/DiffDock
    !git checkout 0f9c419 # remove/update for more up to date code

In [None]:
!pip install pyg==0.7.1 --quiet
!pip install pyyaml==6.0 --quiet
!pip install scipy==1.7.3 --quiet
!pip install networkx==2.6.3 --quiet
!pip install biopython==1.79 --quiet
!pip install rdkit-pypi==2022.03.5 --quiet
!pip install e3nn==0.5.0 --quiet
!pip install spyrmsd==0.5.2 --quiet
!pip install pandas==1.3.5 --quiet
!pip install biopandas==0.4.1 --quiet
!pip install torch==1.12.1+cu113 --quiet

In [None]:
import torch

try:
    import torch_geometric
except ModuleNotFoundError:
    !pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
    !pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html --quiet
    !pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html --quiet
    !pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html --quiet
    !pip install git+https://github.com/pyg-team/pytorch_geometric.git  --quiet # no version for some reason??

### Download 2GB PDBBind dataset
unnecessary for inference

In [None]:
#!test -d /content/DiffDock/data/PDBBind_processed || (wget https://zenodo.org/record/6034088/files/PDBBind.zip && unzip -q PDBBind.zip && mv PDBBind_processed /content/DiffDock/data/)

## Install ESM and prepare PDB file for ESM

In [None]:
if not os.path.exists("/content/DiffDock/esm"):
    %cd /content/DiffDock
    !git clone https://github.com/facebookresearch/esm
    %cd /content/DiffDock/esm
    !git checkout f07aed6 # remove/update for more up to date code
    !sudo pip install -e .
    %cd /content/DiffDock

In [None]:
%cd /content/DiffDock
!python datasets/esm_embedding_preparation.py --protein_ligand_csv /tmp/input_protein_ligand.csv --out_file data/prepared_for_esm.fasta 

In [None]:
%cd /content/DiffDock
%env HOME=esm/model_weights
%env PYTHONPATH=$PYTHONPATH:/content/DiffDock/esm
!python /content/DiffDock/esm/scripts/extract.py esm2_t33_650M_UR50D data/prepared_for_esm.fasta data/esm2_output --repr_layers 33 --include per_tok

## Run DiffDock

In [None]:
%cd /content/DiffDock
!python -m inference --protein_ligand_csv /tmp/input_protein_ligand.csv --out_dir results/user_predictions_small --inference_steps 20 --samples_per_complex 40 --batch_size 10

## Download results

In [None]:
if download_results:
    from google.colab import files
    from glob import glob
    from shlex import quote

    %cd /content/DiffDock/results/user_predictions_small
    out_fs = []
    for pdb_file in pdb_files:
        !cp {pdb_file} .
        pdb_file_root = pdb_file.split("/")[-1]
        out_fs.append(f"{pdb_file_root}")
        for smiles in smiless:
            sglob = ''.join([c if c in "CONH" else "?" for c in smiles])
            out_fs += glob(f"*{pdb_file_root}*{sglob}*/rank*_*.sdf")

    out_fs_bash = ' '.join([quote(f) for f in out_fs])
    !tar cvf diffdock_results.tar {out_fs_bash}
    files.download("diffdock_results.tar")