### Process ligands

For each of the DUD-E targets, process its associated targets (stored in the `.mol2` format).
Save each processed ligand in the PyTorch `Data` format.

In [2]:
import os
import torch
import pickle
import numpy as np
import pandas as pd
from progressbar import progressbar
from torch_geometric.data import Data
from biopandas.mol2 import split_multimol2

<IPython.core.display.Javascript object>

In [3]:
# We will write each of the ligand Data files to the
# "raw" directory.
if not os.path.exists("../data/raw"):
    os.makedirs("../data/raw")

<IPython.core.display.Javascript object>

In [4]:
# Load all of the DUD-E target names.
all_targets = pd.read_csv("../data/dud-e_targets.csv").target_name.tolist()
all_targets = [target.lower() for target in all_targets]

<IPython.core.display.Javascript object>

In [5]:
def filter_bonds(bonds, non_h_indices):
    """Return the bonds between non-hydrogen atoms."""
    filtered_bonds = [
        entry
        for entry in bonds
        if entry[0] in non_h_indices and entry[1] in non_h_indices
    ]
    index_mapping = dict(zip(non_h_indices, range(len(non_h_indices))))
    return np.array(
        [[index_mapping[entry[0]], index_mapping[entry[1]]] for entry in filtered_bonds]
    )

<IPython.core.display.Javascript object>

In [6]:
def get_distance(coords, atom_a, atom_b):
    """Get the L2 distance between two atoms."""
    return np.linalg.norm(coords[atom_a, :] - coords[atom_b, :])


def get_edge_props(bonds, coords):
    """Get the edge list and edge attributes."""
    edge_list, sublist_1, sublist_2 = [], [], []
    edge_attribs = []
    for i in range(bonds.shape[0]):
        # Add both "directions" of the bond to the sublists.
        sublist_1 += [bonds[i, 0], bonds[i, 1]]
        sublist_2 += [bonds[i, 1], bonds[i, 0]]
        # Add the distance edge attribute to the list.
        dist_list = [get_distance(coords, *bonds[i, 0:2])]
        # We append the distance sub-list twice to correspond
        # to both directions of the bond.
        edge_attribs.append(dist_list)
        edge_attribs.append(dist_list)
    # Append both sublists to create the edge list.
    edge_list.append(sublist_1)
    edge_list.append(sublist_2)
    return np.array(edge_list).astype(int).tolist(), edge_attribs

<IPython.core.display.Javascript object>

In [7]:
# Dictionary to encode atom element names to integers
# before saving into the Data format.
atom_mapping = {
    "Br": 1,
    "C": 2,
    "Cl": 3,
    "F": 4,
    "I": 5,
    "N": 6,
    "O": 7,
    "P": 8,
    "S": 9,
    "Si": 10,
    "H": 11,
}

<IPython.core.display.Javascript object>

In [8]:
def process_ligands(target):
    """Save the ligands associated with the target to disk."""
    # Maps each associated ligand's code to whether it interacts
    # with the target (1) or not (0).
    responses = {}
    # The dictionary to be written to disk; maps each associated
    # ligand's code to its Data object.
    ligand_dict = {}
    for fname in ["actives_final.mol2", "decoys_final.mol2"]:
        # Split the mol2 file with multiple ligands by ligand.
        # This list will be a list of pair sub-lists, the first
        # element of which is the ligand code, and the second of which
        # is the associated coordinate and bond text.
        mol_text = list(split_multimol2(f"../data/unproc/{target}/{fname}"))
        ligand_codes = [entry[0] for entry in mol_text]
        mol_text = [entry[1] for entry in mol_text]
        y = int(fname.startswith("a"))
        # For each ligand, create a Data object.
        for i in range(len(mol_text)):
            num_atoms = int(mol_text[i][2].split()[0])
            # Remove extraneous information from each line of the text.
            cleaned_text = [row[:-1] for row in mol_text[i][7:]]
            # Split the cleaned text into two.
            atoms_coords, bonds = (
                [row.split() for row in cleaned_text[:num_atoms]],
                [row.split() for row in cleaned_text[(num_atoms + 1) :]],
            )
            # Split the first half into atom element names, and atom coordinates.
            atoms = [row[5].split(".")[0] for row in atoms_coords]
            coords = np.array([row[2:5] for row in atoms_coords]).astype(float)
            # Process bond information into integers.
            bonds = [(np.array(bond[1:3]).astype(int) - 1).tolist() for bond in bonds]
            # Filter out Hydrogen atoms from all lists.
            non_h_indices = np.where(np.array(atoms) != "H")[0]
            atoms = (np.array(atoms)[non_h_indices]).tolist()
            atoms = [[atom_mapping[atom]] for atom in atoms]
            coords = coords[non_h_indices, :]
            bonds = filter_bonds(bonds, non_h_indices)
            # Convert the bond list to an edge list, and get the edge attributes.
            edge_list, edge_attribs = get_edge_props(bonds, coords)
            # Create a Data object out of the ligand information, and add it to
            # the ligand dictionary for this target.
            ligand_dict[ligand_codes[i]] = Data(
                x=torch.tensor(atoms),
                edge_index=torch.LongTensor(edge_list),
                edge_attr=torch.tensor(edge_attribs),
                y=torch.tensor([[y]]),
                ligand_code=ligand_codes[i],
                target=target,
            )
            responses[ligand_codes[i]] = y

    # Save the dictionary of ligand Data objects to disk.
    with open(f"../data/raw/{target}_ligand_dict.pkl", "wb") as f:
        pickle.dump(ligand_dict, f)
    # Return the dictionary of responses, to be integrated into
    # a larger dictionary.
    return responses

<IPython.core.display.Javascript object>

In [10]:
def process_all_targets(all_targets):
    """Save the ligands associated with all targets to disk."""
    # The dictionary of all target-ligand responses.
    ligand_responses = dict()
    for target in progressbar(all_targets):
        ligand_responses[target] = process_ligands(target)

    with open("../data/raw/ligand_responses.pkl", "wb") as f:
        pickle.dump(ligand_responses, f)

<IPython.core.display.Javascript object>

In [11]:
process_all_targets(all_targets)

100% (102 of 102) |######################| Elapsed Time: 0:40:36 Time:  0:40:36


<IPython.core.display.Javascript object>