In [49]:
from glob import glob

from tqdm import tqdm
import numpy as np

import g2s

# Data Preparation and Featurization
To train Graph-To-Structure, you need a bond order and distances matrix and the corresponding nuclear charges of the system.

You can get a bond order matrix either through SMILES/SMARTS/SELFIES or you can use some of the graph extraction tools explained
in the graph extraction tutorial.

In [50]:
test_mols = np.load('../tests/test_files/test_molecules.npz', allow_pickle=True)
bond_order_matrix, distances, nuclear_charges = test_mols['adj'], test_mols['distances'], test_mols['nuclear_charges']

The next step requires the featurization of our molecules. 

Since the example molecules are of different sizes, we have to apply zero padding
to the representation as well as the distance matrix! 

For that we need to know the largest number of heavy atoms in the dataset:

In [51]:
max_natoms = max([len(np.where(np.array(z) != 1)[0]) for z in nuclear_charges])
print(f'Largest molecule has size: {max_natoms}')

Largest molecule has size: 11


An important step in G2S is the separation of the heavy-atom scaffold from all the hydrogens. 

Heavy atom distances are predicted separately from hydrogens, for which reason these two are separate machine learning problems in G2S.

Disclaimer: In case a molecule has less than 4 heavy atoms, atoms are not separated. (Check the first molecule in the set!)

You can decide for yourself how to add hydrogens later on. Either through G2S or by using tools like RDkit or Open Babel.

We featurize the heavy atom scaffold using the bond length representation, which is the sum of the bond lengths on the shortes path between two atoms.

In [52]:
# Initialize the Graph compound class
gc = g2s.GraphCompound(bond_order_matrix[3], nuclear_charges[3], distances[3])

# Filter heavy atoms
gc.filter_atoms('heavy')

# Compute Bond Length matrix for the heavy atom scaffold
# To ensure permutational invariance, the matrix is sorted by its row-norm
gc.generate_bond_length(size=max_natoms, sorting='row-norm')
print('Zero-padded, flattened bond length representation:')
print(gc.representation)

# For the prediction of hydrogens, compute a local hydrogen matrix
gc.generate_local_hydrogen_matrix()

print('Local hydrogen environment representation:')
print(gc.hydrogen_representations)
print('Closest 4 heavy atom to hydrogen distances:')
print('Last value is the H-H distance in case multiple hydrogens are attached.')
print(gc.hydrogen_heavy_distances)
print('Central heavy atom, hydrogen and neighbour indices')
print(gc.heavy_hydrogen_mapping)

Zero-padded, flattened bond length representation:
[1.5  1.47 1.47 2.64 2.82 2.89 3.   0.   0.   0.   2.97 2.97 4.14 4.32
 1.39 1.5  0.   0.   0.   2.94 1.17 1.35 4.36 4.47 0.   0.   0.   4.11
 4.29 4.36 4.47 0.   0.   0.   2.52 5.53 5.64 0.   0.   0.   5.71 5.82
 0.   0.   0.   2.89 0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
Local hydrogen environment representation:
[[1.47 1.07 1.47 1.5  2.64 2.54 2.94 2.97 1.17 2.54 2.57 3.71 2.97 4.11
  4.14]
 [1.5  1.07 1.39 1.5  2.97 2.57 2.89 3.   1.47 2.46 2.57 4.04 2.89 4.36
  4.47]
 [1.47 1.47 1.5  2.51 2.64 2.94 2.97 3.98 1.17 2.97 1.04 4.11 4.01 4.14
  5.15]
 [1.5  1.5  1.39 2.57 2.97 3.   2.89 4.07 1.47 2.89 1.07 4.47 3.96 4.36
  5.54]]
Closest 4 heavy atom to hydrogen distances:
Last value is the H-H distance in case multiple hydrogens are attached.
[[1.10122396 2.06804614 2.15768785 2.17006917 0.        ]
 [1.0938294  2.01014136 2.14604467 2.17602421 0.        ]
 [1.00978321 2.00464599 3.27224224 2.53945783 1.656134  ]
 [1.09228214 2.

  return np.array(local_h_repr), np.array(heavy_hydrogen_mapping), np.array(hydrogen_heavy_distances)


### Now we repeat the procedure for all molecules in our dataset

In [53]:
# Heavy Atom Stuff
representations = []
padded_nuclear_charges = []
padded_distances = []
adj_mat = []
nucl_ch = []

# Hydrogen Stuff
local_hydrogen = []
heavy_hydrogen_mapping = []
hydrogen_heavy_dist = []
for bo, dist, nc, in zip(bond_order_matrix, distances, nuclear_charges):
    
    # Initialize the Graph compound class
    gc = g2s.GraphCompound(bo, nc, dist)
    
    # Filter heavy atoms
    gc.filter_atoms('heavy')
    
    # Compute Bond Length matrix for the heavy atom scaffold
    # To ensure permutational invariance, the matrix is sorted by its row-norm
    gc.generate_bond_length(size=max_natoms, sorting='row-norm')
    
    # For the prediction of hydrogens, compute a local hydrogen matrix
    gc.generate_local_hydrogen_matrix()
    
    representations.append(gc.representation)
    padded_nuclear_charges.append(gc.nuclear_charges)
    padded_distances.append(gc.distances)
    adj_mat.append(gc.full_adjacency_matrix)
    nucl_ch.append(gc.full_nuclear_charges)

    local_hydrogen.append(gc.hydrogen_representations)
    heavy_hydrogen_mapping.append(gc.heavy_hydrogen_mapping)
    hydrogen_heavy_dist.append(gc.hydrogen_heavy_distances)

representations = np.array(representations)
padded_nuclear_charges = np.array(padded_nuclear_charges)
padded_distances = np.array(padded_distances)
local_hydrogen = np.array(local_hydrogen)
heavy_hydrogen_mapping = np.array(heavy_hydrogen_mapping)
hydrogen_heavy_dist = np.array(hydrogen_heavy_dist)

  padded_nuclear_charges = np.array(padded_nuclear_charges)
  local_hydrogen = np.array(local_hydrogen)
  heavy_hydrogen_mapping = np.array(heavy_hydrogen_mapping)
  hydrogen_heavy_dist = np.array(hydrogen_heavy_dist)


## Machine Learning

In this example, we use Kernel Ridge Regression (KRR) for the learning of distance matrices.

Specifically, we use a single kernel machine per distance matrix element.

Since the representation and therefore the kernel matrix is independent of the distance matrix elements, only a singel kernel inversion has to be performed, which leaves 
n dot products to perform (for n distance matrix elements).

In [54]:
kernel_sigma = 32
kernel_lambda = 1e-5

train_kernel = g2s.krr.laplacian_kernel(representations, representations, 32)
train_kernel[np.diag_indices_from(train_kernel)] += 1e-5
alphas = g2s.krr.train_multikernel(train_kernel, padded_distances)

pred_distances = g2s.krr.predict_distances(train_kernel, alphas)

100%|██████████| 55/55 [00:00<00:00, 184254.57it/s]


For more numerical stability, instead of the multikernel approach a standard cholesky decomposition can also be performed. 

In [55]:
cho_alphas = g2s.krr.train_cholesky(train_kernel, padded_distances)
cho_pred_distances = g2s.krr.predict_distances(train_kernel, cho_alphas)

100%|██████████| 55/55 [00:00<00:00, 13613.05it/s]


Similar logic applies to the learning of local heavy atom to hydrogen distances.
Don't forget to stack your local environments before training!

In [56]:
print(local_hydrogen.shape)
hydrogen_representation = np.vstack(local_hydrogen)
hydrogen_distances = np.vstack(hydrogen_heavy_dist)
print(hydrogen_representation.shape)

(5,)
(13, 15)


In [57]:
train_kernel = g2s.krr.laplacian_kernel(hydrogen_representation, hydrogen_representation, 32)
train_kernel[np.diag_indices_from(train_kernel)] += 1e-7
hydro_alphas = g2s.krr.train_multikernel(train_kernel, hydrogen_distances)

pred_hydrogen_distances = g2s.krr.predict_distances(train_kernel, hydro_alphas)

100%|██████████| 5/5 [00:00<00:00, 57456.22it/s]


## Geometry Reconstruction, Solving the Distance Geometry Problem

To convert the predicted distance matrices, we have to solve the Distance Geometry Problem!

This problem deals with embedding points in a space, given a specified distance boundary (our predicted distances matrix).

In G2S, we use a software called DGSOL, which solves the DGP even for noisy and/or sparse distance matrices (such as obtained from macromolecular NMR experiments).

Make sure you have added the dgsol binary to your $PATH

In [58]:
# Make sure to filter non-zero distances from the zero-padding we had to apply for training!
nz_pred_distances = g2s.utils.filter_nonzero_distances(pred_distances, padded_nuclear_charges)
dgsol = g2s.dgeom.DGSOL(nz_pred_distances, padded_nuclear_charges, vectorized_input=False)
dgsol.solve_distance_geometry('./test_molecules')

100%|██████████| 5/5 [00:00<00:00, 49.38it/s]


Now you can access the coordinates using `dgsol.coords`

If you want to write xyz's files to disc, you can use some helper functions in G2S such as:

In [59]:
from g2s.utils import write_xyz
write_xyz('./test0.xyz', dgsol.coords[0], dgsol.nuclear_charges[0])

However, the hydrogens are still missing! Let's add them shall we?

In [60]:
from g2s.dgeom import hydrogen_lebedev_reconstruction
from g2s.utils import combine_heavy_hydrogen_coords

The function hydrogen_lebedev_reconstruction constructs a lebedev sphere around a central heavy atom and 
places hydrogens on top of this sphere given the distance constraints we have predicted with our machine.

In [61]:
for i, e in enumerate(sorted(glob('./test_molecules/*'))):
    if heavy_hydrogen_mapping[i].size > 0.:
        h_coords = hydrogen_lebedev_reconstruction(dgsol.coords[i], hydrogen_heavy_dist[i], heavy_hydrogen_mapping[i])
        heavy_c, all_nc = combine_heavy_hydrogen_coords(dgsol.coords[i], h_coords, padded_nuclear_charges[i])
        write_xyz(f'{e}/dgsol_h.xyz', heavy_c, all_nc)
    else:
        write_xyz(f'{e}/dgsol_h.xyz', dgsol.coords[i], dgsol.nuclear_charges[i])

Check out your molecules at `./test_molecules/`!

If you `pip install py3Dmol` you can visualize molecules in juypter notebooks:

In [62]:
# import py3Dmol
# with open('./test_molecules/0001/dgsol_h.xyz', 'r') as infile:
#     xyz = infile.readlines()
# xyz = ''.join(xyz)
# xyzview = py3Dmol.view(width=400,height=400)
# xyzview.addModel(xyz,'xyz')
# xyzview.setStyle({'stick':{'colorscheme':'cyanCarbon'}})
# xyzview.zoomTo()
# xyzview.show()

The G2S package also provides an interface to saturate a molecule using rdkit

In [63]:
from g2s.dgeom.rdkit import embed_hydrogens

In [64]:
for i, e in enumerate(sorted(glob('./test_molecules/*'))):
    if heavy_hydrogen_mapping[i].size > 0.:
        try:
            embedded_coords, embedded_nuclear_charges, new_adj_mat = embed_hydrogens(adj_mat[i], nucl_ch[i], dgsol.coords[i])
        except Exception as err:
            print(f'Structure number {i} failed in RDkit for the following reason:')
            print(err)
            print('------------------')
            continue
        write_xyz(f'{e}/rdkit.xyz', embedded_coords, embedded_nuclear_charges)
    else:
        write_xyz(f'{e}/rdkit.xyz', dgsol.coords[i], dgsol.nuclear_charges[i])

Structure number 1 failed in RDkit for the following reason:
Explicit valence for atom # 1 N, 4, is greater than permitted
------------------
Structure number 2 failed in RDkit for the following reason:
Explicit valence for atom # 2 N, 4, is greater than permitted
------------------
Structure number 3 failed in RDkit for the following reason:
Explicit valence for atom # 2 N, 4, is greater than permitted
------------------


RDKit ERROR: [13:08:20] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [13:08:20] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [13:08:20] Explicit valence for atom # 2 N, 4, is greater than permitted


RDkit in general often struggles with Nitrogen's (as you see in the example above). 

Be careful when you use this functionality! RDkit might crash!

(Which is another the reason to use G2S instead)