In [1]:
%load_ext autoreload
%autoreload 1
%aimport kbody_transform

import tensorflow as tf
import numpy as np
import re
from sklearn.metrics import pairwise_distances

One of the test example of $\textrm{C}_9\textrm{H}_7\textrm{N}$:

**Note: the unit of the energies should be eV.**

In [2]:
example = """17
-550.820680481
N     13.64290653      14.13624462       5.00000000  
C     14.15947101      10.84962323       5.00000000  
C     14.76810787       9.57195729       5.00000000  
C     15.48133668       8.52875020       5.00000000  
C     12.43268116      13.68893092       5.00000000  
C     12.80400662      11.04896203       5.00000000  
C     10.69871088      11.93562620       5.00000000  
C     10.49418002      10.46513468       5.00000000  
C     11.80136861       9.98973463       5.00000000  
C     12.02985377      12.30192209       5.00000000  
H     13.81935751      15.13241536       5.00000000  
H     14.45192832      13.52872145       5.00000000  
H     14.86239638      11.67891117       5.00000000  
H     11.63193585      14.42890524       5.00000000  
H      9.87504964      12.64680696       5.00000000  
H     12.06052266       8.93300535       5.00000000  
H     14.30308267       8.39805400       5.00000000  
"""

The helper functions for converting atomic coordinates to input features.

In [3]:
energy_patt = re.compile(r"([\w.-]+)")
string_patt = re.compile(r"([A-Za-z]+)\s+([\w.-]+)\s+([\w.-]+)\s+([\w.-]+)")
num_atoms = 17
num_structures = 1

def extract_xyz(string):
  coords = np.zeros((num_structures, num_atoms, 3), dtype=np.float32)
  species = []
  parse_species = True
  energies = np.zeros((num_structures, ), dtype=np.float64)
  stage = 0
  counter = 0
  
  for line in string.split("\n"):
    i = int(counter // num_atoms)
    if i == num_structures:
      break
    l = line.strip()
    if l == "":
      continue
    elif stage == 0:
      if l.isdigit():
        n = int(l)
        if n != num_atoms:
          raise ValueError("The parsed size %d != NUM_SITES" % n)
        stage += 1
    elif stage == 1:
      m = energy_patt.search(l)
      if m:
        energies[i] = float(m.group(1))
        stage = 2
    elif stage == 2:
      m = string_patt.search(l)
      if m:
        if parse_species:
          species.append(m.group(1))
          if len(species) == num_atoms:
            parse_species = False
        x, y, z = float(m.group(2)), float(m.group(3)), float(m.group(4))
        coords[i, counter % num_atoms, :] = x, y, z
        counter += 1
        if counter % num_atoms == 0:
          stage = 0
  
  return species, energies, coords

Initialize a `Transformer` and we can easily obtain input features and training targets.

In [4]:
species, energies, coords = extract_xyz(example)
clf = kbody_transform.Transformer(species, many_body_k=3)
features, target = clf.transform(coords, energies)

In [5]:
tf.reset_default_graph()
sess = tf.InteractiveSession()  
saver = tf.train.import_meta_graph("./events/model.ckpt-500.meta")
saver.restore(sess, "./events/model.ckpt-500")  
graph = tf.get_default_graph()
graph_def = graph.as_graph_def()

In [23]:
tensor_names = [node.name for node in graph_def.node]

In [18]:
extra_inputs = graph.get_tensor_by_name("placeholders/extra_inputs:0")
use_extra = graph.get_tensor_by_name("placeholders/use_extra_inputs:0")
is_predicting = graph.get_tensor_by_name("placeholders/is_predicting:0")
shuffle_batch = graph.get_tensor_by_name("input/shuffle_batch:0")
split_dims = graph.get_tensor_by_name("split_dims:0")
y_pred = graph.get_tensor_by_name("Outputs/squeeze:0")

In [12]:
fixed_shape = shuffle_batch.get_shape().as_list()

In [14]:
X_test = np.concatenate((features.reshape([1, 1, 680, 3]), np.zeros((1, 1, 680, 3), dtype=np.float32)))

In [15]:
X_test.shape

(2, 1, 680, 3)

In [19]:
feed_dict = {
  use_extra: True, 
  is_predicting: True, 
  extra_inputs: X_test, 
  shuffle_batch: np.zeros(fixed_shape, np.float32),
  split_dims: clf._kbody_sizes
}

In [20]:
sess.run(y_pred, feed_dict=feed_dict)

array([ 550.06317139,  548.69140625], dtype=float32)

In [21]:
target

array([ 550.82068048])

In [35]:
def get_formula(species):
  """
  Return the molecular formula given a list of atomic species.
  """
  return "".join(species)


class CNNPredictor:
  """
  An energy predictor based on the deep neural network of 'sum-kbody-cnn'.
  """
  
  def __init__(self, species, many_body_k=3):
    """
    Initialization method.
    
    Args:
      species: a `List[str]` as the species for the trained model.
      many_body_k: a `int` as the many-body expansion.
    
    """
    self.transformer = kbody_transform.Transformer(species, many_body_k=many_body_k)
    self.sess = tf.Session()
    self._default_num_atoms = len(species)
    self._formula = get_formula(species)
    self._selections = []
    for term, cnk_selection in self.transformer._selections.items():
      self._selections.extend(cnk_selection)
  
  @property
  def many_body_k(self):
    return self.transformer.many_body_k
  
  @property
  def ck2(self):
    return self.transformer.ck2
  
  @property
  def cnk(self):
    return self.transformer.cnk
  
  def import_model(self, model, **kwargs):
    """
    Import and restore the meta-model.
    
    Args:
      model: a string representing the model path. This file should correspond to the 
        ckpt file with steps, eg `model.ckpt-8000`.
      kwargs: additional key-value arguments for restoring the model.
    
    """
    self.saver = tf.train.import_meta_graph("{}.meta".format(model))
    self.saver.restore(self.sess, model, **kwargs)
    self.y_total_op = graph.get_tensor_by_name("Outputs/squeeze:0")    
    self.y_kbody_op = graph.get_tensor_by_name("Contribs:0")
    self._extra_inputs = graph.get_tensor_by_name("placeholders/extra_inputs:0")
    self._use_extra = graph.get_tensor_by_name("placeholders/use_extra_inputs:0")
    self._is_predicting = graph.get_tensor_by_name("placeholders/is_predicting:0")
    self._split_dims = graph.get_tensor_by_name("split_dims:0")
    self._shuffle_batch = graph.get_tensor_by_name("input/shuffle_batch:0")
    self._defaut_batch = np.zeros(self._shuffle_batch.get_shape().as_list(), 
                                  dtype=np.float32)
  
  def _predict_same(self, coords):
    """
    Make the prediction(s) for the molecule(s) with the exactly same species
    with the trained model.
    
    Args:
      coords: a 2D or 3D array as the atomic coordinates.
    
    Returns:
      y_total: a 1D array as the predicted total energies.
      y_atomic: a 2D array as the predicted atomic energies.
    
    """
    if len(coords.shape) == 2:
      assert coords.shape[0] == self._default_num_atoms
      num_atoms = len(coords)
      coords = coords.reshape((1, num_atoms, 3))
      num_mols = 1
    else:
      num_mols, num_atoms = coords.shape[0:2]
    energies = np.zeros((num_mols, ), dtype=np.float64)
    features, _ = self.transformer.transform(coords, energies)
    features = features.reshape((num_mols, 1, -1, self.ck2))
    y_total, y_kbody = sess.run([self.y_total_op, self.y_kbody_op], feed_dict={
      self._extra_inputs: features,
      self._use_extra: True,
      self._is_predicting: True,
      self._shuffle_batch: self._defaut_batch,
      self._split_dims: self.transformer.split_dims,
    })
    atomic_energies = np.zeros((num_mols, num_atoms))
    for step in range(num_mols):
      for i in range(self.cnk):
        for j in self._selections[i]:
          atomic_energies[step, j] -= y_kbody[step, 0, i, 0]
    return -y_total, atomic_energies / float(self.ck2)
  
  def _predict(self, species, coords):
    """
    Make the prediction
    """
    pass

  def predict(self, species, coords):
    """
    Make the prediction for the given molecule.
    
    Args:
      species: a `List[str]` as the ordered atomic species of a molecule.
      coords: a 2D array as the atomic coordinates of a molecule.
    
    Returns:
      total_energy: a float as the predicted total energy.
      atomic_energies: a 1D array as the predicted energy for each atom.
      kbody_energies: a 1D array as the predicted energy of each k-body terms.
    
    """
    if get_formula(species) == self._formula:
      return self._predict_same(coords)
    else:
      return self._predict(species, coords) 

In [36]:
calculator = CNNPredictor(species, many_body_k=3)
calculator.load("./events/model.ckpt-500")

In [40]:
%timeit total_energy, atomic_energies = calculator.predict(species, coords)

100 loops, best of 3: 3.77 ms per loop


In [38]:
total_energy

-550.06317

In [39]:
atomic_energies

array([[  0.        , -20.32864237, -20.33601205, -20.34325353,
        -20.35279616, -20.36083968, -20.37289699, -20.38856101,
        -20.41240295, -20.45897953, -52.36914881, -52.37460534,
        -52.38697068, -52.39125331, -52.39379803, -52.39618492,
        -52.39680743]])