In [1]:
from glob import glob

import numpy as np
from tqdm import tqdm

import g2s
from g2s.graph_extractor import xyz2mol_graph
from g2s.representations.locality import get_local_environment, construct_sparse_dm, get_local_knn_environment




In [81]:
ci_path = sorted(glob('/Users/c0uch1/work/g2g_data/qm9/H10C7O2/*.xyz'))

In [96]:
def get_local_knn_environment(idx, adjacency_matrix, representation, nuclear_charges, distances=None, depth=2, sorting='norm_row'):
    graph = igraph.Graph()
    graph = graph.Adjacency(list(adjacency_matrix.astype(float)))
    # Get all K=1 neighbors around a query atom
    neighbor_idxs = graph.neighbors(idx, mode='OUT')
    new_neighs = neighbor_idxs
    d = 1
    # if k > 1 then scan further neighbors
    while d < depth:
        kn = []
        for n in new_neighs:
            kn.append(graph.neighbors(n, mode='OUT'))

        # Filter non-unique indices
        kn = [item for sublist in kn for item in sublist]
        kn = list(filter((idx).__ne__, kn))
        kn = np.unique(kn)

        neighbor_idxs.extend(kn)
        new_neighs = kn
        d += 1
    neighbor_idxs = np.unique(neighbor_idxs)

    # row, col = np.triu_indices(len(local_idxs), k=1)

    atomic_representation = representation[idx][neighbor_idxs]
    local_nuclear_charges = nuclear_charges[neighbor_idxs]
    if distances is not None:
        local_distances = distances[idx][neighbor_idxs]

    if sorting == 'norm_row':
        row_norm_idx = np.argsort(atomic_representation)
        atomic_representation = atomic_representation[row_norm_idx]
        local_nuclear_charges = local_nuclear_charges[row_norm_idx]
        neighbor_idxs = neighbor_idxs[row_norm_idx]

        if distances is not None:
            local_distances = local_distances[row_norm_idx]
            
    local_nuclear_charges = np.array([nuclear_charges[idx], *local_nuclear_charges])
    local_idxs = np.array([idx, *neighbor_idxs])

    if distances is not None:
        return atomic_representation, local_nuclear_charges, local_idxs, local_distances
    else:
        return atomic_representation, local_nuclear_charges, local_idxs


In [83]:
def prep_data(filepath):
    bond_order_matrix, nuclear_charges, coords = xyz2mol_graph(filepath)

    distances = g2s.utils.calculate_distances(coords)
    gc = g2s.GraphCompound(bond_order_matrix, nuclear_charges, distances)
    gc.generate_bond_length(size=19, sorting='norm_row')

    representation = gc.representation
    adjacency = gc.adjacency_matrix
    nuclear_charges = gc.nuclear_charges
    distances = gc.distances

    representation = g2s.utils.vector_to_square([representation])[0]
    distances = g2s.utils.vector_to_square([distances])[0]

    hydrogen_idxs = np.where(nuclear_charges == 1)[0]
    heavy_atom_idxs = np.where(nuclear_charges != 1)[0]

    ha_representation = representation[heavy_atom_idxs][:, heavy_atom_idxs]
    hy_representation = representation[hydrogen_idxs][:, heavy_atom_idxs]

    ha_dm = distances[heavy_atom_idxs][:, heavy_atom_idxs]
    ha_nc = nuclear_charges[heavy_atom_idxs]
    
    ha_adj = adjacency[heavy_atom_idxs][:, heavy_atom_idxs]

    ha_local_repr = []
    ha_local_nc = []
    ha_local_dist = []
    ha_local_idxs = []

    hy_local_repr = []
    hy_local_nc = []
    hy_local_dist = []
    hy_local_idxs = []

    for i in range(len(heavy_atom_idxs)):
#         atomic_representation, local_nuclear_charges, local_idxs, local_distances = get_local_environment(i, ha_representation, ha_nc, distances=ha_dm, n_neighs=3)
        atomic_representation, local_nuclear_charges, local_idxs, local_distances = get_local_knn_environment(i,  ha_adj, ha_representation, ha_nc, distances=ha_dm, depth=2)

        ha_local_repr.append(atomic_representation)
        ha_local_dist.append(local_distances)
        ha_local_nc.append(local_nuclear_charges)
        ha_local_idxs.append(local_idxs)

    for i in hydrogen_idxs:
        hydro_atomic_representation, hydro_local_nuclear_charges, hydro_local_idxs, hydro_local_distances = get_local_environment(i, representation, nuclear_charges, distances, n_neighs=4)
        hy_local_repr.append(hydro_atomic_representation)
        hy_local_nc.append(hydro_local_nuclear_charges)
        hy_local_dist.append(hydro_local_distances)
        hy_local_idxs.append(hydro_local_idxs)
        
    mol_data = {'ha_lr': ha_local_repr,
                'ha_lnc': ha_local_nc,
                'ha_ldist': ha_local_dist,
                'ha_lidxs': ha_local_idxs,
                'hy_lr': hy_local_repr,
                'hy_lnc': hy_local_nc,
                'hy_ldist': hy_local_dist,
                'hy_lidxs': hy_local_idxs,
                'nuclear_charges': nuclear_charges,
                'heavy_atom_idxs': heavy_atom_idxs,
                'hydrogen_idxs': hydrogen_idxs,
                'full_dm': distances}
   
    return mol_data

def get_unique_environments(local_environments, local_idxs, local_distances=None):
    uq_env = {}
    for i, r in enumerate(local_environments):
        if tuple(r) not in uq_env.keys():
            uq_env[tuple(r)] = {'local_idxs': [],
                                'local_dist': []}
        uq_env[tuple(r)]['local_idxs'].append(local_idxs[i])
        if local_distances is not None:
            uq_env[tuple(r)]['local_dist'].append(local_distances[i])

    if local_distances is not None:

        for k in uq_env.keys():
            uq_env[k]['avg_dist'] = np.array(uq_env[k]['local_dist']).mean(axis=0)
    return uq_env
    

In [97]:
mol_envs = []

for ci in tqdm(ci_path):
    mol_envs.append(prep_data(ci))

100%|██████████| 6095/6095 [00:43<00:00, 139.02it/s]


In [98]:
ha_lr = [mol_envs[i]['ha_lr'] for i in range(len(mol_envs))]
hy_lr = [mol_envs[i]['hy_lr'] for i in range(len(mol_envs))]
ha_lidxs = [mol_envs[i]['ha_lidxs'] for i in range(len(mol_envs))]
hy_lidxs = [mol_envs[i]['hy_lidxs'] for i in range(len(mol_envs))]
ha_ldist = [mol_envs[i]['ha_ldist'] for i in range(len(mol_envs))]
hy_ldist = [mol_envs[i]['hy_ldist'] for i in range(len(mol_envs))]

In [101]:
ha_ldist[0]

[array([1.427608  , 1.50810294, 1.53775736, 1.53445536, 2.62658106,
        2.4587894 ]),
 array([1.36075965, 1.36483552, 1.50810294, 2.19578119, 2.23951927,
        2.38146468, 2.50751839, 2.5014679 ]),
 array([1.36075965, 1.43508055, 2.23100009, 2.62658106, 2.23114235]),
 array([1.36483552, 1.36277714, 2.23100009, 2.23252109, 2.4587894 ]),
 array([1.53775736, 2.43775567, 2.50751839, 2.52940696]),
 array([1.53445536, 2.35871884, 2.5014679 , 2.52940696]),
 array([1.427608  , 2.38146468, 2.43775567, 2.35871884]),
 array([1.35629504, 1.36277714, 2.19578119, 2.23114235]),
 array([1.35629504, 1.43508055, 2.23252109, 2.23951927])]

In [102]:
def get_unique_environments(local_environments, local_idxs, local_distances=None):
    uq_env = {}
    for j in range(len(local_environments)):
        for i, r in enumerate(local_environments[j]):
            if tuple(r) not in uq_env.keys():
                uq_env[tuple(r)] = {'local_idxs': [],
                                    'local_dist': []}
            uq_env[tuple(r)]['local_idxs'].append(local_idxs[j][i])
            if local_distances is not None:
                uq_env[tuple(r)]['local_dist'].append(local_distances[j][i])

    if local_distances is not None:

        for k in uq_env.keys():
            uq_env[k]['avg_dist'] = np.array(uq_env[k]['local_dist']).mean(axis=0)
    return uq_env


In [103]:
uq_ha_env = get_unique_environments(ha_lr, ha_lidxs, local_distances=ha_ldist)
uq_hy_env = get_unique_environments(hy_lr, hy_lidxs, local_distances=hy_ldist)

In [48]:
len(ha_lr)*9

54855

In [104]:
len(uq_ha_env)

471

In [74]:
training_ha_envs = []
training_ha_labels = []

training_hy_envs = []
training_hy_labels = []

for k in uq_ha_env.keys():
    training_ha_envs.append(k)
    training_ha_labels.append(uq_ha_env[k]['avg_dist'])
    

for k in uq_hy_env.keys():
    training_hy_envs.append(k)
    training_hy_labels.append(uq_hy_env[k]['avg_dist'])
    
training_ha_envs = np.array(training_ha_envs)
training_ha_labels = np.array(training_ha_labels)
training_hy_envs = np.array(training_hy_envs)
training_hy_labels = np.array(training_hy_labels)

In [71]:
kernel_sigma = 32
kernel_lambda = 1e-5

train_kernel = g2s.krr.laplacian_kernel(training_envs, training_envs, 32)
train_kernel[np.diag_indices_from(train_kernel)] += 1e-5
alphas = g2s.krr.train_multikernel(train_kernel, training_labels)

pred_distances = g2s.krr.predict_distances(train_kernel, alphas)

100%|██████████| 6/6 [00:00<00:00, 8968.58it/s]


In [49]:
stdev = []

for k in uq_ha_env.keys():
    stdev.append(np.array(uq_ha_env[k]['local_dist']).std(axis=0))

In [65]:
n_max = 1e-10
for i in range(len(stdev)):
    a = max(stdev[i])
    n_max = a if a > n_max else n_max

In [69]:
std = [item for sublist in stdev for item in sublist]

In [53]:
import matplotlib.pyplot as plt
%matplotlib inline


Bad key "text.kerning_factor" on line 4 in
/Users/c0uch1/.miniconda3/envs/wk/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [75]:
np.mean(std)

0.012252212176154806

In [105]:
def construct_sparse_dm(local_distances, local_idxs, n_atoms, n_neighs=4):
    _r, _c = np.triu_indices(n_atoms, k=1)

    sparse_dm = np.zeros((n_atoms, n_atoms))
    d = {(r, c): [] for r, c in zip(_r, _c)}
    for ld, li in zip(local_distances, local_idxs):
        row, col = np.triu_indices(len(li), k=1)
        for i in range(len(row)):
            atom_i = li[row[i]]
            atom_j = li[col[i]]
            dist = ld[i]
            pair = (atom_i, atom_j) if atom_i <= atom_j else (atom_j, atom_i)
            d[pair].append(dist)
    for p in d.keys():
        if d[p]:
            atom_i, atom_j = p
            sparse_dm[atom_i, atom_j] = np.mean(d[p])
            sparse_dm[atom_j, atom_i] = np.mean(d[p])

    return sparse_dm

def local_env_lookup(uq_ha_env, uq_hy_env, mol):
    _ha_lr, _hy_lr, heavy_atom_idxs = mol['ha_lr'],  mol['hy_lr'], mol['heavy_atom_idxs']
    n_total_atoms, n_heavy_atoms = len(mol['nuclear_charges']), len(heavy_atom_idxs)
    _ha_lidxs, _hy_lidxs =  mol['ha_lidxs'],  mol['hy_lidxs']
    ha_dists = []
    hy_dists = []
    for i in range(len(ha_lr)):
        ha_dists.append(uq_ha_env[tuple(_ha_lr[i])]['avg_dist'])
    for i in range(len(hy_lr)):
        hy_dists.append(uq_hy_env[tuple(_hy_lr[i])]['avg_dist'])
    
    sparse_dm = construct_sparse_dm(ha_dists, _ha_lidxs, n_atoms=9, n_neighs=4)
    hy_sparse_dm = construct_sparse_dm(np.array(hy_dists), _hy_lidxs, n_atoms=n_total_atoms, n_neighs=5)
#     for j, e in enumerate(heavy_atom_idxs):
#         for k, em in enumerate(heavy_atom_idxs):
#             hy_sparse_dm[e, em] = sparse_dm[j, k]
#             hy_sparse_dm[em, e] = sparse_dm[k, j]
    return sparse_dm, mol['nuclear_charges'][heavy_atom_idxs]

In [106]:
mol = mol_envs[j]
ha_lr, hy_lr, heavy_atom_idxs = mol['ha_lr'],  mol['hy_lr'], mol['heavy_atom_idxs']
n_total_atoms, n_heavy_atoms = len(mol['nuclear_charges']), len(heavy_atom_idxs)
_ha_lidxs, _hy_lidxs =  mol['ha_lidxs'],  mol['hy_lidxs']
ha_dists = []
hy_dists = []
for i in range(len(ha_lr)):
    ha_dists.append(uq_ha_env[tuple(ha_lr[i])]['avg_dist'])

In [111]:
local_idxs[0].shape

(7,)

In [113]:
local_distances = ha_dists
local_idxs = _ha_lidxs
n_atoms = 9
_r, _c = np.triu_indices(n_atoms, k=1)
sparse_dm = np.zeros((n_atoms, n_atoms))
d = {(r, c): [] for r, c in zip(_r, _c)}
for ld, li in zip(local_distances, local_idxs):
#     row, col = np.triu_indices(len(li), k=1)
    for i,e in enumerate(li[1:]):
        atom_i = li[0]#li[row[i]]
        atom_j = e#li[col[i]]
        dist = ld[i]
        pair = (atom_i, atom_j) if atom_i <= atom_j else (atom_j, atom_i)
        d[pair].append(dist)
for p in d.keys():
    if d[p]:
        atom_i, atom_j = p
        print(np.std(d[p]))
        sparse_dm[atom_i, atom_j] = np.mean(d[p])
        sparse_dm[atom_j, atom_i] = np.mean(d[p])

0.007731503043661547
0.01563252176834995
0.014277323987509138
0.0011549280327013411
0.0043724526611014225
0.013771536236221849
0.0004843871246740239
0.0007383016391177
0.07058588243990482
0.073611130094968
0.06609291786502847
0.0017459911328283617
0.0007533705363418353
0.001951029757740974
0.07422030732371776
0.00015132876313728172
0.0007636439915066351
0.0001594784745262423
0.0
0.03205071898239642
0.020225144038240606
4.110669901036257e-05


In [125]:
row, col = np.triu_indices(9, k=1)
for r,c in zip(row, col):
    if sparse_dm[r,c] == 0.0:
        sparse_dm[r,c] = 20.
        sparse_dm[c,r] = 1.   

In [54]:
row, col = np.tril_indices(9, k=0)


In [114]:
sparse_dm

array([[0.        , 1.50037144, 2.48781483, 2.44912334, 1.50934705,
        1.50612952, 1.41130392, 0.        , 0.        ],
       [1.50037144, 0.        , 1.36027527, 1.36557383, 2.57810427,
        2.57507903, 2.4475576 , 2.1940352 , 2.2387659 ],
       [2.48781483, 1.36027527, 0.        , 2.23104404, 0.        ,
        0.        , 0.        , 2.30922595, 1.43453469],
       [2.44912334, 1.36557383, 2.23104404, 0.        , 0.        ,
        0.        , 0.        , 1.36047843, 2.23276149],
       [1.50934705, 2.57810427, 0.        , 0.        , 0.        ,
        2.63257257, 2.4980085 , 0.        , 0.        ],
       [1.50612952, 2.57507903, 0.        , 0.        , 2.63257257,
        0.        , 2.48618292, 0.        , 0.        ],
       [1.41130392, 2.4475576 , 0.        , 0.        , 2.4980085 ,
        2.48618292, 0.        , 0.        , 0.        ],
       [0.        , 2.1940352 , 2.30922595, 1.36047843, 0.        ,
        0.        , 0.        , 0.        , 1.3564089 ],


In [115]:
test_dm

array([[0.        , 1.52689169, 2.5336469 , 2.44934196, 1.53965439,
        1.53969027, 1.42797468, 3.71988032, 3.7660976 ],
       [1.52689169, 0.        , 1.3601202 , 1.36379872, 2.39198217,
        2.39198217, 2.41954244, 2.19298863, 2.23920591],
       [2.5336469 , 1.3601202 , 0.        , 2.24483405, 3.75210238,
        3.75210238, 3.77966264, 2.23920591, 1.43695095],
       [2.44934196, 1.36379872, 2.24483405, 0.        , 3.7557809 ,
        3.7557809 , 3.78334117, 1.36303982, 2.23438014],
       [1.53965439, 2.39198217, 1.03186197, 1.02818345, 0.        ,
        3.07934466, 2.42162928, 4.5849708 , 4.63118808],
       [1.53969027, 2.39198217, 1.03186197, 1.02818345, 1.        ,
        0.        , 2.41328194, 4.5849708 , 4.63118808],
       [1.42797468, 2.41954244, 1.10567223, 1.05574372, 2.42162928,
        2.41328194, 0.        , 4.61253107, 4.65874835],
       [1.08630214, 2.19298863, 2.23920591, 1.36303982, 1.        ,
        1.        , 1.        , 0.        , 1.35912263],


In [16]:
sparse_distances = []
mol_nuclear_charges = []
for j in tqdm(range(len(mol_envs))):
    sp, mnc = local_env_lookup(uq_ha_env, uq_hy_env, mol_envs[j])
    sparse_distances.append(sp)
    mol_nuclear_charges.append(mnc)

  0%|          | 0/6095 [00:00<?, ?it/s]


RuntimeError: No active exception to reraise

In [17]:
%debug

> [0;32m<ipython-input-15-591d654d234e>[0m(32)[0;36mlocal_env_lookup[0;34m()[0m
[0;32m     30 [0;31m    [0;32mfor[0m [0mi[0m [0;32min[0m [0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mhy_lr[0m[0;34m)[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     31 [0;31m        [0mhy_dists[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0muq_hy_env[0m[0;34m[[0m[0mtuple[0m[0;34m([0m[0m_hy_lr[0m[0;34m[[0m[0mi[0m[0;34m][0m[0;34m)[0m[0;34m][0m[0;34m[[0m[0;34m'avg_dist'[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 32 [0;31m    [0;32mraise[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     33 [0;31m[0;34m[0m[0m
[0m[0;32m     34 [0;31m    [0msparse_dm[0m [0;34m=[0m [0mconstruct_sparse_dm[0m[0;34m([0m[0mha_dists[0m[0;34m,[0m [0m_ha_lidxs[0m[0;34m,[0m [0mn_atoms[0m[0;34m=[0m[0;36m9[0m[0;34m,[0m [0mn_neighs[0m[0;34m=[0m[0;36m4[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  ha_dists


[array([1.50810294, 1.427608  , 1.53775736, 1.53445536, 2.62658106,
       2.4587894 , 2.38146468, 2.50751839, 2.5014679 , 1.36075965,
       1.36483552, 2.43775567, 2.35871884, 2.90802274, 3.63794665,
       2.52940696, 3.72368396, 2.9030692 , 3.54678494, 3.13493141,
       2.23100009]), array([1.50810294, 1.36075965, 1.36483552, 2.38146468, 2.50751839,
       2.5014679 , 2.19578119, 2.23951927, 2.62658106, 2.4587894 ,
       1.427608  , 1.53775736, 1.53445536, 3.61208007, 3.73503437,
       2.23100009, 2.90802274, 3.72368396, 3.54678494, 2.23114235,
       1.43508055, 3.63794665, 2.9030692 , 3.13493141, 1.36277714,
       2.23252109, 2.43775567, 2.35871884, 4.56011517, 4.2840969 ,
       2.52940696, 4.17038568, 4.5965294 , 4.29224973, 4.53259135,
       1.35629504]), array([1.36673634, 1.36149884, 2.20057792, 2.2447165 , 1.48937914,
       2.22973354, 1.36325034, 2.23395614, 2.43349695, 2.23249647,
       1.43682558, 2.62045787, 1.35855263, 3.59203511, 3.72392626]), array([1.36673634

ipdb>  ha_dists[0]


array([1.50810294, 1.427608  , 1.53775736, 1.53445536, 2.62658106,
       2.4587894 , 2.38146468, 2.50751839, 2.5014679 , 1.36075965,
       1.36483552, 2.43775567, 2.35871884, 2.90802274, 3.63794665,
       2.52940696, 3.72368396, 2.9030692 , 3.54678494, 3.13493141,
       2.23100009])


ipdb>  len(ha_dists[0])


21


ipdb>  ha_dists[1]


array([1.50810294, 1.36075965, 1.36483552, 2.38146468, 2.50751839,
       2.5014679 , 2.19578119, 2.23951927, 2.62658106, 2.4587894 ,
       1.427608  , 1.53775736, 1.53445536, 3.61208007, 3.73503437,
       2.23100009, 2.90802274, 3.72368396, 3.54678494, 2.23114235,
       1.43508055, 3.63794665, 2.9030692 , 3.13493141, 1.36277714,
       2.23252109, 2.43775567, 2.35871884, 4.56011517, 4.2840969 ,
       2.52940696, 4.17038568, 4.5965294 , 4.29224973, 4.53259135,
       1.35629504])


ipdb>  len(ha_dists[1])


36


ipdb>  exit


In [118]:
import os
import subprocess

import numpy as np
from tqdm import tqdm

from g2s.utils import vector_to_square


class DGSOL:
    """
    Wrapper class for the Distance Geometry Solver (DGSOL)

    Embeds points in cartesian space given a distance boundary.

    To read more about DGSOl visit: https://www.mcs.anl.gov/~more/dgsol/
    """
    def __init__(self, distances, nuclear_charges, vectorized_input=True):
        """

        Parameters
        ----------
        distances: np.array
            Either a symmetric (n, n) distance matrix or its vectorized form.
        nuclear_charges: np.array, shape n
            Nucelear charges of the system
        vectorized_input: bool (default=True)
            Whether the distance matrix is in its vectorized form or not.
            If True, converts distance matrix to its symmetric form.
        """
        self.nuclear_charges = nuclear_charges
        self.distances = vector_to_square(distances) if vectorized_input else distances
        self.coords = None
        self.c_errors = None

    def gen_cerror_overview(self):
        """
        Prints overview of DGSOl reconstruction errors
        """
        print('Error Type, Min, Mean, Max')
        print(f'minError: {np.min(self.c_errors[:, 1])}, {np.mean(self.c_errors[:, 1])}, {np.max(self.c_errors[:, 1])}')
        print(f'avgError: {np.min(self.c_errors[:, 2])}, {np.mean(self.c_errors[:, 2])}, {np.max(self.c_errors[:, 2])}')
        print(f'maxError: {np.min(self.c_errors[:, 2])}, {np.mean(self.c_errors[:, 3])}, {np.max(self.c_errors[:, 3])}')

    def to_scientific_notation(self, number):
        """
        Converts numbers to DGSOL notation.

        Parameters
        ----------
        number: float

        Returns
        -------
        Number in DGSOL notation, e.g. 1e10
        """
        a, b = '{:.17E}'.format(number).split('E')
        num = '{:.12f}E{:+03d}'.format(float(a) / 10, int(b) + 1)
        return num[1:]

    def write_dgsol_input(self, distances, outpath):
        """
        Input file writer for DGSOL.
        Basically writes 4 columns such as
        Atom_i   Atom_j  lower_bound       upper_bound
        1         2   .139169904722E+01   .139169904722E+01
        1         3   .237179033727E+01   .237179033727E+01
        1         4   .331764447534E+01   .331764447534E+01
        1         5   .200997900174E+01   .200997900174E+01

        Parameters
        ----------
        distances: np.array
            Vectorized distance matrix.
        outpath: str
            Directory to save input file

        """
        n, m = np.triu_indices(distances.shape[1], k=1)
        with open(f'{outpath}/dgsol.input', 'w') as outfile:
            for i, j in zip(n, m):
                upper = distances[i, j]
                lower = distances[i, j]
                if distances[i, j] == 0.0:
                    upper = 20.
                    lower = 1.
                outfile.write(
                    f'{i + 1:9.0f}{j + 1:10.0f}   {self.to_scientific_notation(lower)}   '
                    f'{self.to_scientific_notation(upper)}\n')

    def parse_dgsol_coords(self, path, n_solutions, n_atoms):
        """
        Parser for DGSOl output file.
        Reads all found solutions and filters coordinates.

        Parameters
        ----------
        path: str
            Path to dgsol.output file.
        n_solutions: int
            Number of dgsol solutions.
        n_atoms: int
            Number of atoms.

        Returns
        -------
        coords: np.array, shape (n, 3)
            Coordinates of the system.

        """
        with open(f'{path}/dgsol.output') as outfile:
            lines = outfile.readlines()

        coords = []
        for line in lines:
            if not line.startswith('\n') and len(line) > 30:
                coords.append([float(n) for n in line.split()])
        coords = np.array(coords).reshape((n_solutions, n_atoms, 3))
        return coords

    def solve_distance_geometry(self, outpath, n_solutions=10):
        """
        Interface to solve distance geometry problem.
        Writes input for DGSOL, run's DGSOL and parses coordinates.

        Parameters
        ----------
        outpath: str
            Output directory to write input files and run DGSOL.
        n_solutions: int (default=10)
            Number of solutions to compute with DGSOL.

        """
        construction_errors = []
        mol_coordinates = []
        mol_ids = np.arange(self.distances.shape[0])
        for i, ids in tqdm(enumerate(mol_ids), total=len(mol_ids)):
            out = f'{outpath}/{ids:04}'
            os.makedirs(out, exist_ok=True)
            self.write_dgsol_input(distances=self.distances[i], outpath=out)
            self.run_dgsol(out, n_solutions=n_solutions)
            errors = self.parse_dgsol_errors(out)
            lowest_errors_idx = np.argsort(errors[:, 2])
            construction_errors.append(errors[lowest_errors_idx[0]])
            coords = self.parse_dgsol_coords(out, n_solutions, n_atoms=len(self.nuclear_charges[i]))
            mol_coordinates.append(coords[lowest_errors_idx])
        self.coords = np.array(mol_coordinates, dtype=object)
        self.c_errors = np.array(construction_errors)

    def run_dgsol(self, outpath, n_solutions=10):
        """
        Interface to submit DGSOL as a subprocess.

        Parameters
        ----------
        outpath: str
            Output directory to write input files and run DGSOL.
        n_solutions: int (default=10)
            Number of solutions to compute with DGSOL.

        """
        cmd = f'dgsol -s{n_solutions} {outpath}/dgsol.input {outpath}/dgsol.output {outpath}/dgsol.summary'
        process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        if error is not None:
            raise UserWarning(f'{outpath} produced the following error: {error}')

    def parse_dgsol_errors(self, outpath):
        """
        Parses DGSOL Errors.

        There are 4 types of errors in the dgsol output:

        f_err         The value of the merit function
        derr_min      The smallest error in the distances
        derr_avg      The average error in the distances
        derr_max      The largest error in the distances

        Parameters
        ----------
        outpath: str
            Output directory that contains dgsol.summary

        Returns
        -------
        dgsol_erros: np.array
            Contains DGSOL errors, shape(4)

        """
        with open(f'{outpath}/dgsol.summary', 'r') as input:
            lines = input.readlines()

        errors = []
        # skip the header lines
        for line in lines[5:]:
            errors.append(line.split()[2:])   # the first two entries are n_atoms and n_distances
        return np.array(errors).astype('float32')


In [126]:
DistanceGeometry.DoTriangleSmoothing(np.array(sparse_dm))

True

In [127]:
sparse_dm

array([[ 0.        ,  1.50037144,  2.48781483,  2.44912334,  1.50934705,
         1.50612952,  1.41130392, 20.        , 20.        ],
       [ 1.50037144,  0.        ,  1.36027527,  1.36557383,  2.57810427,
         2.57507903,  2.4475576 ,  2.1940352 ,  2.2387659 ],
       [ 2.48781483,  1.36027527,  0.        ,  2.23104404, 20.        ,
        20.        , 20.        ,  2.30922595,  1.43453469],
       [ 2.44912334,  1.36557383,  2.23104404,  0.        , 20.        ,
        20.        , 20.        ,  1.36047843,  2.23276149],
       [ 1.50934705,  2.57810427,  1.        ,  1.        ,  0.        ,
         2.63257257,  2.4980085 , 20.        , 20.        ],
       [ 1.50612952,  2.57507903,  1.        ,  1.        ,  2.63257257,
         0.        ,  2.48618292, 20.        , 20.        ],
       [ 1.41130392,  2.4475576 ,  1.        ,  1.        ,  2.4980085 ,
         2.48618292,  0.        , 20.        , 20.        ],
       [ 1.        ,  2.1940352 ,  2.30922595,  1.36047843,  1

In [128]:
dgsol = DGSOL(np.array([sparse_dm]),   np.array([mol_envs[0]['nuclear_charges'][mol_envs[0]['heavy_atom_idxs']]]), vectorized_input=False)
dgsol.solve_distance_geometry('./test_const_iso_heavy', n_solutions=20)



  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [00:00<00:00,  8.24it/s][A[A


In [150]:
np.where(dgsol.c_errors[:, 0] > 1.0)[0]

array([ 308,  312,  315,  317,  332,  407,  408,  410,  411,  412,  417,
        421,  529,  530,  531,  637,  638,  639,  640, 2471, 3173, 3445,
       3505, 3506, 3890, 4208, 4323, 4325, 4337, 4519])

In [24]:
outdir = sorted(glob('./test_const_iso_heavy/*'))


In [129]:
outdir = sorted(glob('./test_const_iso_heavy/*'))[:1]
n_confs = 10
for i, e in enumerate(outdir):
    for j in range(n_confs):
        g2s.utils.write_xyz(f'{e}/dgsol_conf_{j}.xyz', dgsol.coords[i][j], dgsol.nuclear_charges[i])

In [26]:
import py3Dmol
with open(f'{outdir[0]}/dgsol.xyz', 'r') as infile:
    xyz_pred = infile.readlines()
xyz_pred = ''.join(xyz_pred)
with open(f'{ci_path[0]}', 'r') as infile:
    xyz_ref = infile.readlines()
    xyz_ref = ''.join(xyz_ref)
xyzview = py3Dmol.view(width=400,height=400)
xyzview.addModel(xyz_pred,'xyz')
#xyzview.addModel(xyz_ref,'xyz')
xyzview.setStyle({'stick':{'colorscheme':'cyanCarbon'}})
xyzview.zoomTo()
xyzview.show()

In [145]:
ci_path[6085]

'/Users/c0uch1/work/g2g_data/qm9/H10C7O2/dsgdb9nsd_6085.xyz'

In [65]:
from rdkit import DistanceGeometry
DistanceGeometry.DoTriangleSmoothing(test_dm)

True

In [68]:
test_dm-sparse_dm

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        , -16.28011968,
        -16.2339024 ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
        -16.24789762, -16.24789762, -16.22033736,   0.        ,
          0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
        -16.2442191 , -16.2442191 , -16.21665883,   0.        ,
          0.        ],
       [  0.        ,   0.        ,   0.03186197,   0.02818345,
          0.        , -16.92065534,   0.        , -15.4150292 ,
        -15.36881192],
       [  0.        ,   0.        ,   0.03186197,   0.02818345,
          0.        ,   0.        ,   0.        , -15.4150292 ,
        -15.36881192],
       [  0.        ,   0.        ,   0.10567223,   0.05574372,
          0.        ,   0.    

In [184]:
test = np.array(sparse_distances)[612].copy()
test[0][1] = 2
test[1][0] = 2

DistanceGeometry.DoTriangleSmoothing(test)

True

In [185]:
np.array(sparse_distances)[612]-test

array([[0.        , 0.73482977, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.73482977, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],


In [175]:
test[0][1] - test[0][1]

0.0

In [70]:
import os
import subprocess

import numpy as np
from tqdm import tqdm

from g2s.utils import vector_to_square


class DGSOL:
    """
    Wrapper class for the Distance Geometry Solver (DGSOL)

    Embeds points in cartesian space given a distance boundary.

    To read more about DGSOl visit: https://www.mcs.anl.gov/~more/dgsol/
    """
    def __init__(self, distances, nuclear_charges, vectorized_input=True):
        """

        Parameters
        ----------
        distances: np.array
            Either a symmetric (n, n) distance matrix or its vectorized form.
        nuclear_charges: np.array, shape n
            Nucelear charges of the system
        vectorized_input: bool (default=True)
            Whether the distance matrix is in its vectorized form or not.
            If True, converts distance matrix to its symmetric form.
        """
        self.nuclear_charges = nuclear_charges
        self.distances = vector_to_square(distances) if vectorized_input else distances
        self.coords = None
        self.c_errors = None

    def gen_cerror_overview(self):
        """
        Prints overview of DGSOl reconstruction errors
        """
        print('Error Type, Min, Mean, Max')
        print(f'minError: {np.min(self.c_errors[:, 1])}, {np.mean(self.c_errors[:, 1])}, {np.max(self.c_errors[:, 1])}')
        print(f'avgError: {np.min(self.c_errors[:, 2])}, {np.mean(self.c_errors[:, 2])}, {np.max(self.c_errors[:, 2])}')
        print(f'maxError: {np.min(self.c_errors[:, 2])}, {np.mean(self.c_errors[:, 3])}, {np.max(self.c_errors[:, 3])}')

    def to_scientific_notation(self, number):
        """
        Converts numbers to DGSOL notation.

        Parameters
        ----------
        number: float

        Returns
        -------
        Number in DGSOL notation, e.g. 1e10
        """
        a, b = '{:.17E}'.format(number).split('E')
        num = '{:.12f}E{:+03d}'.format(float(a) / 10, int(b) + 1)
        return num[1:]

    def write_dgsol_input(self, distances, outpath):
        """
        Input file writer for DGSOL.
        Basically writes 4 columns such as
        Atom_i   Atom_j  lower_bound       upper_bound
        1         2   .139169904722E+01   .139169904722E+01
        1         3   .237179033727E+01   .237179033727E+01
        1         4   .331764447534E+01   .331764447534E+01
        1         5   .200997900174E+01   .200997900174E+01

        Parameters
        ----------
        distances: np.array
            Vectorized distance matrix.
        outpath: str
            Directory to save input file

        """
        n, m = np.triu_indices(distances.shape[1], k=1)
        with open(f'{outpath}/dgsol.input', 'w') as outfile:
            for i, j in zip(n, m):
                upper = distances[i, j]
                lower = distances[i, j]
                if distances[i, j] == 0.0:
                    upper = 20.
                    lower = 1.
                outfile.write(
                    f'{i + 1},{j + 1},{lower},'
                    f'{upper}\n')

In [71]:
dgsol = DGSOL(np.array(sparse_distances),   np.array(mol_nuclear_charges), vectorized_input=False)

In [80]:
dgsol.write_dgsol_input(np.array(sparse_dm), 'test_const_iso_heavy/0000')

In [74]:
test_dm[8]

array([1.09669596, 2.23920591, 1.43695095, 2.23438014, 1.        ,
       1.        , 1.        , 1.35912263, 0.        ])

In [193]:
np.array(sparse_distances)[0]

array([[0.        , 1.51948622, 2.62351947, 2.44614317, 1.54084022,
        1.53696883, 1.42755951, 3.59871676, 3.72762897],
       [1.51948622, 0.        , 1.36055531, 1.36488488, 2.46402064,
        2.4745706 , 2.41313606, 2.19658883, 2.24011376],
       [2.62351947, 1.36055531, 0.        , 2.23161247, 3.72368396,
        3.54678494, 2.90802274, 2.23355036, 1.43647815],
       [2.44614317, 1.36488488, 2.23161247, 0.        , 2.9030692 ,
        3.13493141, 3.63794665, 1.36308868, 2.23372819],
       [1.54084022, 2.46402064, 3.72368396, 2.9030692 , 0.        ,
        2.49871639, 2.43841315, 4.17038568, 4.5965294 ],
       [1.53696883, 2.4745706 , 3.54678494, 3.13493141, 2.49871639,
        0.        , 2.40955912, 4.29224973, 4.53259135],
       [1.42755951, 2.41313606, 2.90802274, 3.63794665, 2.43841315,
        2.40955912, 0.        , 4.56011517, 4.2840969 ],
       [3.59871676, 2.19658883, 2.23355036, 1.36308868, 4.17038568,
        4.29224973, 4.56011517, 0.        , 1.35844303],


In [79]:
sparse_dm

array([[0.        , 1.52689169, 2.5336469 , 2.44934196, 1.53965439,
        1.53969027, 1.42797468, 0.        , 0.        ],
       [1.52689169, 0.        , 1.3601202 , 1.36379872, 2.39198217,
        2.39198217, 2.41954244, 2.19298863, 2.23920591],
       [2.5336469 , 1.3601202 , 0.        , 2.24483405, 0.        ,
        0.        , 0.        , 2.23920591, 1.43695095],
       [2.44934196, 1.36379872, 2.24483405, 0.        , 0.        ,
        0.        , 0.        , 1.36303982, 2.23438014],
       [1.53965439, 2.39198217, 0.        , 0.        , 0.        ,
        0.        , 2.42162928, 0.        , 0.        ],
       [1.53969027, 2.39198217, 0.        , 0.        , 0.        ,
        0.        , 2.41328194, 0.        , 0.        ],
       [1.42797468, 2.41954244, 0.        , 0.        , 2.42162928,
        2.41328194, 0.        , 0.        , 0.        ],
       [0.        , 2.19298863, 2.23920591, 1.36303982, 0.        ,
        0.        , 0.        , 0.        , 1.35912263],
