# Perform ISDF with Centroids

* Add support for more molecules
* Leverage helper module such that the NB is more compact
* Use SKLearn
* Discretise output of SKLearn
* Check for duplicate points. Could happen if one asks for too many centroids
* Convert composite grid index into `(ix, iy, iz)` for use in fortran
* Quantify errors in a numerically sensible way

In [45]:
%load_ext autoreload
%autoreload 2
""" Execution times with minimal def2-SVP basis:

benzene: 6.96 s
ferrocene: 56.6 s
dimethyl_sulfoxide: 5.78 s
1H-Pyrazole: 4.92 s

"""
import os
from pathlib import Path
import numpy as np

import py3Dmol
from pyscf import dft

from isdf_prototypes.helpers import (smiles_to_rdkitmol, pyscf_molecule, pyscf_density, pyscf_grid, 
                                     add_grid_to_view, discretise_continuous_values, pyscf_molecular_orbitals,
                                     duplicate_indices, compute_approximate_product_matrix, mean_squared_error_regular_grid)
from isdf_prototypes.visualise import visualise_mo
from isdf_prototypes.gridding import expand_index_to_three_indices, indices_to_composite


# Define project root
project_root = Path(os.getcwd()).parent.parent

smiles_strs = {'benzene': 'c1ccccc1',
               'dmso':    'CC(=O)SC',
               'pyrazole': 'c1cc[nH]n1',
               'ferrocene': '[Fe].c1ccccc1.c1ccccc1'
               }


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
""" Restricted DFT with minimal basis
"""
# Solve SCF for restricted KS-LDA
mol_benzene = pyscf_molecule(project_root / 'inputs/benzene.xyz', 'def2-SVP')
mf_benzene = dft.RKS(mol_benzene)
mf_benzene.kernel()    

converged SCF energy = -229.930646528208


-229.930646528208

In [38]:
""" Define grid, compute density and compute wave functions
"""
from string import Template

points = [10, 10, 10]
n_total = np.prod(points)

# Define grid and compute density 
rdmol_benzene = smiles_to_rdkitmol(smiles_strs['benzene'])
cube_file = '../../outputs/benzene_density.cube'
rho_benzene = pyscf_density(mf_benzene, points, cube_file)
grid_benzene = pyscf_grid(mol_benzene, points)

# Visualise both
with open(cube_file) as f:
    cube_data = f.read()
view_benzene = py3Dmol.view(width=400,height=400)
view_benzene = visualise_mo(cube_data, view_benzene, isoval=0.0001, mol=rdmol_benzene)
view_benzene = add_grid_to_view(view_benzene, grid_benzene, 
                                **{'radius': 0.1, 'color': 'blue', 'alpha': 0.6})
view_benzene.show()

# Construct wave functions from MOs
cube_template = Template('../../outputs/benzene_$i.cube')
wfs = pyscf_molecular_orbitals(mf_benzene, points, cube_template)

n_occ = 21
assert wfs.shape == (n_total, n_occ)



In [39]:
""" L.H.S. Full product basis matrix
"""
from isdf_prototypes.math_ops import face_splitting_product

# Exact product states
z_exact = face_splitting_product(wfs)
assert z_exact.shape == (n_total, n_occ**2)


In [49]:
from isdf_prototypes.helpers import find_interpolation_points_via_kmeans

kmeans_opts = {'init': 'k-means++',
                'max_iter': 300,
                'tol': 0.0001,
                'verbose': 0,
                'copy_x': True,
                'random_state': None,
                'algorithm': 'lloyd'}

n_total = np.prod(points)
n_clusters = [30, 50, 70, 90]
n_product = n_occ * n_occ

# Plots of centroid choices
views = []

# Errors
rel_product_err = np.empty(shape=n_product)
mse_product_err = np.empty(shape=n_product)

# Compute difference for a range of cluster sizes
for nc in [30]:
    continuous_clusters = find_interpolation_points_via_kmeans(nc, 
                                                               grid_benzene, 
                                                               rho_benzene, 
                                                               **kmeans_opts)

    discrete_clusters, interpolation_indices = discretise_continuous_values(continuous_clusters, grid_benzene)
    
    # Check that no centroid values have been discretised to the same point
    dup_indices = duplicate_indices(discrete_clusters)
    if dup_indices:
        # If this occurs, the easiest thing is to discard it
        print("Removing duplicate centroids ", dup_indices)
        # discrete_clusters = np.delete(discrete_clusters, dup_indices) 
        interpolation_indices = np.delete(interpolation_indices, dup_indices) 

    # Given a centroid grid index of ir, output ix, iy, iz for use elsewhere
    unrolled_indices = np.empty(shape=(nc, 3), dtype=np.int32)
    for i, index in enumerate(interpolation_indices):
        iz, iy, ix = expand_index_to_three_indices(index, points)
        # Can use this to verify icmp returned == index
        # icmp = indices_to_composite([iz, iy, ix], points)
        unrolled_indices[i, :] = [ix, iy, iz]
    np.savetxt(project_root / 'array.txt', unrolled_indices, fmt='%d', header=f'n_centroids: {nc}')
        
    # Visualisation of centroids
    view_benzene = add_grid_to_view(view_benzene, discrete_clusters, 
                                    **{'radius': 0.2, 'color': 'red', 'alpha': 1.0})
    view_benzene = add_grid_to_view(view_benzene, continuous_clusters, 
                                    **{'radius': 0.1, 'color': 'yellow', 'alpha': 1.0})
    views.append(view_benzene)
    # view_benzene.show()
    
    # Compute approximate ISDF vectors
    z_isdf = compute_approximate_product_matrix(wfs, interpolation_indices)
    
    # Quantify MSE over the product states
    for ij in range(0, z_isdf.shape[1]):
        mse_product_err[ij] = mean_squared_error_regular_grid(z_exact[:, ij], z_isdf[:, ij])
    print("Min, Mean, Max relative error:", np.amin(mse_product_err), np.mean(mse_product_err), np.amax(mse_product_err))

    # Relative errors are ignored as these blow up for some reference values



Min, Mean, Max relative error: 3.274507530002088e-09 4.449008526349109e-07 2.95389289251211e-06


In [44]:
"""Plot pair product states using PYSCF's Cube functionality
"""
from pyscf.tools import cubegen

nx, ny, nz = points
cube_grid = cubegen.Cube(mol_benzene, nx, ny, nz)

output_root = project_root / 'outputs/isdf_product_expansion/benzene'

for ij in range(0, n_product):
    # Exact product state
    fname = output_root / f'z_exact_{ij}.cube'
    cube_grid.write(field=z_exact[:, ij].reshape(nx, ny, nz) , fname=fname.as_posix())
    # Approximate product state
    fname = output_root / f'z_isdf_{ij}.cube'
    cube_grid.write(field=z_isdf[:, ij].reshape(nx, ny, nz) , fname=fname.as_posix())

In [None]:
""" Other systems to test later
"""
smiles_strs = {'benzene': 'c1ccccc1',
               'dmso':    'CC(=O)SC',
               'pyrazole': 'c1cc[nH]n1',
               'ferrocene': '[Fe].c1ccccc1.c1ccccc1'
               }

mol_dmso = pyscf_molecule(project_root / 'inputs/dimethyl_sulfoxide.xyz', 'def2-SVP')
mf_dmso = dft.RKS(mol_dmso)
mf_dmso.kernel()

mol_pyrazole = pyscf_molecule(project_root / 'inputs/1H-Pyrazole.xyz', 'def2-SVP')
mf_pyrazole = dft.RKS(mol_pyrazole)
mf_pyrazole.kernel()