# Perform ISDF with Centroids

* Add support for more molecules
* Leverage helper module such that the NB is more compact
* Use SKLearn
* Discretise output of SKLearn
* Check for duplicate points. Could happen if one asks for too many centroids
* Convert composite grid index into `(ix, iy, iz)` for use in fortran
* Quantify errors in a numerically sensible way

In [4]:
%load_ext autoreload
%autoreload 2
""" Execution times with minimal def2-SVP basis:

benzene: 6.96 s
ferrocene: 56.6 s
dimethyl_sulfoxide: 5.78 s
1H-Pyrazole: 4.92 s

"""
import os
from pathlib import Path
import numpy as np

import py3Dmol
from pyscf import dft

from isdf_prototypes.helpers import (smiles_to_rdkitmol, pyscf_molecule, pyscf_density, pyscf_grid, 
                                     add_grid_to_view, discretise_continuous_values, pyscf_molecular_orbitals,
                                     duplicate_indices, compute_approximate_product_matrix, mean_squared_error_regular_grid)
from isdf_prototypes.visualise import visualise_mo
from isdf_prototypes.gridding import expand_index_to_three_indices, indices_to_composite


# Define project root
project_root = Path(os.getcwd()).parent.parent

smiles_strs = {'benzene': 'c1ccccc1',
               'dmso':    'CC(=O)SC',
               'pyrazole': 'c1cc[nH]n1',
               'ferrocene': '[Fe].c1ccccc1.c1ccccc1'
               }


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
""" Restricted DFT with minimal basis
"""
# Solve SCF for restricted KS-LDA
mol_benzene = pyscf_molecule(project_root / 'inputs/benzene.xyz', 'def2-SVP')
mf_benzene = dft.RKS(mol_benzene)
mf_benzene.kernel()    


# from pyscf.tools import cubegen
# cube_grid = cubegen.Cube(mol_benzene, 10, 10, 10)
# print(cube_grid.box)


converged SCF energy = -229.930646528208


-229.930646528208

In [6]:
""" Define grid, compute density and compute wave functions
"""
from string import Template

points = [10, 10, 10]
n_total = np.prod(points)

# Define grid and compute density 
rdmol_benzene = smiles_to_rdkitmol(smiles_strs['benzene'])
cube_file = '../../outputs/benzene_density.cube'
rho_benzene = pyscf_density(mf_benzene, points, cube_file)
grid_benzene = pyscf_grid(mol_benzene, points)

# Visualise both
with open(cube_file) as f:
    cube_data = f.read()
view_benzene = py3Dmol.view(width=400,height=400)
view_benzene = visualise_mo(cube_data, view_benzene, isoval=0.0001, mol=rdmol_benzene)
view_benzene = add_grid_to_view(view_benzene, grid_benzene, 
                                **{'radius': 0.1, 'color': 'blue', 'alpha': 0.6})
view_benzene.show()

# Construct wave functions from MOs
cube_template = Template('../../outputs/benzene_$i.cube')
wfs = pyscf_molecular_orbitals(mf_benzene, points, cube_template)

n_occ = 21
assert wfs.shape == (n_total, n_occ)



In [7]:
""" L.H.S. Full product basis matrix
"""
from isdf_prototypes.math_ops import face_splitting_product

# Exact product states
z_exact = face_splitting_product(wfs)
assert z_exact.shape == (n_total, n_occ**2)


In [9]:
from isdf_prototypes.helpers import find_interpolation_points_via_kmeans

kmeans_opts = {'init': 'k-means++',
                'max_iter': 300,
                'tol': 0.0001,
                'verbose': 0,
                'copy_x': True,
                'random_state': None,
                'algorithm': 'lloyd'}

n_total = np.prod(points)
n_clusters = [30, 50, 70, 90]
n_product = n_occ * n_occ

# Plots of centroid choices
views = []

# Errors
rel_product_err = np.empty(shape=n_product)
mse_product_err = np.empty(shape=n_product)

# Compute difference for a range of cluster sizes
for nc in [30]:  #n_clusters:
    continuous_clusters = find_interpolation_points_via_kmeans(nc, 
                                                               grid_benzene, 
                                                               rho_benzene, 
                                                               **kmeans_opts)

    discrete_clusters, interpolation_indices = discretise_continuous_values(continuous_clusters, grid_benzene)
    
    # Check that no centroid values have been discretised to the same point
    dup_indices = duplicate_indices(discrete_clusters)
    if dup_indices:
        # If this occurs, the easiest thing is to discard it
        print("Removing duplicate centroids ", dup_indices)
        # discrete_clusters = np.delete(discrete_clusters, dup_indices) 
        interpolation_indices = np.delete(interpolation_indices, dup_indices) 

    # Given a centroid grid index of ir, output ix, iy, iz for use elsewhere
        
    # unrolled_indices = np.empty(shape=(nc, 3), dtype=np.int32)
    # for i, index in enumerate(interpolation_indices):
    #     iz, iy, ix = expand_index_to_three_indices(index, points)
    #     # Can use this to verify icmp returned == index
    #     # icmp = indices_to_composite([iz, iy, ix], points)
    #     unrolled_indices[i, :] = [ix, iy, iz]
    # np.savetxt(project_root / f'centroid_indices_{nc}.txt', unrolled_indices, fmt='%d', header=f'n_centroids: {nc}')
        
    # Rather than the indices, write the points. These can unambiguously be used to indentify relevant indices once read in
    np.savetxt(project_root / f'centroid_points_{nc}.txt', discrete_clusters, comments='', header=nc)
        
    # Visualisation of centroids
    view_benzene = add_grid_to_view(view_benzene, discrete_clusters, 
                                    **{'radius': 0.2, 'color': 'red', 'alpha': 1.0})
    view_benzene = add_grid_to_view(view_benzene, continuous_clusters, 
                                    **{'radius': 0.1, 'color': 'yellow', 'alpha': 1.0})
    views.append(view_benzene)
    view_benzene.show()
    
    # Compute approximate ISDF vectors
    z_isdf = compute_approximate_product_matrix(wfs, interpolation_indices)
    
    # Quantify MSE over the product states
    for ij in range(0, z_isdf.shape[1]):
        mse_product_err[ij] = mean_squared_error_regular_grid(z_exact[:, ij], z_isdf[:, ij])
    print("Min, Mean, Max relative error:", np.amin(mse_product_err), np.mean(mse_product_err), np.amax(mse_product_err))

    # Relative errors are ignored as these blow up for some reference values



Min, Mean, Max relative error: 4.472228706771518e-09 4.125432004774827e-07 2.8134464605811815e-06


In [7]:
"""Plot pair product states using PYSCF's Cube functionality
"""
from pyscf.tools import cubegen

nx, ny, nz = points
cube_grid = cubegen.Cube(mol_benzene, nx, ny, nz)

output_root = project_root / 'outputs/isdf_product_expansion/benzene'

for ij in range(0, n_product):
    # Exact product state
    fname = output_root / f'z_exact_{ij}.cube'
    cube_grid.write(field=z_exact[:, ij].reshape(nx, ny, nz) , fname=fname.as_posix())
    # Approximate product state
    fname = output_root / f'z_isdf_{ij}.cube'
    cube_grid.write(field=z_isdf[:, ij].reshape(nx, ny, nz) , fname=fname.as_posix())

In [8]:
"""
Cube file writes out in a consistent order, so:

* Parse exact product functions from cube files, generated with fortran
* Parse exact product functions from cube files, generated with Python
* Compare numerically, and confirm that all functions agree. If so, my fortran face-splitting product is ok, and the bug is elsewhere

Nabbed the functions from: https://gist.github.com/aditya95sriram/8d1fccbb91dae93c4edf31cd6a22510f
"""

def _getline(cube):
    """
    Read a line from cube file where first field is an int 
    and the remaining fields are floats.

    params:
        cube: file object of the cube file

    returns: (int, list<float>)
    """
    l = cube.readline().strip().split()
    return int(l[0]), map(float, l[1:])


def read_cube(fname):
    """ 
    Read cube file into numpy array

    params:
        fname: filename of cube file

    returns: (data: np.array, metadata: dict)
    """
    meta = {}
    with open(fname, 'r') as cube:
        cube.readline(); cube.readline()  # ignore comments
        natm, meta['org'] = _getline(cube)
        nx, meta['xvec'] = _getline(cube)
        ny, meta['yvec'] = _getline(cube)
        nz, meta['zvec'] = _getline(cube)
        meta['atoms'] = [_getline(cube) for i in range(natm)]
        data = np.zeros((nx*ny*nz))
        idx = 0
        for line in cube:
            for val in line.strip().split():
                data[idx] = float(val)
                idx += 1
    # I have no need to reshape
    # data = np.reshape(data, (nx, ny, nz))
    return data, meta




In [9]:
"""KS state comparison
"""

fortran_root = "/Users/alexanderbuccheri/Codes/isdf_fortran/ks_states/"

for i in range(0, n_occ):
    wfs_fortran, _ = read_cube(fortran_root + f"benzene_wf_{i + 1}.cube")
    assert wfs_fortran.shape[0] == wfs.shape[0]
    diff = np.abs(wfs_fortran - wfs[:, i])
    print(i, np.mean(diff))  #, np.amin(diff), np.amax(diff))


0 2.7871022495986537e-08
1 1.4611023065075657e-08
2 1.1355451213837888e-08
3 1.588174847291656e-08
4 1.4435367052299203e-08
5 1.8494407518565304e-08
6 7.709859376677836e-08
7 9.329353786668679e-08
8 1.0586814611400175e-07
9 1.0069237654944394e-07
10 1.1356407967690141e-07
11 9.501069349347241e-08
12 9.379491828020937e-08
13 9.731280347238638e-08
14 9.499039390015851e-08
15 8.98436236466868e-08
16 9.567886349496566e-08
17 1.1058333562847117e-07
18 1.133280895441246e-07
19 8.780837863227887e-08
20 9.225577303382062e-08


In [11]:
""" Product function comparison
"""

# Python cube files
python_root = "/Users/alexanderbuccheri/Codes/isdf_prototypes/outputs/isdf_product_expansion/benzene/"
# fortran_root = "/Users/alexanderbuccheri/Codes/isdf_fortran/product_cubes/"
# 
# n_product = 441
# for i in range(0, n_product):
#     z_exact_python, _ = read_cube(python_root + f"z_exact_{i}.cube")
#     z_exact_fortran, _ = read_cube(fortran_root + f"product_exact{i + 1}.cube")
#     diff = np.abs(z_exact_python - z_exact_fortran)
#     print(i, np.mean(diff))  #, np.amin(diff), np.amax(diff))
#     
# Use different output
fortran_root = "/Users/alexanderbuccheri/Codes/isdf_fortran/face_output/"

n_product = 441
for i in range(0, n_product):
    z_exact_python, _ = read_cube(python_root + f"z_exact_{i}.cube")
    z_exact_fortran, _ = read_cube(fortran_root + f"product_exact_alt{i + 1}.cube")
    diff = np.abs(z_exact_python - z_exact_fortran)
    print(i, np.mean(diff))

0 2.4458681125838943e-09
1 1.6943651268835144e-09
2 1.542112041188276e-09
3 1.4716515234962887e-09
4 9.633715721392635e-10
5 2.361607969914938e-09
6 7.173818259507367e-10
7 5.321679051978716e-10
8 2.394964615607732e-09
9 2.0879276693637587e-09
10 6.788874709002315e-10
11 2.8124494000051744e-10
12 1.8824338608663177e-09
13 4.3516125329973535e-10
14 3.155482396994537e-10
15 2.0057542014033435e-09
16 2.276730825380083e-09
17 1.661881978859784e-09
18 2.1944551520009052e-09
19 1.3254956661500729e-09
20 9.405755707395776e-10
21 1.6943651268835144e-09
22 1.2490209272177061e-09
23 1.1659293798634658e-09
24 1.1046353200298514e-09
25 5.086563609195935e-10
26 1.1901875746669718e-09
27 1.3212026020697816e-09
28 6.909942081867799e-10
29 1.0506822274578684e-09
30 1.4203820167086753e-09
31 1.2286353331287185e-09
32 3.1444211670976916e-10
33 1.69677260761271e-09
34 2.826575848001761e-10
35 2.7242424281041126e-10
36 1.523256695995158e-09
37 1.6488462348475492e-09
38 1.4566563013087638e-09
39 2.68454612

In [23]:
"""Check CC^T and inv(CC^T)
"""

# Parse from fortran
fortran_file = '/Users/alexanderbuccheri/Codes/isdf_fortran/compare_quantities/cct.dat'
with open(fortran_file, 'r') as fid:
    cct_stream = fid.readlines()

n_interp = int(cct_stream[0].split()[-1])
cct_fortran = np.empty(shape=(n_interp, n_interp), dtype=float)

for line in cct_stream[1:]:
    i, j, cct_value, inv_cct_value = line.split()
    i_c, j_c = int(i) - 1, int(j) - 1
    
    cct_fortran[i_c, j_c] = float(cct_value)

# Compare (CC^T) from fortran to matrix computed in python
continuous_clusters = find_interpolation_points_via_kmeans(n_interp, grid_benzene, rho_benzene, **kmeans_opts)
discrete_clusters, interpolation_indices = discretise_continuous_values(continuous_clusters, grid_benzene)
    
assert wfs.shape == (n_total, n_occ)
cct = (wfs[interpolation_indices, :] @ wfs[interpolation_indices, :].T) * (
        wfs[interpolation_indices, :] @ wfs[interpolation_indices, :].T)
assert cct.shape == (n_interp, n_interp), "Shape of python CC^T disagrees with fortran reference shape"

print('cct and cct_fortran look different')
# for i in range(n_interp):
#     for j in range(n_interp):
#         print(i, j, cct_fortran[i, j], cct[i, j])
        
# Compute inv(CC^T) using fortran input, and compare to fortran references 





0 0 3.805140921394125e-05 0.003493489767151743
0 1 3.37623242871452e-05 2.972147699842845e-06
0 2 1.556236618133054e-05 4.661344663053919e-06
0 3 2.8377752539387166e-05 3.0352184562488085e-05
0 4 1.4698485376499276e-05 0.00016251575346527117
0 5 0.00011588531557837098 7.336451385286399e-08
0 6 2.2278269744979472e-07 8.023771826927454e-05
0 7 2.0786048482230938e-06 1.2462801756368627e-05
0 8 2.2121732926305115e-07 2.9345903170737145e-08
0 9 2.7709262292281866e-05 0.0007218372814494365
0 10 0.00011618884111618692 2.13310391719186e-05
0 11 2.2166207373516704e-07 1.5216985439327047e-05
0 12 6.299678249240686e-06 1.125314275479404e-05
0 13 1.0614570683688242e-06 1.993967473098133e-05
0 14 4.314205398940744e-07 5.747472382043849e-06
0 15 2.2083919219240347e-07 6.16187570853708e-08
0 16 1.96965366206762e-06 1.645338598828269e-06
0 17 8.499131971372293e-06 3.0210538558998393e-05
0 18 5.207784073131856e-06 6.1750906620858e-05
0 19 6.397348336433494e-07 3.117494281259344e-05
0 20 5.0746850435446

In [None]:
"""Check ZC^T
"""

# Parse from fortran

# Generate from python code

# Compare


In [12]:
""" Other systems to test later
"""
smiles_strs = {'benzene': 'c1ccccc1',
               'dmso':    'CC(=O)SC',
               'pyrazole': 'c1cc[nH]n1',
               'ferrocene': '[Fe].c1ccccc1.c1ccccc1'
               }

# mol_dmso = pyscf_molecule(project_root / 'inputs/dimethyl_sulfoxide.xyz', 'def2-SVP')
# mf_dmso = dft.RKS(mol_dmso)
# mf_dmso.kernel()
# 
# mol_pyrazole = pyscf_molecule(project_root / 'inputs/1H-Pyrazole.xyz', 'def2-SVP')
# mf_pyrazole = dft.RKS(mol_pyrazole)
# mf_pyrazole.kernel()

SyntaxError: invalid syntax (2800844049.py, line 1)