# K-Means Grid Determination on Molecular Orbitals


In [1]:
"""Build Benzene molecule with a minimal GTO basis, usign PYSCF
"""
%load_ext autoreload
%autoreload 2

from pyscf import gto, dft
from rdkit import Chem


benzene_coordinates = """
 C         -0.65914       -1.21034        3.98683
 C          0.73798       -1.21034        4.02059
 C         -1.35771       -0.00006        3.96990
 C          1.43653       -0.00004        4.03741
 C         -0.65915        1.21024        3.98685
 C          0.73797        1.21024        4.02061
 H         -1.20447       -2.15520        3.97369
 H          1.28332       -2.15517        4.03382
 H         -2.44839       -0.00006        3.94342
 H          2.52722       -0.00004        4.06369
 H         -1.20448        2.15509        3.97373
 H          1.28330        2.15508        4.03386
"""

# Generate ball and stick data of atomic positions from SMILES
rdkit_mol = Chem.MolFromSmiles("c1ccccc1")
rdkit_mol = Chem.AddHs(rdkit_mol)

# PYSCF Molecule
mol = gto.Mole()
mol.atom = benzene_coordinates
mol.basis = 'def2-SVP'
mol.build()

# Solve SCF for restricted KS-LDA
mf = dft.RKS(mol)
mf.kernel()

converged SCF energy = -229.930646528208


-229.930646528208

In [37]:
"""Build the electron density
"""
import numpy as np
from pathlib import Path

import py3Dmol
from pyscf.tools import cubegen

from isdf_prototypes.visualise import visualise_mo


# 1-electron Density matrix
dm = mf.make_rdm1()

# Grid/cube settings
nx, ny, nz = 10, 10, 10
n_total = np.prod([nx, ny, nz])

# Notes
# PYSCF grids are shit. Passing resolution ignores (nx, ny, nz). So passing it completely changes 
# the grid one gets out.
# Assume the grid is also in Bohr, not Angstrom, so I need to convert in xyz
# - This massively messes up the scale
# Also assume that the cube file does recentering, and I need to apply this to anything written to xyz

# Export to cube format (PYSCF routines are not modular enough: computes rho, writes cube file, returns rho)
cube_file = '../outputs/benzene_density.cube'
rho = cubegen.density(mol, cube_file, dm, nx=nx, ny=ny, nz=nz)
assert rho.shape == (nx, ny, nz)

# Reimport and plot
cube_file = Path(cube_file)
with open(cube_file) as f:
    cube_data = f.read()

view = py3Dmol.view(width=400,height=400)
view = visualise_mo(cube_data, view, isoval=0.0001, mol=rdkit_mol)

# Regenerate the real-space grid used for rho's construct
cube_grid = cubegen.Cube(mol, nx, ny, nz)
grid_points = cube_grid.get_coords()
assert grid_points.shape == (n_total, 3)

# Plot grid points ontop of isosurface
# As this is using the xyz format, I have to convert from bohr to ang
bohr_to_ang =  0.529177249
for i in range(len(grid_points)):
    x, y, z = grid_points[i] * bohr_to_ang
    view.addSphere({'center': {'x': x, 'y': y, 'z': z}, 'radius': 0.1, 'color': 'blue', 'alpha': 0.5})

view.setStyle({'sphere': {}})
view.show()

In [38]:
""" Find optimal centroid points using SKLearn

See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

Options:
 * init='k-means++' starts with the more effective algorithm, but note that I do not have this implemented
 * For initial testing, fix the random state seed
 * algorithm='lloyd' is fixed, this is what I implement
 
ninit: 
Number of times the k-means algorithm is run with different centroid seeds. The final results is the best output of n_init consecutive runs in terms of inertia. Several runs are recommended for sparse high-dimensional problems. 

If I start with k-means++ to optimise initial seeding, ninit is set to 1.
If I use random seeding, then 10 runs are performed and the best is taken.
"""
import copy

from sklearn.cluster import KMeans

from isdf_prototypes.visualise import write_xyz

bohr_to_ang =  0.529177249

n_clusters = 100 # i.e. ~ sqrt(n_total)
# Using an int will produce the same results across different calls.
# Popular integer random seeds are 0 and 42
random_state = 10

k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init='auto', max_iter=300, tol=0.0001, verbose=0, random_state=random_state, copy_x=True, algorithm='lloyd')

# Assigning weight function: https://stackoverflow.com/questions/50789660/weighted-k-means-in-python
rho = rho.reshape(-1)
assert rho.shape == (n_total,)
k_means.fit(grid_points, sample_weight = rho)

clusters = k_means.cluster_centers_
assert clusters.shape == (n_clusters, 3)

# Output full grid and clusters to .xyz with dummy species
grid_xyz = write_xyz(['Sr']*n_total, grid_points * bohr_to_ang)
clusters_xyz = write_xyz(['U']*n_total, clusters * bohr_to_ang)

with open('../outputs/grid.xyz', "w") as fid:
    fid.write(grid_xyz)
    
with open('../outputs/clusters.xyz', "w") as fid:
    fid.write(clusters_xyz)
    
# Add optimal cluster points to py3dmol plot
# view_sk1 = py3Dmol.view(width=400, height=400)
view_sk1 = view

for i in range(n_clusters):
    x, y, z = clusters[i] * bohr_to_ang
    view.addSphere({'center': {'x': x, 'y': y, 'z': z}, 'radius': 0.2, 'color': 'red'})

view_sk1.setStyle({'sphere': {}})
view_sk1.show()    


In [39]:
""" Find optimal centroid points using SKLearn

Use random sampling of centroids, instead of k-means++
Note, the random seed is hard-coded so a) it's determinisitic and b) I don't run 10 times (see init) and choose the most optimal
"""
# Using an int will produce the same results across different calls.
# Popular integer random seeds are 0 and 42
random_state = 10

k_means = KMeans(n_clusters=n_clusters, init='random', n_init=10, max_iter=300, tol=0.0001, random_state=random_state, algorithm='lloyd')
k_means.fit(grid_points, sample_weight = rho)
clusters = k_means.cluster_centers_

# Add optimal cluster points to py3dmol plot
for i in range(n_clusters):
    x, y, z = clusters[i] * bohr_to_ang
    view.addSphere({'center': {'x': x, 'y': y, 'z': z}, 'radius': 0.2, 'color': 'green'})

view.setStyle({'sphere': {}})
view.show()    

In [40]:
""" Find optimal centroid points using My Implementation

I'm using k-means++, but not the greedy implementation (which is better)
I should supply a seed so I can fix the random_choice function
"""
from isdf_prototypes.interpolation_points import weighted_kmeans, kmeans_seeding


initial_centroids = kmeans_seeding(grid_points, n_clusters)
my_clusters, iter = weighted_kmeans(grid_points, rho, initial_centroids, n_iter=300, centroid_tol=1.0e-7,
                                    safe_mode=True, verbose=False)

# Create a new view instance with one SKLearn set of points, my algorithm's points and the isosurface
view = py3Dmol.view(width=400,height=400)

# benzene isosurface
view = visualise_mo(cube_data, view, isoval=0.0001, mol=rdkit_mol)

# Last SKlearn results
for i in range(n_clusters):
    x, y, z = clusters[i] * bohr_to_ang
    x2, y2, z2 = my_clusters[i] * bohr_to_ang
    view.addSphere({'center': {'x': x, 'y': y, 'z': z}, 'radius': 0.2, 'color': 'red'})
    view.addSphere({'center': {'x': x2, 'y': y2, 'z': z2}, 'radius': 0.2, 'color': 'green'})

view.show()    
print("My results just are not as good - probabaly an issue in my implementation")

My results just are not as good - probabaly an issue in my implementation
