# Random Selection

When selecting indices for associated with an array of grid points, one wants to use a weight function to information this choice.
If one is sampling uniformly, it's possible that a centroid will be chosen in a region of space where the sum of the weights
associated with the cluster's grid points is zero. For example, in cells with alot of vacuum. 

This cab be achieved straightforwardly with numpy:

```python
random.choice(a, size=None, replace=True, p=None)
```

where one passes a weight function or distribution to as the argument to `p`. However, when using fortran, this functionality
is not available. This NB prototypes a basic implementation that samples random numbers according to a user-supplied function
using binary search



In [3]:
"""pyscf setup to generate the grid and density
"""
%load_ext autoreload
%autoreload 2

import numpy as np

from pathlib import Path
from pyscf.tools import cubegen
from pyscf import gto, dft
from rdkit import Chem

bohr_to_ang =  0.529177249
output_root = Path('../outputs/isdf_product_expansion')

benzene_coordinates = """
 C         -0.65914       -1.21034        3.98683
 C          0.73798       -1.21034        4.02059
 C         -1.35771       -0.00006        3.96990
 C          1.43653       -0.00004        4.03741
 C         -0.65915        1.21024        3.98685
 C          0.73797        1.21024        4.02061
 H         -1.20447       -2.15520        3.97369
 H          1.28332       -2.15517        4.03382
 H         -2.44839       -0.00006        3.94342
 H          2.52722       -0.00004        4.06369
 H         -1.20448        2.15509        3.97373
 H          1.28330        2.15508        4.03386
"""

# PYSCF Molecule
mol = gto.Mole()
mol.atom = benzene_coordinates
mol.basis = 'def2-SVP'
mol.build()

# RDKIT molecule from SMILES data
rdkit_mol = Chem.MolFromSmiles("c1ccccc1")
rdkit_mol = Chem.AddHs(rdkit_mol)
# mol.atom_coords(unit='angstom'), symbols=[mol.atom_symbol(i) for i in range(12)]

# Solve SCF for restricted KS-LDA
mf = dft.RKS(mol)
mf.kernel()
# Occupied states for benzene, with this basis is 22
n_occ = 22

# Grid/cube settings
nx, ny, nz = 10, 10, 10
n_total = np.prod([nx, ny, nz])

# Generate the real-space grid
cube_grid = cubegen.Cube(mol, nx, ny, nz)
grid_points = cube_grid.get_coords()
assert grid_points.shape == (n_total, 3)

# Density
dm = mf.make_rdm1()
cube_file = output_root / f'density.cube'
rho = cubegen.density(mol, cube_file.as_posix(), dm, nx=nx, ny=ny, nz=nz)
rho = rho.reshape(-1)

converged SCF energy = -229.930646528208


In [27]:
""" Random number generation, sampling from a user-supplied distribution
"""
import numpy as np



np.random.Generator.choice
def construct_cumulative_weights(weights) -> np.ndarray:
    """Construct cumulative weight function
    
    Implemented with loops for easy transcribing to fortran
    else one could use existing np/scipy functions
    """
    cumulative_weights = np.empty_like(weights)
    cumulative_weights[0] = weights[0]
    i = 0
    for weight in weights[1:]:
        i += 1
        cumulative_weights[i] = cumulative_weights[i-1] + weight
    return cumulative_weights
    

def weighted_random_choice(n, weights):
    """
    Chooses n random elements from the list 'elements' with probabilities 
    proportional to the values in the list 'weights'.

    Note that this implementation does not guarantee that every index returned is unique.
    One may get the same index (or indices) appearing more than once.
    One would need to rewrite the implementation to add this feature, and the implementation
    in fortran would look quite different - so not worth doing.
    """
    n_weights = len(weights)
    if n > n_weights:
        raise ValueError("More sampled requested than there are weights")
    
    total_weight = np.sum(weights)
    cumulative_weights = construct_cumulative_weights(weights)

    indices = np.empty(shape=n, dtype=np.int32)
    for i in range(n):
        random_value = random.random() * total_weight

        # Binary search to find the interval
        low, high = 0, n_weights - 1
        while low < high:
            mid = (low + high) // 2
            if random_value < cumulative_weights[mid]:
                high = mid
            else:
                low = mid + 1
        
        indices[i] = low
        
    return indices



In [31]:
""" 
Visually check that the initial guess centroids are already located in regions of high density
"""
# Example usage: integers 1 to 100 with custom weights
elements = list(range(1, 101))   # Create a list of integers from 1 to 100
weights = [i**2 for i in elements]  # Example weights (squares of the integers)

chosen_numbers = weighted_random_choice(20, weights)  # Choose 5 unique numbers
print(f"The chosen numbers are: {chosen_numbers}")

# NOTE. I didn't bother testing this because 
# a) the fortran code will look quite different
# b) Can already be achieved in python with numpy

# Output full grid to .xyz with dummy species
# grid_xyz = write_xyz(['Sr']*n_total, grid_points * bohr_to_ang)
# with open(output_root / 'grid.xyz', "w") as fid:
#     fid.write(grid_xyz)

# Print density to cube

# Output INITIAL centroids to .xyz with dummy species
# grid_xyz = write_xyz(['Sr']*n_total, grid_points * bohr_to_ang)
# with open(output_root / 'grid.xyz', "w") as fid:
#     fid.write(grid_xyz)



The chosen numbers are: [61 87 83 81 97 96 83 56 86 67 80 43 64 85 88 65 60 72 92 84]
