# Water Descriptor Selection using Dadapy

## Imports

In [None]:
import pathlib
import re

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from ase.io import read as ase_read

from dadapy.feature_weighting import FeatureWeighting

## Import Data

In [None]:
data_dir = pathlib.Path('../data').resolve()

In [None]:
# Get ase.Atoms objects for each liquid configuration
liquid_frames = ase_read(data_dir.joinpath("ice_in_water_data/dataset_1000_eVAng.xyz"), index=':')
n_atoms = np.sum(np.asarray([len(frame) for frame in liquid_frames], dtype=np.int16))
atom_types = np.zeros((n_atoms), dtype=np.int8)

# Collect some metadata, like how many atoms/config, atoms in total and which atom is even an oxygen.
counter = 0
for frame in liquid_frames:
    atom_types[counter:counter+len(frame)] = frame.get_atomic_numbers()
    counter+=len(frame)
is_o = atom_types==8
is_h = np.logical_not(is_o)

print(f"Found {np.count_nonzero(is_o)} Oxygen atoms and {np.count_nonzero(is_h)} Hydrogen atoms.")

In [None]:
# Get new descriptors from file or laod ase.Atoms objects and recalculate
# Recalculation here works best when just getting SOAPs, ACSF takes a little long
average_soap = np.load(data_dir.joinpath("ice_in_water_data/average_soap_rcut6_nmax6_lmax6_sigma03.npy"))
atomic_soap = np.load(data_dir.joinpath("ice_in_water_data/singleatom_soap_rcut6_nmax6_lmax6_sigma03.npy"))
print("Fetched computed atomic SOAP descriptors for %u configurations and with %u features each."%atomic_soap.shape)
print("Fetched computed global SOAP descriptors for %u configurations and with %u features each."%average_soap.shape)

# The file format of the input file the descriptors are calculated from is 54 solid, 1000 liquid
# So we can just get the liquid configurations by getting the number of atoms n_atoms in the liquid configurations
# From the end of the decriptor matrix
liquid_atomic_soap = atomic_soap[-n_atoms:, :].copy()

In [None]:
average_acsf = np.asarray(np.load(data_dir.joinpath("ice_in_water_data/average_acsf_rcut6_gridsearch_bohr_lambda.npy")), dtype=np.float32)
atomic_acsf = np.asarray(np.load(data_dir.joinpath("ice_in_water_data/singleatom_acsf_rcut6_gridsearch_bohr_lambda.npy")), dtype=np.float32)
liquid_atomic_acsf = atomic_acsf[-n_atoms:, :].copy()
print("Fetched computed atomic SOAP descriptors for %u configurations and with %u features each."%atomic_acsf.shape)
print("Fetched computed liquid atomic SOAP descriptors for %u configurations and with %u features each."%liquid_atomic_acsf.shape)
print("Fetched computed global SOAP descriptors for %u configurations and with %u features each."%average_acsf.shape)

## Post-Processing

In [None]:
descriptors = [average_soap, atomic_soap, liquid_atomic_soap, average_acsf, atomic_acsf, liquid_atomic_acsf]
for desc in descriptors:
    desc /= np.linalg.norm(desc, axis=-1)[:, np.newaxis]
average_soap, atomic_soap, liquid_atomic_soap, average_acsf, atomic_acsf, liquid_atomic_acsf = descriptors

# apparently atomic acsf sometimes become nan, set to 0
nan_frames = np.argwhere(np.isnan(atomic_acsf))[:, 0]
print("Removing %u nan frames in atomic acsf"%(len(np.unique(nan_frames))))
atomic_acsf[nan_frames, :] = 0.

nan_frames = np.argwhere(np.isnan(liquid_atomic_acsf))[:, 0]
print("Removing %u nan frames in liquid atomic acsf"%(len(np.unique(nan_frames))))
liquid_atomic_acsf[nan_frames, :] = 0.

### Define Target Spaca

In [None]:
rng = np.random.default_rng()
# random_selection = rng.choice(liquid_atomic_soap.shape[0], 300)
target_data = liquid_atomic_soap[::500]
input_space = liquid_atomic_acsf[::500]

stds = np.std(input_space, axis=0)
stds[stds==0.] = 1.
standardised_input = input_space/stds[np.newaxis, :]

print(f"Working on ground truth shaped {target_data.shape} and optimising space of shape {input_space.shape}")

## Fitting

### Regular ACSF to SOAP

In [None]:
n_epochs = 100  # number of training epochs
l1s=np.logspace(np.log10(0.05), np.log10(0.2), 30) # [0 ,0.1, 10, 1000, 100000]

f = FeatureWeighting(coordinates=input_space, verbose=True, maxk=input_space.shape[0]-1)
f_target = FeatureWeighting(coordinates=target_data, maxk=input_space.shape[0]-1)

(
    num_nonzero_features,
    l1_penalties_opt_per_nfeatures,
    dii_opt_per_nfeatures,
    weights_opt_per_nfeatures,
) = f.return_lasso_optimization_dii_search(
    target_data=f_target,
    initial_weights=np.ones((input_space.shape[-1])),  # (default) set automatically
    n_epochs=n_epochs,
    l1_penalties=l1s,
    learning_rate=True,  # (default) set automatically
    refine=False,
    plotlasso=True,  # automatically show DII vs number of non-zero features
)

In [None]:
print(f.history.keys())
print(f.history["weights_per_l1_per_epoch"].shape)
print(np.count_nonzero(f.history["weights_per_l1_per_epoch"][:, -1, :], axis=-1))
print(f.history["l1_penalties"])

print(num_nonzero_features)
print(l1_penalties_opt_per_nfeatures)
print(dii_opt_per_nfeatures)

### Remove 0 stds

In [None]:
stds_input = np.std(input_space, axis=0)
print("Removing dimensions with zero variance: ")
print(np.argwhere(stds_input==0.))
shortened_input = input_space[:, stds_input>0.].copy()
print(input_space.shape)
print(shortened_input.shape)
shortened_f = FeatureWeighting(coordinates=shortened_input, verbose=True, maxk=shortened_input.shape[0]-1)

In [None]:
(
    shortened_num_nonzero_features,
    shortened_l1_penalties_opt_per_nfeatures,
    shortened_dii_opt_per_nfeatures,
    shortened_weights_opt_per_nfeatures,
) = shortened_f.return_lasso_optimization_dii_search(
    target_data=f_target,
    initial_weights=np.ones((shortened_input.shape[-1])),  # (default) set automatically
    n_epochs=n_epochs,
    l1_penalties=l1s,
    learning_rate=True,  # (default) set automatically
    refine=False,
    plotlasso=True,  # automatically show DII vs number of non-zero features
)

### Initialise with 1/std

In [None]:
stdnormed_l1s=np.logspace(-3, 3, 10)

(
    stdnormed_num_nonzero_features,
    stdnormed_l1_penalties_opt_per_nfeatures,
    stdnormed_dii_opt_per_nfeatures,
    stdnormed_weights_opt_per_nfeatures,
) = shortened_f.return_lasso_optimization_dii_search(
    target_data=f_target,
    initial_weights=None,  # (default) set automatically
    n_epochs=n_epochs,
    l1_penalties=stdnormed_l1s,
    learning_rate=True,  # (default) set automatically
    refine=True,
    plotlasso=True,  # automatically show DII vs number of non-zero features
)

In [None]:
kernel_imbs = np.load(data_dir.joinpath('water_phase_store/kernel_imbs_hartbohr_lambda.npy'))
lasso_gammas = np.load(data_dir.joinpath('water_phase_store/lasso_gammas_hartbohr_lambda.npy'))

n_true = 5
true_indices = np.argwhere(lasso_gammas[n_true, :]!=0.)
print(f"Indices with {n_true} true values:")
print(true_indices)

In [None]:
stdnormed_weights_opt_per_nfeatures.shape

feature_inds = [np.argwhere(np.logical_and(weights!=0, np.logical_not(np.isnan(weights)))) for weights in stdnormed_weights_opt_per_nfeatures]
for ii, feature_ind in enumerate(feature_inds):
    print(len(feature_inds) - ii, np.intersect1d(feature_ind, true_indices), len(feature_ind))