In [2]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
from ase.io import read as ase_read
import umap

from dadapy.feature_weighting import FeatureWeighting

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd

# Load

In [4]:
data_dir = pathlib.Path('../data').resolve()

In [None]:
# Get ase.Atoms objects for each liquid configuration
liquid_frames = ase_read(data_dir.joinpath("ice_in_water_data/dataset_1000_eVAng.xyz"), index=':')
n_atoms = np.sum(np.asarray([len(frame) for frame in liquid_frames], dtype=np.int16))
atom_types = np.zeros((n_atoms), dtype=np.int8)

# Collect some metadata, like how many atoms/config, atoms in total and which atom is even an oxygen.
counter = 0
for frame in liquid_frames:
    atom_types[counter:counter+len(frame)] = frame.get_atomic_numbers()
    counter+=len(frame)
is_o = atom_types==8
is_h = np.logical_not(is_o)

# Alternatively this file also has the volumes
liquid_energies_volumes = np.loadtxt(data_dir.joinpath("ice_in_water_data/all_volume_energies.txt"))[-len(liquid_frames):]
system_liquid_energies_volumes = []
for ii_frame, frame in enumerate(liquid_frames):
    system_liquid_energies_volumes.extend([liquid_energies_volumes[ii_frame]]*len(frame))
system_liquid_energies_volumes = np.asarray(system_liquid_energies_volumes)

In [None]:
is_o.shape

In [None]:
atomic_soap = np.load(data_dir.joinpath("ice_in_water_data/singleatom_soap_rcut6_nmax6_lmax6_sigma03.npy"))

# The file format of the input file the descriptors are calculated from is 54 solid, 1000 liquid
# So we can just get the liquid configurations by getting the number of atoms n_atoms in the liquid configurations
# From the end of the decriptor matrix
liquid_atomic_soap = atomic_soap[-n_atoms:, :].copy()
liquid_atomic_soap.shape
liquid_atomic_soap = liquid_atomic_soap[is_o]

In [None]:
atomic_acsf = np.asarray(np.load(data_dir.joinpath("ice_in_water_data/singleatom_acsf_rcut6_gridsearch_bohr_lambda.npy")), dtype=np.float32)
liquid_atomic_acsf = atomic_acsf[-n_atoms:, :].copy()
liquid_atomic_acsf = liquid_atomic_acsf[is_o]

In [None]:
liquid_atomic_acsf.shape

In [None]:
descriptors = [atomic_soap, liquid_atomic_soap, atomic_acsf, liquid_atomic_acsf]
for desc in descriptors:
    desc /= np.linalg.norm(desc, axis=-1)[:, np.newaxis]
atomic_soap, liquid_atomic_soap, atomic_acsf, liquid_atomic_acsf = descriptors

print(np.count_nonzero(np.isnan(atomic_acsf)))
print(np.argwhere(np.isnan(atomic_acsf)))
print(atomic_acsf[64064])

print(np.argwhere(np.logical_and.reduce(0.==liquid_atomic_acsf, axis=-0)))

atomic_acsf[np.isnan(atomic_acsf)] = 0.
liquid_atomic_acsf[np.isnan(liquid_atomic_acsf)] = 0.

In [None]:
soap_cut = liquid_atomic_soap[::167]
acsf_cut = liquid_atomic_acsf[::167]
acsf_cut.shape

In [None]:
soap_cut

In [None]:
np.isnan(soap_cut).any()

# UMAP

In [None]:
reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, metric='euclidean')
embedding = reducer.fit_transform(soap_cut)

In [None]:
umap=embedding

In [None]:
print(system_liquid_energies_volumes.shape)

In [None]:
plt.scatter(umap[:,0], umap[:,1], c=system_liquid_energies_volumes[is_o][::167, 1], cmap='viridis')

### There are some 0 columns in the ACSFs. drop them

In [None]:
zero_columns = np.where(np.all(acsf_cut == 0, axis=0))
print(np.where(np.all(liquid_atomic_acsf == 0., axis=0)))
acsf_cut = np.delete(acsf_cut, zero_columns, axis=1)

# DII

In [None]:
n_epochs = 70  # number of training epochs
l1s=[0 ,0.1, 10, 1000, 100000]

f = FeatureWeighting(coordinates=acsf_cut, verbose=True)
f_target = FeatureWeighting(coordinates=umap)

(
    num_nonzero_features,
    l1_penalties_opt_per_nfeatures,
    dii_opt_per_nfeatures,
    weights_opt_per_nfeatures,
) = f.return_lasso_optimization_dii_search(
    target_data=f_target,
    # initial_weights=None,  # (default) set automatically
    n_epochs=n_epochs,
    l1_penalties=l1s,
    learning_rate=None,  # (default) set automatically
    refine=False,  # only 10 values of the L1 strength are tested
    plotlasso=True,  # automatically show DII vs number of non-zero features
)

In [None]:
n_epochs = 70  # number of training epochs
l1s=[10000000, 1000000000, 100000000000]

f = FeatureWeighting(coordinates=acsf_cut, verbose=True)
f_target = FeatureWeighting(coordinates=umap)

(
    num_nonzero_features,
    l1_penalties_opt_per_nfeatures,
    dii_opt_per_nfeatures,
    weights_opt_per_nfeatures,
) = f.return_lasso_optimization_dii_search(
    target_data=f_target,
    initial_weights=None,  # (default) set automatically
    n_epochs=n_epochs,
    l1_penalties=l1s,
    learning_rate=None,  # (default) set automatically
    refine=False,  # only 10 values of the L1 strength are tested
    plotlasso=True,  # automatically show DII vs number of non-zero features
)

In [None]:
n_epochs = 70  # number of training epochs

final_imbs, final_weights = f.return_backward_greedy_dii_elimination(
    target_data=f_target,
    initial_weights=None,  # set automatically (default)
    n_epochs=n_epochs,
    learning_rate=None,  # set automatically (default)
)

In [None]:
final_imbs.shape

In [None]:
np.savetxt("final_imbs_nohydrogen.csv", final_imbs)
np.savetxt("final_weights_nohydrogen.csv", final_weights)

In [None]:
plt.plot(np.arange(len(final_imbs), 0, -1),final_imbs,  ".-")