# UMAP of Atomic Descriptors

## Imports

In [None]:
from pathlib import Path
import re
from textwrap import wrap

from ase.io import read as ase_read
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import umap

from dadapy import FeatureWeighting

### Individual Frames

In [None]:
data_dir = Path('../data').resolve()

In [None]:
# Get ase.Atoms objects for each liquid configuration
liquid_frames = ase_read(data_dir.joinpath("ice_in_water_data/dataset_1000_eVAng.xyz"), index=':')
framewise_natoms = np.asarray([len(frame) for frame in liquid_frames], dtype=np.int32)
max_atoms = np.max(framewise_natoms)
n_atoms = np.sum(framewise_natoms, dtype=np.int32)
atom_types = np.zeros((n_atoms), dtype=np.int8)

# Collect some metadata, like how many atoms/config, atoms in total and which atom is even an oxygen.
pairwise_distances = np.zeros((n_atoms, max_atoms), dtype=np.float32)
counter = 0
for frame in liquid_frames:
    atom_types[counter:counter+len(frame)] = frame.get_atomic_numbers()
    pairwise_distances[counter:counter+len(frame), :len(frame)] = frame.get_all_distances()
    counter+=len(frame)

pairwise_distances = np.sort(pairwise_distances, axis=-1)

is_o = atom_types==8
is_h = np.logical_not(is_o)

print(f"Found {np.count_nonzero(is_o)} Oxygen atoms and {np.count_nonzero(is_h)} Hydrogen atoms.")

### Descriptors

In [None]:
# Get new descriptors from file or laod ase.Atoms objects and recalculate
# Recalculation here works best when just getting SOAPs, ACSF takes a little long
average_soap = np.load(data_dir.joinpath("ice_in_water_data/average_soap_rcut6_nmax6_lmax6_sigma03.npy"))
atomic_soap = np.load(data_dir.joinpath("ice_in_water_data/singleatom_soap_rcut6_nmax6_lmax6_sigma03.npy"))
print("Fetched computed atomic SOAP descriptors for %u configurations and with %u features each."%atomic_soap.shape)
print("Fetched computed global SOAP descriptors for %u configurations and with %u features each."%average_soap.shape)
liquid_atomic_soap = atomic_soap[-n_atoms:, :].copy()

average_acsf = np.asarray(np.load(data_dir.joinpath("ice_in_water_data/average_acsf_rcut6_gridsearch_bohr_lambda.npy")), dtype=np.float32)
atomic_acsf = np.asarray(np.load(data_dir.joinpath("ice_in_water_data/singleatom_acsf_rcut6_gridsearch_bohr_lambda.npy")), dtype=np.float32)
liquid_atomic_acsf = atomic_acsf[-n_atoms:, :].copy()

print("Fetched computed atomic SOAP descriptors for %u configurations and with %u features each."%atomic_acsf.shape)
print("Fetched computed liquid atomic SOAP descriptors for %u configurations and with %u features each."%liquid_atomic_acsf.shape)
print("Fetched computed global SOAP descriptors for %u configurations and with %u features each."%average_acsf.shape)

# The file format of the input file the descriptors are calculated from is 54 solid, 1000 liquid
# So we can just get the liquid configurations by getting the number of atoms n_atoms in the liquid configurations
# From the end of the decriptor matrix

descriptors = [average_soap, atomic_soap, liquid_atomic_soap, average_acsf, atomic_acsf, liquid_atomic_acsf]
for desc in descriptors:
    desc /= np.linalg.norm(desc, axis=-1)[:, np.newaxis]
average_soap, atomic_soap, liquid_atomic_soap, average_acsf, atomic_acsf, liquid_atomic_acsf = descriptors

# apparently atomic acsf sometimes become nan, set to 0
nan_frames = np.argwhere(np.isnan(atomic_acsf))[:, 0]
print("Removing %u nan frames in atomic acsf"%(len(np.unique(nan_frames))))
atomic_acsf[nan_frames, :] = 0.

nan_frames = np.argwhere(np.isnan(liquid_atomic_acsf))[:, 0]
print("Removing %u nan frames in liquid atomic acsf"%(len(np.unique(nan_frames))))
liquid_atomic_acsf[nan_frames, :] = 0.

In [None]:
# All this does is give labels for ACSF descriptors, since acsf seems to be missing this functionality
acsf_g2 = np.loadtxt(data_dir.joinpath("ice_in_water_data/g2_params_gridsearch_bohr_lambda.txt"))
acsf_g4 = np.loadtxt(data_dir.joinpath("ice_in_water_data/g4_params_gridsearch_bohr_lambda.txt"))
acsf_species = [1, 8]
acsf_symbols = ['H', 'O']
acsf_labels = []

counter = 0
for symbol in acsf_symbols:
    counter+=1
    acsf_labels.append(f"G1 to {symbol}")
    for g2_param in acsf_g2:
        counter+=1
        acsf_labels.append(f"G2 to {symbol}, R0={g2_param[1]:.3E}, eta={g2_param[0]*((1./1.8897259886)**2.):.3E}")
for ii_spec1, species1 in enumerate(acsf_species):
    for ii_spec2, species2 in enumerate(acsf_species):
        if species2 >= species1:
            for g4_param in acsf_g4:
                # if g4_param[2] == 0.:
                #    mask[counter] = False
                counter+=1
                acsf_labels.append(f"G4 with species {acsf_symbols[ii_spec1]} and {acsf_symbols[ii_spec2]}, eta={g4_param[0]*((1./1.8897259886)**2.):.3E}, zeta={g4_param[1]:.3E}, lambda={g4_param[2]:.3E}")

### Forces and Energies

In [None]:
with open(data_dir.joinpath("ice_in_water_data/dataset_1000_eVAng.xyz"), 'r') as f:
        file_content = f.read()
        f.close()

# The forces for the liquid
liquid_forces = np.asarray(
    re.findall(
        "[A-z][\\s]*(-?[0-9]*\.[0-9]*)[\\s]*(-?[0-9]*\.[0-9]*)[\\s]*(-?[0-9]*\.[0-9]*)[\\s]*(-?[0-9]*\.[0-9]*)[\\s]*(-?[0-9]*\.[0-9]*)[\\s]*(-?[0-9]*\.[0-9]*)", 
        file_content
    ), 
    dtype=np.float32
)[:, 3:]

# Only extract regex pattern of Energies in Metadata
energy_pattern = re.compile("(TotEnergy=)(\-[0-9]*\.[0-9]*)")
liquid_energies = np.asarray([float(energy) for buff, energy in re.findall(energy_pattern, file_content)], dtype=np.float32)

# Alternatively this file also has the volumes
liquid_energies_volumes = np.loadtxt(data_dir.joinpath("ice_in_water_data/all_volume_energies.txt"))[-len(liquid_frames):]
system_liquid_energies_volumes = []
for ii_frame, frame in enumerate(liquid_frames):
    system_liquid_energies_volumes.extend([liquid_energies_volumes[ii_frame]]*len(frame))
system_liquid_energies_volumes = np.asarray(system_liquid_energies_volumes)

### ACSFs

In [None]:
kernel_imbs = np.load(data_dir.joinpath("water_phase_store/kernel_imbs_hartbohr_lambda.npy"))
lasso_gammas = np.load(data_dir.joinpath("water_phase_store/lasso_gammas_hartbohr_lambda.npy"))
classic_greedy_imbalances = np.load(data_dir.joinpath("water_phase_store/classic_greedy_imbalances_hartbohr_lambda.npy"))
classic_delete_indices = np.load(data_dir.joinpath("water_phase_store/classic_delete_indices_hartbohr_lambda.npy"))
greedy_gammas = np.load(data_dir.joinpath("water_phase_store/greedy_gammas_hartbohr_lambda.npy"))
greedy_imbs = np.load(data_dir.joinpath("water_phase_store/greedy_imbs_hartbohr_lambda.npy"))

In [None]:
where_gammas_found = kernel_imbs!=0.

plt.plot(np.arange(len(kernel_imbs))[where_gammas_found], kernel_imbs[where_gammas_found], 'o', label='Kernel')
plt.xlim((0, 30))

n_acsfs = 3
which_gammas = lasso_gammas[n_acsfs, :]!=0.
# get indices and sort by magnitude of gamma, descending
gamma_indices = np.argwhere(which_gammas).squeeze()[np.argsort(np.abs(lasso_gammas[n_acsfs, which_gammas]))[::-1]]
plot_acsfs = liquid_atomic_acsf[:, gamma_indices]*lasso_gammas[n_acsfs, gamma_indices]
for gind in gamma_indices:
    print(acsf_labels[gind])

## Make UMAP

In [None]:
reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, metric='euclidean')
selector = np.s_[::100]
embedding = reducer.fit_transform(liquid_atomic_soap[selector])

make_tsne = False
if make_tsne:
    from sklearn.manifold import TSNE

    tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
    embedding = tsne.fit_transform(liquid_atomic_soap[selector])

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(11, 6))

fig.suptitle("tSNE of Atomic SOAP")

cmap = plt.get_cmap("viridis")
norm = plt.Normalize(np.min(np.linalg.norm(liquid_forces, axis=-1)), np.max(np.linalg.norm(liquid_forces, axis=-1)))
ax[0, 0].scatter(embedding[:, 0], embedding[:, 1], c=np.linalg.norm(liquid_forces[selector], axis=-1), cmap=cmap, norm=norm, alpha=0.5)
fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax[0, 0], label=r"$|F_{at}|$ [eV/A]")

cmap = plt.get_cmap("viridis")
norm = plt.Normalize(np.min(system_liquid_energies_volumes[:, 0]), np.max(system_liquid_energies_volumes[:, 0]))
ax[0, 1].scatter(embedding[:, 0], embedding[:, 1], c=system_liquid_energies_volumes[selector][:, 0], cmap=cmap, norm=norm, alpha=0.5)
fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax[0, 1], label=r"$E_{sys}$ [eV/molecule]")

cmap = plt.get_cmap("viridis")
norm = plt.Normalize(np.min(system_liquid_energies_volumes[:, 1]), np.max(system_liquid_energies_volumes[:, 1]))
ax[0, 2].scatter(embedding[:, 0], embedding[:, 1], c=system_liquid_energies_volumes[selector][:, 1], cmap=cmap, norm=norm, alpha=0.5)
fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax[0, 2], label=r"$V_{sys}$ [A$^3$/atom]")

ax[1, 0].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices[0]], 20)))
sc = ax[1, 0].scatter(embedding[:, 0], embedding[:, 1], c=plot_acsfs[selector, 0], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 0], label=r"acsf_${1}$")

ax[1, 1].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices[1]], 20)))
sc = ax[1, 1].scatter(embedding[:, 0], embedding[:, 1], c=plot_acsfs[selector, 1], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 1], label=r"acsf_${2}$")

ax[1, 2].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices[2]], 20)))
sc = ax[1, 2].scatter(embedding[:, 0], embedding[:, 1], c=plot_acsfs[selector, 2], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 2], label=r"acsf_${3}$")

plt.tight_layout()
fig.savefig("embedding_atomic_soap_acsf.png", dpi=300, format='png', bbox_inches='tight')

plt.show()

In [None]:
reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, metric='euclidean')
selector = np.s_[::100]
n_neighs = 6
embedding = reducer.fit_transform(pairwise_distances[selector, 1:n_neighs+1])

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(11, 6))

fig.suptitle("UMAP of Pairwise Distances (n_neighs=%u)"%n_neighs)

cmap = plt.get_cmap("viridis")
norm = plt.Normalize(np.min(np.linalg.norm(liquid_forces, axis=-1)), np.max(np.linalg.norm(liquid_forces, axis=-1)))

ax[0, 0].scatter(embedding[:, 0], embedding[:, 1], c=np.linalg.norm(liquid_forces[selector], axis=-1), cmap=cmap, norm=norm, alpha=0.5)
fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax[0, 0], label=r"$|F_{at}|$ [eV/A]")

cmap = plt.get_cmap("viridis")
norm = plt.Normalize(np.min(system_liquid_energies_volumes[:, 0]), np.max(system_liquid_energies_volumes[:, 0]))

ax[0, 1].scatter(embedding[:, 0], embedding[:, 1], c=system_liquid_energies_volumes[selector][:, 0], cmap=cmap, norm=norm, alpha=0.5)
fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax[0, 1], label=r"$E_{sys}$ [eV/molecule]")

cmap = plt.get_cmap("viridis")
norm = plt.Normalize(np.min(system_liquid_energies_volumes[:, 1]), np.max(system_liquid_energies_volumes[:, 1]))

ax[0, 2].scatter(embedding[:, 0], embedding[:, 1], c=system_liquid_energies_volumes[selector][:, 1], cmap=cmap, norm=norm, alpha=0.5)
fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax[0, 2], label=r"$V_{sys}$ [A$^3$/atom]")

ax[1, 0].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices[0]], 20)))
sc = ax[1, 0].scatter(embedding[:, 0], embedding[:, 1], c=plot_acsfs[selector, 0], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 0], label=r"acsf_${1}$")

ax[1, 1].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices[1]], 20)))
sc = ax[1, 1].scatter(embedding[:, 0], embedding[:, 1], c=plot_acsfs[selector, 1], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 1], label=r"acsf_${2}$")

ax[1, 2].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices[2]], 20)))
sc = ax[1, 2].scatter(embedding[:, 0], embedding[:, 1], c=plot_acsfs[selector, 2], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 2], label=r"acsf_${3}$")

plt.tight_layout()

## Fit ACSF to UMAP

In [None]:
reducer = umap.UMAP(n_components=2, n_neighbors=7, min_dist=0.1, metric='euclidean')
selector = np.s_[::100]
target_embedding = reducer.fit_transform(liquid_atomic_soap[is_o][selector])
print(len(target_embedding))

plt.scatter(target_embedding[:, 0], target_embedding[:, 1], alpha=0.2, c='k', label='Oxygen')

In [None]:
mode='load_old'
input_space = liquid_atomic_acsf[is_o][selector]

if mode=='save_new':

    # cur_std = np.std(liquid_atomic_acsf[is_o], axis=0)
    # cur_std[cur_std<1e-6] = 1.
    # input_space = (input_space - np.mean(liquid_atomic_acsf[is_o], axis=0))/cur_std

    rng = np.random.default_rng()
    t_size = 200
    t_indices = rng.choice(np.arange(len(target_embedding)), size=t_size, replace=False)

    initial_gammas = np.ones((input_space.shape[-1], ), dtype=np.double)
    lr_list = np.logspace(-2, 2, 10)

    num_nonzero_features_umap, l1_penalties_opt_per_nfeatures_umap, kernel_imbs_umap, lasso_gammas_umap = FeatureWeighting(
        coordinates=input_space[t_indices, :], maxk=t_size-1, verbose=True
    ).return_lasso_optimization_dii_search(
        target_data=FeatureWeighting(coordinates=target_embedding[t_indices, :]),
        initial_weights=initial_gammas, learning_rate=None, n_epochs=100, 
        constrain=True, decaying_lr=True, refine=False
    )

    num_nonzero_features_umap = num_nonzero_features_umap[::-1]
    l1_penalties_opt_per_nfeatures_umap = l1_penalties_opt_per_nfeatures_umap[::-1]
    kernel_imbs_umap = kernel_imbs_umap[::-1]
    lasso_gammas_umap = lasso_gammas_umap[::-1]

In [None]:
if mode=='save_new':
    np.save(data_dir.joinpath('water_phase_store/kernel_imbs_umap.npy'), kernel_imbs_umap)
    np.save(data_dir.joinpath('water_phase_store/lasso_gammas_umap.npy'), lasso_gammas_umap)
elif mode=='load_old':
    kernel_imbs_umap = np.load(data_dir.joinpath('water_phase_store/kernel_imbs_umap.npy'))
    lasso_gammas_umap = np.load(data_dir.joinpath('water_phase_store/lasso_gammas_umap.npy'))
where_gammas_umap = np.logical_not(np.isnan(kernel_imbs_umap))

In [None]:
n_acsfs_umap = 2
which_gammas_umap = np.logical_not(np.isnan(lasso_gammas_umap[n_acsfs_umap, :]))
# get indices and sort by magnitude of gamma, descending
gamma_indices_umap = np.argwhere(which_gammas_umap).squeeze()[np.argsort(np.abs(lasso_gammas_umap[n_acsfs_umap, which_gammas_umap]))[::-1]]

fig = plt.figure(figsize=(11, 6))

dii_ax = fig.add_subplot(2, 2, (1, 2))
dii_ax.plot(np.arange(len(kernel_imbs_umap))[where_gammas_umap], kernel_imbs_umap[where_gammas_umap], 'o', label='UMAP -> ACSF')
# ax[0].plot(kernel_imbs, 'o', label='SOAP -> ACSF')
dii_ax.set_xlim((0, 50))
dii_ax.legend()
dii_ax.set_ylabel("DII")
dii_ax.set_xlabel(r"$n_{ACSF}$")

umap_ax = fig.add_subplot(2, 2, 3)
sc = umap_ax.scatter(
    target_embedding[:, 0], 
    target_embedding[:, 1],
    c=system_liquid_energies_volumes[is_o][selector][:, 1],
    alpha=1
)
fig.colorbar(sc, ax=umap_ax, label=r"$V_{sys}$ [A$^3$/atom]")
umap_ax.set_xlabel(r'umap$_1$')
umap_ax.set_ylabel(r'umap$_2$')

acsf_ax = fig.add_subplot(2, 2, 4)
acsf_ax.scatter(
    liquid_atomic_acsf[is_o][selector][:, gamma_indices_umap[0]]*lasso_gammas_umap[n_acsfs_umap, gamma_indices_umap[0]],
    liquid_atomic_acsf[is_o][selector][:, gamma_indices_umap[1]]*lasso_gammas_umap[n_acsfs_umap, gamma_indices_umap[1]],
    c=system_liquid_energies_volumes[is_o][selector][:, 1],
    alpha=1
)
acsf_ax.set_xlabel(r'acsf$_1$')
acsf_ax.set_ylabel(r'acsf$_2$')

plt.tight_layout()

fig.savefig("dii_umap_soap.pdf", format='pdf', bbox_inches='tight', dpi=300)

plt.show()

In [None]:
n_acsfs_umap = 6
which_gammas_umap = lasso_gammas_umap[n_acsfs_umap, :]!=0.
# get indices and sort by magnitude of gamma, descending
gamma_indices_umap = np.argwhere(which_gammas_umap).squeeze()[np.argsort(np.abs(lasso_gammas_umap[n_acsfs_umap, which_gammas_umap]))[::-1]]
plot_acsfs = liquid_atomic_acsf[is_o][selector, gamma_indices_umap]*lasso_gammas_umap[n_acsfs_umap, gamma_indices_umap]

fig, ax = plt.subplots(2, 3, figsize=(10, 6))

ax[0, 0].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices_umap[0]], 20)))
sc = ax[0, 0].scatter(target_embedding[:, 0], target_embedding[:, 1], c=plot_acsfs[:, 0], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[0, 0], label=r"acsf_${1}$")

ax[0, 1].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices_umap[1]], 20)))
sc = ax[0, 1].scatter(target_embedding[:, 0], target_embedding[:, 1], c=plot_acsfs[:, 1], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[0, 1], label=r"acsf_${2}$")

ax[0, 2].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices_umap[2]], 20)))
sc = ax[0, 2].scatter(target_embedding[:, 0], target_embedding[:, 1], c=plot_acsfs[:, 2], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[0, 2], label=r"acsf_${3}$")

ax[1, 0].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices_umap[3]], 20)))
sc = ax[1, 0].scatter(target_embedding[:, 0], target_embedding[:, 1], c=plot_acsfs[:, 3], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 0], label=r"acsf_${4}$")

ax[1, 1].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices_umap[4]], 20)))
sc = ax[1, 1].scatter(target_embedding[:, 0], target_embedding[:, 1], c=plot_acsfs[:, 4], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 1], label=r"acsf_${5}$")

ax[1, 2].set_title("ACSF: %s"%'\n'.join(wrap(acsf_labels[gamma_indices_umap[5]], 20)))
sc = ax[1, 2].scatter(target_embedding[:, 0], target_embedding[:, 1], c=plot_acsfs[:, 5], cmap='viridis', alpha=0.5)
fig.colorbar(sc, ax=ax[1, 2], label=r"acsf_${6}$")
acsf_ax
plt.tight_layout()

fig.savefig("umap_by_acsf_n%u.pdf"%n_acsfs_umap, format='pdf', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
%matplotlib inline

n_acsfs_umap = 3
which_gammas_umap = lasso_gammas_umap[n_acsfs_umap, :]!=0.
# get indices and sort by magnitude of gamma, descending
gamma_indices_umap = np.argwhere(which_gammas_umap).squeeze()[np.argsort(np.abs(lasso_gammas_umap[n_acsfs_umap, which_gammas_umap]))[::-1]]
plot_acsfs = liquid_atomic_acsf[is_o][selector, gamma_indices_umap]*lasso_gammas_umap[n_acsfs_umap, gamma_indices_umap]

fig = plt.figure(figsize=(11, 6))
umap_one_ax = fig.add_subplot(121, projection='3d')

sc = umap_one_ax.scatter(
    plot_acsfs[:, 0], plot_acsfs[:, 1], plot_acsfs[:, 2],
    c=target_embedding[:, 0], cmap='viridis'
)
fig.colorbar(sc, ax=umap_one_ax, label=r'umap$_1$', shrink=0.5)

umap_two_ax = fig.add_subplot(122, projection='3d')
sc = umap_two_ax.scatter(
    plot_acsfs[:, 0], plot_acsfs[:, 1], plot_acsfs[:, 2],
    c=target_embedding[:, 1], cmap='viridis'
)
fig.colorbar(sc, ax=umap_two_ax, label=r'umap$_2$', shrink=0.5)

plt.tight_layout()