# Classifying Defect Sites on TiO Surfaces

## Imports

Get packages needed for computation and load trajectory from `.lammpstrj` file.

Some packages might not already be in your Python installation, make sure you have the following installed:
This requires the [ASE](https://wiki.fysik.dtu.dk/ase/), [dscribe](https://singroup.github.io/dscribe/latest/) and [sklearn](https://scikit-learn.org/stable/index.html) Python libraries for neighborlists, trajectory handling and building and analysis of atomic descriptors.

In the future, or alternatively, [ASAP](https://github.com/BingqingCheng/ASAP) could be used to handle the building and analysis part.

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

from ase.neighborlist import NeighborList, natural_cutoffs
from ase.io import read as ase_read, write as ase_write
from dscribe.descriptors import SOAP
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE

In [None]:
trajectory_path = "defect_test_out.lammpstrj"
trajectory = ase_read(trajectory_path, index=':100')

Get indices of all individual elements for later use.

In [None]:
ti_indices, o_indices, h_indices = [], [], []

for ii_symbol, symbol in enumerate(trajectory[0].get_chemical_symbols()):
    if symbol == 'Ti':
        ti_indices.append(ii_symbol)
    elif symbol == 'H':
        h_indices.append(ii_symbol)
    elif symbol == 'O':
        o_indices.append(ii_symbol)
    else:
        raise ValueError("Undefined element {symbol} found in initial snapshot at index {ii_symbol}.")


## Get Atomic Configurations via Neighbor Lists

Use the `dscribe.neighborlist` package to gain a rough idea of the atomic configuration of each oxygen atom.

In [None]:
neighbor_list = NeighborList(cutoffs=natural_cutoffs(trajectory[0], mult=1.), self_interaction=False, bothways=True)
neighbor_list.update(trajectory[0])

o_neighbors = []
for o_index in o_indices:
    o_neighbors.append(np.append([o_index], neighbor_list.get_neighbors(o_index)[0]))

In [None]:
o_symbols = []
for o_neighbor in o_neighbors:
    o_symbols.append(trajectory[0][o_neighbor].get_chemical_formula(mode='hill'))

configs = pd.Series(o_symbols).value_counts()
print(configs)

## Get Atomic Configurations via Neighbor Lists

Use the `dscribe.neighborlist` package to gain a rough idea of the atomic configuration of each oxygen atom.

In [None]:
neighbor_list = NeighborList(cutoffs=natural_cutoffs(trajectory[0], mult=1.), self_interaction=False, bothways=True)
neighbor_list.update(trajectory[0])

o_neighbors = []
for o_index in o_indices:
    o_neighbors.append(np.append([o_index], neighbor_list.get_neighbors(o_index)[0]))

In [None]:
o_symbols = []
for o_neighbor in o_neighbors:
    o_symbols.append(trajectory[0][o_neighbor].get_chemical_formula(mode='hill'))

configs = pd.Series(o_symbols).value_counts()
print(configs)

## Building Atomic Descriptors

Currently `r_cut` is being set to just be any number, in the future it would make sense to make some sort of educated guess.
Some multiple of the distance to the mean of nearest neighbour?
The number of basis functions in `n_max` and `l_max` is also chosen arbitrarily as of now.

In [None]:
is_bulk = np.asarray(['Ti' in o_symbol for o_symbol in o_symbols], dtype=bool)
bulk_symbols = [o_symbol for ii_symbol, o_symbol in enumerate(o_symbols) if is_bulk[ii_symbol]]

soap = SOAP(r_cut=5., n_max=12, l_max=12, sigma=0.4, species=['Ti', 'H', 'O'], periodic=True) # Periodic to true if SOAP should respect the periodicity in ASE.Atoms
soaps = soap.create(trajectory[0], positions=o_indices)
print(soaps.shape)

## Analyse Atomic Descriptors

Use kernelPCA or PCA to build a low dimensional representation of the SOAP operators.
Ideally, the defect site should show up seperately on this representation.

In [None]:
# All oxygen
pca = PCA(n_components=2)
reduction = pca.fit_transform(soaps)
kernel_pca = KernelPCA(n_components=2, kernel='rbf')
kernel_reduction = kernel_pca.fit_transform(soaps)
tsne = TSNE(n_components=2, perplexity=30)
tsne_reduction = tsne.fit_transform(soaps)

# Only Ti bonded
bulk_pca = PCA(n_components=2)
bulk_reduction = bulk_pca.fit_transform(soaps[is_bulk])
bulk_kernel_pca = KernelPCA(n_components=2)
bulk_kernel_reduction = bulk_kernel_pca.fit_transform(soaps[is_bulk])
bulk_tsne = TSNE(n_components=2, perplexity=30)
bulk_tsne_reduction = bulk_tsne.fit_transform(soaps[is_bulk])

In [None]:
# Indices of them oxygen atoms around the defect site
defect_inds = np.array([69, 68, 128, 63], dtype=np.int16)
defect_soaps = soap.create(trajectory[0], positions=defect_inds)

# All oxygen
defect_reduction = pca.transform(defect_soaps)
defect_kernel_reduction = kernel_pca.transform(defect_soaps)

# Only Ti bonded
bulk_defect_reduction = bulk_pca.transform(defect_soaps)
bulk_defect_kernel_reduction = bulk_kernel_pca.transform(defect_soaps)

In [None]:
def draw_2d_scatter(fig, ax, reduction, color_dict, o_symbols, special_inds=None):
    colors = [color_dict[config] for config in o_symbols]
    ax.scatter(reduction[:, 0], reduction[:, 1], c=colors)
    
    return fig, ax

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(8, 8))

cmap = plt.cm.get_cmap('tab20', len(configs))
norm = plt.Normalize(vmin=-0.5, vmax=len(configs)-0.5)
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)

color_dict = {config: sm.to_rgba(ii_config) for ii_config, config in enumerate(configs.keys())}
colors = [color_dict[config] for config in o_symbols]

draw_2d_scatter(fig, axes[0, 0], reduction, color_dict, o_symbols)
draw_2d_scatter(fig, axes[0, 1], kernel_reduction, color_dict, o_symbols)
draw_2d_scatter(fig, axes[0, 2], tsne_reduction, color_dict, o_symbols)
draw_2d_scatter(fig, axes[1, 0], bulk_reduction, color_dict, bulk_symbols)
draw_2d_scatter(fig, axes[1, 1], bulk_kernel_reduction, color_dict, bulk_symbols)
draw_2d_scatter(fig, axes[1, 2], bulk_tsne_reduction, color_dict, bulk_symbols)

axes[0, 0].scatter(defect_reduction[:, 0], defect_reduction[:, 1], color='r')
axes[0, 1].scatter(defect_kernel_reduction[:, 0], defect_kernel_reduction[:, 1], color='r')
axes[1, 0].scatter(bulk_defect_reduction[:, 0], bulk_defect_reduction[:, 1], color='r')
axes[1, 1].scatter(bulk_defect_kernel_reduction[:, 0], bulk_defect_kernel_reduction[:, 1], color='r')

axes[0, 0].set_title("PCA all O SOAPs")
axes[0, 1].set_title("Kernel PCA all O SOAPs")
axes[0, 2].set_title("tSNE all O SOAPs")
axes[1, 0].set_title("PCA bulk O SOAPs")
axes[1, 1].set_title("Kernel PCA bulk O SOAPs")
axes[1, 2].set_title("tSNE bulk O SOAPs")

cb = fig.colorbar(mappable=sm, ax=axes[:, 2])
cb.set_ticks(range(len(configs)), labels=configs.keys())

fig.suptitle("Oxygen around defect site marked in red")

## Building SOAPs and Projections along the whole TiO Surface

The previous dimensionality reductions show, that bulk oxygen separates from surface oxygens.
Maybe by creating a grid over the surface of the TiO and calculating the SOAPs at certain spaced intervals, a definition of the defect site can be found.
Maybe by optimising on this plane (for eg.: largest difference to previous soaps) the site can be found.
Additionally, the projection of the SOAPs around the approximate defect site is calculated, to gain some understanding of how it is represented.

Although none-surface molecules (such as water) could have an impact on this process, so maybe the molecules should be filtered first for being part of the "bulk", especially for building SOAPs.

In [None]:
# Number of points along each axis of mesh
n_points = 20

# Define a 2D surface mesh
sim_cell = trajectory[0].cell.lengths()
height = 0
x = np.linspace(0, sim_cell[0], n_points)
y = np.linspace(0, sim_cell[1], n_points)
# Make mesh and build a 20x20x3 set of coordinates on plane
xplane, yplane, zplane = np.meshgrid(x, y, height)
plane = np.append(np.append(xplane[..., np.newaxis], yplane[..., np.newaxis], axis=-1), zplane[..., np.newaxis], axis=-1)[:, :, 0, :]

# Define 3D cube mesh
z = np.linspace(0, 5, n_points) # Only upper half of Ti block
# Make mesh and build a 20x20x20x3 set of coordinates of cube
xcube, ycube, zcube = np.meshgrid(x, y, z)
cube = np.append(np.append(xcube[..., np.newaxis], ycube[..., np.newaxis], axis=-1), zcube[..., np.newaxis], axis=-1)

In [None]:
cube_soaps = soap.create(trajectory[0], positions=cube.reshape((n_points**3), 3))
cube_bulk_projection = bulk_pca.transform(cube_soaps)

In [None]:
defect_pos = np.array([3.01, 5.6, 0.8])
defect_proj = bulk_pca.transform(soap.create(trajectory[0], positions=defect_pos[np.newaxis, :]))

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 3))

cb = axes[0].scatter(cube_bulk_projection[:, 0], cube_bulk_projection[:, 1], c=xcube.flatten(), alpha=0.5)
fig.colorbar(cb, ax=axes[0], label='x')
cb = axes[1].scatter(cube_bulk_projection[:, 0], cube_bulk_projection[:, 1], c=ycube.flatten(), alpha=0.5)
fig.colorbar(cb, ax=axes[1], label='y')
cb = axes[2].scatter(cube_bulk_projection[:, 0], cube_bulk_projection[:, 1], c=zcube.flatten(), alpha=0.5)
fig.colorbar(cb, ax=axes[2], label='z')

for ax in axes:
    ax.scatter(defect_proj[:, 0], defect_proj[:, 1], c='r', s=20, label='Approximate defect site')
    draw_2d_scatter(fig, ax, bulk_reduction, color_dict, bulk_symbols)

fig.suptitle("PCA of 3D cube in TiO")

plt.show()