In [None]:
import matplotlib.pyplot as plp
import scipy.cluster.hierarchy as sch
from  h5py import File
import pandas as pd
from pandas import DataFrame
import h5py
from HDF5er import saveXYZfromTrajGroup,MDA2HDF5,saveXYZfromTrajGroup
import numpy
from MDAnalysis import Universe as mdaUniverse
from SOAPify import (saponifyGroup,
                    createReferencesFromTrajectory,
                    mergeReferences,
                    SOAPdistanceNormalized,
                    saveReferences,
                    getReferencesFromDataset,
                    classify
                    )

loadReferences=True
soapReferences=True



def patchBoxFromTopology(hdf5TrajFile:str,topologyFile:str):
    u=mdaUniverse(topologyFile,atom_style="id type x y z")
    with h5py.File(hdf5TrajFile,"a") as workFile:
        for key in workFile['Trajectories']:
            tgroup=workFile[f'Trajectories/{key}']
            tgroup['Box'][:]=[u.dimensions]*tgroup['Box'].shape[0]

Here we perform SOAP analysis, creating the .hdf5 file.

In [None]:
if loadReferences:
    for surf in [110,211,210]:
        for fname in [f"{surf}_T_700.lammpsdump" ]:
            u=mdaUniverse(fname)#, atom_style="id type x y z")
            u.atoms.types = ["Cu"] * len(u.atoms)
            print(u.coord[0])
            MDA2HDF5(u,f"{surf}.hdf5",fname.split('.')[0], trajChunkSize=1000)
            print(surf)
if soapReferences:
    for surf in [110,211,210]:
        patchBoxFromTopology(f"{surf}.hdf5",f"{surf}.data")
        with File(f"{surf}.hdf5","a") as workFile:
            saponifyGroup(
            trajContainers=workFile["Trajectories"],
            SOAPoutContainers=workFile.require_group("SOAP"),
            SOAPOutputChunkDim=1000,
            SOAPnJobs=32,
            SOAPrcut=6,
            SOAPnmax= 8,
            SOAPlmax= 8,
        )

In [None]:
! h5ls -r 211.hdf5

with the following cell we obtain the distance matrix for the whole dictionary; color scale indicates the distance in the high-dimensional SOAP feature space (dsoap) between all SOAP environments in the Cu surfaces.

In [None]:
from scipy.spatial.distance import squareform
from matplotlib.pyplot import viridis, get_cmap
import numpy
import seaborn as sns
from seaborn import clustermap
from pandas import DataFrame
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt

references = {}
with File("../create_reference/references.hdf5", "r") as refFile:
    g = refFile["testReferences"]
    for k in g.keys():
        references[k] = getReferencesFromDataset(g[k])

wholeData = mergeReferences(
    references["111"], references["110"], references["211"], references["210"]
)
ndataset = len(wholeData)
wholeDistances = numpy.zeros((int(ndataset * (ndataset - 1) / 2)))
cpos = 0
for i in range(ndataset):
    for j in range(i + 1, ndataset):
        wholeDistances[cpos] = SOAPdistanceNormalized(
            wholeData.spectra[i], wholeData.spectra[j]
        )
        cpos += 1


a2 = ['s$_{(111)}$','ss$_{(111)}$','b$_{(111)}$',
      's$^{l}$$_{(110)}$','s$^{h}$$_{(110)}$','ss$^{l}$$_{(110)}$','ss$^{h}$$_{(110)}$','b$_{(110)}$',
    's$^{l}$$_{(211)}$','s$^{m}$$_{(211)}$','s$^{h}$$_{(211)}$','ss$^{l}$$_{(211)}$','ss$^{m}$$_{(211)}$','ss$^{h}$$_{(211)}$','b$_{(211)}$',
      's$^{l}$$_{(210)}$','s$^{m}$$_{(210)}$','s$^{h}$$_{(210)}$','ss$^{l}$$_{(210)}$','ss$^{m}$$_{(210)}$','ss$^{h}$$_{(210)}$','b$_{(210)}$'
     ]          
        
        
wfDist = DataFrame(
    squareform(wholeDistances) #index=a2, columns=a2
)

colors = {}
colorbytype = {}
cmaps = {
    "Oranges": get_cmap("Oranges"),
    "Reds": get_cmap("Reds"),
    "Blues": get_cmap("Blues"),
    "Greens": get_cmap("Greens"),
    "GnBu": get_cmap("GnBu"),
    "Greys": get_cmap("Greys"),
    "Purples": get_cmap("Purples"),
}
for k, c in zip(references.keys(), ["Greys", "Reds", "GnBu", "Purples"]):
    colors[k] = [cmaps[c](0.6) for i in range(len(references[k]))]
    colorbytype[k] = []
    for i in range(len(references[k])):
        cc = "Oranges"
        num = 0.6
        if "ss" in references[k].names[i]:
            cc = "Greens"
        elif "s" in references[k].names[i]:
            cc = "Blues"
        
        if "lc" in references[k].names[i]:
            num = 0.4
        elif "hc" in references[k].names[i]:
            num = 0.8
        colorbytype[k].append(cmaps[cc](num))

cr = [] + colors["111"] + colors["110"] + colors["211"] + colors["210"]
cc = (
    []
    + colorbytype["111"]
    + colorbytype["110"]
    + colorbytype["211"]
    + colorbytype["210"]
)

links = linkage(wholeDistances, method="complete")


plt.figure( 
           dpi = 1200) 

plt.rcParams.update({'font.size': 32})
cmap = clustermap(
    wfDist,
    method="complete",
    cmap="bone",
    tree_kws=dict(linewidths=2.5),
    #row_colors=cr,
    #col_colors=cc,
    row_linkage=links,
    col_linkage=links,
    #linewidths=0.05,
   dendrogram_ratio=(0.2,0.2),
   linewidths=.75, 
    figsize=(25,25),
    xticklabels=a2, yticklabels=a2,
    cbar=True,
)

cmap.ax_col_dendrogram.remove()
#_ = cmap.ax_heatmap.set_xticks([])
#for l in cmap.ax_row_dendrogram.lines:
        #l.set_linewidth(20)
#for l in cmap.ax_col_dendrogram.lines:
#        l.set_linewidth(20)
#cmap.savefig("groupingDistances.png",dpi=300)

here, we save and classify the trajectory using the the SOAP environments defined in the complete dictionary 

In [None]:
for surf in [110,211,210]:
    with File(f"{surf}.hdf5", "r") as workFile:
        g=workFile[f"SOAP"]
        for key in workFile[f"SOAP"].keys():
            cls = {}
            t= classify(g[key], wholeData, SOAPdistanceNormalized, True)
            cls[f"whole"] = t.references
            cls[f"whole_d"] = t.distances
            saveXYZfromTrajGroup(
             f"whole_{surf}_T_700.xyz",
             workFile[f"Trajectories/{key}"],
             **cls,
            )