## Imports

In [None]:
import sys
sys.path.append("../py_src")

from glob import glob
import os

import numpy as np
import matplotlib.pyplot as plt

import sort_neigh

from ase.io import read as ase_read
from ase.neighborlist import natural_cutoffs, NeighborList
from dscribe.descriptors import LMBTR, SOAP

## Define some Values

In [None]:
target_dir = "../test_data/230104_finalres_justlocalstruct/rh"
only_cu_dir = target_dir + "/cunanoparticle"
only_cu_path = only_cu_dir + "/cusingle.lammpstrj"

old_format = False
n_particles = 1577
n_rhod = 15

if False: # finalres
    r_cut=4.2 
    n_max=4
    l_max=3
    sigma=0.6
    gamma_kernel=1.
else: # finalres justlocalstruct
    r_cut=5.2
    n_max=4
    l_max=3
    sigma=1.
    gamma_kernel=1.


## ML Classifier

In [None]:
ml_classifier = sort_neigh.USMLClassifier()

use_soap = True
load_dim_red = False
if use_soap:
    descr = SOAP(species=["Rh", "Cu"], r_cut=r_cut, n_max=n_max, l_max=l_max, sigma=sigma, periodic=False)
else:
    n_spec = 180
    descr = LMBTR(
        species=["Rh", "Cu"],
    #    k2={
    #        "geometry": {"function": "distance"},
    #        "grid": {"min": 0, "max": 5, "n": 100, "sigma": 0.1},
    #        "weighting": {"function": "exp", "scale": 0.5, "threshold": 1e-3},
    #    },
        k3={
            "geometry": {"function": "angle"},
            "grid": {"min": 0, "max": 180, "n": n_spec, "sigma": 2.8},
            "weighting": {"function": "unity"},
        },
        periodic=False,
        sparse=False,
        normalization="none",
        flatten=True
    )

standard_classifier = sort_neigh.NeighbourClassifier(
    local_structures_path=os.path.abspath("../src/localstructures_newopt_rh"),
    non_class_max=14
)
standard_classifier.load_identifiers(descr_func=descr)

## Import Particle

In [None]:
full_particle = ase_read(only_cu_path)
if False:
    full_particle = ase_read('../test_data/221229_saafinal/rh/mc/600.lammpstrj', index=3000)
at_pos = full_particle.get_positions()

cut_off = natural_cutoffs(full_particle, mult=0.98)# mult=0.98)
neighbour_list = NeighborList(cut_off, bothways=True, self_interaction=False)
neighbour_list.update(full_particle)

ind_soaps = np.zeros((len(full_particle), descr.get_number_of_features()))
surf_soaps = []
for index in range(len(full_particle)):
    neighbour_indices, trash = neighbour_list.get_neighbors(index)
    neighbour_indices = np.append(np.array([index]), neighbour_indices, axis=0)
    neighbour_particle = full_particle[neighbour_indices]
    
    # Make center atom Rh
    symbs = neighbour_particle.get_chemical_symbols()
    symbs[0] = "Rh"
    neighbour_particle.set_chemical_symbols(symbs)

    ind_soaps[index] = descr.create(neighbour_particle, centers=[0])
    if len(neighbour_particle) < 10:
        surf_soaps.append(ind_soaps[index])

surf_soaps = np.asarray(surf_soaps)

## Load Soaps from localstructures

In [None]:
soaps_from_classifier = []
labels = []

for key in standard_classifier.identification_dict.keys():
    entry = standard_classifier.identification_dict[key]
    if entry is not None:
        soaps_from_classifier.append(entry["soap_descr"][:, 0, :])
        labels.append(entry["id"])

buff = soaps_from_classifier[0].copy()
for ii_soap in range(1, len(soaps_from_classifier)):
    buff = np.append(buff, soaps_from_classifier[ii_soap], axis=0)

soaps_from_classifier = buff.copy()
del buff

buff = []
for label in labels:
    for entry in label:
        buff.append(entry)

labels=buff
del buff

print("Loaded localstructures: ")
print(labels)
print(soaps_from_classifier.shape)

## Training

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import KMeans, DBSCAN, Birch
from dscribe.descriptors import SOAP

n_components = 2

train_on_particle = False
if train_on_particle:
    n_clust = ml_classifier.train_on_particle(
        full_particle,
        soap_species=["Cu"], dim_red=PCA(n_components=n_components), 
        clusterer=Birch(n_clusters=10),
        r_cut=r_cut, n_max=n_max, l_max=l_max, sigma=sigma
    )
else:
    n_clust = ml_classifier._train_on_data(
        soaps_from_classifier, # soaps_from_classifier, # ind_soaps,
        dim_red=PCA(n_components=n_components), clusterer=KMeans(n_clusters=10)
    )
    ml_classifier.descr = descr

# soaps = ml_classifier.descr.create(full_particle)
reduced_particle = ml_classifier.dim_red.transform(ind_soaps)
reduced_surf = ml_classifier.dim_red.transform(surf_soaps)
soap_prediction = ml_classifier.dim_red.transform(soaps_from_classifier)

## Load Existing Atomic Descriptors

In [None]:
target_folders = [
    target_dir+"/mc",
    target_dir+"/mcmd"
]
if old_format:
    if use_soap:
        load_name = "_soap.npy"
    else:
        load_name = "_lmbtr.npy"
else:
    n_timesteps = 10000
    if use_soap:
        load_name = "_soap_%ux%u.txt"%(n_timesteps, n_rhod)
        if load_dim_red:
            load_name = "_soappca_%ux%u.txt"%(n_timesteps, n_rhod)
    else:
        load_name = "_lmbtr_%ux%u.txt"%(n_timesteps, n_rhod)
        if load_dim_red:
            load_name = "_lmbtrpca_%ux%u.txt"%(n_timesteps, n_rhod)

results_dict = {}

for target_folder in target_folders:
    for target_file in glob(target_folder+"/*.lammpstrj"):
        target_file = os.path.abspath(target_file)
        only_file = os.path.basename(target_file).split(".")[0]
        save_txt_path = os.path.join(os.path.dirname(target_file), only_file+load_name)

        with open(save_txt_path, 'rb') as f:
            if old_format:
                load_descriptors = np.load(f)
            else:
                load_descriptors = np.loadtxt(f, dtype=np.float32)
                load_descriptors = load_descriptors.reshape((n_timesteps, n_rhod, load_descriptors.shape[-1]))
            f.close()

        dir_name = save_txt_path.split("/")[-2]
        cur_key = '_'.join([dir_name, only_file])
        results_dict[cur_key] = {}
        if not load_dim_red:
            results_dict[cur_key]["descriptors"] = load_descriptors.copy()
            import_shape = load_descriptors.shape
            load_descriptors = load_descriptors.reshape((import_shape[0]*import_shape[1], import_shape[2]))
            results_dict[cur_key]["dim_red"] = ml_classifier.dim_red.transform(load_descriptors)
            results_dict[cur_key]["dim_red"] = results_dict[cur_key]["dim_red"].reshape((import_shape[0], import_shape[1], results_dict[cur_key]["dim_red"].shape[-1]))
        else:
            results_dict[cur_key]["dim_red"] = load_descriptors[..., :n_components]


## Plot Reduction Maps

In [None]:
%matplotlib auto
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax.set_title("PCA Map of Unsupervised Regression")

sc = ax.scatter(reduced_particle[:, 0], reduced_particle[:, 1], label='particle')
ax.scatter(soap_prediction[:, 0], soap_prediction[:, 1], c="k", label="localstructures")
for ii_label, label in enumerate(labels):
    ax.annotate(label, soap_prediction[ii_label, 0:2])

load_descriptors = results_dict["mcmd_400"]["dim_red"][..., :2]
load_descriptors = load_descriptors.reshape((load_descriptors.shape[0]*load_descriptors.shape[1], load_descriptors.shape[2]))
ax.scatter(load_descriptors[:, 0], load_descriptors[:, 1], label='trajectory mcmd')
load_descriptors = results_dict["mc_400"]["dim_red"][..., :2]
load_descriptors = load_descriptors.reshape((load_descriptors.shape[0]*load_descriptors.shape[1], load_descriptors.shape[2]))
ax.scatter(load_descriptors[:, 0], load_descriptors[:, 1], label='trajectory mc')
ax.legend()
plt.show()

## Build Clustering Algorithm

In [None]:
if True:
    ml_classifier.clusterer = KMeans(init=soap_prediction, n_clusters=soap_prediction.shape[0])
    ml_classifier.clusterer.fit(soap_prediction)

clust_min = np.min(soap_prediction, axis=0)
clust_max = np.max(soap_prediction, axis=0)
clust_diff = clust_max - clust_min
clust_min -= clust_diff*0.1
clust_max += clust_diff*0.1

# Everything too far outside 9 neighbour is considered bulk
if soap_prediction[-1, 0] > 0:
    bulk_border = clust_max[0]
else:
    bulk_border = clust_min[0]

# Build cluster number for each file in results_dict
for key, value in results_dict.items():
    load_reduction = value["dim_red"]
    import_shape = load_reduction.shape
    load_reduction = load_reduction.reshape((import_shape[0]*import_shape[1], import_shape[2]))
    clusters = ml_classifier.clusterer.predict(load_reduction)
    results_dict[key]["clusters"] = clusters.reshape((import_shape[0], import_shape[1]))

In [None]:
from matplotlib import cm
from matplotlib.colors import ListedColormap

def cmap_from_categories(colors, or_map_name="tab20", n_orcolors=20):
    c_range = np.max(colors)-np.min(colors)+1
    normed_colors = (colors-np.min(colors))/(c_range-1)
    normed_colors *= (c_range-1)/(c_range)
    normed_colors += 1./(2*c_range)

    c_ticks = np.linspace(0, 1, c_range*2+1, endpoint=True)[1::2]

    tab20 = cm.get_cmap(or_map_name, 256)
    color_range = np.linspace(0, c_range/float(n_orcolors), 500, endpoint=False)
    cmap = ListedColormap(tab20(color_range))
    
    return normed_colors, cmap, c_ticks

def norm_to_previous_cats(new_cats, prev_cats):
    c_range = np.max(prev_cats)-np.min(prev_cats)+1
    normed_colors = (new_cats-np.min(prev_cats))/(c_range-1)
    normed_colors *= (c_range-1)/(c_range)
    normed_colors += 1./(2*c_range)

    return normed_colors

## Show Particle Classification

In [None]:
target_prediction = ml_classifier.clusterer.predict(soap_prediction)
normed_colors, cmap, c_ticks = cmap_from_categories(target_prediction)
centers = ml_classifier.clusterer.cluster_centers_

particle_colors = norm_to_previous_cats(ml_classifier.clusterer.predict(reduced_particle), target_prediction)
at_pos = full_particle.get_positions()

In [None]:
cond = np.s_[:]

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection='3d')

sc = ax.scatter(
    at_pos[cond, 0], at_pos[cond, 1], at_pos[cond, 2], c=particle_colors, cmap=cmap, alpha=1,
    s=800, edgecolors="k", vmin=0, vmax=1
)
cbar = fig.colorbar(sc)
cbar.set_ticks(c_ticks)
cbar.set_ticklabels(labels)

plt.show()

In [None]:
from matplotlib.patches import Rectangle

%matplotlib auto
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax.set_title("PCA Map of Unsupervised Regression")

sc = ax.scatter(reduced_particle[:, 0], reduced_particle[:, 1], c=particle_colors, label='particle', cmap=cmap, vmin=0, vmax=1)

# ax.scatter(soap_prediction[:, 0], soap_prediction[:, 1], c="k", label="localstructures")
ax.scatter(centers[:, 0], centers[:, 1], label='centers', c='k', marker='x')
for ii_label, label in enumerate(labels):
    ax.annotate(label, soap_prediction[ii_label, 0:2])

# Make colorbar nice
if True:
    cbar = fig.colorbar(sc)
    cbar.set_ticks(c_ticks)
    cbar.set_ticklabels(labels)

# Plot clustering borders
if False:
    x_bords = ax.get_xlim()
    x_range = np.linspace(x_bords[0], x_bords[1], 201)
    y_bords = ax.get_ylim()
    y_range = np.linspace(-50, y_bords[1], 200)

    xx_mesh, yy_mesh = np.meshgrid(x_range, y_range)
    Z = ml_classifier.clusterer.predict(np.c_[xx_mesh.ravel(), yy_mesh.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx_mesh.shape)
    Z = norm_to_previous_cats(Z, target_prediction)
    ax.imshow(
        Z,
        interpolation="nearest",
        extent=(xx_mesh.min(), xx_mesh.max(), yy_mesh.min(), yy_mesh.max()),
        cmap=cmap,
        aspect="auto",
        origin="lower",
    )
raise ValueError
# MC and MCMD plot
t = 600
load_descriptors = results_dict["mc_%u"%t]["dim_red"][..., :2]
load_descriptors = load_descriptors.reshape((load_descriptors.shape[0]*load_descriptors.shape[1], load_descriptors.shape[2]))
load_colors = results_dict["mc_%u"%t]["clusters"]
load_colors = load_colors.reshape((load_colors.shape[0]*load_colors.shape[1]))
load_colors = norm_to_previous_cats(load_colors, target_prediction)
ax.scatter(load_descriptors[:, 0], load_descriptors[:, 1], c=load_colors, label='trajectory mc', cmap=cmap, vmin=0, vmax=1, marker='s')

load_descriptors = results_dict["mcmd_%u"%t]["dim_red"][..., :2]
load_descriptors = load_descriptors.reshape((load_descriptors.shape[0]*load_descriptors.shape[1], load_descriptors.shape[2]))
load_colors = results_dict["mcmd_%u"%t]["clusters"]
load_colors = load_colors.reshape((load_colors.shape[0]*load_colors.shape[1]))
load_colors = norm_to_previous_cats(load_colors, target_prediction)
ax.scatter(load_descriptors[:, 0], load_descriptors[:, 1], label='trajectory mcmd', c=load_colors, cmap=cmap, vmin=0, vmax=1, marker='>')

# Border filter patch
border_rect = Rectangle(
    xy=clust_min, width=(clust_max-clust_min)[0], height=(clust_max-clust_min)[1], 
    fill=False, edgecolor='k', linewidth=3,
    label='Classification Range'
)
ax.add_patch(border_rect)
ax.vlines(bulk_border, ax.get_ylim()[0], ax.get_ylim()[1], color='r', linewidth=3, label='Bulk Cutoff')

ax.legend()
plt.show()

## Build Bar Sort

In [None]:
label_list = ['400', '500', '600']
index_dict = {'400':0, '500':1, '600':2} # To sort temperature into right spot in bincount array

n_clust = ml_classifier.clusterer.n_clusters
particle_cats = ml_classifier.clusterer.predict(reduced_particle.astype(np.double))

particle_bins = np.bincount(particle_cats, minlength=n_clust)

measurement_count = np.zeros((2, 3, n_clust+2), np.int16)
n_ts = np.zeros((2, 3), np.int16)
for key, value in results_dict.items():
    temperature = key.split('_')[-1]
    t_ind = index_dict[temperature]
    type_ind = int('mcmd' in key) # 0 for mc, 1 for mcmd
    
    load_reduction = value["dim_red"]
    load_reduction = load_reduction.reshape((load_reduction.shape[0]*load_reduction.shape[1], load_reduction.shape[2]))
    is_bulk = load_reduction[:, 0] > bulk_border
    n_bulk = np.count_nonzero(is_bulk)
    in_range = np.logical_and.reduce(np.logical_and(load_reduction > clust_min, load_reduction < clust_max), axis=-1)
    n_outrange = np.count_nonzero(np.logical_not(in_range))

    load_cats = value["clusters"]
    n_ts[type_ind, t_ind] = load_cats.shape[0]
    load_cats = load_cats.reshape((load_cats.shape[0]*load_cats.shape[1]))
    measurement_count[type_ind, t_ind, :-2] = np.bincount(load_cats[in_range], minlength=n_clust)
    measurement_count[type_ind, t_ind, -2] = n_bulk
    measurement_count[type_ind, t_ind, -1] = n_outrange - n_bulk

used_cats = 0 != np.sum(measurement_count, axis=1)

In [None]:
%matplotlib inline

draw_ind = 0
offsets = np.array([-0.3, 0, 0.3])
alphas = [0.2, 0.5, 1]
width = 0.3
fontsize = 17

fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
ax.set_title('mc')
print(labels)
cur_labels = []
for ii_label, label in enumerate(labels+['bulk', 'non-classifiable']):
    print(label, used_cats[draw_ind, ii_label])
    if used_cats[draw_ind, ii_label]:
        cur_labels.append(label)

draw_percentages = measurement_count[draw_ind, :, :] / (n_ts[draw_ind, :, np.newaxis] * n_rhod)
draw_percentages *= 100
draw_percentages = measurement_count[draw_ind, :, :]
n_cats = np.sum(np.asarray(used_cats[draw_ind, :], np.int8))
cat_locs = np.arange(n_cats)

for ii_bar in range(3):
    ax.bar(cat_locs + offsets[ii_bar], draw_percentages[ii_bar, used_cats[draw_ind, :]], width=width, label=label_list[ii_bar])

    for ll_ann, annotation in enumerate(draw_percentages[ii_bar, used_cats[draw_ind, :]]):
        ax.annotate(
            "%.4f"%annotation,
            [cat_locs[ll_ann]+offsets[ii_bar], annotation*1.1], 
            ha='center', rotation=45, fontsize=fontsize-10
        )

ax.set_yscale('log')
ax.set_xticks(cat_locs)
ax.set_xticklabels(cur_labels)

ax.set_xlabel('site type', fontsize=fontsize)
ax.set_ylabel('Rh atoms per surface type [%]', fontsize=fontsize)

plt.show()

In [None]:
%matplotlib inline
draw_ind = 1
offsets = np.array([-0.3, 0, 0.3])
alphas = [0.2, 0.5, 1]
width = 0.3
fontsize = 17

fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
ax.set_title('mcmd')

cur_labels = []
for ii_label, label in enumerate(labels+['bulk', 'non-classifiable']):
    if used_cats[draw_ind, ii_label]:
        cur_labels.append(label)

draw_percentages = measurement_count[draw_ind, :, :] / (n_ts[draw_ind, :, np.newaxis] * n_rhod)
draw_percentages *= 100
draw_percentages = measurement_count[draw_ind, :, :]
n_cats = np.sum(np.asarray(used_cats[draw_ind, :], np.int8))
cat_locs = np.arange(n_cats)

for ii_bar in range(3):
    ax.bar(cat_locs + offsets[ii_bar], draw_percentages[ii_bar, used_cats[draw_ind, :]], width=width, label=label_list[ii_bar])

    for ll_ann, annotation in enumerate(draw_percentages[ii_bar, used_cats[draw_ind, :]]):
        ax.annotate(
            "%.4f"%annotation,
            [cat_locs[ll_ann]+offsets[ii_bar], annotation*1.1], 
            ha='center', rotation=45, fontsize=fontsize-10
        )

ax.set_yscale('log')
ax.set_xticks(cat_locs)
ax.set_xticklabels(cur_labels)

ax.set_xlabel('site type', fontsize=fontsize)
ax.set_ylabel('Rh atoms per surface type [%]', fontsize=fontsize)

plt.show()