# Benchmarks and Tests of Maps for Single-Element Substrates

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
default_fontsize = plt.rcParams['font.size']
publication_fontsize_large = 20
publication = False
if publication: plt.rcParams.update({'font.size': publication_fontsize_large})

## Pt(111) Surface

Generate the ASE.Atoms instance using the fcc111 build function. 

In [None]:
from ase.build import fcc111
Pt111 = fcc111("Pt",size=(4,4,3),a=3.94,orthogonal=True,periodic=True,vacuum = 10)

From the `ASE.Cell` and the generated `ASE.Atoms`, create a `MapSy.Grid` and a `MapSy.System`

In [None]:
from mapsy.data import Grid
grid: Grid = Grid(cell=Pt111.cell)

In [None]:
from mapsy.data import System
system: System = System(grid, Pt111, dimension=2, axis=2)

In this notebook we will consider a simplified contact space composed by points that are all in the same planes above and below the material. This contact space is generated starting from a smoothly-varying boundary function that is centered on the system's center of mass and varies smoothly from 1 to 0 at the specified `distance` from the center. The smoothness of the transition is controlled by the `spread` parameter. The resolution of the points is controlled by the `cutoff` keyword, with larger values corresponding to more fine grids. Given the 2D nature of the system (as specified in the `dimension` and `axis` keywords above) the generated points ideally correspond to two flat regions above and below the materials and perpendicural to the z Cartesian direction. The `side` keyword allows to choose the top (+1) or bottom (-1) region. Setting the `threshold` to a negative value will select the points that have the largest modulus of the gradient of the boundary function, it thus allows us to only focus on the points at the center of the selected transition region. 

In [None]:
from mapsy.io.parser import ContactSpaceGenerator, ContactSpaceModel
contactspacesettings = ContactSpaceModel.parse_obj({"mode": "system", "distance": 3.5, "spread": 1.0, "cutoff": 80, "threshold": -1, 'side':1})
contactspace = ContactSpaceGenerator(contactspacesettings).generate(system)

Check the generated contact space: each point is associated with a `probability` column that corresponds to the scaled modulus of the gradient of the contact space interface. For each point we also compute the indexes of the neighboring points and the region (topologically disconnected groups of points) to which they belong. Given the slab nature of the substrate, we expect to generate two regions of points on the two opposite faces of the material.

In [None]:
contactspace.data

In [None]:
from mapsy.symfunc.input import SymmetryFunctionsModel, SymFuncModel
from mapsy.symfunc.parser import SymmetryFunctionsParser
symfuncsettings = SymmetryFunctionsModel.parse_obj({"functions": [SymFuncModel.parse_obj({"type":"ac","radius":4.5,"order":10,"compositional":False,"structural":True}),SymFuncModel.parse_obj({"type":"ac","radius":4.5,"order":10,"compositional":False,"structural":True,"radial":False})]})
symmetryfunctions = SymmetryFunctionsParser(symfuncsettings).parse()

In [None]:
from mapsy.maps import Maps
maps = Maps(system,symmetryfunctions,contactspace)

In [None]:
data = maps.atcontactspace()

We can visualize features to check how they look using `Maps.plot(feature: str)` or `Maps.plot(index: int)`. 

In [None]:
for index in range(20):
    fig, axes = maps.plot(index=index, cmap='Spectral', set_aspect='scaled', levels=20)
    axes.set_title(f'{maps.features[index]}')
    axes.set_xlabel('x (Å)')
    axes.set_ylabel('y (Å)')
    plt.show()

## Dimensionality Reduction (PCA)

For visualization and post-processing purposes, perform dimensionality reduction on the generated features. We can first determine how many components we need, by checking the explained variance. 

In [None]:
fig, ax1, ax2 = maps.reduce(scale=True)
if (publication) : 
    ax1.set_title('PCA')
    fig.tight_layout()

For visualization purposes we don't want more than 3 components, but it seems that for this specific system 3 components are enough to explain 99% of the variance, with 4 components able to fully explain the variance in the features. 

In [None]:
npca = 4
maps.reduce(npca, scale=True)

We can visually inspect how the PCAs correlate with the Cartesian coordinates of the points (e.g., PCA3 distinguishes between HCP and FCC hollow sites)

In [None]:
for i in range(npca):
    fig, axes = maps.plot(feature=f'pca{i}', axes=['x', 'y'],cmap='Spectral', set_aspect='scaled', levels=20)
    axes.set_title(f'PCA {i+1}')
    axes.set_xlabel('x (Å)')
    axes.set_ylabel('y (Å)')
    plt.show()

We can also verify how the contact space is transformed (folded) in the symmetry function space. 

In [None]:
fig, ax = maps.scatter(index=0, cmap='Spectral', axes=['x','y'], alpha=1., set_aspect='scaled', s=10)
ax.set_xlabel('x (Å)')
ax.set_ylabel('y (Å)')
if publication : ax.set_title("")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4*1))
fig.subplots_adjust(hspace=0.3)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_title("")
ax.set_xlim(-8.5,13)
ax.set_ylim(-1.5,4.8)
x1m = maps.data['pca0'].values.astype(np.float64)
x2m = maps.data['pca1'].values.astype(np.float64)
fm = maps.data[maps.features[0]].values.astype(np.float64)
fmin = np.min(fm)
fmax = np.max(fm)
scatter = ax.scatter(x1m,x2m,c=fm,vmin=fmin,vmax=fmax,cmap='Spectral',alpha=0.05,s=60,edgecolors='black')
ax.axis('on')
plt.show()

More in general, we can plot any feature in all possible 2D spaces of principal components:

In [None]:
fig, gs = maps.scatter_pca_grid(index=0,cmap='Spectral',set_aspect='equal',s=70, alpha=0.05)
fig.tight_layout()

## Perform Clustering on Generated Features

Use SpectralClustering to find N clusters in the featured data. First, we run the analysis to identify promising values of N in terms of the Silouette scores and Davis-Bouldin indexes. Local maxima in Silouette score and local minima in Davis-Bouldin index correspond to better clustered data. 

In [None]:
ntries = 1
if publication: ntries = 100
fig, ax1, ax2 = maps.cluster(maxclusters=15, ntries=ntries)
if publication: 
    ax1.set_title('')
    ax2.set_title('')
plt.show()

The analysis above suggests that $N=5$ and $N=10$ may provide better clusters. We can perform the analysis with one of these values:

In [None]:
nclusters = 10
maps.cluster(nclusters)

Given the clusters, plot the connectivity matrix

In [None]:
for i in range(len(maps.cluster_centers)): 
    print(i,maps.cluster_sizes[i],maps.cluster_sizes[i]*4,maps.cluster_graph[i,i],(np.sum(maps.cluster_graph[i,:])-maps.cluster_graph[i,i]),maps.cluster_graph[i,i]/(np.sum(maps.cluster_graph[i,:])-maps.cluster_graph[i,i]))
    print(4*int(np.sqrt(maps.cluster_sizes[i])),(maps.cluster_sizes[i]-int(np.sqrt(maps.cluster_sizes[i]))*4))


In [None]:
maps.cluster_graph

In [None]:
plt.matshow(maps.cluster_edges)
plt.show()

Given the clusters and the connectivity, find the high-symmetry sites

In [None]:
maps.sites()

Visualize the results

In [None]:
fig, ax = maps.scatter(feature='Cluster', categorical=True, s=20, alpha=0.8, set_aspect='scaled', centroids=True)
ax.set_xlabel('x (Å)')
ax.set_ylabel('y (Å)')
ax.set_title('Clusters')
if publication:
    ax.set_title('')
#    ax.get_legend().remove()
plt.show()

We can now visualize the clustering in PCA space, together with the connectivity matrix and centroids

In [None]:
axes = ['pca0','pca1']
fig, ax = maps.scatter(feature='Cluster', categorical=True, axes=axes, alpha=0.05, s=70, edgecolors='black', set_aspect='on')
G = nx.from_numpy_array(maps.cluster_edges,create_using=nx.DiGraph,parallel_edges=False)
pos = maps.data.loc[maps.centroids,axes].values
weights = [ d['weight']/200 for (u, v, d) in G.edges(data=True)]
nx.draw(G, pos, node_size=maps.cluster_sizes, width=weights, ax=ax, alpha=0.5)
limits=ax.axis('on') # turns on axis
ax.tick_params(left=True, bottom=True, labelleft=True, labelbottom=True)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_title('Clusters')
if publication:
    ax.set_title('')
#    ax.get_legend().remove()
plt.show()

## Pt(100) Surface

In [None]:
from ase.build import fcc100
Pt100 = fcc100("Pt",size=(4,4,3),a=3.94,orthogonal=True,periodic=True,vacuum = 10)

In [None]:
from mapsy.data import Grid
grid100: Grid = Grid(cell=Pt100.cell)

In [None]:
from mapsy.data import System
system100: System = System(grid100, Pt100, dimension=2, axis=2)

In [None]:
from mapsy.io.parser import ContactSpaceGenerator, ContactSpaceModel
contactspacesettings = ContactSpaceModel.parse_obj({"mode": "system", "distance": 3.5, "spread": 1.0, "cutoff": 80, "threshold": -1, "side":1})
contactspace100 = ContactSpaceGenerator(contactspacesettings).generate(system100)

In [None]:
contactspace100.data

In [None]:
from mapsy.maps import Maps
maps100 = Maps(system100,symmetryfunctions,contactspace100)

In [None]:
data100 = maps100.atcontactspace()

In [None]:
fig, ax = maps100.plot(feature='ACSF_RS_r4.5_003', cmap='Spectral', levels=24, set_aspect='scaled')
ax.set_xlabel('x (Å)')
ax.set_ylabel('y (Å)')
ax.set_title('ACSF_RS_r4.5_003')
plt.show()

In [None]:
fig, ax1, ax2 = maps100.reduce(scale=True)
if publication: 
    ax1.set_title('PCA')
    ax1.set_xticks(np.arange(0, 21, 4))
    ax2.set_xticks(np.arange(0, 21, 4))
    fig.tight_layout()

In [None]:
maps100.reduce(npca=4, scale=True)

In [None]:
fig, gs = maps100.scatter_pca_grid(index=0,cmap='Spectral',set_aspect='equal',s=70, alpha=0.05)
fig.tight_layout()

In [None]:
fig, ax = maps100.scatter(index=0, cmap='Spectral', axes=['x','y'], alpha=1., set_aspect='scaled', s=10)
ax.set_xlabel('x (Å)') 
ax.set_ylabel('y (Å)')
if publication: ax.set_title("")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4*1))
fig.subplots_adjust(hspace=0.3)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_title("")
ax.set_xlim((-8.03123771959378, 10.144203352496739))
ax.set_ylim((-5.531119537918034, 6.614474301000187))
ax.set_xticks([ -5.,   0.,   5.,  10. ])
ax.set_yticks([ -5.,  -2.5, 0.,  2.5,  5.])
x1m = maps100.data['pca0'].values.astype(np.float64)
x2m = maps100.data['pca1'].values.astype(np.float64)
fm = maps100.data[maps100.features[0]].values.astype(np.float64)
fmin = np.min(fm)
fmax = np.max(fm)
scatter = ax.scatter(x1m,x2m,c=fm,vmin=fmin,vmax=fmax,cmap='Spectral',alpha=0.05,s=60,edgecolors='black')
ax.axis('on')
plt.show()

In [None]:
ntries = 1
if publication: ntries = 100
fig, ax1, ax2 = maps100.cluster(maxclusters=15, ntries=ntries)
if publication:
    ax1.set_title('')
    ax2.set_title('')

In [None]:
best_db = maps100.cluster_screening.loc[maps100.cluster_screening.groupby('nclusters')['db_index'].idxmin()]
best_sil = maps100.cluster_screening.loc[maps100.cluster_screening.groupby('nclusters')['silhouette_score'].idxmax()]
import seaborn as sns
sns.set(style="whitegrid", context="talk")
# Plot Silhouette Scores
fig, ax1 = plt.subplots()
# Plot Silhouette Scores on the left y-axis
#ax1.scatter(maps110.cluster_screening['nclusters'], maps110.cluster_screening['silhouette_score'], color='b', marker='o', label='Silhouette Score')
sns.boxplot(x='nclusters', y='silhouette_score', data=maps100.cluster_screening[['nclusters', 'silhouette_score']],color='b',capprops=dict(color='b'),flierprops=dict(markerfacecolor='b', markeredgecolor='b'),ax=ax1)
#sns.reset_orig()
#ax1.plot(best_db['nclusters']-2,best_db['silhouette_score'], '-', color='b')
ax1.plot(best_sil['nclusters']-2,best_sil['silhouette_score'], '-', color='b')
ax1.set_xlabel('Number of Clusters')
ax1.set_ylabel('Silhouette Score', color='b')
ax1.tick_params(axis='y', labelcolor='b')
# Create a second y-axis to the right for Davies-Bouldin Index
ax2 = ax1.twinx()
#ax2.scatter(maps110.cluster_screening['nclusters'], maps110.cluster_screening['db_index'], color='r', marker='s', label='DB-Index')
sns.boxplot(x='nclusters', y='db_index', data=maps100.cluster_screening[['nclusters', 'db_index']],color='r',capprops=dict(color='r'),flierprops=dict(markerfacecolor='r', markeredgecolor='r'),ax=ax2)
#sns.reset_orig()
ax2.plot(best_db['nclusters']-2,best_db['db_index'], '-', color='r')
#ax2.plot(best_sil['nclusters']-2,best_sil['db_index'], ':', color='r')
ax2.set_ylabel('Davies-Bouldin Index', color='r')
ax2.tick_params(axis='y', labelcolor='r')
# Title and grid
#ax1.set_title('Silhouette Score and Davies-Bouldin Index vs. Number of Clusters')
#ax1.grid(True)
ax2.grid(True, linestyle='--', alpha=0.7)


The optimal values of $N$ are then: $3$, $5$, possibly $12$

In [None]:
maps100.cluster(nclusters=10)

In [None]:
plt.matshow(maps100.cluster_edges)
plt.show()

In [None]:
maps100.sites()

In [None]:
axes = ['x','y']
fig, ax = maps100.scatter(feature='Cluster', axes=axes, categorical=True, alpha=0.80, centroids=True, set_aspect='scaled')
ax.set_xlabel('x (Å)')
ax.set_ylabel('y (Å)')
ax.set_title('Clusters')
if publication:
    ax.set_title('')
    ax.get_legend().remove()
plt.show()

In [None]:
axes = ['pca0','pca1']
fig, ax = maps100.scatter(feature='Cluster', categorical=True, axes=axes, alpha=0.05, s=70,  edgecolors='black', set_aspect='on')
G = nx.from_numpy_array(maps100.cluster_edges,create_using=nx.DiGraph,parallel_edges=False)
pos = maps100.data.loc[maps100.centroids,axes].values
weights = [ d['weight']/200 for (u, v, d) in G.edges(data=True)]
nx.draw(G, pos, node_size=maps100.cluster_sizes, width=weights, ax=ax, alpha=0.4)
limits=ax.axis('on') # turns on axis
ax.tick_params(left=True, bottom=True, labelleft=True, labelbottom=True)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_title('Clusters')
if publication:
    ax.set_title('')
    ax.get_legend().remove()
plt.show()

## Pt(110) Surface

In [None]:
from ase.build import fcc110
Pt110 = fcc110("Pt",size=(4,4,3),a=3.94,orthogonal=True,periodic=True,vacuum = 10)

In [None]:
from mapsy.data import Grid
grid110: Grid = Grid(cell=Pt110.cell)

In [None]:
from mapsy.data import System
system110: System = System(grid110, Pt110, dimension=2, axis=2)

In [None]:
from mapsy.io.parser import ContactSpaceGenerator, ContactSpaceModel
contactspacesettings = ContactSpaceModel.parse_obj({"mode": "system", "distance": 3.5, "spread": 1.0, "cutoff": 80, "threshold": -1, "side":1})
contactspace110 = ContactSpaceGenerator(contactspacesettings).generate(system110)

In [None]:
contactspace110.data

In [None]:
from mapsy.maps import Maps
maps110 = Maps(system110,symmetryfunctions,contactspace110)

In [None]:
data110 = maps110.atcontactspace()

In [None]:
fig, ax = maps110.plot(index=7, cmap='Spectral', levels=24, set_aspect='scaled')
ax.set_title(maps110.features[7])
ax.set_xlabel('x (Å)')
ax.set_ylabel('y (Å)')
plt.show()

In [None]:
fig, ax1, ax2 = maps110.reduce(scale=True)
if publication:
    ax1.set_xticks(np.arange(0, 21, 4))
    ax2.set_xticks(np.arange(0, 21, 4))
    ax1.set_title('PCA')
    fig.tight_layout()

In [None]:
maps110.reduce(npca=4, scale=True)

In [None]:
fig, gs = maps110.scatter_pca_grid(index=0,cmap='Spectral',set_aspect='equal',s=50, alpha=0.05)
fig.tight_layout()

In [None]:
fig, ax = maps110.scatter(index=0, cmap='Spectral', axes=['x','y'], alpha=1., set_aspect='scaled', s=10)
ax.set_xlabel('x (Å)')
ax.set_ylabel('y (Å)')
if publication: ax.set_title("")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4*1))
fig.subplots_adjust(hspace=0.3)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_title("")  
ax.set_xlim((-5.149891252326163, 8.129674135920126))
ax.set_ylim((-5.116892034864052, 6.098692971496631))
ax.set_xticks([-6., -4., -2.,  0.,  2.,  4.,  6.,  8.])
ax.set_yticks([ -5.,  -2.5,  0.,   2.5,  5.])
x1m = maps110.data['pca0'].values.astype(np.float64)
x2m = maps110.data['pca1'].values.astype(np.float64)
fm = maps110.data[maps110.features[0]].values.astype(np.float64)
fmin = np.min(fm)
fmax = np.max(fm)
scatter = ax.scatter(x1m,x2m,c=fm,vmin=fmin,vmax=fmax,cmap='Spectral',alpha=0.05,s=60,edgecolors='black')
ax.axis('on')
plt.show()

In [None]:
ntries = 1
if publication: ntries = 100
fig, ax1, ax2 = maps110.cluster(maxclusters=15, ntries=ntries)
if publication:
    ax1.set_title('')
    ax2.set_title('')

In [None]:
maps110.cluster_screening

In [None]:
best_db

In [None]:
best_db = maps110.cluster_screening.loc[maps110.cluster_screening.groupby('nclusters')['db_index'].idxmin()]
best_sil = maps110.cluster_screening.loc[maps110.cluster_screening.groupby('nclusters')['silhouette_score'].idxmin()]
import seaborn as sns
# Plot Silhouette Scores
fig, ax1 = plt.subplots()
# Plot Silhouette Scores on the left y-axis
#ax1.scatter(maps110.cluster_screening['nclusters'], maps110.cluster_screening['silhouette_score'], color='b', marker='o', label='Silhouette Score')
sns.stripplot(x='nclusters', y='silhouette_score', data=maps110.cluster_screening[['nclusters', 'silhouette_score']],color='blue',ax=ax1)
#sns.reset_orig()
ax1.plot(best_db['nclusters']-2,best_db['silhouette_score'], '-', color='b')
ax1.plot(best_sil['nclusters']-2,best_sil['silhouette_score'], ':', color='b')
ax1.set_xlabel('Number of Clusters')
ax1.set_ylabel('Silhouette Score', color='b')
ax1.tick_params(axis='y', labelcolor='b')
# Create a second y-axis to the right for Davies-Bouldin Index
ax2 = ax1.twinx()
#ax2.scatter(maps110.cluster_screening['nclusters'], maps110.cluster_screening['db_index'], color='r', marker='s', label='DB-Index')
sns.stripplot(x='nclusters', y='db_index', data=maps110.cluster_screening[['nclusters', 'db_index']],color='red',ax=ax2)
#sns.reset_orig()
ax2.plot(best_db['nclusters']-2,best_db['db_index'], '-', color='r')
ax2.plot(best_sil['nclusters']-2,best_sil['db_index'], ':', color='r')
ax2.set_ylabel('Davies-Bouldin Index', color='r')
ax2.tick_params(axis='y', labelcolor='r')
# Title and grid
#ax1.set_title('Silhouette Score and Davies-Bouldin Index vs. Number of Clusters')
ax1.grid(True)


Optimal values of $N$ are $4$, $9$, and possibly $12$

In [None]:
maps110.cluster(nclusters=4)

In [None]:
plt.matshow(maps110.cluster_edges)
plt.show()

In [None]:
maps110.sites()

In [None]:
axes = ['x','y']#['pca0','pca2']
fig, ax = maps110.scatter(feature='Cluster', axes=axes, categorical=True, alpha=0.95, centroids=True, set_aspect='scaled')
ax.set_xlabel('x (Å)')
ax.set_ylabel('y (Å)')
ax.set_title('Clusters')
if publication:
    ax.set_title('')
    ax.get_legend().remove()
plt.show()

In [None]:
axes = ['pca0','pca1']
fig, ax = maps110.scatter(feature='Cluster', categorical=True, axes=axes, alpha=0.04, s=70,  edgecolors='black',set_aspect='on')
G = nx.from_numpy_array(maps110.cluster_edges,create_using=nx.DiGraph,parallel_edges=False)
pos = maps110.data.loc[maps110.centroids,axes].values
weights = [ d['weight']/100 for (u, v, d) in G.edges(data=True)]
nx.draw(G, pos, node_size=maps110.cluster_sizes, width=weights, ax=ax, alpha=0.5)
limits=ax.axis('on') # turns on axis
ax.tick_params(left=True, bottom=True, labelleft=True, labelbottom=True)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_title('Clusters')
if publication:
    ax.set_title('')
    ax.get_legend().remove()
plt.show()