# Benchmark: GAME-Net vs Andersen et al. dataset (fragments) 

Here, we test GAME-Net performance on the dataset related to the work of Andersen et al. (2019), which considers open-shell fragments adsorbed on metal and alloy surfaces.

The alloys surfaces have been neglected as they are not the target of GAME-Net.

In [None]:
import sys, os
sys.path.insert(0, '../../../src')
import ase.io.trajectory

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.font_manager import FontProperties
legend_font = FontProperties(family='Arial', style='normal', size=9)
import seaborn as sns
from sklearn.metrics import r2_score

from gnn_eads.functions import structure_to_graph
from gnn_eads.nets import PreTrainedModel

def get_fragment_energy(structure: str):
    """Calculate fragment energy (in eV) with same reference closed-shell molecules
       used by Andersen et al."""
    e_H2O = -14.21877278
    e_H2 = -6.76639487
    e_CH4 = -24.05681734
    structure = structure.split("_")[0]
    # Count elements in the structure
    n_C = structure.count("C")
    n_O = structure.count("O")
    n_H = structure.count("H")
    # Calculate adsorbate energy
    e_fragment = n_C * e_CH4 + (n_O) * e_H2O + (0.5*n_H - 2*n_C - n_O) * e_H2
    return e_fragment

## 1) Load GAME-Net model

In [None]:
MODEL_NAME = "GAME-Net"
MODEL_PATH = "../../../models/{}".format(MODEL_NAME)  
model = PreTrainedModel(MODEL_PATH)
print(model)

## 2) Retrieve data, convert them in graph format and get predictions

To download the data, please refer to the paper related to the dataset. Go directly to section 4)

In [None]:
YOUR_PATH = "/home/smorandi/Desktop/Andersen_dataset"  # change this to your path to the dataset
DATA_PATH = "{}/structures/".format(YOUR_PATH)
EADS_FILE_PATH = "{}/data.txt".format(YOUR_PATH)
print("Number of structures: {}".format(len(os.listdir(DATA_PATH))))

In [None]:
# get the list of structures
structures = os.listdir(DATA_PATH)
# Print total number of structures
print("Number of structures: {}".format(len(structures)))
# Filter out all the structures that are not .traj files and that have more than one uppercase letter in the string after the first underscore
structures = [s for s in structures if s[-5:] == ".traj" and len([c for c in s.split("_")[1] if c.isupper()]) == 1]
print("Number of structures after filtering out alloys: {}".format(len(structures)))

In [None]:
# For each structure, get the geometry with ASE and convert it to a graph
graphs, Eads_DFT, fragment, E_fragment, E_GNN, Eads_GNN, metal = [], [], [], [], [], [], []
data = pd.read_csv(EADS_FILE_PATH, sep="    ")
for s in structures:
    # get the geometry from the .traj file, convert to CONTCAR format and convert to graph
    geometry = ase.io.trajectory.Trajectory(DATA_PATH + s)[0]
    geometry.write("CONTCAR")
    graph = structure_to_graph("./CONTCAR", model.g_tol, model.g_sf, model.g_metal_2nn)
    graphs.append(graph)
    # get the adsorption energy from the data file
    Eads_DFT.append(data[data["Samples"] == s[:-5]]["E_ads_eV"].values[0])
    # get the fragment name and energy
    fragment.append(s.split("_")[0])
    E_fragment.append(get_fragment_energy(s))
    # get the metal name
    metal.append(s.split("_")[1])
    # get the adsorption energy from the GNN
    E_GNN.append(model.evaluate(graph))
    Eads_GNN.append(E_GNN[-1] - E_fragment[-1])
    print("Converted structure {} to graph".format(s))
os.remove("CONTCAR")
# Create a dataframe with structure name, fragment name, DFT adsorption energy, graph, GNN energy and adsorption energy
df = pd.DataFrame({"Structure": structures, 
                   "Fragment": fragment,
                   "Metal": metal, 
                   "Eads_DFT": Eads_DFT, 
                   "Graph": graphs, 
                   "E_GNN": E_GNN, 
                   "Eads_GNN": Eads_GNN, 
                   "E_fragment": E_fragment})

df["error"] = df["Eads_DFT"] - df["Eads_GNN"]
df["error_abs"] = np.abs(df["error"])

# generate statistics
mae = df["error_abs"].mean()
rmse = np.sqrt((df["error"]**2).mean())
r2 = r2_score(df["Eads_DFT"], df["Eads_GNN"])
n = len(df)
mean = df["error"].mean()
std = df["error"].std()
median = df["error"].median()

print("Mean absolute error: {:.3f} eV".format(mae))
print("Root mean squared error: {:.3f} eV".format(rmse))
print("R2 score: {:.3f}".format(r2))
print("Number of structures: {}".format(n))

## 3) Save dataframe to .csv

In [None]:
df.to_csv("Andersen_dataset_predictions.csv")

## 4) Load .csv to dataframe

In [None]:
df = pd.read_csv("Andersen_dataset_predictions.csv")
df.head()

In [None]:
# generate statistics
mae = df["error_abs"].mean()
rmse = np.sqrt((df["error"]**2).mean())
r2 = r2_score(df["Eads_DFT"], df["Eads_GNN"])
n = len(df)
mean = df["error"].mean()
std = df["error"].std()
median = df["error"].median()

print("Mean absolute error: {:.3f} eV".format(mae))
print("Root mean squared error: {:.3f} eV".format(rmse))
print("R2 score: {:.3f}".format(r2))
print("Number of structures: {}".format(n))

# Plot

In [None]:
# subplot with first two figures
fig, ax = plt.subplots(1, 2, figsize=(18/2.54,9/2.54), dpi=300)       
sns.scatterplot(x="Eads_DFT", y="Eads_GNN", hue="Fragment", data=df, ax=ax[0], palette="viridis", ec="k", s=20)
params = {'mathtext.default': 'regular'}          
plt.rcParams.update(params)
ax[0].set_ylabel('$\mathit{E}_{ads}^{GNN}$ / eV')
ax[0].set_xlabel('$\mathit{E}_{ads}^{DFT}$ / eV')
ax[0].set_title("Parity plot")
l = 4.5
ax[0].set_xlim(-l, l)
ax[0].set_ylim(-l, l)
ax[0].plot([-l, l], [-l, l], c="k", zorder=-1)
ax[0].xaxis.set_major_locator(MaxNLocator(5))
ax[0].yaxis.set_major_locator(MaxNLocator(5))
plt.tight_layout()
text = "MAE = {:.2f} eV\nRMSE = {:.2f} eV\n$\mathit{{R}}^{{2}}$ = {:.2f}\nN = {}".format(mae, rmse, r2, n)
props = dict(boxstyle='round', facecolor='white', edgecolor='black')
ax[0].text(0.05, 0.95, text, transform=ax[0].transAxes, fontsize=9,
        verticalalignment='top', bbox=props)
ax[0].legend(title="", fontsize=9, loc='lower left', bbox_to_anchor=(0.02, 0.02), framealpha=1)

sns.kdeplot(df["error"], fill=True, ax=ax[1], alpha=0.5)
ax[1].spines['right'].set_linewidth(1.0)
ax[1].spines['right'].set_color('black')
ax[1].spines['top'].set_linewidth(1.0)
ax[1].spines['top'].set_color('black')
ax[1].spines['left'].set_linewidth(1.0)
ax[1].spines['left'].set_color('black')
ax[1].spines['bottom'].set_linewidth(1.0)
ax[1].spines['bottom'].set_color('black')
plt.tick_params("both")
plt.xlabel("$\mathit{E}_{ads}^{DFT} - \mathit{E}_{ads}^{GNN}$ / eV")  
plt.ylabel("Density")
plt.title("Error distribution")
plt.xlim(-2.5, 6.5)
ylim = 0.5
plt.ylim(0, ylim)
plt.vlines(np.mean(df["error"]), 0, ylim, colors='r', linestyles='dashed', label='mean')
plt.vlines(np.median(df["error"]), 0, ylim, colors='g', linestyles='dashed', label='median')
plt.legend(fontsize=9)
plt.text(0.035, 0.95, "mean = {:.2f}\nmedian = {:.2f}\nstd = {:.2f}".format(mean, median, std),
 transform=plt.gca().transAxes, va='top', bbox=dict(boxstyle='round', facecolor='white', alpha=1.0, edgecolor='black'), fontsize=9)
plt.tight_layout()
