# GNN benchmark: PCA dataset

Here, we test GAME-Net on the dataset from the paper of García-Muelas and Núria López: "Statistical learning goes beyond the d-band model providing the thermochemistry of adsorbates on transition metals", 2019. This DFT dataset contains C1-C2 fragments adsorbed on 12 metals surfaces.

In [None]:
import sys, os
sys.path.insert(0, '../../../src')
from subprocess import PIPE, Popen

import pandas as pd
import numpy as np
import cycler
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.font_manager import FontProperties
legend_font = FontProperties(family='Arial', style='normal', size=9)
import seaborn as sns
from sklearn.metrics import r2_score

from gnn_eads.functions import structure_to_graph
from gnn_eads.nets import PreTrainedModel

def get_fragment_energy(structure: str):
    """Calculate fragment energy from closed shell structures.""" 
    e_H2O = -14.21877278
    e_H2 = -6.76639487
    e_CH4 = -24.05681734
    # Count elemens in the structure
    n_C = int(structure[0])
    n_O = int(structure[2])
    n_H = int(structure[1])
    # Calculate adsorbate energy
    e_fragment = n_C * e_CH4 + (n_O) * e_H2O + (0.5*n_H - 2*n_C - n_O) * e_H2
    return e_fragment

## 1) Load GAME-Net

In [None]:
MODEL_NAME = "GAME-Net"
MODEL_PATH = "../../../models/{}".format(MODEL_NAME)  
model = PreTrainedModel(MODEL_PATH)
print(model)

## 2) Extract data

As we do not provide the data, please refer to the paper to obtain them. The data generated in this step are provided as .csv file, please go directly to section 4).

In [None]:
DATA_PATH = "/home/smorandi/teklahome/benchmark_pca/sol"  # Change this to your path

In [None]:
system, metal, fragment, DFT_energy, DFT_reduced_energy, graph, GNN_reduced_energy, fragment_energy, DFT_eads, GNN_eads = [], [], [], [], [], [], [], [], [], []
error, abs_error = [], []
for filename in os.listdir(DATA_PATH):
    if len(filename) == 7 and "0000" not in filename:
        system.append(filename)
        metal.append(filename.split("-")[0])
        fragment.append(filename.split("-")[1])
        # Get DFT energies
        p1 = Popen(["grep", "energy  w", "{}/{}/OUTCAR".format(DATA_PATH, filename)], stdout=PIPE)
        p2 = Popen(["tail", "-1"], stdin=p1.stdout, stdout=PIPE)
        DFT_energy.append(float(p2.communicate()[0].split()[-1]))
        p1 = Popen(["grep", "energy  w", "{}/{}/OUTCAR".format(DATA_PATH, metal[-1]+"-0000")], stdout=PIPE)
        p2 = Popen(["tail", "-1"], stdin=p1.stdout, stdout=PIPE)
        DFT_reduced_energy.append(DFT_energy[-1] - float(p2.communicate()[0].split()[-1]))
        # Get graph and related energy
        graph.append(structure_to_graph("{}/{}/CONTCAR".format(DATA_PATH, filename), 
                                      model.g_tol, 
                                      model.g_sf, 
                                      model.g_metal_2nn))
        GNN_reduced_energy.append(model.evaluate(graph[-1]))
        fragment_energy.append(get_fragment_energy(fragment[-1]))
        DFT_eads.append(DFT_reduced_energy[-1] - fragment_energy[-1])
        GNN_eads.append(GNN_reduced_energy[-1] - fragment_energy[-1])
        error.append(DFT_eads[-1]-GNN_eads[-1])
        abs_error.append(abs(error[-1]))
        print(filename + " converted")

df = pd.DataFrame({"system": system, 
                   "metal": metal, 
                   "fragment": fragment, 
                   "DFT_energy": DFT_energy, 
                   "DFT_reduced_energy": DFT_reduced_energy, 
                   "GNN_reduced_energy": GNN_reduced_energy, 
                   "fragment_energy": fragment_energy, 
                   "DFT_eads": DFT_eads, 
                   "GNN_eads": GNN_eads, 
                   "error": error, 
                   "abs_error": abs_error})
        

## 3) Save dataframe to .csv

In [None]:
df.to_csv("benchmark_pca.csv")

## 4) Load dataframe from .csv

In [None]:
df = pd.read_csv("benchmark_pca.csv")
# Remove data with metal co (only one point)
df = df[df["metal"] != "co"]
df.head()

In [None]:
# print statistics
mae = df["abs_error"].mean()
rmse = np.sqrt((df["error"]**2).mean())
r2 = r2_score(df["DFT_eads"], df["GNN_eads"])
mean = df["error"].mean()
median = df["error"].median()
std = df["error"].std()
n = len(df["error"])

print("Mean: {:.2f} eV".format(mean))
print("Median: {:.2f} eV".format(median))
print("Std: {:.2f} eV".format(std))
print("MAE: {:.2f} eV".format(mae))
print("RMSE: {:.2f} eV".format(rmse))
print("R2: {:.2f}".format(r2))
print("N: {}".format(n))

# Plot

In [None]:
# subplot with first two figures
fig, ax = plt.subplots(1, 2, figsize=(18/2.54,9/2.54), dpi=300)
sns.color_palette("hls", 12)
sns.scatterplot(x="DFT_eads", y="GNN_eads", hue="metal", data=df, ax=ax[0], ec="k", s=15)
params = {'mathtext.default': 'regular'}          
plt.rcParams.update(params)
ax[0].set_ylabel('$\mathit{E}_{ads}^{GNN}$ / eV')
ax[0].set_xlabel('$\mathit{E}_{ads}^{DFT}$ / eV')
ax[0].set_title("Parity plot")
l = 7
ax[0].set_xlim(-l, l)
ax[0].set_ylim(-l, l)
ax[0].plot([-l, l], [-l, l], c="k", zorder=-1)
ax[0].xaxis.set_major_locator(MaxNLocator(5))
ax[0].yaxis.set_major_locator(MaxNLocator(5))
plt.tight_layout()
text = "MAE = {:.2f} eV\nRMSE = {:.2f} eV\n$\mathit{{R}}^{{2}}$ = {:.2f}\nN = {}".format(mae, rmse, r2, n)
props = dict(boxstyle='round', facecolor='white', edgecolor='black')
ax[0].text(0.05, 0.95, text, transform=ax[0].transAxes, fontsize=9,
        verticalalignment='top', bbox=props)
handles, labels = ax[0].get_legend_handles_labels()
ax[0].legend(handles, labels, loc="lower left", title="", fontsize=9, ncol=2, columnspacing=0.4, handletextpad=0.2, borderpad=0.35, framealpha=1)
# Capitalize legend labels
for text in ax[0].get_legend().get_texts():
    text.set_text(text.get_text().capitalize())
# Reduce spacing between the two columns of the legend
ax[0].get_legend().get_frame().set_linewidth(0.5)
ax[0].get_legend().get_frame().set_edgecolor("black")


sns.kdeplot(df["error"], fill=True, ax=ax[1], alpha=0.5)
ax[1].spines['right'].set_linewidth(1.0)
ax[1].spines['right'].set_color('black')
ax[1].spines['top'].set_linewidth(1.0)
ax[1].spines['top'].set_color('black')
ax[1].spines['left'].set_linewidth(1.0)
ax[1].spines['left'].set_color('black')
ax[1].spines['bottom'].set_linewidth(1.0)
ax[1].spines['bottom'].set_color('black')
plt.tick_params("both")
plt.xlabel("$\mathit{E}_{ads}^{DFT} - \mathit{E}_{ads}^{GNN}$ / eV")  
plt.ylabel("Density")
plt.title("Error distribution")
plt.xlim(-2.5, 4.5)
ylim = 0.75
plt.ylim(0, ylim)
plt.vlines(np.mean(df["error"]), 0, ylim, colors='r', linestyles='dashed', label='mean')
plt.vlines(np.median(df["error"]), 0, ylim, colors='g', linestyles='dashed', label='median')
plt.legend(fontsize=9)
plt.text(0.03, 0.95, "mean = {:.2f}\nmedian = {:.2f}\nstd = {:.2f}".format(mean, median, std),
 transform=plt.gca().transAxes, va='top', bbox=dict(boxstyle='round', facecolor='white', alpha=1.0, edgecolor='black'), fontsize=9)
plt.tight_layout()