In [None]:
import sys, os
sys.path.insert(0, '../../../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.font_manager import FontProperties
legend_font = FontProperties(family='Arial', style='normal', size=9)
import seaborn as sns
from sklearn.metrics import r2_score

from gnn_eads.functions import contcar_to_graph
from gnn_eads.graph_filters import single_fragment_filter
from gnn_eads.nets import PreTrainedModel

def get_fragment_energy(structure: str): 
    """Calculate the fragment formation energy from closed-shell reference molecules."""
    e_CO2 = -22.96215586
    e_H2O = -14.21877278
    e_H2 = -6.76639487
    # Count elements in the structure based on the structure name (ex. i123032)
    n_C = int(structure[1])
    try:
        n_H = int(structure[2])
    except ValueError:
        if structure[2] == "a":
            n_H = 10
        if structure[2] == "b":
            n_H = 11
        if structure[2] == "c":
            n_H = 12
    n_O = int(structure[3])
    # Calculate fragment formation energy
    e_fragment = n_C * e_CO2 + (n_O - 2*n_C) * e_H2O + (4*n_C + n_H - 2*n_O) * e_H2 * 0.5
    return e_fragment

## 1) Load GAME-Net

In [None]:
MODEL_NAME = "GAME-Net"
MODEL_PATH = "../../../models/{}".format(MODEL_NAME)  
model = PreTrainedModel(MODEL_PATH)
print(model)

## 2) Retrieve data, convert them to graph and get predictions

As we do not provide the dataset, please refer to the related paper. We already provide a .csv file containing the data generated in this step, go directly to section 4).

In [None]:
DATA_PATH = "/home/smorandi/teklahome/benchmark_propylene_network"  # Path to the dataset

In [None]:
# Read the data
df = pd.read_csv(os.path.join(DATA_PATH, "energies.dat"), sep=" ", header=None)
# Define headers
df.columns = ["structure", "energy_DFT"]
# Delete data containing more than 8 characters in the structure name (filter out failed, warnings, etc.)
df = df[df["structure"].str.len() <= 8]
# Subtruct energy of i000000
df["reduced_energy_DFT"] = df["energy_DFT"] - df[df["structure"] == "i000000"]["energy_DFT"].values[0]
# Delete i000000
df = df[df["structure"] != "i000000"]
# Get graph from contcars with for loop and try except, together with GNN prediction
graphs, energies_GNN = [], []
for row, system in df.iterrows():
    try:
        file_path = os.path.join(DATA_PATH, "contcars", "{}.contcar".format(df["structure"][row]))
        graphs.append(contcar_to_graph(file_path, 
                                       model.g_tol, 
                                       model.g_sf, 
                                       model.g_metal_2nn))
        energies_GNN.append(model.evaluate(graphs[-1]))
        print("Done with {}".format(df["structure"][row]))
    except:
        graphs.append(None)
        energies_GNN.append(None)
        print("Error in {}".format(df["structure"][row]))
# Add graphs to dataframe
df["graph"] = graphs
df["reduced_energy_GNN"] = energies_GNN
# Delete rows with None graphs
df = df[df["graph"].notna()]
# Remove graphs with multiple adsorbates using single_fragment_filter
df = df[df["graph"].apply(single_fragment_filter)]
# Get adsorbate energy from structure name
df["adsorbate_energy"] = df["structure"].apply(get_fragment_energy)
# Get DFT and GNN adsorption energies
df["eads_DFT"] = df["reduced_energy_DFT"] - df["adsorbate_energy"]
df["eads_GNN"] = df["reduced_energy_GNN"] - df["adsorbate_energy"]
# Get error and absolute error
df["error"] = df["eads_DFT"] - df["eads_GNN"]
df["abs_error"] = np.abs(df["eads_DFT"] - df["eads_GNN"])
# Get number of C atoms from the structure name
df["C"] = df["structure"].str[1].astype(int)
# Get statistics 
mae = df["abs_error"].mean()
rmse = np.sqrt((df["error"]**2).mean())
n = len(df)
r2 = r2_score(df["eads_DFT"], df["eads_GNN"])
mean = np.mean(df["error"])
median = np.median(df["error"])
std = np.std(df["error"])
print("MAE: {:.3f} eV".format(mae))
print("RMSE: {:.3f} eV".format(rmse))
print("R2 score: {:.3f}".format(r2))
print("Mean: {:.3f} eV".format(mean))
print("Median: {:.3f} eV".format(median))
print("Std: {:.3f} eV".format(std))

## 3) Save dataframe to .csv

In [None]:
df.to_csv("benchmark_propylene_network.csv")

## 4) Load dataframe from .csv

In [None]:
df = pd.read_csv("benchmark_propylene_network.csv")

In [None]:
# Get statistics 
mae = df["abs_error"].mean()
rmse = np.sqrt((df["error"]**2).mean())
n = len(df)
r2 = r2_score(df["eads_DFT"], df["eads_GNN"])
mean = np.mean(df["error"])
median = np.median(df["error"])
std = np.std(df["error"])

print("MAE: {:.3f} eV".format(mae))
print("RMSE: {:.3f} eV".format(rmse))
print("R2 score: {:.3f}".format(r2))
print("Mean: {:.3f} eV".format(mean))
print("Median: {:.3f} eV".format(median))
print("Std: {:.3f} eV".format(std))

# Plot

In [None]:
# subplot with first two figures
fig, ax = plt.subplots(1, 2, figsize=(18/2.54,9/2.54), dpi=300)
sns.scatterplot(x="eads_DFT", y="eads_GNN", hue="C", data=df, ax=ax[0], palette="viridis", ec="k", s=20)
params = {'mathtext.default': 'regular'}          
plt.rcParams.update(params)
ax[0].set_ylabel('$\mathit{E}_{ads}^{GNN}$ / eV')
ax[0].set_xlabel('$\mathit{E}_{ads}^{DFT}$ / eV')
ax[0].set_title("Parity plot")
l = 8
ax[0].set_xlim(-l, 2)
ax[0].set_ylim(-l, 2)
ax[0].plot([-l, l], [-l, l], c="k", zorder=-1)
ax[0].xaxis.set_major_locator(MaxNLocator(5))
ax[0].yaxis.set_major_locator(MaxNLocator(5))
plt.tight_layout()
text = "MAE = {:.2f} eV\nRMSE = {:.2f} eV\n$\mathit{{R}}^{{2}}$ = {:.2f}\nN = {}".format(mae, rmse, r2, n)
props = dict(boxstyle='round', facecolor='white', edgecolor='black')
ax[0].text(0.05, 0.95, text, transform=ax[0].transAxes, fontsize=9,
        verticalalignment='top', bbox=props)
# Move legend bottom right
handles, labels = ax[0].get_legend_handles_labels()
ax[0].legend(handles, labels, loc="lower right", title="C", fontsize=9)


sns.kdeplot(df["error"], fill=True, ax=ax[1], alpha=0.5)
ax[1].spines['right'].set_linewidth(0.5)
ax[1].spines['right'].set_color('black')
ax[1].spines['top'].set_linewidth(0.5)
ax[1].spines['top'].set_color('black')
ax[1].spines['left'].set_linewidth(0.5)
ax[1].spines['left'].set_color('black')
ax[1].spines['bottom'].set_linewidth(0.5)
ax[1].spines['bottom'].set_color('black')
plt.tick_params("both")
plt.xlabel("$\mathit{E}_{ads}^{DFT} - \mathit{E}_{ads}^{GNN}$ / eV")  
plt.ylabel("Density")
plt.title("Error distribution")
plt.xlim(-2.5, 2.5)
ylim = 0.75
plt.ylim(0, ylim)
plt.vlines(np.mean(df["error"]), 0, ylim, colors='r', linestyles='dashed', label='mean')
plt.vlines(np.median(df["error"]), 0, ylim, colors='g', linestyles='dashed', label='median')
plt.legend(fontsize=9)
plt.text(0.05, 0.95, "mean = {:.2f}\nmedian = {:.2f}\nstd = {:.2f}".format(mean, median, std),
 transform=plt.gca().transAxes, va='top', bbox=dict(boxstyle='round', facecolor='white', alpha=1.0, edgecolor='black'), fontsize=9)
plt.tight_layout()