## Imports 

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from itertools import chain
from ocpmodels.datasets import SinglePointLmdbDataset

## Variables

In [None]:
INITIAL_GEOMETRY = "contcar"                # Either use poscar or contcar predictions
GEOM_MODEL = "ensemble"                     # Geometric model, either full or ensemble
GNN_MODEL = "painn"                         # GNN model, either dpp, painn or gemnet

ROOT_DIR = Path("./predictions")            # Root of the predictions
TARBALL = ROOT_DIR/"ocp_predictions.tar.xz" # Location of the dataset tarball
#TARBALL = None                             # Set to False or None to avoid extraction.
DS_NAME = f"""\
lmdb_bm_{GEOM_MODEL}_{INITIAL_GEOMETRY}\
"""                                         # Name of the Datasets               
DS_DIR = ROOT_DIR/DS_NAME                   # Dataset DIR
PREDICT_DIR = DS_DIR/GNN_MODEL              # Prediction DIR
SURF_DS_NAME = f"""\
lmdb_bm_surf_{INITIAL_GEOMETRY}\
"""                                         # Surface dataset name
SURF_DIR = ROOT_DIR/SURF_DS_NAME            # Surf dataset DIR
SURF_PREDICT_DIR = SURF_DIR/GNN_MODEL       # Surf predict DIR

## Extract Tarball

In [None]:
# Extract tarball to DS_DIR location
if TARBALL:
    import tarfile
    tar_ds = tarfile.open(TARBALL, mode="r:xz")
    tar_ds.extractall(ROOT_DIR)
    tar_ds.close()

## Read Predictions

In [None]:
def arr_load_n_dict(f):
    arr = np.load(f)
    return map(
        lambda s: {"sid": s[0], "e_pred": s[1]}
        , zip(np.asarray(arr["ids"], dtype=int)
                 , np.asarray(arr["energy"], dtype=float)))

bm_test_preds = arr_load_n_dict(PREDICT_DIR/"predictions.npz")

## Read Surface Energies Predictions

In [None]:
if GEOM_MODEL == "full":
    surf_df = pd.read_csv(SURF_DIR/"ds_data.csv"
                        , names=("sid", "name", "family", "e_true"))
    surf_test_preds = arr_load_n_dict(SURF_PREDICT_DIR/"predictions.npz")

    surf_df = surf_df.merge(pd.DataFrame(surf_test_preds))
    surf_df.drop(["sid"], inplace=True, axis=1)
    # Adjust some surfaces that are expanded
    def adjust_values(x):
        match (*x[["name", "family"]],):
            case [n, "plastics"]: return 4
            case [n, _] if "au" in n: return 4
            case _: return 1
    mult_val = surf_df.apply(adjust_values, axis=1)
    surf_df["e_pred"] *= mult_val
    surf_df["e_true"] *= mult_val
    print(surf_df)

## Collect data in dataframe

In [None]:
bm_df = pd.read_csv(DS_DIR/"ds_data.csv"
                    , names=("sid", "name", "family", "e_true"))
bm_test_preds = arr_load_n_dict(PREDICT_DIR/"predictions.npz")

bm_df = bm_df.merge(pd.DataFrame(bm_test_preds))
# Remove sid column to prettify the output
bm_df.drop(["sid"], inplace=True, axis=1)

# Get only adsorbed molecules
is_ads = lambda x: '-' in x
bm_ads_df = bm_df[bm_df["name"].apply(is_ads)]

# Compute the error
bm_df["error"] = np.abs(bm_df["e_true"] - bm_df["e_pred"])

## Compute adsorption energies

In [None]:
get_mol_val = lambda x: bm_df.loc[bm_df['name'] == x]

if GEOM_MODEL == "full":
    def get_surf_val(m, f):
        fn = lambda x: x["name"].split('-')[0] == m and x["family"] == f
        return surf_df[surf_df.apply(fn, axis=1)].iloc[0]
    
def get_ener(x, c):
    metal, mol = x["name"].split('-')
    ener = x[c] - bm_df.loc[bm_df['name'] == mol][c].array[0]
    if GEOM_MODEL == "full":
        ener -= get_surf_val(metal, x["family"])[c]
    return ener
                            
        
get_ener_true = lambda x: get_ener(x, "e_true")
get_ener_pred = lambda x: get_ener(x, "e_pred")

In [None]:
bm_df["eads_true"] = bm_ads_df.apply(get_ener_true, axis=1)
bm_df["eads_pred"] = bm_ads_df.apply(get_ener_pred, axis=1)
bm_df["eads_error"] = bm_df["eads_true"] - bm_df["eads_pred"]

## Delete gasses

In [None]:
bm_s_df = bm_df[bm_df["eads_true"].notnull()]
bm_s_df

## Compute metrics and plot results

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score

In [None]:
color_dict = {
    "biomass": "red"
    , "plastics": "green"
    , "polyurethanes": "yellow"
}
to_color = lambda x: color_dict[x]

r2 = r2_score(bm_s_df["eads_true"], bm_s_df["eads_pred"])
mae = bm_s_df["eads_error"].abs().mean()
rmse = (bm_s_df["eads_error"] ** 2).mean() ** .5

fig, ax = plt.subplots(figsize=(5, 5), dpi=800)
ax.scatter(bm_s_df["eads_true"]
          , bm_s_df["eads_pred"]
          , c=bm_s_df["family"].apply(to_color))
ax.set_ylabel("$\mathit{E}_{\mathrm{ads}}^{\mathrm{pred}}$ / $\mathit{eV}$")
ax.set_xlabel("$\mathit{E}_{\mathrm{ads}}^{\mathrm{true}}$ / $\mathit{eV}$")
ax.annotate("$\mathrm{{MAE}} = {:.3f}~eV$".format(mae), (0.7, 0.20), xycoords="axes fraction")
ax.annotate("$\mathrm{{RMSE}} = {:.3f}~eV$".format(rmse), (0.7, 0.15), xycoords="axes fraction")
ax.annotate("$\mathrm{{R}}^{{2}} = {:.3f}$".format(r2), (0.7, 0.1), xycoords="axes fraction")