# FG_Dataset lmdb creation

## Introduction

This notebooks pack our data sets into the Open Catalysist Project lmdb format. If the `SPLIT_CV` option is selected, it will create multiple lmdb datasets by following the nested cross validation procedure depicted in our work.

The dataset should have the following hierarchy:

## File Hierarchy

```
├── Biomass
│   └── structures
│       ├── ni-mol5.contcar
│       ├── ni-mol5.poscar
│       ├── ru-mol1.poscar
│       ├── ru-mol2.contcar
│       ├── *.poscar
│       └── *.contcar
├── Plastics
│   └── structures
│       ├── *.poscar
│       └── *.contcar
├── Polyurethanes
│   └── structures
│       ├── *.poscar
│       └── *.contcar
├── energies.dat
├── energies_i.dat
└── groups.dat
```

#### Folders

Folders at root may represent the chemical group name of the geometries contained inside.

#### Geometry Names

Initial structures should be labelled with the `.poscar` extension while final structures are labelled with the `.contcar` extension. 
The name of the file will match the sample label and should adhere to the following format:
- `<metal>-<label>.(contcar|poscar)` for adsorbed structures, e.g. `ag-13X1-a.contcar`
- `<metal>-0000.(contcar|poscar)` for metallic surfaces, e.g. `ag-0000.crontcar`
- `<label>.(contcar|poscar)` for gasses, e.g. `49X6`


#### energies*.dat Files

The energy files (energies.dat and energies_i.dat) should contain two colums, the former containing the names and the last containing the DFT energy. It is a good practice to include the surfaces energies, but they will be ignored during the lmdb packing. `energies.dat` contain DFT the energies obtained from the relaxed structure while `energies_i.dat` contain the energies obtained from the first converged SCF cycle of the relaxation (singlepoint calculation).

Format example:
```
energies.dat

49X6 -85.18458992
49X7 -85.12457339
49X8 -84.60803269
ag-13X1-a -161.01991739
ag-25X1-a -177.38788953
ag-0000 -125.30436866
```

#### groups.dat File

`groups.dat` file follows the same structure as the `energies.dat` file, but instead of storing the DFT energies in the second column it stores group of the sample. If the name of the folders in root match the group names, the following shell command can be used to generate `groups.dat`:
```bash
for fn in ./*/structures/ 
```

## Variables

In [None]:
from pathlib import Path

ROOT_DIR = Path("./datasets/")                 # Working directory
DS = "FG"                                      # Either "FG" or "BM"

TARBALL = ROOT_DIR/f"{DS}_dataset_lite.tar.xz" # Location of the dataset tarball
#TARBALL = None                                # Set to False or None to avoid extraction.
DS_DIR = ROOT_DIR/f"{DS}_dataset_lite"         # Dir of the initial Dataset
DS_DIR_OUT = Path(f"./lmdb/lmdb_{DS}")         # Dir of the output dataset
INITIAL_GEOMETRY = "contcar"                   # Either look for contcar or poscar files
SPLIT_CV = { "seed": 42                        # Seed that will be used during the random splitting
           , "n_splits": 5                     # Number of splits
           , "val_size": 1                     # Number of splits in the validation set
           , "test_size": 1 }                  # Number of splits in the test set
#SPLIT_CV = None                               # Set it to False or None to avoid splitting

## Extract Tarball

In [None]:
# Extract tarball to DS_DIR location
if TARBALL:
    import tarfile
    tar_ds = tarfile.open(TARBALL, mode="r:xz")
    tar_ds.extractall(DS_DIR)
    tar_ds.close()

## Auxiliary Functions

In [None]:
# Read a file with two columns and transform it to a dictionary
def read_two_columns(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return map(
            lambda l: l.split()
            , f.readlines())

## Read structures and Energies

Read structures, inital and final energies.

In [None]:
from pathlib import Path
from ase.io.vasp import read_vasp, read_vasp_out
from ase.calculators.singlepoint import SinglePointCalculator

# Get energies in files
iener_dict = dict(read_two_columns(DS_DIR/"energies_i.dat"))
fener_dict = dict(read_two_columns(DS_DIR/"energies.dat"))

def get_struct(fname):
    final = read_vasp(fname)
    final._calc = SinglePointCalculator(final, energy=float(fener_dict[fname.stem]))
    return final                                
                                                
strct_map = map(
    lambda d: (d.stem, get_struct(d))
    , DS_DIR.glob(f"./*/*/*.{INITIAL_GEOMETRY}"))

## Get Groups

In [None]:
from functools import reduce
from itertools import chain

def reduce_grp(d, i):
    match i:
        case (k, v) if k in d: d[k].append(v)
        case (k, v): d[k] = [v]
    return d

groups_direct = dict(read_two_columns(DS_DIR/"groups.dat"))
groups_invert = map(
    lambda xs: xs[::-1]
    , groups_direct.items())

groups_dict = reduce(
    reduce_grp
    , groups_invert
    , {})

## Samples Dictionary

In [None]:
# Use only final energies if contcar is selected
if INITIAL_GEOMETRY == "contcar": 
    ener_pvt_dict = fener_dict
else:
    ener_pvt_dict = iener_dict
    
# Apply a filter to avoid collecting the metallic surfaces
filter_surf = lambda x: "0000" not in x[0]

ener_strct_map = map(
    lambda x: (x[0], dict(name=x[0]
                       , fener=float(fener_dict[x[0]])
                       , iener=float(ener_pvt_dict[x[0]])
                       , image=x[1]
                       , group=groups_direct[x[0]]))
    , filter(filter_surf, strct_map))

## Extract Structures

In [None]:
from ocpmodels.preprocessing import AtomsToGraphs
import torch

a2g = AtomsToGraphs(
    max_neigh=50,
    radius=6,
    r_energy=True,
    r_forces=False,
    r_distances=False,
    r_fixed=True,
)

def read_entry_extract_features(a2g, strc):
    tags = strc.get_tags()
    data_objects = a2g.convert_all([strc], disable_tqdm=True)
    data_objects[0].tags = torch.LongTensor(tags)
    return data_objects

def model_dict(xs):
    idx = 0
    out_dict = {}
    for key, value in xs:
        data_objects = read_entry_extract_features(a2g, value['image'])
        init = data_objects[0]
    
        init.y_init = value["iener"]
        init.y_relaxed = init.y
        del init.y
        # As we are performing a IS2RE the final structure is not needed.
        init.pos_relaxed = init.pos 
    
        init.sid = idx
        # Saving name and group for later identification.
        init.name = value["name"]
        init.group = value["group"]
        
        if init.edge_index.shape[1] == 0:
            print("no neighbors", idx)
            continue
        idx += 1
        out_dict[key] = init
    return out_dict

In [None]:
ase_dict = model_dict(ener_strct_map)

In [None]:
print(f"Samples in the dataset: {len(ase_dict)}")

## Process Structures

## Split Sets

The algorithm below generates a nested cross validation set of sets equally splitting the dataset samples by chemical group.

The behavior of the algorithm can be simplified as:
1) Randomly shuffle the names in each group, using `seed` as the seed of the RNG.
2) Split the samples of each chemical group into `n_splits` slices:
    - `Dict[name, sample] -> Dict[group, [name]] -> n -> [[value]]`
3) From these splits, generate the `n` unique combinations available by taking `val_size` slices as the validation set, `test_size` slices as the test set and the remaining as the training set: 
    - `[[value]] -> n -> n -> [([value], [value], [value])]`


*Note: Step **1.** is impure and changes the order of the samples stored in `groups_dict`*

In [None]:
def split_ds(ase_dict, groups_dict, seed=42, n_splits=5, val_size=1, test_size=1):
    from collections import deque
    from itertools import chain, combinations, product
    import random
    from numpy import array_split
    
    random.seed(seed)

    # Randomly shuffle the values stored in groups dict
    deque(map(
        random.shuffle
        , groups_dict.values())
        , maxlen=0)

    # Filter structures that are in groups_dict but are not present
    # in ase_dict
    filtered_groups_dict = dict(map(
        lambda xs: (xs[0]
                    , tuple(filter(lambda x: x in ase_dict.keys()
                             , xs[1])))
        , groups_dict.items()))
    
    # Split the groups entries equally into n_splits slices
    slices = reduce(
        lambda l, t: map(lambda x: tuple(chain.from_iterable(x))
                         , zip(l, t))
        , map(lambda x: array_split(x, n_splits)
            , filtered_groups_dict.values())
        , [[]]*n_splits)

    k_sets = set(map(
        lambda x: tuple(map(ase_dict.get, x))
        , slices))
    
    val_set = combinations(k_sets, val_size)
    test_set = combinations(k_sets, test_size)
    # Quick filter to discard combinations that lead to intersections between
    # validation and test datasets.
    val_test_comb = filter(
        lambda xs: not set(xs[0]).intersection(set(xs[1]))
        , product(val_set, test_set))
    
    # Chain the slices into training test and val
    chain_n_tuple = lambda xs: tuple(chain.from_iterable(xs))
    return map(
        lambda xs: (chain_n_tuple(k_sets.difference(set(set(chain.from_iterable(xs)))))
                    , chain_n_tuple(xs[0])
                    , chain_n_tuple(xs[1]))
        , val_test_comb)

## Write data to LMDB

Write the three datasets into the lmdb format

In [None]:
import lmdb
import pickle
from pathlib import Path

def dump_db(xs, db_path):
    db = lmdb.open(
        str(db_path),
        map_size=1099511627776 * 2,
        subdir=False,
        meminit=False,
        map_async=True,
    )
    idx = 0
    for value in xs:
        txn = db.begin(write=True)
        txn.put(f"{idx}".encode("ascii"), pickle.dumps(value, protocol=-1))
        txn.commit()
        db.sync()
        idx += 1
    db.close()

In [None]:
from pathlib import Path
from os import makedirs

mkdir_p = lambda p: p.is_dir() or makedirs(p)

# Write three different lmdb for each of the splittings.
if SPLIT_CV:
    splitted_sets = split_ds(ase_dict, groups_dict, **SPLIT_CV)
    for idx, n_set in enumerate(splitted_sets):
        train, test, val = n_set
        dpath = DS_DIR_OUT/str(idx)
        mkdir_p(dpath)
        dump_db(train, dpath/"train.lmdb")
        dump_db(test, dpath/"test.lmdb")
        dump_db(val, dpath/"val.lmdb")
else:
    dpath = DS_DIR_OUT
    mkdir_p(dpath)
    dump_db(ase_dict.values(), dpath/"test.lmdb")

## Metrics

### Compute Metrics

In [None]:
import numpy as np
import pandas as pd
from ocpmodels.datasets import SinglePointLmdbDataset
from pathlib import Path

if SPLIT_CV:
    target_glob = "./*/*.lmdb"
else:
    target_glob = "./*.lmdb"

# Extract some useful metrics from the datasets
# This step is not needed and can be done before without
# reading the datasets again. However, errors can be
# easily detected during this step and therefore, I use
# it as a sanity check
def get_metrics(lmdb_ds_path):
    ds_arr = np.asarray(tuple(
        map(
            lambda x: x.y_relaxed
            , SinglePointLmdbDataset({"src": str(lmdb_ds_path)})))
        , dtype=float)
    return {
        "mean": np.mean(ds_arr)
        , "std": np.std(ds_arr)
        , "idx": lmdb_ds_path.parent.name
        , "split": lmdb_ds_path.stem
        , "path": lmdb_ds_path
        , "samples": ds_arr.shape[0]
    }

metrics_df = pd.DataFrame(map(
    get_metrics
    , DS_DIR_OUT.glob(("./*.lmdb", "./*/*.lmdb")[bool(SPLIT_CV)])))

In [None]:
from collections import deque

deque(map(lambda xs: xs[1].to_csv( Path(xs[1]["path"].iloc[0]).parent/"metrics.csv"
                            , header=False
                            , index=False)
    , metrics_df.groupby("idx")[["split", "path", "mean", "std"]]))

print(f"Written metrics for the ds in {DS_DIR_OUT}")

### Show Metrics

In [None]:
metrics_df