### OCP Data Preprocessing Tutorial


This notebook provides an overview of converting ASE Atoms objects to PyTorch Geometric Data objects. To better understand the raw data contained within OC20, check out the following tutorial first: https://github.com/Open-Catalyst-Project/ocp/blob/master/docs/source/tutorials/data_playground.ipynb

In [None]:
from ocpmodels.preprocessing import AtomsToGraphs
import ase.io
from ase.build import bulk
from ase.build import fcc100, add_adsorbate, molecule
from ase.constraints import FixAtoms
from ase.calculators.emt import EMT
from ase.optimize import BFGS
import torch
from torch_geometric.data import Data

### Generate toy dataset: Relaxation of CO on Cu

In [None]:
adslab = fcc100("Cu", size=(2, 2, 3))
ads = molecule("CO")
add_adsorbate(adslab, ads, 3, offset=(1, 1))
cons = FixAtoms(indices=[atom.index for atom in adslab if (atom.tag == 3)])
adslab.set_constraint(cons)
adslab.center(vacuum=13.0, axis=2)
adslab.set_pbc(True)
adslab.set_calculator(EMT())
dyn = BFGS(adslab, trajectory="CuCO_adslab.traj", logfile=None)
dyn.run(fmax=0, steps=1000)

In [None]:
raw_data = ase.io.read("CuCO_adslab.traj", ":")
print(len(raw_data))

### Convert Atoms object to Data object

The AtomsToGraphs class takes in several arguments to control how Data objects created:

- max_neigh (int):   Maximum number of neighbors a given atom is allowed to have, discarding the furthest
- radius (float):      Cutoff radius to compute nearest neighbors around
- r_energy (bool):    Write energy to Data object
- r_forces (bool):    Write forces to Data object
- r_distances (bool): Write distances between neighbors to Data object
- r_edges (bool):     Write neigbhor edge indices to Data object
- r_fixed (bool):     Write indices of fixed atoms to Data object

In [None]:
a2g = AtomsToGraphs(
    max_neigh=50,
    radius=6,
    r_energy=True,
    r_forces=True,
    r_distances=False,
    r_edges=True,
    r_fixed=True,
)

In [None]:
data_objects = a2g.convert_all(raw_data, disable_tqdm=True)

In [None]:
data = data_objects[0]
data

In [492]:
data.atomic_numbers

tensor([29., 29., 29., 29., 29., 29., 29., 29., 29., 29., 29., 29.,  8.,  6.])

In [493]:
data.cell

tensor([[[ 5.1053,  0.0000,  0.0000],
         [ 0.0000,  5.1053,  0.0000],
         [ 0.0000,  0.0000, 32.6100]]])

In [494]:
data.edge_index #neighbor idx, source idx

tensor([[ 5,  7,  6,  ...,  4,  6,  7],
        [ 0,  0,  0,  ..., 13, 13, 13]])

In [495]:
data.fixed

tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [496]:
data.force

tensor([[ 2.2306e-15,  1.4246e-16, -1.6429e-02],
        [ 2.2372e-15,  1.8440e-15, -6.4668e-04],
        [ 2.5206e-15,  5.8791e-15, -6.4668e-04],
        [-1.8058e-15, -4.0592e-15,  1.7722e-02],
        [ 1.2381e-14,  2.5091e-15, -1.3239e-15],
        [-1.2242e-14, -1.0446e-14,  3.2344e-15],
        [-3.1921e-15,  4.7090e-15,  6.3805e-15],
        [ 5.3794e-15, -3.5640e-15,  5.0293e-15],
        [ 5.6747e-15, -2.7279e-16, -6.8647e-15],
        [-2.9879e-15, -1.2906e-15, -6.5919e-15],
        [ 3.8997e-15,  7.8866e-15,  1.3808e-15],
        [-6.4576e-15, -4.2618e-15,  1.1362e-14],
        [ 4.6960e-18, -1.0266e-17, -4.7148e-14],
        [-1.3401e-16,  2.1034e-16,  3.8341e-14]])

In [497]:
data.pos

tensor([[ 0.0000e+00,  0.0000e+00,  1.3000e+01],
        [ 2.5527e+00,  0.0000e+00,  1.3000e+01],
        [ 0.0000e+00,  2.5527e+00,  1.3000e+01],
        [ 2.5527e+00,  2.5527e+00,  1.3000e+01],
        [ 1.2741e+00,  1.2741e+00,  1.4778e+01],
        [ 3.8312e+00,  1.2741e+00,  1.4778e+01],
        [ 1.2741e+00,  3.8312e+00,  1.4778e+01],
        [ 3.8312e+00,  3.8312e+00,  1.4778e+01],
        [-7.6910e-16, -1.9714e-15,  1.6562e+01],
        [ 2.5527e+00, -1.3052e-15,  1.6567e+01],
        [ 1.2551e-15,  2.5527e+00,  1.6567e+01],
        [ 2.5527e+00,  2.5527e+00,  1.6539e+01],
        [ 2.5527e+00,  2.5527e+00,  1.9567e+01],
        [ 2.5527e+00,  2.5527e+00,  1.8443e+01]])

In [498]:
data.y

3.968355893395698

### Adding additional info to your Data objects

In addition to the above information, the OCP repo requires several other pieces of information for your data to work
with the provided trainers:

- sid (int): A unique identifier for a particular system. Does not affect your model performance, used for prediction saving 
- fid (int) (S2EF only): If training for the S2EF task, your data must also contain a unique frame identifier for atoms objects coming from the same system.
- tags (tensor): Tag information - 0 for adsorbate, 1 for surface, 2 for subsurface. Optional, can be used for training.


Other information may be added her as well if you choose to incorporate other information in your models/frameworks

In [499]:
data_objects = []
for idx, system in enumerate(raw_data):
    data = a2g.convert(system)
    data.fid = idx
    data.sid = 0 # All data points come from the same system, arbitrarly define this as 0
    data_objects.append(data)

In [500]:
data = data_objects[100]
data

Data(atomic_numbers=[14], cell=[1, 3, 3], cell_offsets=[635, 3], edge_index=[2, 635], fid=100, fixed=[14], force=[14, 3], natoms=14, pos=[14, 3], sid=0, y=3.968355893395698)

In [501]:
data.sid

0

In [502]:
data.fid

100

In [503]:
# Helper function to convert from PyTorch Geometric input to GROVER input:
def convert_input(data):
    """
        :param data: data as PyTorch geometric object
        :param f_atoms: the atom features, num_atoms * atom_dim
        :param f_bonds: the bond features, num_bonds * bond_dim
        :param a2b: mapping from atom index to incoming bond indices. 
        :param a2a: mapping from atom index to its neighbors. num_atoms * max_num_bonds
        :param b2a: mapping from bond index to the index of the atom the bond is coming from.
        :param b2revb: mapping from bond index to the index of the reverse bond.
        :return: batch = (f_atoms, f_bonds, a2b, a2a, b2a, b2revb)
    """
    # Per atom features: (atomic_number, pos_x, pos_y, pos_z)
    f_atoms = torch.stack((data.atomic_numbers.long(), data.pos[:,0], data.pos[:,1], data.pos[:,2]), 1)
    # Per edge features (calculated by atomic distances in model forward pass)
    f_bonds = data.edge_attr

    a2a = [[] for j in range(data.natoms)] # List of lists - Dynamically append neighbors for a given atom
    a2b = [[] for j in range(data.natoms)] # List of lists - Dynamically append edges for a given atom
    b2a = torch.zeros((data.edge_index.shape[1],))  # (num_edges, ) - One originating atom per edge
    b2revb = torch.zeros((data.edge_index.shape[1],))  # (num_edges, ) - One reverse bond per bond
    rev_edges = {} # Dict of lists for each (from_atom, to_atom) pair, saving edge numbers

    # Loop through every edge in the graph
    for i in range(data.edge_index.shape[1]):
        from_atom = int(data.edge_index[0][i])
        to_atom = int(data.edge_index[1][i])

        a2a[from_atom].append(to_atom)  # Mark b as neighbor of a
        a2b[to_atom].append(i)  # Mark bond i as incoming bond to atom a
        b2a[i] = from_atom  # Mark a as atom where bond i is originating
        key = frozenset({to_atom, from_atom})
        if (key not in rev_edges):  # If the edge from these two atoms has not been seen yet
            rev_edges[key] = []  # Declare it as a list (so we can keep track of the edge numbers)
        rev_edges[key].append(i)  # Append the edge number to the list

    # Iterate through and set b2revb
    for atoms, edges in rev_edges.items():
        b2revb[edges[0]] = edges[1]
        b2revb[edges[1]] = edges[0]


      # Convert list of lists for a2a and a2b into tensor: (num_nodes, max_edges)
    # Trim length to max number of edges seen in the data (should be capped by 50 but not always in practice)
    a2a_pad = len(max(a2a, key=len))
    a2b_pad = len(max(a2b, key=len))

    # -1 is not a valid atom or edge index so we pad with this
    a2a = torch.tensor([i + [0] * (a2a_pad - len(i)) for i in a2a])
    a2b = torch.tensor([i + [0] * (a2b_pad - len(i)) for i in a2b])

    batch = (f_atoms, f_bonds, a2b, a2a, b2a, b2revb)
    return batch

In [519]:
from torch_geometric.utils import degree
# print(degree(data.edge_index[0]))
# print(torch.max(degree(data.edge_index[0])))


batch = convert_input(data)
f_atoms, f_bonds, a2b, a2a, b2a, b2revb = batch

# Calculate atom to neighboring atom mappings (a2a)
print(data.edge_index)
trans = data.edge_index.T
print(trans)
sorted_index = trans[trans[:, 0].sort()[1]]  # Sort by column zero (from_node), index based off
print(sorted_index)
sorted_index = sorted_index.T
print(sorted_index)


out, counts = torch.unique(sorted_index[0], return_counts=True)  # counts for each atom
print("out: ", out)
out = torch.cat((out[:3], out[4:]))
out = torch.cat((out[:7], out[8:]))
print("out: ", out)
print("counts: ", counts)
counts = torch.cat((counts[:3], counts[4:]))
counts = torch.cat((counts[:7], counts[8:]))
print("counts: ", counts)

if len(out) - 1 != out[-1]: # If they are the correct length then we don't need to add 0's
    ascending = torch.arange(out[-1] + 1) # Ascending order list [0, 1, ... n] where n is last elem in out
    print("ascending: ", ascending)
    missing = [ elem not in out for elem in ascending ] # Missing elements in out
    print("missing: ", missing)
    print("np.nonzero: ", np.nonzero(missing)[0])
    for elem in np.nonzero(missing)[0]:
        i = int(elem)
        print("i: ", i)
        print("counts: ", counts)
        counts = torch.cat((counts[:i], torch.tensor((0,)), counts[i:])) # Add zero to count
        print("counts: ", counts)
        
a2a2 = sorted_index[1].split(counts.tolist())  # Index into to_bonds with these indices

a2a2 = torch.nn.utils.rnn.pad_sequence(list(a2a2), batch_first=True, padding_value=0)  

print("a2a2.shape", a2a2.shape)
print("a2a.shape", a2a.shape)
print("a2a[0]: ", a2a[0])
print("a2a2[0]: ", a2a2[0])
print("\na2a and a2a2 equal: ", torch.equal(a2a, a2a2))

tensor([[ 5,  7,  6,  ...,  4,  6,  7],
        [ 0,  0,  0,  ..., 13, 13, 13]])
tensor([[ 5,  0],
        [ 7,  0],
        [ 6,  0],
        ...,
        [ 4, 13],
        [ 6, 13],
        [ 7, 13]])
tensor([[ 0,  7],
        [ 0,  2],
        [ 0,  2],
        ...,
        [13,  3],
        [13, 11],
        [13,  6]])
tensor([[ 0,  0,  0,  ..., 13, 13, 13],
        [ 7,  2,  2,  ...,  3, 11,  6]])
out:  tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])
out:  tensor([ 0,  1,  2,  4,  5,  6,  7,  9, 10, 11, 12, 13])
counts:  tensor([45, 45, 45, 46, 48, 49, 49, 50, 53, 46, 47, 55, 22, 35])
counts:  tensor([45, 45, 45, 48, 49, 49, 50, 46, 47, 55, 22, 35])
ascending:  tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])
missing:  [False, False, False, True, False, False, False, False, True, False, False, False, False, False]
np.nonzero:  [3 8]
i:  3
counts:  tensor([45, 45, 45, 48, 49, 49, 50, 46, 47, 55, 22, 35])
counts:  tensor([45, 45, 45,  0, 48, 49, 49, 

RuntimeError: split_with_sizes expects split_sizes to sum exactly to 635 (input tensor's size at dimension 0), but got split_sizes=[45, 45, 45, 0, 48, 49, 49, 50, 0, 46, 47, 55, 22, 35]

In [None]:
# # Efficient b2a calculation
# b2a2 = data.edge_index[0].type(torch.FloatTensor)
# print("\nb2a and b2a2 equal: ", torch.equal(b2a2, b2a))

# # Efficient a2b calculation
# num_atoms_total = data.natoms
# count, idx = torch.unique(data.edge_index[1], return_counts=True)
# max_bonds = int(torch.max(degree(data.edge_index[1])))

# a2b1 = torch.zeros(num_atoms_total, max_bonds)
# for i in range(len(idx)):
#     if i == 0:
#         start_index = 0
#         indices = torch.arange(0, idx[i])
#         a2b1[i] = torch.cat((indices, torch.zeros(len(a2b1[0]) - len(indices))), 0)
#     else:
#         end_index = start_index + idx[i]
#         indices = torch.arange(start_index, end_index)
#         a2b1[i] = torch.cat((indices, torch.zeros(len(a2b1[0]) - len(indices))), 0)
#     start_index = start_index + idx[i]    
# a2b1 = a2b1.type(torch.LongTensor)

# print("\na2b and a2b1 equal? ", torch.equal(a2b, a2b1))

# TODO: efficient b2revb calculation



# print("\nShapes of parameters")
# print("f_atoms: ", f_atoms.shape)
# print("f_bonds: ", f_bonds.shape)
# print("a2b: ", a2b.shape)
# print("a2a: ", a2a.shape)
# print("b2a: ", b2a.shape)
# print("b2revb: ", b2revb.shape)

# print("\nExample data")
# print("Atom features index 0: ", f_atoms[0])
# print("Edge features index 0: ", f_bonds[0])
# print("a2b atom 0: ", a2b[0])
# print("a2a atom 0: ", a2a[0])
# print("b2a edge 0: ", b2a[0])
# print("b2revb edge 0: ", b2revb[0])

# for atom in a2b:
#     for bond in atom:
#         print(dtype(bond))
#         if(bond != torch.tensor(-1)):
#             print("bond: ", bond)

In [None]:
# Generate example data (3 nodes)
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

atomic_numbers = data.atomic_numbers[11:14]

edge_attr = data.edge_attr[:3]

pos = data.pos[:3]

data2 = Data(x=x, edge_index=edge_index, atomic_numbers=atomic_numbers, edge_attr=edge_attr, natoms=3, pos=pos)

In [None]:
data2

In [None]:
batch = convert_input(data2)
f_atoms, f_bonds, a2b, a2a, b2a, b2revb = batch
print("Shapes of parameters")
print("f_atoms: ", f_atoms.shape)
print("f_bonds: ", f_bonds.shape)
print("a2b: ", a2b.shape)
print("a2a: ", a2a.shape)
print("b2a: ", b2a.shape)
print("b2revb: ", b2revb.shape)

print("\nExample data")
print("Atom features: ", f_atoms)
print("Edge features: ", f_bonds)
print("a2b: ", a2b)
print("a2a: ", a2a)
print("b2a: ", b2a)
print("b2revb: ", b2revb)


Resources:

- https://github.com/Open-Catalyst-Project/ocp/blob/6604e7130ea41fabff93c229af2486433093e3b4/ocpmodels/preprocessing/atoms_to_graphs.py
- https://github.com/Open-Catalyst-Project/ocp/blob/master/scripts/preprocess_ef.py

In [None]:
print("test")