### OCP Data Preprocessing Tutorial


This notebook provides an overview of converting ASE Atoms objects to PyTorch Geometric Data objects. To better understand the raw data contained within OC20, check out the following tutorial first: https://github.com/Open-Catalyst-Project/ocp/blob/master/docs/source/tutorials/data_playground.ipynb

In [6]:
from ocpmodels.preprocessing import AtomsToGraphs
import ase.io
from ase.build import bulk
from ase.build import fcc100, add_adsorbate, molecule
from ase.constraints import FixAtoms
from ase.calculators.emt import EMT
from ase.optimize import BFGS
import torch
from torch_geometric.data import Data

### Generate toy dataset: Relaxation of CO on Cu

In [22]:
adslab = fcc100("Cu", size=(2, 2, 3))
ads = molecule("CO")
add_adsorbate(adslab, ads, 3, offset=(1, 1))
cons = FixAtoms(indices=[atom.index for atom in adslab if (atom.tag == 3)])
adslab.set_constraint(cons)
adslab.center(vacuum=13.0, axis=2)
adslab.set_pbc(True)
adslab.set_calculator(EMT())
dyn = BFGS(adslab, trajectory="CuCO_adslab.traj", logfile=None)
dyn.run(fmax=0, steps=1000)

False

In [27]:
raw_data = ase.io.read("CuCO_adslab.traj", ":")
print(len(raw_data))

1001


### Convert Atoms object to Data object

The AtomsToGraphs class takes in several arguments to control how Data objects created:

- max_neigh (int):   Maximum number of neighbors a given atom is allowed to have, discarding the furthest
- radius (float):      Cutoff radius to compute nearest neighbors around
- r_energy (bool):    Write energy to Data object
- r_forces (bool):    Write forces to Data object
- r_distances (bool): Write distances between neighbors to Data object
- r_edges (bool):     Write neigbhor edge indices to Data object
- r_fixed (bool):     Write indices of fixed atoms to Data object

In [28]:
a2g = AtomsToGraphs(
    max_neigh=50,
    radius=6,
    r_energy=True,
    r_forces=True,
    r_distances=False,
    r_edges=True,
    r_fixed=True,
)

In [29]:
data_objects = a2g.convert_all(raw_data, disable_tqdm=True)

In [30]:
data = data_objects[0]
data

Data(atomic_numbers=[14], cell=[1, 3, 3], cell_offsets=[636, 3], edge_index=[2, 636], fixed=[14], force=[14, 3], natoms=14, pos=[14, 3], y=3.9893144106684715)

In [31]:
data.atomic_numbers

tensor([29., 29., 29., 29., 29., 29., 29., 29., 29., 29., 29., 29.,  8.,  6.])

In [32]:
data.cell

tensor([[[ 5.1053,  0.0000,  0.0000],
         [ 0.0000,  5.1053,  0.0000],
         [ 0.0000,  0.0000, 32.6100]]])

In [33]:
data.edge_index #neighbor idx, source idx

tensor([[ 1,  2,  2,  ...,  4,  6,  3],
        [ 0,  0,  0,  ..., 13, 13, 13]])

In [34]:
from torch_geometric.utils import degree
# Degree corresponds to the number of neighbors a given node has. Note there is no more than max_neigh neighbors for
# any given node.

degree(data.edge_index[1]) 

tensor([45., 45., 45., 46., 49., 49., 49., 49., 50., 49., 49., 50., 26., 35.])

In [35]:
data.fixed

tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [36]:
data.force

tensor([[ 2.3250e-15,  1.8807e-15,  1.1354e-01],
        [ 9.0249e-16,  1.1050e-15,  1.1344e-01],
        [ 5.2822e-15,  2.9421e-15,  1.1344e-01],
        [-3.4399e-17,  6.2746e-17,  1.1294e-01],
        [-8.5221e-03, -8.5221e-03, -1.1496e-02],
        [ 8.5221e-03, -8.5221e-03, -1.1496e-02],
        [-8.5221e-03,  8.5221e-03, -1.1496e-02],
        [ 8.5221e-03,  8.5221e-03, -1.1496e-02],
        [ 8.5001e-16, -8.4308e-16, -1.0431e-01],
        [-2.0583e-15, -4.5797e-16, -6.6610e-02],
        [-5.5511e-17, -5.8287e-16, -6.6610e-02],
        [-1.7780e-15, -2.5274e-15, -3.3250e-01],
        [-4.2690e-19, -8.6059e-19, -3.4247e-01],
        [-4.3368e-17, -2.4286e-17,  5.0512e-01]])

In [37]:
data.pos

tensor([[ 0.0000,  0.0000, 13.0000],
        [ 2.5527,  0.0000, 13.0000],
        [ 0.0000,  2.5527, 13.0000],
        [ 2.5527,  2.5527, 13.0000],
        [ 1.2763,  1.2763, 14.8050],
        [ 3.8290,  1.2763, 14.8050],
        [ 1.2763,  3.8290, 14.8050],
        [ 3.8290,  3.8290, 14.8050],
        [ 0.0000,  0.0000, 16.6100],
        [ 2.5527,  0.0000, 16.6100],
        [ 0.0000,  2.5527, 16.6100],
        [ 2.5527,  2.5527, 16.6100],
        [ 2.5527,  2.5527, 19.6100],
        [ 2.5527,  2.5527, 18.4597]])

In [38]:
data.y

3.9893144106684715

### Adding additional info to your Data objects

In addition to the above information, the OCP repo requires several other pieces of information for your data to work
with the provided trainers:

- sid (int): A unique identifier for a particular system. Does not affect your model performance, used for prediction saving 
- fid (int) (S2EF only): If training for the S2EF task, your data must also contain a unique frame identifier for atoms objects coming from the same system.
- tags (tensor): Tag information - 0 for adsorbate, 1 for surface, 2 for subsurface. Optional, can be used for training.


Other information may be added her as well if you choose to incorporate other information in your models/frameworks

In [39]:
data_objects = []
for idx, system in enumerate(raw_data):
    data = a2g.convert(system)
    data.fid = idx
    data.sid = 0 # All data points come from the same system, arbitrarly define this as 0
    data_objects.append(data)

In [40]:
data = data_objects[100]
data

Data(atomic_numbers=[14], cell=[1, 3, 3], cell_offsets=[635, 3], edge_index=[2, 635], fid=100, fixed=[14], force=[14, 3], natoms=14, pos=[14, 3], sid=0, y=3.968355893395698)

In [41]:
data.sid

0

In [42]:
data.fid

100

In [64]:
# Helper function to convert from PyTorch Geometric input to GROVER input:
def convert_input(data):
    """
        :param data: data as PyTorch geometric object
        :param f_atoms: the atom features, num_atoms * atom_dim
        :param f_bonds: the bond features, num_bonds * bond_dim
        :param a2b: mapping from atom index to incoming bond indices. 
        :param a2a: mapping from atom index to its neighbors. num_atoms * max_num_bonds
        :param b2a: mapping from bond index to the index of the atom the bond is coming from.
        :param b2revb: mapping from bond index to the index of the reverse bond.
        :return: batch = (f_atoms, f_bonds, a2b, a2a, b2a, b2revb)
    """
    # Per atom features: (atomic_number, pos_x, pos_y, pos_z)
    f_atoms = torch.stack((data.atomic_numbers.long(), data.pos[:,0], data.pos[:,1], data.pos[:,2]), 1)
    # Per edge features (calculated by atomic distances in model forward pass)
    f_bonds = data.edge_attr

    a2a = [[] for j in range(data.natoms)] # List of lists - Dynamically append neighbors for a given atom
    a2b = [[] for j in range(data.natoms)] # List of lists - Dynamically append edges for a given atom
    b2a = torch.zeros((data.edge_index.shape[1],))  # (num_edges, ) - One originating atom per edge
    b2revb = torch.zeros((data.edge_index.shape[1],))  # (num_edges, ) - One reverse bond per bond
    rev_edges = {} # Dict of lists for each (from_atom, to_atom) pair, saving edge numbers

    # # we want to convert this edge_index to a list of lists/tensors. [[1, 2], [0], [0]].
    # edge_index = torch.tensor([[0, 0, 1, 2], [1, 2, 0, 0]])
    # natoms = 3 # no. of atoms in the system
    _, idx = torch.unique(data.edge_index[0,:], return_counts=True) # idx is tensor([2, 1, 1])
    from_to_atoms = data.edge_index[1].split(idx.tolist()) # from_to_atoms is (tensor([1, 2]), tensor([0]), tensor([0]))
    _, idx = torch.unique(data.edge_index[1], return_counts=True)
    print("idx: ", idx)
    
    
    for i in range(data.edge_index.shape[1]):
        from_atom = int(data.edge_index[0][i])
        to_atom = int(data.edge_index[1][i])
        if to_atom not in a2a[from_atom]:
            a2a[from_atom].append(to_atom) # Mark b as neighbor of a
        if i not in a2b[from_atom]:
            a2b[from_atom].append(i) # Mark bond i as outgoing bond from atom a
        b2a[i] = from_atom # Mark a as atom where bond i is originating
        key = frozenset({to_atom, from_atom})
        if(key not in rev_edges): # If the edge from these two atoms has not been seen yet
            rev_edges[key] = [] # Declare it as a list (so we can keep track of the edge numbers)
        rev_edges[key].append(i) # Append the edge number to the list
    
    print("a2a: ", a2a[0])
    print("from to atoms: ", from_to_atoms[0])
    print("a2a: ", a2a[1])
    print("from to atoms: ", from_to_atoms[1])
    # Iterate through and set b2revb
    for atoms, edges in rev_edges.items():
        b2revb[edges[0]] = edges[1]
        b2revb[edges[1]] = edges[0]


    # Convert list of lists for a2a and a2b into tensor: (num_nodes, max_edges)
    # Option 1: Trims length to max number of edges seen in the data (<= 50)
    a2a_pad = len(max(a2a, key=len))
    a2b_pad = len(max(a2b, key=len))

    # Option 2: Sets length to max number of possible edges (should be 50 but in test ipynb it's 55)
    # a2a_pad = 50
    # a2b_pad = 50

    # -1 is not a valid atom or edge index so we pad with this
    a2a = torch.tensor([i + [-1] * (a2a_pad - len(i)) for i in a2a])
    a2b = torch.tensor([i + [-1] * (a2b_pad - len(i)) for i in a2b])

    batch = (f_atoms, f_bonds, a2b, a2a, b2a, b2revb)
    return batch

In [90]:
print("number of atoms: ", data.natoms)
print("number of edges: ", len(data.edge_index[0]))
print("edge index: ", data.edge_index)

batch = convert_input(data)
f_atoms, f_bonds, a2b, a2a, b2a, b2revb = batch

_, idx = torch.unique(data.edge_index[1], return_counts=True)

print("idx: ", idx)
print(data.edge_index[1, 0])
print(data.edge_index[1, 1])
print(data.edge_index[1, idx[0]-1])
b2a = torch.zeros((data.edge_index.shape[1],))  # (num_edges, ) - One originating atom per edge
for i in range(len(idx)):
    print(i)
    if i == 0:
        start_index = 0
        b2a[:idx[i]] = i
    end_index = start_index + idx[i] - 1
    b2a[start_index:end_index] = i
    start_index = start_index + idx[i]
b2a



# print("\nShapes of parameters")
# print("f_atoms: ", f_atoms.shape)
# print("f_bonds: ", f_bonds.shape)
# print("a2b: ", a2b.shape)
# print("a2a: ", a2a.shape)
# print("b2a: ", b2a.shape)
# print("b2revb: ", b2revb.shape)

# print("\nExample data")
# print("Atom features index 0: ", f_atoms[0])
# print("Edge features index 0: ", f_bonds[0])
# print("a2b atom 0: ", a2b[0])
# print("a2a atom 0: ", a2a[0])
# print("b2a edge 0: ", b2a[0])
# print("b2revb edge 0: ", b2revb[0])

# for atom in a2b:
#     for bond in atom:
#         print(dtype(bond))
#         if(bond != torch.tensor(-1)):
#             print("bond: ", bond)


number of atoms:  14
number of edges:  635
edge index:  tensor([[ 5,  7,  6,  ...,  4,  6,  7],
        [ 0,  0,  0,  ..., 13, 13, 13]])
idx:  tensor([45, 45, 45, 46, 49, 49, 49, 49, 49, 49, 49, 50, 26, 35])
a2a:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
from to atoms:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
a2a:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
from to atoms:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
idx:  tensor([45, 45, 45, 46, 49, 49, 49, 49, 49, 49, 49, 50, 26, 35])
tensor(0)
tensor(0)
tensor(0)
0
1
2
3
4
5
6
7
8
9
10
11
12
13


tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 1

In [72]:
# Generate example data (3 nodes)
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

atomic_numbers = data.atomic_numbers[11:14]

edge_attr = data.edge_attr[:3]

pos = data.pos[:3]

data2 = Data(x=x, edge_index=edge_index, atomic_numbers=atomic_numbers, edge_attr=edge_attr, natoms=3, pos=pos)

In [25]:
data2

NameError: name 'data2' is not defined

In [26]:
batch = convert_input(data2)
f_atoms, f_bonds, a2b, a2a, b2a, b2revb = batch
print("Shapes of parameters")
print("f_atoms: ", f_atoms.shape)
print("f_bonds: ", f_bonds.shape)
print("a2b: ", a2b.shape)
print("a2a: ", a2a.shape)
print("b2a: ", b2a.shape)
print("b2revb: ", b2revb.shape)

print("\nExample data")
print("Atom features: ", f_atoms)
print("Edge features: ", f_bonds)
print("a2b: ", a2b)
print("a2a: ", a2a)
print("b2a: ", b2a)
print("b2revb: ", b2revb)


NameError: name 'data2' is not defined

Resources:

- https://github.com/Open-Catalyst-Project/ocp/blob/6604e7130ea41fabff93c229af2486433093e3b4/ocpmodels/preprocessing/atoms_to_graphs.py
- https://github.com/Open-Catalyst-Project/ocp/blob/master/scripts/preprocess_ef.py

In [None]:
print("test")