In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import ase
from ase import Atoms, Atom

import tensorflow as tf
from tensorflow.keras import layers

from prepare_data import (
    create_grid, 
    deserialize_df, 
    filter_out_single_atom_molecules,
    create_distance_matrices, 
    create_expanded_distance_matrix,
    get_expanded_distance_matrix
)



In [2]:
df0 = pd.read_pickle("./../Datasets/Cu_df2_1k.pkl.gzip", compression="gzip")
atoms = df0["ase_atoms"]
a = Atoms(atoms[800])
print(a.positions)
df0

[[ 0.        0.        0.      ]
 [ 1.613117  1.613117  1.613117]
 [ 0.        0.        3.226233]
 [ 1.613117  1.613117  4.83935 ]
 [ 0.        0.        6.452467]
 [ 1.613117  1.613117  8.065583]
 [ 0.        0.        9.6787  ]
 [ 1.613117  1.613117 11.291817]]


Unnamed: 0,name,ase_atoms,energy_corrected,forces,energy_corrected_per_atom
0,A1:FHI-aims/PBE/tight:elastic:s_e_0,"(Atom('Cu', [0.0, 0.0, 0.0], index=0))",-3.699843,"[[0.0, 0.0, 0.0]]",-3.699843
1,A1:FHI-aims/PBE/tight:murnaghan:strain_1_0,"(Atom('Cu', [0.0, 0.0, 0.0], index=0))",-3.699841,"[[0.0, 0.0, 0.0]]",-3.699841
2,A1:FHI-aims/PBE/tight:phonon:supercell_phonon_0,"(Atom('Cu', [4.3368086899420173e-19, 0.0070709...",-236.789603,"[[-1.13852957740976e-06, -0.0464638907314277, ...",-3.699838
3,A1:FHI-aims/PBE/tight:murnaghan:strain_1_02,"(Atom('Cu', [0.0, 0.0, 0.0], index=0))",-3.697932,"[[0.0, 0.0, 0.0]]",-3.697932
4,A1:FHI-aims/PBE/tight:qha:phonopy_strain_1_02:...,"(Atom('Cu', [0.0, 0.007070999999999999, 0.0070...",-236.667372,"[[-1.7950713845026e-06, -0.0426786586259085, -...",-3.697928
...,...,...,...,...,...
995,Zr4Al3,"(Atom('Cu', [1.397, 0.806558, 0.0], index=0), ...",-18.922813,"[[-6.18422096975777e-08, 3.90650715661433e-08,...",-2.703259
996,fcc_100surf_X110_Y-110_Z001_4at_d0.95,"(Atom('Cu', [0.0, 0.0, 0.0], index=0), Atom('C...",-10.811770,"[[-4.30878346990838e-07, -7.31495829958533e-08...",-2.702942
997,ran2.7197,"(Atom('Cu', [0.0, 0.0, 0.0], index=0), Atom('C...",-5.405739,"[[-0.233777024216308, -0.488775610406501, 0.45...",-2.702869
998,A15.shakesmallsuper2.14,"(Atom('Cu', [19.532944999999994, -0.449866, 19...",-172.952480,"[[0.872967501214253, 0.798687481685555, 0.8365...",-2.702382


In [3]:
df0 = deserialize_df(df0)
grid = create_grid(df0["ase_atoms"][800], 1)
grid

[GridPoint(index: (0, 0, 0), atoms: Atom('Cu', [0.0, 0.0, 0.0], index=0)),
 GridPoint(index: (1, 1, 1), atoms: Atom('Cu', [1.613117, 1.613117, 1.613117], index=1)),
 GridPoint(index: (0, 0, 3), atoms: Atom('Cu', [0.0, 0.0, 3.2262330000000006], index=2)),
 GridPoint(index: (1, 1, 4), atoms: Atom('Cu', [1.613117, 1.613117, 4.8393500000000005], index=3)),
 GridPoint(index: (0, 0, 6), atoms: Atom('Cu', [0.0, 0.0, 6.452467], index=4)),
 GridPoint(index: (1, 1, 8), atoms: Atom('Cu', [1.613117, 1.613117, 8.065583], index=5)),
 GridPoint(index: (0, 0, 9), atoms: Atom('Cu', [0.0, 0.0, 9.678700000000001], index=6)),
 GridPoint(index: (1, 1, 11), atoms: Atom('Cu', [1.613117, 1.613117, 11.291817], index=7))]

In [4]:
distance_matrix, atom_numbers = create_distance_matrices(grid)
expanded_distance_matrix = create_expanded_distance_matrix(distance_matrix)
print(len(atom_numbers))
print(expanded_distance_matrix.shape)

8
(8, 7, 4)


In [22]:
def create_tf_dataset(path, grid_size=0.1, batch_size=32, shuffle_buffer_size=1000):
    ENERGY_KEY = "energy_corrected_per_atom"
    FORCE_KEY = "forces"
    
    df = pd.read_pickle(path, compression="gzip")
    df = deserialize_df(df)
    df = filter_out_single_atom_molecules(df)
    
    # create expanded distance matrix for each df entry
    df[["expanded_distance_matrix", "atom_numbers"]] = df["ase_atoms"].apply(
        lambda x: pd.Series(get_expanded_distance_matrix(x, grid_size), index=["expanded_distance_matrix", "atom_numbers"])
    )
    print("finished creating distance matrices")
    
    max_atoms = df["expanded_distance_matrix"].apply(lambda x: x.shape[1]).max()
    df["expanded_distance_matrix"] = df["expanded_distance_matrix"].apply(
        lambda x: np.pad(x, ((0, 0), (0, max_atoms - x.shape[1]), (0, 0)), mode="constant")
    )
    df["atom_numbers"] = df["atom_numbers"].apply(
        lambda x: np.pad(x, ((0, 0), (0, max_atoms - x.shape[1])), mode="constant")
    )
    
    expanded_distance_matrices = np.stack(df["expanded_distance_matrix"])
    atom_numbers = np.stack(df["atom_numbers"])
    energies = np.stack(np.array(df[ENERGY_KEY]))
    forces = np.stack(np.array(df[FORCE_KEY]))
    
    print(expanded_distance_matrices.shape)
    print(atom_numbers.shape)
    print(energies.shape)
    print(forces.shape)
    
    # create tf dataset
    dataset = tf.data.Dataset.from_tensor_slices(
        (
            expanded_distance_matrices,  # shape: (n_samples, max_atoms, max_atoms, 4)
            atom_numbers,  # shape: (n_samples, max_atoms, max_atoms)
            energies,  # shape: (n_samples,)
            forces # shape: (n_samples, max_atoms, 3)
        )
    )
    
    # shuffle and batch
    dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)
    dataset = dataset.batch(batch_size)
    
    return dataset

create_tf_dataset("./../Datasets/Cu_df2_1k.pkl.gzip", grid_size=0.1, batch_size=32, shuffle_buffer_size=1000)

finished creating distance matrices


ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (2,2)  and requested shape (3,2)