### Dataloader

In [1]:
import os
import math
import numpy as np
import mindspore as ms
from mindspore import nn, ops, context
from mindspore.dataset import GeneratorDataset
from mindspore.train.callback import LossMonitor
from mindspore import Tensor
import mindspore.dataset as ds
from mindspore.common.initializer import HeUniform
from mindspore.train import Model
from mindspore import save_checkpoint
import argparse
from datetime import date
import random
import pickle

os.CUDA_VISIBLE_DEVICES = '6'



In [2]:
import math
import pickle
import numpy as np
import mindspore as ms
import mindspore.numpy as mnp
from mindspore import Tensor

def get_iso_permuted_dataset(picklefile, **atm_iso):
    dataset = []

    for key, value in atm_iso.items():
        if key == 'h_iso':
            h_data = Tensor(np.loadtxt(value, skiprows=2, usecols=1), dtype=ms.float32)
        elif key == 'c_iso':
            c_data = Tensor(np.loadtxt(value, skiprows=2, usecols=1), dtype=ms.float32)
        elif key == 'n_iso':
            n_data = Tensor(np.loadtxt(value, skiprows=2, usecols=1), dtype=ms.float32)
        elif key == 'o_iso':
            o_data = Tensor(np.loadtxt(value, skiprows=2, usecols=1), dtype=ms.float32)
        elif key == 'p_iso':
            p_data = Tensor(np.loadtxt(value, skiprows=2, usecols=1), dtype=ms.float32)
        else:
            raise ValueError("Isolated atom type not found. Use kwargs \"h_iso\", \"c_iso\", etc.")

    with open(picklefile, "rb") as f:
        molecules = pickle.load(f)

    cnt = 0
    for molecule in molecules:
        # Load data
        # pos = Tensor(molecule['pos'], dtype=ms.float32)
        # z = Tensor(molecule['type'].unsqueeze(1), dtype=ms.float32)
        # x = Tensor(molecule['onehot'], dtype=ms.float32)
        # c = Tensor(molecule['coefficients'], dtype=ms.float32)
        # n = Tensor(molecule['norms'], dtype=ms.float32)
        # exp = Tensor(molecule['exponents'], dtype=ms.float32)
        # full_c = copy.deepcopy(c)
        # iso_c = Tensor(np.zeros_like(c.asnumpy()), dtype=ms.float32)
        # Load from numpy arrays
        pos = Tensor(molecule['pos'], dtype=ms.float32)
        z = Tensor(np.expand_dims(molecule['type'], axis=1), dtype=ms.float32)
        x = Tensor(molecule['onehot'], dtype=ms.float32)
        c = Tensor(molecule['coefficients'], dtype=ms.float32)
        n = Tensor(molecule['norms'], dtype=ms.float32)
        exp = Tensor(molecule['exponents'], dtype=ms.float32)
        full_c = ops.deepcopy(c)
        iso_c = Tensor(np.zeros_like(c.asnumpy()), dtype=ms.float32)

        # Subtract the isolated atoms
        for atom, iso, typ in zip(c, iso_c, z):
            typ_value = typ.asnumpy().item()
            if typ_value == 1.0:
                atom[:h_data.shape[0]] -= h_data
                iso[:h_data.shape[0]] += h_data
            elif typ_value == 6.0:
                atom[:c_data.shape[0]] -= c_data
                iso[:c_data.shape[0]] += c_data
            elif typ_value == 7.0:
                atom[:n_data.shape[0]] -= n_data
                iso[:n_data.shape[0]] += n_data
            elif typ_value == 8.0:
                atom[:o_data.shape[0]] -= o_data
                iso[:o_data.shape[0]] += o_data
            elif typ_value == 15.0:
                atom[:p_data.shape[0]] -= p_data
                iso[:p_data.shape[0]] += p_data
            else:
                raise ValueError("Isolated atom type not supported!")

        pop = mnp.where(n != 0, c * 2 * math.sqrt(2) / n, n)

        # Permute positions, yzx -> xyz
        # p_pos = copy.deepcopy(pos)
        p_pos = ops.deepcopy(pos)
        p_pos[:, 0] = pos[:, 1]
        p_pos[:, 1] = pos[:, 2]
        p_pos[:, 2] = pos[:, 0]

        # Create dataset dictionary
        data_dict = {
            'pos': p_pos,
            'pos_orig': pos,
            'z': z,
            'x': x,
            'y': pop,
            'c': c,
            'full_c': full_c,
            'iso_c': iso_c,
            'exp': exp,
            'norm': n
        }

        dataset.append(data_dict)
        cnt += 1

    print(f"Loaded {cnt} molecules from {picklefile}")
    print(f"Loaded {len(dataset)} molecules from {picklefile}")

    return dataset

In [16]:
args = argparse.Namespace()
args.dataset = "./data/water_density_dataset_np_test.pkl" 
args.testset = "./data/water_density_testset_np_test.pkl"
args.split   = 20
args.epochs  = 500
args.qm      = "pbe0"
args.ldep    = False

if args.qm == 'ccsd':
    hhh = "./data/ccsd_h_s_only_def2-universal-jfit-decontract_density.out"
    ooo = "./data/ccsd_o_s_only_def2-universal-jfit-decontract_density.out"
else:
    hhh = "./data/h_s_only_def2-universal-jfit-decontract_density.out"
    ooo = "./data/o_s_only_def2-universal-jfit-decontract_density.out"

test_dataset = args.testset
num_epochs = args.epochs
ldep_bool = args.ldep

Rs = [(12, 0), (5, 1), (4, 2), (2, 3), (1, 4)]

test_dataset = get_iso_permuted_dataset(args.testset, o_iso=ooo, h_iso=hhh)

split = args.split
data_file = args.dataset
lr = 1e-2
density_spacing = 0.1
save_interval = 5
model_kwargs = {
    "irreps_in": 2,
    "irreps_hidden": [(mul, (l, p)) for l, mul in enumerate([125, 40, 25, 15]) for p in [-1, 1]],
    "irreps_out": 12,
    "irreps_node_attr": None,
    "irreps_edge_attr": 3,
    "layers": 3,
    "max_radius": 3.5,
    "number_of_basis": 10,
    "radial_layers": 1,
    "radial_neurons": 128,
    "num_neighbors": 12.2298,
    "num_nodes": 24,
    "reduce_output": False,
}

dataset = get_iso_permuted_dataset(data_file, o_iso=ooo, h_iso=hhh)
random.shuffle(dataset)
if split > len(dataset):
    raise ValueError('Split is too large for the dataset.')

b = 1

def data_generator(data):
    for d in data:
        yield d

train_loader = GeneratorDataset(lambda: data_generator(dataset[:split]), ["train"], shuffle=True)
test_loader = GeneratorDataset(lambda: data_generator(test_dataset), ["test"], shuffle=True)

train_loader = train_loader.batch(b)
test_loader = test_loader.batch(b)

Loaded 50 molecules from ./data/water_density_testset_np_test.pkl
Loaded 50 molecules from ./data/water_density_testset_np_test.pkl
Loaded 50 molecules from ./data/water_density_dataset_np_test.pkl
Loaded 50 molecules from ./data/water_density_dataset_np_test.pkl


In [17]:
dataset_size = train_loader.get_dataset_size()
print("Train dataset size: ", dataset_size)
print(train_loader)

Train dataset size:  20
<mindspore.dataset.engine.datasets.BatchDataset object at 0x7fd018ce2790>


In [18]:
len(dataset[:split])

20

In [19]:
for batch_idx, data in enumerate(train_loader.create_dict_iterator()):
    print("Train")
for data in train_loader:
    print("Train")

Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train
Train


### .pkl data

In [1]:
import pickle
import numpy as np

In [2]:
path = './data/water_density_testset.pkl'
dataset = []

with open(path, 'rb') as f:
    molecules = pickle.load(f)

cnt = 0
for data in molecules:
    cnt += 1
    if cnt > 50:
        break
    print(data.keys())
    # dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
    # transform data to numpy array and save in a new .pkl file
    data['type'] = np.array(data['type'])
    data['pos'] = np.array(data['pos'])
    data['onehot'] = np.array(data['onehot'])
    data['coefficients'] = np.array(data['coefficients'])
    data['exponents'] = np.array(data['exponents'])
    data['norms'] = np.array(data['norms'])
    data['rs_max'] = np.array(data['rs_max'])
    data['energy'] = np.array(data['energy'])
    data['forces'] = np.array(data['forces'])
    dataset.append(data)

with open('./data/water_density_testset_np_test.pkl', 'wb') as f:
    pickle.dump(dataset, f)

dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'ex

In [3]:
path = './data/water_density_dataset.pkl'
dataset = []

with open(path, 'rb') as f:
    molecules = pickle.load(f)

cnt = 0
for data in molecules:
    cnt += 1
    if cnt > 50:
        break
    print(data.keys())
    # dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
    # transform data to numpy array and save in a new .pkl file
    data['type'] = np.array(data['type'])
    data['pos'] = np.array(data['pos'])
    data['onehot'] = np.array(data['onehot'])
    data['coefficients'] = np.array(data['coefficients'])
    data['exponents'] = np.array(data['exponents'])
    data['norms'] = np.array(data['norms'])
    data['rs_max'] = np.array(data['rs_max'])
    data['energy'] = np.array(data['energy'])
    data['forces'] = np.array(data['forces'])
    dataset.append(data)

with open('./data/water_density_dataset_np_test.pkl', 'wb') as f:
    pickle.dump(dataset, f)

dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'exponents', 'norms', 'rs_max', 'energy', 'forces'])
dict_keys(['type', 'pos', 'onehot', 'coefficients', 'ex