In [1]:
import numpy as np
import torch
import json
import os

from libraries.dataset        import standardize_dataset, check_extend_POSCAR
from libraries.graph          import graph_POSCAR_encoding
from libraries.structure      import compute_diffraction_pattern
from torch_geometric.data     import Data
from pymatgen.core            import Structure

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# In case database is created from scratch (otherwise, it is not being used)
data_path = '../../../Desktop/MOSES_dataset'

# Define diffraction type ('neutron', 'xrd' or 'EPA')
target = 'None'

# Define folder in which all data will be stored
data_folder = f'data/GM_molecules'

# Define name for storing dataset basic description
dataset_parameters_name = f'{data_folder}/dataset_parameters.json'

encoding_type      = 'sphere-images'  # 'voronoi' or 'sphere-images'
distance_threshold = 6  # Used in general

minimum_lattice_vector = 0 * distance_threshold  # Allowing three convolutions

# Define basic dataset parameters for tracking data
dataset_parameters = {
    'input_folder': data_path,
    'output_folder': data_folder,
    'target': target,
    'encoding_type': encoding_type,
    'distance_threshold': distance_threshold,
    'minimum_lattice_vector': minimum_lattice_vector
}

if not os.path.exists(data_folder):
    os.system(f'mkdir {data_folder}')

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name, 'w') as json_file:
    json.dump(dataset_parameters, json_file)

# Generation of graph database for training

Load the datasets, already standardized if possible.

In [9]:
# Generate the raw dataset from scratch, and standardize it

# Read all materials within the database
materials = os.listdir(data_path)

dataset = []
labels  = []
for material in materials[:100]:
    try:
        # Try to read the polymorphs
        polymorphs = os.listdir(f'{data_path}/{material}')
    except:
        continue
    
    print(material)
    for polymorf in polymorphs:
        # Path to folder containing the POSCAR
        path_to_POSCAR = f'{data_path}/{material}/{polymorf}'
        
        # Check that the folder is valid
        if os.path.exists(path_to_POSCAR):
            print(f'\t{polymorf}')
            
            try:
                # Load pymatgen structure object
                structure = Structure.from_file(f'{path_to_POSCAR}/POSCAR')
                
                # Check that POSCAR is big enough, otherwise extend it where necessary
                structure = check_extend_POSCAR(structure, minimum_lattice_vector)
                
                nodes, edges, attributes = graph_POSCAR_encoding(structure,
                                                                 encoding_type=encoding_type,
                                                                 distance_threshold=distance_threshold)
            except:
                print(f'Error: {material} {polymorf} not loaded')
                continue
            
            if target == 'EPA':
                # Load ground state energy per atom
                extracted_target = [float(np.loadtxt(f'{path_to_POSCAR}/EPA'))]
            elif (target == 'neutron') or (target == 'xrd'):
                # Compute diffraction pattern from given structure
                extracted_target = compute_diffraction_pattern(structure, diffraction=target)
            else:
                # Do not extract anything
                extracted_target = [0]
            
            # Construct temporal graph structure
            graph = Data(x=nodes,
                         edge_index=edges.t().contiguous(),
                         edge_attr=attributes.ravel(),
                         y=torch.tensor(extracted_target, dtype=torch.float)
                        )

            # Append to dataset and labels
            dataset.append(graph)
            labels.append(f'{material}-{polymorf}')

1086310
	mol
811091
	mol
856756
	mol
871948
	mol
Error: 871948 mol not loaded
15970
	mol
491766
	mol
150171
	mol
39386
	mol
911117
	mol
386730
	mol
647127
	mol
700766
	mol
727978
	mol
338506
	mol
673312
	mol
353837
	mol
Error: 353837 mol not loaded
374629
	mol
238480
	mol
773294
	mol
734553
	mol
233068
	mol
Error: 233068 mol not loaded
809852
	mol
Error: 809852 mol not loaded
164344
	mol
123483
	mol
468297
	mol
1038126
	mol
444861
	mol
862563
	mol
883851
	mol
1074209
	mol
568311
	mol
Error: 568311 mol not loaded
925322
	mol
244906
	mol
263718
	mol
Error: 263718 mol not loaded
764223
	mol
344880
	mol
324159
	mol
648953
	mol
Error: 648953 mol not loaded
368276
	mol
623462
	mol
932395
	mol
975452
	mol
Error: 975452 mol not loaded
994960
	mol
553950
	mol
Error: 553950 mol not loaded
533089
	mol
538461
	mol
1024579
	mol
90107
	mol
832213
	mol
1068656
	mol
134434
	mol
586657
	mol
941667
	mol
966879
	mol
100601
	mol
806026
	mol
1037952
	mol
750016
	mol
Error: 750016 mol not loaded
291601
	mol

In [10]:
# Standardize dataset
dataset_std, dataset_parameters = standardize_dataset(dataset)

# Save dataset

In [11]:
labels_name                 = f'{data_folder}/labels.pt'
dataset_name                = f'{data_folder}/dataset.pt'
dataset_name_std            = f'{data_folder}/standardized_dataset.pt'
dataset_parameters_name_std = f'{data_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

torch.save(labels,      labels_name)
torch.save(dataset,     dataset_name)
torch.save(dataset_std, dataset_name_std)

# Convert torch tensors to numpy arrays
numpy_dict = {key: value.cpu().numpy().tolist() for key, value in dataset_parameters.items()}

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name_std, 'w') as json_file:
    json.dump(numpy_dict, json_file)