In [2]:
import numpy as np
import torch
import json
import os

from libraries.dataset        import standardize_dataset, check_extend_POSCAR
from libraries.graph          import graph_POSCAR_encoding
from libraries.structure      import compute_diffraction_pattern
from torch_geometric.data     import Data
from pymatgen.core            import Structure

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# In case database is created from scratch (otherwise, it is not being used)
data_path = '../MP/Loaded_PT'

# Define diffraction type ('neutron', 'xrd' or 'EPA')
target = 'EPA'

# Define folder in which all data will be stored
data_folder = f'data/Loaded_PT-sphere-images'

# Define name for storing dataset basic description
dataset_parameters_name = f'{data_folder}/dataset_parameters.json'

encoding_type      = 'sphere-images'  # 'voronoi' or 'sphere-images'
distance_threshold = 6  # Used in general

minimum_lattice_vector = 3 * distance_threshold  # Allowing three convolutions

# Define basic dataset parameters for tracking data
dataset_parameters = {
    'input_folder': data_path,
    'output_folder': data_folder,
    'target': target,
    'encoding_type': encoding_type,
    'distance_threshold': distance_threshold,
    'minimum_lattice_vector': minimum_lattice_vector
}

if not os.path.exists(data_folder):
    os.system(f'mkdir {data_folder}')

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name, 'w') as json_file:
    json.dump(dataset_parameters, json_file)

# Generation of graph database for training

Load the datasets, already standardized if possible.

In [4]:
# Generate the raw dataset from scratch, and standardize it

# Read all materials within the database
materials = os.listdir(data_path)

dataset = []
labels  = []
for material in materials[:100]:
    try:
        # Try to read the polymorphs
        polymorphs = os.listdir(f'{data_path}/{material}')
    except:
        continue
    
    print(material)
    for polymorf in polymorphs:
        # Path to folder containing the POSCAR
        path_to_POSCAR = f'{data_path}/{material}/{polymorf}'
        
        # Check that the folder is valid
        if os.path.exists(path_to_POSCAR):
            print(f'\t{polymorf}')
            
            try:
                # Load pymatgen structure object
                structure = Structure.from_file(f'{path_to_POSCAR}/POSCAR')
                
                # Check that POSCAR is big enough, otherwise extend it where necessary
                structure = check_extend_POSCAR(structure, minimum_lattice_vector)
                
                nodes, edges, attributes = graph_POSCAR_encoding(structure,
                                                                 encoding_type=encoding_type,
                                                                 distance_threshold=distance_threshold)
            except:
                print(f'Error: {material} {polymorf} not loaded')
                continue
            
            if target == 'EPA':
                # Load ground state energy per atom
                extracted_target = [float(np.loadtxt(f'{path_to_POSCAR}/EPA'))]
            elif (target == 'neutron') or (target == 'xrd'):
                # Compute diffraction pattern from given structure
                extracted_target = compute_diffraction_pattern(structure, diffraction=target)
            else:
                # Do not extract anything
                extracted_target = [0]
            
            # Construct temporal graph structure
            graph = Data(x=nodes,
                         edge_index=edges.t().contiguous(),
                         edge_attr=attributes.ravel(),
                         y=torch.tensor(extracted_target, dtype=torch.float)
                        )

            # Append to dataset and labels
            dataset.append(graph)
            labels.append(f'{material}-{polymorf}')

Li2Ti2VO6
	Cmce
	P-1
Li4V(TeO4)3
	P1
	P2
ErTe
	Fm-3m
	P4-mmm
Dy2GaCu3
	Cm
Lu2(SO4)3
	R-3c
Ta5N10Cl17
	I4mm
Ta4WC5
	R-3m
Ba2CoAg2(SeO)2
	I4-mmm
V6Ga5
	P6_3-mmc
BaCdHg2
	Immm
Ba3La7Mn(Co3O10)3
	P2-m
CoAs2
	P2_1-c
	Pnnm
Ca7(H6Cl)2
	P-6
Na10Zr4In4Si7S5O48
	P1
TiMn9O20
	C2-m
	P-1
Fe2CoSi
	Fm-3m
	F-43m
Na5Y2In2P5SO24
	P1
VH10SNO9
	P2_1
Na3TbTi2Nb2O12
	Pmc2_1
	P1
Yb(NO3)3
	P2_1-c
Er2Cl2O
	P4-mmm
SrCaNdMnO6
	P-4n2
	F-43m
UN2O11
	P2_1-c
	P-1
Li4TiV5O12
	C2-m
K2Mo(CO4)2
	P2_1-c
GdClO2
	P2_1-m
ErMn2(Fe2Sn3)2
	Cmm2
Er2NiSn6
	Cmmm
Ti2MnCo
	Fm-3m
SrFe5P5O22
	P-1
TiCdCu2
	Immm
Mo(PO4)2
	P2_1-c
	P2_12_12
	P2_12_12_1
AlCu6H18SClO19
	P31c
Li2V5O10
	P1
	P-1
K2BiCl5O2
	Pnma
Cd2P2H10CCl2O11
	P-1
Al4Co(BO5)2
	P2_1-c
Sn6P3NO14
	Pbcn
Ta2Si(PO6)2
	Cc
Nd6CdFe13
	I4-mcm
SrLuCuS3
	Cmcm
K2NaAsF6
	Fm-3m
Ge(WO3)6
	Amm2
Ca8Mo7O20
	I4-m
CaMg14TiO16
	Pmmm
	P4-mmm
AlCu3(SbO3)4
	Im-3
Na3YSi3Ge3(SO8)3
	P1
MgCoN2
	Pna2_1
Li4AlCr3O8
	C2-m
	R-3m
	P2-m
	P-1
Ba2Tb2Ti2Cu2O11
	P4-mmm
SmCo2Ni3
	P6-mmm
H5SeNO4
	C2
	P1
ErCo3B2
	P6-

In [5]:
# Standardize dataset
dataset_std, dataset_parameters = standardize_dataset(dataset)

# Save dataset

In [6]:
labels_name                 = f'{data_folder}/labels.pt'
dataset_name                = f'{data_folder}/dataset.pt'
dataset_name_std            = f'{data_folder}/standardized_dataset.pt'
dataset_parameters_name_std = f'{data_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

torch.save(labels,      labels_name)
torch.save(dataset,     dataset_name)
torch.save(dataset_std, dataset_name_std)

# Convert torch tensors to numpy arrays
numpy_dict = {key: value.cpu().numpy().tolist() for key, value in dataset_parameters.items()}

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name_std, 'w') as json_file:
    json.dump(numpy_dict, json_file)