In [1]:
import numpy as np
import torch
import json
import os

from libraries.dataset        import standardize_dataset, check_extend_POSCAR
from libraries.graph          import graph_POSCAR_encoding
from libraries.structure      import compute_diffraction_pattern
from torch_geometric.data     import Data
from pymatgen.core            import Structure

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# In case database is created from scratch (otherwise, it is not being used)
data_path = '../MP/Loaded_PT'

# Define diffraction type ('neutron', 'xrd' or 'EPA')
target = 'EPA'

# Define folder in which all data will be stored
data_folder = f'data/GM_PT_{target}'

# Define name for storing dataset basic description
dataset_parameters_name = f'{data_folder}/dataset_parameters.json'

encoding_type      = 'voronoi'  # 'voronoi' or 'sphere-images'
distance_threshold = 6  # Used in general

minimum_lattice_vector = 3 * distance_threshold  # Allowing three convolutions

# Define basic dataset parameters for tracking data
dataset_parameters = {
    'input_folder': data_path,
    'output_folder': data_folder,
    'target': target,
    'encoding_type': encoding_type,
    'distance_threshold': distance_threshold,
    'minimum_lattice_vector': minimum_lattice_vector
}

if not os.path.exists(data_folder):
    os.system(f'mkdir {data_folder}')

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name, 'w') as json_file:
    json.dump(dataset_parameters, json_file)

# Generation of graph database for training

Load the datasets, already standardized if possible.

In [3]:
# Generate the raw dataset from scratch, and standardize it

# Read all materials within the database
materials = os.listdir(data_path)

dataset = []
labels  = []
for material in materials[:100]:
    try:
        # Try to read the polymorphs
        polymorphs = os.listdir(f'{data_path}/{material}')
    except:
        continue
    
    print(material)
    for polymorf in polymorphs:
        # Path to folder containing the POSCAR
        path_to_POSCAR = f'{data_path}/{material}/{polymorf}'
        
        # Check that the folder is valid
        if os.path.exists(path_to_POSCAR):
            print(f'\t{polymorf}')
            
            try:
                # Load pymatgen structure object
                structure = Structure.from_file(f'{path_to_POSCAR}/POSCAR')
                
                # Check that POSCAR is big enough, otherwise extend it where necessary
                structure = check_extend_POSCAR(structure, minimum_lattice_vector)
                
                nodes, edges, attributes = graph_POSCAR_encoding(structure,
                                                                 encoding_type=encoding_type,
                                                                 distance_threshold=distance_threshold)
            except:
                print(f'Error: {material} {polymorf} not loaded')
                continue
            
            if target == 'EPA':
                # Load ground state energy per atom
                extracted_target = [float(np.loadtxt(f'{path_to_POSCAR}/EPA'))]
            else:
                # Compute diffraction pattern from given structure
                extracted_target = compute_diffraction_pattern(structure, diffraction=target)
            
            # Construct temporal graph structure
            graph = Data(x=nodes,
                         edge_index=edges.t().contiguous(),
                         edge_attr=attributes.ravel(),
                         y=torch.tensor(extracted_target, dtype=torch.float)
                        )

            # Append to dataset and labels
            dataset.append(graph)
            labels.append(f'{material}-{polymorf}')

CeSnRh
	P-62m
LuSi
	Cmcm
	P-6m2
Gd2Ti2O7
	Fd-3m
	C2-m
LiAs3H2O9
	P2_1-c
VAg2HgO4
	I-42d
Y5Be6Fe3(SiO5)6
	P1
CuCN2
	Cmcm
BaSrEuCrO6
	F-43m
Ta2N3
	Pnma
Nd6Al43W4
	P6_3-mcm
CdSO4
	Cmcm
	Pmn2_1
	P3m1
	C2-m
Mg6ZnFe
	Amm2
BaCeEuSbO6
	P-1
	F-43m
	P222
Li2Mn2CoO6
	P-1
	Cmce
Na(CrS2)2
	P-1
	I4_1-amd
	Pmmn
	Fd-3m
	Pca2_1
	Pnma
	P2-m
	C2-m
KCu2BiS3
	P-1
SrMg14Sn
	P-6m2
	Amm2
LiZnInF6
	P321
KCe(PO3)4
	P2_1-c
	P2_1
K2NiAs2
	Cmcm
Nb2SiNi3
	P6_3-mmc
CaCoF4
	C2-c
Ta2Co3Si
	P6_3-mmc
LaEr4Se7
	Cm
NdTiFe11N
	Imm2
YTaP2SO12
	Cc
MgAl3SiBO9
	Pnma
CaZn2(H5O4)2
	P2_1-c
K3CeI6
	Fm-3m
Mn3Ni2P6WO24
	R3
Error: Mn3Ni2P6WO24 R3 not loaded
H6PbC2S2O7
	P-1
Error: H6PbC2S2O7 P-1 not loaded
BaFe4O7
	P6_3mc
Error: BaFe4O7 P6_3mc not loaded
	P6_3-m
Error: BaFe4O7 P6_3-m not loaded
Nb2CoS4
	P-3m1
Error: Nb2CoS4 P-3m1 not loaded
	Pnma
Error: Nb2CoS4 Pnma not loaded
W2Br5
	P2_1-c
Error: W2Br5 P2_1-c not loaded
ZrP
	P6_3-mmc
Error: ZrP P6_3-mmc not loaded
	Fm-3m
Error: ZrP Fm-3m not loaded
BaClF
	P4-nmm
Error: BaClF P4-nmm n

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8a761aaf20>>
Traceback (most recent call last):
  File "/home/claudio/cibran/Work/UPC/GenerativeModels/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Error: NaPbO3 Pm-3m not loaded
KAl4(SiO6)2
	C2-c


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8a761aaf20>>
Traceback (most recent call last):
  File "/home/claudio/cibran/Work/UPC/GenerativeModels/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Error: KAl4(SiO6)2 C2-c not loaded
Y4Cu7O16
	P-1
Error: Y4Cu7O16 P-1 not loaded
Li5Mn5Cr2O12
	C2
Error: Li5Mn5Cr2O12 C2 not loaded
U11Se20
	.DS_Store
Error: U11Se20 .DS_Store not loaded
	P4nc
Error: U11Se20 P4nc not loaded
	P4
Error: U11Se20 P4 not loaded
Yb2MgTi2O6
	R3
Error: Yb2MgTi2O6 R3 not loaded
	R-3
Error: Yb2MgTi2O6 R-3 not loaded
MgTiMn3O8
	Cm
Error: MgTiMn3O8 Cm not loaded
	P1
Error: MgTiMn3O8 P1 not loaded
	C2-m
Error: MgTiMn3O8 C2-m not loaded
Na6Y4Al4Si3(S3O16)3
	P1
Error: Na6Y4Al4Si3(S3O16)3 P1 not loaded
LiNi7O7F
	Pm
Error: LiNi7O7F Pm not loaded
Na4Ga4Si19
	P1
Error: Na4Ga4Si19 P1 not loaded
BaMgFeF7
	P2_1-c
Error: BaMgFeF7 P2_1-c not loaded
Sn4Te3Se
	R3m
Error: Sn4Te3Se R3m not loaded
Sr2Cd2Cu(SO)2
	I4-mmm


  if np.any(pair == str(idx_i)):  # Real for idx_i
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8a761aaf20>>
Traceback (most recent call last):
  File "/home/claudio/cibran/Work/UPC/GenerativeModels/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Error: Sr2Cd2Cu(SO)2 I4-mmm not loaded
Na14Ca4Hf4Si7S5O48
	P1
Error: Na14Ca4Hf4Si7S5O48 P1 not loaded
PrTaTiO6
	Pna2_1
Error: PrTaTiO6 Pna2_1 not loaded
Nd(NiO2)2
	Cmce
Error: Nd(NiO2)2 Cmce not loaded
Mg6ZnCd
	Amm2
Error: Mg6ZnCd Amm2 not loaded
CdHg4C6S6(Br2N3)2
	Fmm2
Error: CdHg4C6S6(Br2N3)2 Fmm2 not loaded
ZnFeCo2
	Immm
Li4Ni5SbO12
	C2-m
Error: Li4Ni5SbO12 C2-m not loaded
Hg(NCl)2
	Cmmm
Error: Hg(NCl)2 Cmmm not loaded
Mn7Al4V5
	R3m
Error: Mn7Al4V5 R3m not loaded
Li3Mn(PO4)2
	R3
Error: Li3Mn(PO4)2 R3 not loaded
	P-1
Error: Li3Mn(PO4)2 P-1 not loaded
	P2_1-m
Error: Li3Mn(PO4)2 P2_1-m not loaded
	P2_1-c
Error: Li3Mn(PO4)2 P2_1-c not loaded
	P2_1
Error: Li3Mn(PO4)2 P2_1 not loaded
	P1
Error: Li3Mn(PO4)2 P1 not loaded
	C2-c
Error: Li3Mn(PO4)2 C2-c not loaded
	C2-m
Error: Li3Mn(PO4)2 C2-m not loaded
ZrHgO3
	Pm-3m
Error: ZrHgO3 Pm-3m not loaded
Gd3As5O12
	I-43m
Error: Gd3As5O12 I-43m not loaded
MnCuO4
	Imma
Error: MnCuO4 Imma not loaded
Ce2Ni3Ge5
	Ibam
Error: Ce2Ni3Ge5 Ibam not loaded
MgC

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8a761aaf20>>
Traceback (most recent call last):
  File "/home/claudio/cibran/Work/UPC/GenerativeModels/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Error: ZnCrO4 C2-m not loaded
Ba3(AlAs2)2
	Pnma
Error: Ba3(AlAs2)2 Pnma not loaded
EuCuAs
	P6_3-mmc
Error: EuCuAs P6_3-mmc not loaded
Na2V2F7
	C2-c


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8a761aaf20>>
Traceback (most recent call last):
  File "/home/claudio/cibran/Work/UPC/GenerativeModels/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Error: Na2V2F7 C2-c not loaded
Li14Nd5Si11N19O7F2
	Pmn2_1
Error: Li14Nd5Si11N19O7F2 Pmn2_1 not loaded
Y3(AlFe2)2
	P-1
Error: Y3(AlFe2)2 P-1 not loaded
Li3MnV4O8
	C2-m
Error: Li3MnV4O8 C2-m not loaded
Na3Sr3GaAs4
	P6_3mc
Error: Na3Sr3GaAs4 P6_3mc not loaded
CoH15N6(ClO)2
	P-1
Error: CoH15N6(ClO)2 P-1 not loaded
	Cc
Error: CoH15N6(ClO)2 Cc not loaded
	Pna2_1
Error: CoH15N6(ClO)2 Pna2_1 not loaded
Mg14FeC
	P-6m2


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8a761aaf20>>
Traceback (most recent call last):
  File "/home/claudio/cibran/Work/UPC/GenerativeModels/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Error: Mg14FeC P-6m2 not loaded
	Amm2
Error: Mg14FeC Amm2 not loaded
Li4Co7O16
	P2_1
Error: Li4Co7O16 P2_1 not loaded
Sr2NiSe2(ClO3)2
	P2_1-c
Error: Sr2NiSe2(ClO3)2 P2_1-c not loaded
TbHfF7
	P2_1
Error: TbHfF7 P2_1 not loaded
Na7Mg2Ga2P5SO24
	P1
Error: Na7Mg2Ga2P5SO24 P1 not loaded
Na2LaInSi(SO6)2
	P1
Error: Na2LaInSi(SO6)2 P1 not loaded
La3Se4Cl
	Pna2_1
Error: La3Se4Cl Pna2_1 not loaded
Fe2CO5
	P2_1-c
Error: Fe2CO5 P2_1-c not loaded
Nd(MnFe5)2
	Immm
Error: Nd(MnFe5)2 Immm not loaded
KMgMn4O8
	P1
Error: KMgMn4O8 P1 not loaded
	C2-m
Error: KMgMn4O8 C2-m not loaded
K2Na2Mo(WO4)3
	Pmm2
Error: K2Na2Mo(WO4)3 Pmm2 not loaded
PrGdAl4
	F-43m
Error: PrGdAl4 F-43m not loaded
Ca(WO2)2
	I4_1-amd
Error: Ca(WO2)2 I4_1-amd not loaded
	Cmcm
Error: Ca(WO2)2 Cmcm not loaded
	Pmmn
Error: Ca(WO2)2 Pmmn not loaded
	Cm
Error: Ca(WO2)2 Cm not loaded
	Fd-3m
Error: Ca(WO2)2 Fd-3m not loaded
	R3m
Error: Ca(WO2)2 R3m not loaded
	Pnma
Error: Ca(WO2)2 Pnma not loaded
	Imma
Error: Ca(WO2)2 Imma not loaded
	P1
Error

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f8a761aaf20>>
Traceback (most recent call last):
  File "/home/claudio/cibran/Work/UPC/GenerativeModels/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Error: Ba4Ta2O9 P-3c1 not loaded
LiCo6(OF5)2
	P-1
Error: LiCo6(OF5)2 P-1 not loaded
	Cm
Error: LiCo6(OF5)2 Cm not loaded
	P1
Error: LiCo6(OF5)2 P1 not loaded
K2CoC4S4N4O3
	P2_12_12
Error: K2CoC4S4N4O3 P2_12_12 not loaded
Ba4W(N2Cl)2
	P2_1-m
Error: Ba4W(N2Cl)2 P2_1-m not loaded
Ba3Li(AsO4)2
	P-3m1
Error: Ba3Li(AsO4)2 P-3m1 not loaded
	C2
Error: Ba3Li(AsO4)2 C2 not loaded
	Cm
Error: Ba3Li(AsO4)2 Cm not loaded
	R3m
Error: Ba3Li(AsO4)2 R3m not loaded
	R-3m
Error: Ba3Li(AsO4)2 R-3m not loaded
Ba2BiSbO6
	P-1
Error: Ba2BiSbO6 P-1 not loaded
	Pn-3
Error: Ba2BiSbO6 Pn-3 not loaded
	R-3
Error: Ba2BiSbO6 R-3 not loaded
	Fm-3m
Error: Ba2BiSbO6 Fm-3m not loaded
	C2-m
Error: Ba2BiSbO6 C2-m not loaded
Cr3B4Mo3
	Amm2
Error: Cr3B4Mo3 Amm2 not loaded
Ba3Cr2MoO9
	P6_3-mmc
Error: Ba3Cr2MoO9 P6_3-mmc not loaded
Li4Ti2Fe3Ni3O16
	Cm
Error: Li4Ti2Fe3Ni3O16 Cm not loaded


In [5]:
# Standardize dataset
dataset_std, dataset_parameters = standardize_dataset(dataset)

# Save dataset

In [6]:
labels_name                 = f'{data_folder}/labels.pt'
dataset_name                = f'{data_folder}/dataset.pt'
dataset_name_std            = f'{data_folder}/standardized_dataset.pt'
dataset_parameters_name_std = f'{data_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

torch.save(labels,      labels_name)
torch.save(dataset,     dataset_name)
torch.save(dataset_std, dataset_name_std)

# Convert torch tensors to numpy arrays
numpy_dict = {key: value.cpu().numpy().tolist() for key, value in dataset_parameters.items()}

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name_std, 'w') as json_file:
    json.dump(numpy_dict, json_file)