In [1]:
import numpy as np
import torch
import json
import os

from libraries.dataset        import standardize_dataset
from libraries.graph          import graph_POSCAR_encoding
from libraries.structure      import compute_diffraction_pattern
from torch_geometric.data     import Data
from pymatgen.core            import Structure, Molecule, Lattice

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# In case database is created from scratch (otherwise, it is not being used)
data_path = '/Users/cibran/Desktop/QM9/general'

# Type of data in folder
data_type = 'xyz'  # 'POSCAR' or 'xyz'

# Define diffraction type (None, 'neutron', 'xrd', 'EPA' or other)
targets = ['H', 'Cv', 'gap']

# Define folder in which all data will be stored
data_folder = f'data/QM9-all-linked'

encoding_type      = 'all-linked'  # 'voronoi', 'all-linked' or 'sphere-images'
distance_threshold = None  # 6, used in general
periodicity        = None  # Better False for molecules, always True for crystals

# Define basic dataset parameters for tracking data
dataset_parameters = {
    'input_folder': data_path,
    'output_folder': data_folder,
    'target': targets,
    'encoding_type': encoding_type,
    'distance_threshold': distance_threshold
}

if not os.path.exists(data_folder):
    os.system(f'mkdir {data_folder}')

# Dump the dictionary with numpy arrays to a JSON file
with open(f'{data_folder}/dataset_parameters.json', 'w') as json_file:
    json.dump(dataset_parameters, json_file)

# Generation of graph database for training

Load the datasets, already standardized if possible.

In [3]:
# Generate the raw dataset from scratch, and standardize it

# Read all materials within the database
materials = os.listdir(data_path)

dataset = []
for material in materials:
    try:
        # Try to read the polymorphs
        polymorphs = os.listdir(f'{data_path}/{material}')
    except:
        continue
    
    print(material)
    for polymorf in polymorphs:
        # Path to folder containing the POSCAR
        path_to_POSCAR = f'{data_path}/{material}/{polymorf}'
        
        # Check that the folder is valid
        if os.path.exists(path_to_POSCAR):
            print(f'\t{polymorf}')
            
            try:
                if data_type == 'POSCAR':
                    # Load pymatgen structure object
                    structure = Structure.from_file(f'{path_to_POSCAR}/POSCAR')
                elif data_type == 'xyz':
                    molecule  = Molecule.from_file(path_to_POSCAR)
                    
                    # Define the molecule within a [100, 100, 100] POSCAR lattice
                    structure = Structure(Lattice.cubic(100), molecule.species, molecule.cart_coords, coords_are_cartesian=True)
                    
                    with open(path_to_POSCAR, 'r') as file:
                        lines = file.readlines()
                    properties_values = lines[1].split()
                    
                    properties_tags = ['tag', 'index', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
                    
                    graph_level_data = {tag: properties_values[i] for i, tag in enumerate(properties_tags)}
                
                nodes, edges, attributes = graph_POSCAR_encoding(structure,
                                                                 encoding_type=encoding_type,
                                                                 distance_threshold=distance_threshold,
                                                                 periodicity=periodicity)
            except:
                print(f'Error: {material} {polymorf} not loaded')
                continue

            extracted_target = []
            for target in targets:
                if target == 'EPA':
                    # Load ground state energy per atom
                    extracted_target.append(float(np.loadtxt(f'{path_to_POSCAR}/EPA')))
                elif target == 'bandgap':
                    # Load band-gap
                    extracted_target.append(float(np.loadtxt(f'{path_to_POSCAR}/bandgap')))
                elif (target == 'neutron') or (target == 'xrd'):
                    # Compute diffraction pattern from given structure
                    extracted_target.append(compute_diffraction_pattern(structure, diffraction=target))
                elif None:
                    # Do not extract anything
                    extracted_target.append(0)
                else:
                    extracted_target.append(float(graph_level_data[target]))
            
            # Construct temporal graph structure
            graph = Data(x=nodes,
                         edge_index=edges.t().contiguous(),
                         edge_attr=attributes.ravel(),
                         y=torch.tensor(extracted_target, dtype=torch.float),
                         label=f'{material} {polymorf}'
                        )

            # Append to dataset and labels
            dataset.append(graph)

dsgdb9nsd.xyz
	dsgdb9nsd_035176.xyz
	dsgdb9nsd_131462.xyz
	dsgdb9nsd_073123.xyz
	dsgdb9nsd_096811.xyz
	dsgdb9nsd_100867.xyz
	dsgdb9nsd_063330.xyz
	dsgdb9nsd_089931.xyz
	dsgdb9nsd_121671.xyz
	dsgdb9nsd_025365.xyz
	dsgdb9nsd_065999.xyz
	dsgdb9nsd_032619.xyz
	dsgdb9nsd_044957.xyz
	dsgdb9nsd_055682.xyz
	dsgdb9nsd_002902.xyz
	dsgdb9nsd_033507.xyz
	dsgdb9nsd_075552.xyz
	dsgdb9nsd_045491.xyz
	dsgdb9nsd_034268.xyz
	dsgdb9nsd_128133.xyz
	dsgdb9nsd_065741.xyz
	dsgdb9nsd_074894.xyz
	dsgdb9nsd_127200.xyz
	dsgdb9nsd_023714.xyz
	dsgdb9nsd_124709.xyz
	dsgdb9nsd_038452.xyz
	dsgdb9nsd_066248.xyz
	dsgdb9nsd_009857.xyz
	dsgdb9nsd_018782.xyz
	dsgdb9nsd_037761.xyz
	dsgdb9nsd_046198.xyz
	dsgdb9nsd_133275.xyz
	dsgdb9nsd_071734.xyz
	dsgdb9nsd_028641.xyz
	dsgdb9nsd_039994.xyz
	dsgdb9nsd_016977.xyz
	dsgdb9nsd_050922.xyz
	dsgdb9nsd_061527.xyz
	dsgdb9nsd_123066.xyz
	dsgdb9nsd_079368.xyz
	dsgdb9nsd_008591.xyz
	dsgdb9nsd_093969.xyz
	dsgdb9nsd_027572.xyz
	dsgdb9nsd_031310.xyz
	dsgdb9nsd_077345.xyz
	dsgdb9nsd_057095.

  with zopen(filename) as file:


	dsgdb9nsd_077379.xyz
	dsgdb9nsd_006580.xyz
	dsgdb9nsd_029563.xyz
	dsgdb9nsd_070416.xyz
	dsgdb9nsd_132157.xyz
	dsgdb9nsd_036443.xyz
	dsgdb9nsd_082858.xyz
	dsgdb9nsd_068259.xyz
	dsgdb9nsd_016793.xyz
	dsgdb9nsd_039770.xyz
	dsgdb9nsd_048189.xyz
	dsgdb9nsd_007846.xyz
	dsgdb9nsd_041813.xyz
	dsgdb9nsd_112287.xyz
	dsgdb9nsd_022436.xyz
	dsgdb9nsd_126122.xyz
	dsgdb9nsd_064463.xyz
	dsgdb9nsd_129211.xyz
	dsgdb9nsd_090284.xyz
	dsgdb9nsd_055866.xyz
	dsgdb9nsd_013833.xyz
	dsgdb9nsd_005089.xyz
	dsgdb9nsd_074670.xyz
	dsgdb9nsd_032625.xyz
	dsgdb9nsd_130998.xyz
	dsgdb9nsd_025359.xyz
	dsgdb9nsd_080097.xyz
	dsgdb9nsd_100683.xyz
	dsgdb9nsd_111956.xyz
	dsgdb9nsd_087920.xyz
	dsgdb9nsd_004397.xyz
	dsgdb9nsd_081389.xyz
	dsgdb9nsd_024047.xyz
	dsgdb9nsd_120553.xyz
	dsgdb9nsd_062012.xyz
	dsgdb9nsd_014184.xyz
	dsgdb9nsd_023728.xyz
	dsgdb9nsd_110490.xyz
	dsgdb9nsd_098800.xyz
	dsgdb9nsd_072201.xyz
	dsgdb9nsd_121895.xyz
	dsgdb9nsd_130740.xyz
	dsgdb9nsd_034254.xyz
	dsgdb9nsd_005937.xyz
	dsgdb9nsd_043962.xyz
	dsgdb9nsd

In [4]:
# Standardize dataset
dataset_std, dataset_parameters = standardize_dataset(dataset, transformation='inverse-quadratic')

# Save dataset

In [5]:
dataset_name                = f'{data_folder}/dataset.pt'
dataset_name_std            = f'{data_folder}/standardized_dataset.pt'
dataset_parameters_name_std = f'{data_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

torch.save(dataset,     dataset_name)
torch.save(dataset_std, dataset_name_std)

# Convert torch tensors to numpy arrays
numpy_dict = {}
for key, value in dataset_parameters.items():
    try:
        numpy_dict[key] = value.cpu().numpy().tolist()
    except:
        numpy_dict[key] = value

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name_std, 'w') as json_file:
    json.dump(numpy_dict, json_file)