In [1]:
import numpy as np
import torch
import json
import os

from libraries.dataset        import standardize_dataset
from libraries.graph          import graph_POSCAR_encoding
from torch.utils.data         import random_split
from torch_geometric.data     import Data
from pymatgen.io.vasp.outputs import Vasprun

import sys
sys.path.append('../../UCL/m3gnet')
import ML_library as MLL

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
check_labels = False  # Whether to train-test split attending to labels or not

test_ratio  = 0.0

# In case database is created from scratch (otherwise, it is not being used)
DB_path = '../MP/Loaded_PT'

# Define folder in which all data will be stored
data_folder = 'data/GM_BiSI'

# Generation of graph database for training

Load the datasets, already standardized if possible.

# Generate the raw dataset from scratch, and standardize it

# Read all materials within the database
materials = os.listdir(DB_path)

dataset = []
labels  = []
for material in materials:
    try:
        # Try to read the polymorphs
        polymorphs = os.listdir(f'{DB_path}/{material}')
    except:
        continue
    
    print(material)
    for polymorf in polymorfs:
        # Path to folder containing the POSCAR
        path_to_POSCAR = f'{DB_path}/{material}/{polymorf}'
        
        # Check that the folder is valid
        if os.path.exists(f'{path_to_POSCAR}/POSCAR'):
            print(f'\t{polymorf}')

            # Extract parameters from POSCAR
            cell, composition, concentration, positions = MPL.information_from_VASPfile(path_to_POSCAR,
                                                                                        'POSCAR')
            
            # Generate POSCAR covering the box
            try:
                nodes, edges, attributes = graph_POSCAR_encoding(temp_structure,
                                                                 encoding_type='sphere-images')
            except:
                print(f'Error: {material} {polymorf} not loaded')
                continue

            # Load ground state energy per atom
            gs_energy = float(np.loadtxt(f'{path_to_POSCAR}/EPA'))

            # Construct temporal graph structure
            graph = Data(x=nodes,
                         edge_index=edges.t().contiguous(),
                         edge_attr=attributes,
                         y=torch.tensor([temp_energy], dtype=torch.float)
                        )

            # Append to dataset and labels
            dataset.append(graph)
            labels.append(f'{material}-{polymorf}')

# Standardize dataset
dataset, dataset_parameters = standardize_dataset(dataset)

# Save standardized dataset
torch.save(dataset, dataset_name_std)
torch.save(labels,  labels_name)

# Convert torch tensors to numpy arrays
numpy_dict = {key: value.cpu().numpy().tolist() for key, value in dataset_parameters.items()}

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name_std, 'w') as json_file:
    json.dump(numpy_dict, json_file)

In [None]:
dataset = []
labels  = []

### Loaded database ###

data_path = 'Loaded_BiSI/gamma'

# Iterate over materials and relaxations in the dataset
while len(dataset) < 200:
    for material in os.listdir(data_path):
        # Define path to material
        path_to_material = f'{data_path}/{material}'

        # Check if it is a folder
        if not os.path.isdir(path_to_material):
            continue

        print()
        print(material)

        # Get relaxations steps (rel1, rel2...)
        relaxation_steps = os.listdir(path_to_material)

        # Determine all defect states across every folder
        defect_states = []
        for relaxation_step in relaxation_steps:
            path_to_relaxation_step = f'{path_to_material}/{relaxation_step}'
            if os.path.isdir(path_to_relaxation_step):
                for defect_state in os.listdir(path_to_relaxation_step):
                    if os.path.isdir(f'{path_to_material}/{relaxation_step}/{defect_state}'):
                        defect_states.append(defect_state)

        # Determine unique defect states across every folder
        unique_defect_states = np.unique(defect_states)

        # Run over all defect states
        for defect_state in unique_defect_states:
            print(f'\t{defect_state}')

            # Run over all relaxation steps
            for relaxation_step in relaxation_steps:
                # Define path to relaxation loading every relaxation step
                # of a same defect state in the same data column
                path_to_deformation = f'{path_to_material}/{relaxation_step}/{defect_state}'

                # Avoiding non-directories (such as .DS_Store)
                if not os.path.isdir(path_to_deformation):
                    continue

                # Define name for the defect state folder
                temp_relaxation = f'{material}_{defect_state}'

                # Check if it is a valid relaxation (with a vasprun.xml file)
                # If not, it might be that there are different deformation folders of the defect state
                if MLL.is_relaxation_folder_valid(path_to_deformation):
                    path_to_relaxations = [path_to_deformation]
                else:
                    # Try to extract deformation folders
                    deformation_folders = os.listdir(path_to_deformation)

                    # Run over deformations
                    path_to_relaxations = []
                    for deformation_folder in deformation_folders:
                        path_to_relaxation = f'{path_to_deformation}/{deformation_folder}'
                        if MLL.is_relaxation_folder_valid(path_to_relaxation):
                            path_to_relaxations.append(path_to_relaxation)

                # Gather relaxations from different deformations as different ionic steps
                for path_to_relaxation in path_to_relaxations:
                    # Remove invalid characters from the vasprun.xml file
                    MLL.clean_vasprun(path_to_relaxation)  # Uncomment is it happens to you as well!!

                    if not os.path.exists(f'{path_to_relaxation}/vasprun.xml'):
                        print(f'Check {path_to_relaxation}')

                    # Load data from relaxation
                    try:
                        # Try to load those unfinished relaxations as well
                        vasprun = Vasprun(f'{path_to_relaxation}/vasprun.xml', exception_on_bad_xml=False)
                    except:
                        print('Error: vasprun not correctly loaded.')
                        continue

                    # Run over ionic steps
                    for ionic_step_idx in range(len(vasprun.ionic_steps)):
                        temp_ionic_step = f'{temp_relaxation}_{ionic_step_idx}'
                        # Extract data from each ionic step
                        temp_structure = vasprun.ionic_steps[ionic_step_idx]['structure']
                        temp_energy    = vasprun.ionic_steps[ionic_step_idx]['e_fr_energy']
                        
                        nodes, edges, attributes = graph_POSCAR_encoding(temp_structure,
                                                                         encoding_type='sphere-images')

                        # Construct temporal graph structure
                        graph = Data(x=nodes,
                                     edge_index=edges.t().contiguous(),
                                     edge_attr=attributes,
                                     y=torch.tensor([temp_energy], dtype=torch.float)
                                    )

                        # Append to dataset and labels
                        dataset.append(graph)
                        labels.append(f'')


BiSBr

BiSeBr

BiSeI

BiSI
	as_1_Bi_on_S_-1
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
	as_1_Bi_on_S_-2
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
Error: vasprun not correctly loaded.
	as_1_Bi_on_S_0


In [None]:
# Standardize dataset
dataset_std, dataset_parameters = standardize_dataset(dataset)

# Definition of train-test datasets

In [None]:
# torch.manual_seed(12345)

# Define the sizes of the train and test sets
test_size  = int(test_ratio * len(dataset_std))
train_size = len(dataset_std) - test_size

# Use random_split() to generate train and test sets
train_dataset, test_dataset = random_split(dataset_std, [train_size, test_size])

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of testing  graphs: {len(test_dataset)}')

# Save datasets

In [None]:
labels_name                 = f'{data_folder}/labels.pt'
train_dataset_name_std      = f'{data_folder}/train_dataset.pt'
test_dataset_name_std       = f'{data_folder}/test_dataset.pt'
dataset_parameters_name_std = f'{data_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

torch.save(test_dataset,  labels_name)
torch.save(train_dataset, train_dataset_name_std)
torch.save(test_dataset,  test_dataset_name_std)

# Convert torch tensors to numpy arrays
numpy_dict = {key: value.cpu().numpy().tolist() for key, value in dataset_parameters.items()}

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name_std, 'w') as json_file:
    json.dump(numpy_dict, json_file)