In [4]:
import torch
import json
import os

from libraries.dataset    import standardize_dataset
from libraries.graph      import graph_POSCAR_encoding
from torch_geometric.data import Data

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# In case database is created from scratch (otherwise, it is not being used)
data_path = '../MP/MOSES_dataset.txt'

# Define folder in which all data will be stored
data_folder = f'data/MOSES_dataset'

# Define name for storing dataset basic description
dataset_parameters_name = f'{data_folder}/dataset_parameters.json'

encoding_type = 'molecule'  # 'voronoi' or 'sphere-images'

# Define basic dataset parameters for tracking data
dataset_parameters = {
    'input_folder': data_path,
    'output_folder': data_folder,
    'encoding_type': encoding_type
}

if not os.path.exists(data_folder):
    os.system(f'mkdir {data_folder}')

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name, 'w') as json_file:
    json.dump(dataset_parameters, json_file)

# Generation of graph database for training

Load the datasets, already standardized if possible.

In [None]:
# Generate the raw dataset from scratch, and standardize it

# Read all molecules within the database
with open(data_path, 'r') as file:
    lines = file.readlines()

total_structures  = 0
loaded_structures = 0

dataset = []
labels  = []
for line in lines:
    total_structures += 1
    
    # Define molecule appart from train-test splitting
    molecule = line.split(',')[0]
    #print()
    #print(molecule)
    try:
        nodes, edges, attributes = graph_POSCAR_encoding(molecule,
                                                         encoding_type=encoding_type)
    except:
        print(f'Error: {molecule} not loaded')
        continue
    
    # Construct temporal graph structure
    graph = Data(x=nodes,
                 edge_index=edges.t().contiguous(),
                 edge_attr=attributes.ravel()
                )

    # Append to dataset and labels
    dataset.append(graph)
    labels.append(molecule)
    
    loaded_structures += 1
total_structures, loaded_structures

[17:32:36] SMILES Parse Error: syntax error while parsing: SMILES
[17:32:36] SMILES Parse Error: Failed parsing SMILES 'SMILES' for input: 'SMILES'
  edges      = torch.tensor(edges,      dtype=torch.long)


Error: SMILES not loaded


In [None]:
# Standardize dataset
dataset_std, dataset_parameters = standardize_dataset(dataset)

# Save dataset

In [None]:
labels_name                 = f'{data_folder}/labels.pt'
dataset_name_std            = f'{data_folder}/dataset.pt'
dataset_parameters_name_std = f'{data_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

torch.save(labels,  labels_name)
torch.save(dataset, dataset_name_std)

# Convert torch tensors to numpy arrays
numpy_dict = {key: value.cpu().numpy().tolist() for key, value in dataset_parameters.items()}

# Dump the dictionary with numpy arrays to a JSON file
with open(dataset_parameters_name_std, 'w') as json_file:
    json.dump(numpy_dict, json_file)