In [1]:
import numpy             as np
import torch.nn          as nn
import libraries.dataset as gld
import libraries.model   as glm
import libraries.graph   as glg
import torch
import json

from pymatgen.core        import Structure
from scipy.optimize       import minimize
from torch_geometric.data import Batch, Data

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# From random noise, we generate completely new materials
# A target property can be seeked with this approach

In [3]:
# Define folder in which all data will be stored
is_molecule     = True
target_folder   = 'models/QM9-all-linked/GM_v0'
model_name = f'{target_folder}/model.pt'

# Number of graphs to predict
N_predictions = 10

# Define target to be generated
target_tensor = torch.tensor(0, dtype=torch.int, device=device)

# Load model data

In [4]:
# Read the file in JSON format to a dictionary
with open(f'{target_folder}/model_parameters.json', 'r') as json_file:
    numpy_dict = json.load(json_file)

# Convert torch tensors to numpy arrays
model_parameters = {}
for key, value in numpy_dict.items():
    try:
        model_parameters[key] = torch.tensor(value, device=device)
    except:
        model_parameters[key] = value

# Number of diffusing and denoising steps
n_t_steps = model_parameters['n_t_steps']

model_parameters['alpha_decay'] = torch.tensor(0.4, device=device)

# Decay of parameter alpha
alpha_decay = model_parameters['alpha_decay']

# Dropouts for node and edge models (independent of each other)
pdropout_node = model_parameters['dropout_node']
pdropout_edge = model_parameters['dropout_edge']

# Generation of graph database for training

Load the datasets, already standarized if possible.

In [5]:
dataset_name                = f'{target_folder}/dataset.pt'
labels_name                 = f'{target_folder}/standardized_labels.pt'
dataset_name_std            = f'{target_folder}/standardized_dataset.pt'
dataset_parameters_name_std = f'{target_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

# Load the standardized dataset
dataset = torch.load(dataset_name_std, weights_only=False)

# Read the file in JSON format to a dictionary
with open(dataset_parameters_name_std, 'r') as json_file:
    numpy_dict = json.load(json_file)

# Convert torch tensors to numpy arrays
dataset_parameters = {}
for key, value in numpy_dict.items():
    try:
        dataset_parameters[key] = torch.tensor(value, device=device)
    except:
        dataset_parameters[key] = value

In [6]:
# Normalize target_tensor accordingly
target_tensor = (target_tensor - dataset_parameters['target_mean']) * dataset_parameters['scale'] / dataset_parameters['target_std']

In [7]:
# Calculate the mean and standard deviation of the number of nodes
total_nodes = torch.tensor([data.num_nodes for data in dataset])
mean_nodes  = torch.mean(total_nodes.float()).item()
std_nodes   = torch.std(total_nodes.float()).item()

mean_nodes, std_nodes

(17.983739852905273, 2.9542582035064697)

# Loading the model

In [8]:
# Determine number of node-level features in dataset, considering the t_step information
n_node_features = dataset[0].num_node_features

# Determine the number of graph-level features to be predicted
n_graph_features = len(dataset[0].y)

# Instantiate the models for nodes and edges
model = glm.GNN(n_node_features, n_graph_features, pdropout_node, pdropout_edge).to(device)

model.load_state_dict(torch.load(model_name, map_location=torch.device(device), weights_only=False))
model.eval()

# Allow data parallelization among multi-GPU
model= nn.DataParallel(model)

print('\nGCNN:')
print(model)


GCNN:
DataParallel(
  (module): GNN(
    (node_conv1): GraphConv(8, 32)
    (node_conv2): GraphConv(32, 64)
    (node_conv3): GraphConv(64, 4)
    (edge_linear_f1): Linear(in_features=17, out_features=32, bias=True)
    (edge_linear_r1): Linear(in_features=32, out_features=1, bias=True)
    (edge_linear_f2): Linear(in_features=65, out_features=32, bias=True)
    (edge_linear_r2): Linear(in_features=32, out_features=1, bias=True)
    (edge_linear_f3): Linear(in_features=129, out_features=16, bias=True)
    (edge_linear_r3): Linear(in_features=16, out_features=1, bias=True)
    (node_norm1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (edge_norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)


# Generating new cystals

In [29]:
# Create constant target tensor once
features_tensor = torch.cat([target_tensor, torch.tensor([0], device=device, dtype=target_tensor.dtype)])

# Predicting loop
diffused_dataset = []
with torch.no_grad():
    for idx in range(N_predictions):
        # Get a positive random number of nodes using absolute value
        n_nodes = int(np.abs(np.random.normal(mean_nodes, std_nodes)))
        
        # Get random graph, acting as diffused
        diffused_graph = glm.get_random_graph(n_nodes, n_node_features)

        # Make room for n_graph_features and t_steps in the dataset
        diffused_graph = glm.add_features_to_graph(diffused_graph, features_tensor)
        
        diffused_dataset.append(diffused_graph)
            
    # Generate batch object and move data to device
    diff_batch = Batch.from_data_list(diffused_dataset).to(device)
    print(diff_batch)
    # Denoise batch
    predicted_dataset = glm.denoise(diff_batch,
                                    n_t_steps, alpha_decay,
                                    model,
                                    n_features=n_node_features)

# From batch object to list
predicted_dataset = predicted_dataset.to_data_list()

# Remove graph features
for graph in predicted_dataset:
    graph.x = graph.x[:, :n_node_features]

# Revert standardization
denoised_graphs = gld.revert_standardize_dataset(predicted_dataset, dataset_parameters)
denoised_graphs

DataBatch(x=[174, 8], edge_index=[2, 1469], edge_attr=[1469], batch=[174], ptr=[11])


[Data(x=[18, 4], edge_index=[2, 153], edge_attr=[153]),
 Data(x=[13, 4], edge_index=[2, 78], edge_attr=[78]),
 Data(x=[18, 4], edge_index=[2, 153], edge_attr=[153]),
 Data(x=[17, 4], edge_index=[2, 136], edge_attr=[136]),
 Data(x=[14, 4], edge_index=[2, 91], edge_attr=[91]),
 Data(x=[21, 4], edge_index=[2, 210], edge_attr=[210]),
 Data(x=[17, 4], edge_index=[2, 136], edge_attr=[136]),
 Data(x=[14, 4], edge_index=[2, 91], edge_attr=[91]),
 Data(x=[20, 4], edge_index=[2, 190], edge_attr=[190]),
 Data(x=[22, 4], edge_index=[2, 231], edge_attr=[231])]

In [30]:
temp = denoised_graphs[0]

nodes = temp.x
edges = temp.edge_index.detach().cpu().numpy().T
weights = temp.edge_attr.detach().cpu().numpy()

## Molecules

In [32]:
if is_molecule:
    # Initial guess for the positions
    solution = np.random.rand(len(nodes) * 3)*10  # Initialize all points at origin, 1D array
    #solution = coordinates.reshape(-1, 1).ravel()
    
    # Function to calculate the squared difference between distances and weights
    def objective(solution_attempt, edges, weights):
        positions = solution_attempt.reshape(-1, 3)  # Reshape to 2D array
        errors = 0
        for edge, weight in zip(edges, weights):
            p1 = positions[edge[0]]
            p2 = positions[edge[1]]
            distance = np.linalg.norm(p2 - p1)
            errors += np.power(distance - weight, 2)
        #print(errors)
        return errors
    
    def worst_identification(edges, attributes, solution_attempt):
        positions = solution_attempt.reshape(-1, 3)  # Reshape to 2D array
    
        particle_errors = []
        for particle in np.unique(edges):
            # Get those edge indexes where particle has a connection
            particle_connections = np.where((edges[:, 0] == particle) | (edges[:, 1] == particle))
    
            particle_error = 0
            for idx in particle_connections[0]:
                # Load indexes in edge
                edge = edges[idx]
    
                # Load expected attribute
                p1 = positions[edge[0]]
                p2 = positions[edge[1]]
    
                # Load reference attribute
                weight = attributes[idx].item()
                
                # Compute error
                distance = np.linalg.norm(p2 - p1)

                # Append to trial errors for different atom images
                trial_error = np.power(distance - weight, 2)
    
                # Add error
                particle_error += trial_error
    
            # Average over the connection of the node
            particle_error /= len(particle_connections[0])
    
            # Append particle error
            particle_errors.append(particle_error)
    
        return np.argmax(particle_errors), np.max(particle_errors)

## Crystals

In [33]:
if not is_molecule:
    # Initial guess for the lattice parameters
    lattice_vectors = np.array([[12.6111574162109861, 0.0000011161086378, 0.0000448983002823],
                                [0.0000017328561662, 17.1582865432904406, -0.0000025255958988],
                                [0.0000367952301136, -0.0000015683987433, 10.2820259071568429]])
    
    # Initial guess for the positions
    #initial_positions = np.random.rand(len(nodes) * 3)  # Initialize all points at origin, 1D array
    initial_positions = coordinates
    solution = np.concatenate([lattice_vectors.ravel(), initial_positions])
    
    # Function to calculate the squared difference between distances and weights
    def objective(solution_attempt, edges, weights):
        solution_attempt = solution_attempt.reshape(-1, 3)  # Reshape to 2D array
        
        lattice_vectors = solution_attempt[:3]
        positions       = solution_attempt[3:]
        
        errors = 0
        for edge, weight in zip(edges, weights):
            p1 = positions[edge[0]]
            p2 = positions[edge[1]]
            
            trial_errors = [] 
            for i in [-1, 0, 1]:
                for j in [-1, 0, 1]:
                    for k in [-1, 0, 1]:
                        # i*lattice_vectors[0] + j*lattice_vectors[1] + k*lattice_vectors[2]
                        ijk_lattice_vectors = np.sum([i, j, k] * lattice_vectors.T, axis=1)

                        # Compute error
                        distance = np.linalg.norm(p2 - p1 + ijk_lattice_vectors)

                        # Append to trial errors for differente atom images
                        trial_errors.append(np.power(distance - weight, 2))
            errors += np.min(trial_errors)
        #print(errors)
        return errors
    
    def worst_identification(edges, attributes, solution_attempt):
        solution_attempt = solution_attempt.reshape(-1, 3)  # Reshape to 2D array
    
        lattice_vectors = solution_attempt[:3]
        positions       = solution_attempt[3:]
    
        particle_errors = []
        for particle in np.unique(edges):
            # Get those edge indexes where particle has a connection
            particle_connections = np.where((edges[:, 0] == particle) | (edges[:, 1] == particle))
    
            particle_error = 0
            for idx in particle_connections[0]:
                # Load indexes in edge
                edge = edges[idx]
    
                # Load expected attribute
                p1 = positions[edge[0]]
                p2 = positions[edge[1]]
    
                # Load reference attribute
                weight = attributes[idx].item()
    
                trial_errors = []
                for i in [-1, 0, 1]:
                    for j in [-1, 0, 1]:
                        for k in [-1, 0, 1]:
                            # i*lattice_vectors[0] + j*lattice_vectors[1] + k*lattice_vectors[2]
                            ijk_lattice_vectors = np.sum([i, j, k] * lattice_vectors.T, axis=1)
    
                            # Compute error
                            distance = np.linalg.norm(p2 - p1 + ijk_lattice_vectors)
    
                            # Append to trial errors for different atom images
                            trial_errors.append(np.power(distance - weight, 2))
    
                # Add error
                particle_error += np.min(trial_errors)
    
            # Average over the connection of the node
            particle_error /= len(particle_connections[0])
    
            # Append particle error
            particle_errors.append(particle_error)
    
        return np.argmax(particle_errors), np.max(particle_errors)

In [34]:
error_threshold = 1e-5

for attempt in range(100):
    print()
    print(f'Attempt {attempt}')
    solution = minimize(objective, solution,
                        args=(edges, weights),
                        method='Powell')

    is_success       = solution.success
    solution_message = solution.message
    worst_particle, worst_error = worst_identification(edges, weights, solution.x)

    attempt_error = objective(solution.x, edges, weights)
    print(f'Total: {attempt_error} and local {worst_error} errors')

    if attempt_error < error_threshold:
        break

    solution = solution.x.reshape(-1, 3)  # Reshape to 2D array

    # Re-initialize that position
    if is_molecule:
        solution[worst_particle] = np.random.rand(3)
    else:
        solution[worst_particle+3] = np.random.rand(3)

    solution = solution.flatten()

# Check convergence status
if is_success:
    print('Converged to a solution.')
else:
    print(f'Failed to converge: {solution_message}')


Attempt 0
Total: 77.48523182259045 and local 0.6329999255632265 errors

Attempt 1
Total: 77.20066500671535 and local 0.6216631390758556 errors

Attempt 2
Total: 77.48800982346968 and local 0.623685722475042 errors

Attempt 3
Total: 77.654806713137 and local 0.6384355329665493 errors

Attempt 4
Total: 77.71685800819633 and local 0.6358518108203721 errors

Attempt 5
Total: 77.66796971437712 and local 0.6345372153555331 errors

Attempt 6
Total: 77.71496010016631 and local 0.6370539971236819 errors

Attempt 7
Total: 77.64962428337947 and local 0.6351606462728148 errors

Attempt 8
Total: 77.7106664910225 and local 0.634845927850386 errors

Attempt 9
Total: 77.64965815162219 and local 0.6351494340008969 errors

Attempt 10
Total: 77.70850764448971 and local 0.634938034672591 errors

Attempt 11
Total: 77.6499193477986 and local 0.6351016986188665 errors

Attempt 12
Total: 77.7068970956085 and local 0.6349929058173133 errors

Attempt 13
Total: 77.65581856607565 and local 0.6346937472312821 err

In [35]:
solution = solution.reshape(-1, 3)

## Molecules

In [36]:
if is_molecule:
    # Get the position of each atom in direct coordinates
    #direct_positions = graph_to_cartesian_positions(graph)
    #cartesian_positions = solution.x.reshape(-1, 3)*mw
    #cartesian_positions = solution.x.reshape(-1, 3)
    
    lattice_vectors     = np.array([[10,  0,   0],
                                    [0,   10,  0],
                                    [0,   0,   10]])
    cartesian_positions = solution

## Crystals

In [37]:
if not is_molecule:
    # Get the position of each atom in direct coordinates
    #direct_positions = graph_to_cartesian_positions(graph)
    #cartesian_positions = solution.x.reshape(-1, 3)*mw
    
    lattice_vectors     = solution[:3]
    cartesian_positions = solution[3:]

In [41]:
new_graph.x

tensor([[ 7.1365,  3.8249,  2.4776, 12.7807],
        [ 7.0138,  3.7843,  2.4891, 12.8251],
        [ 6.9057,  3.7339,  2.4783, 12.8256],
        [ 7.0122,  3.7737,  2.4817, 12.7812],
        [ 6.8408,  3.7007,  2.4785, 12.8339],
        [ 6.8943,  3.7361,  2.4786, 12.8249],
        [ 6.8747,  3.7351,  2.4760, 12.8615],
        [ 6.9799,  3.7713,  2.4803, 12.8099],
        [ 7.0315,  3.7640,  2.4816, 12.7987],
        [ 7.0216,  3.7837,  2.4838, 12.8142],
        [ 6.8890,  3.7358,  2.4774, 12.8242],
        [ 6.9740,  3.7701,  2.4817, 12.8139],
        [ 6.8881,  3.7048,  2.4775, 12.8413],
        [ 6.9214,  3.7429,  2.4763, 12.8331],
        [ 6.9967,  3.7504,  2.4828, 12.8152],
        [ 6.9247,  3.7348,  2.4798, 12.8254],
        [ 7.0200,  3.7788,  2.4806, 12.8384],
        [ 6.9106,  3.7369,  2.4812, 12.8187]], device='cuda:0')

In [39]:
POSCAR_name = None

# Get name for the first line of the POSCAR
POSCAR_name = POSCAR_name or 'POSCAR from GenerativeModels'

# Clone the input graph to preserve the original structure
new_graph = temp.clone()

# Load and detach embeddings for the graph nodes
data_embeddings = new_graph.x.detach().cpu().numpy()

# Loading dictionary of available embeddings for atoms
available_embeddings = {}
with open('../MP/input/atomic_masses.dat', 'r') as atomic_masses_file:
    for line in atomic_masses_file:
        key, mass, charge, electronegativity, ionization_energy = line.split()

        # Check if all information is present
        if all(val != 'None' for val in (mass, charge, electronegativity, ionization_energy)):
            available_embeddings[key] = np.array([mass, charge, electronegativity, ionization_energy], dtype=float)

# Get most similar atoms for each graph node and create a list of keys
keys = [glg.find_closest_key(available_embeddings, emb) for emb in data_embeddings]

# Get elements' composition, concentration, and positions
POSCAR_composition, POSCAR_concentration, POSCAR_positions = glg.composition_concentration_from_keys(keys, cartesian_positions)

In [40]:
# Write file
with open('CONTCAR', 'w') as POSCAR_file:
    # Delete previous data in the file
    POSCAR_file.truncate()
    
    # Write POSCAR's name
    POSCAR_file.write(f'{POSCAR_name}\n')

    # Write scaling factor (assumed to be 1.0)
    POSCAR_file.write('1.0\n')

    # Write lattice parameters (assumed to be orthogonal)
    np.savetxt(POSCAR_file, lattice_vectors, delimiter=' ')

    # Write composition (each different species, previously sorted)
    np.savetxt(POSCAR_file, [POSCAR_composition], fmt='%s', delimiter=' ')

    # Write concentration (number of each of the previous elements)
    np.savetxt(POSCAR_file, [POSCAR_concentration], fmt='%d', delimiter=' ')

    # Write position in cartesian form
    POSCAR_file.write('Cartesian\n')
    np.savetxt(POSCAR_file, POSCAR_positions, delimiter=' ')