In [1]:
import numpy             as np
import torch.nn          as nn
import libraries.dataset as gld
import libraries.model   as glm
import libraries.graph   as glg
import torch
import json

from pymatgen.core        import Structure
from scipy.optimize       import minimize
from torch_geometric.data import Batch, Data

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# From random noise, we generate completely new materials
# A target property can be seeked with this approach

In [3]:
# Define folder in which all data will be stored
is_molecule     = True
target_folder   = 'models/QM9-all-linked/GM_v0'
model_name = f'{target_folder}/model.pt'

# Number of graphs to predict
N_predictions = 10

# Define target to be generated
target_tensor = torch.tensor(0, dtype=torch.int, device=device)

# Load model data

In [4]:
# Read the file in JSON format to a dictionary
with open(f'{target_folder}/model_parameters.json', 'r') as json_file:
    numpy_dict = json.load(json_file)

# Convert torch tensors to numpy arrays
model_parameters = {}
for key, value in numpy_dict.items():
    try:
        model_parameters[key] = torch.tensor(value, device=device)
    except:
        model_parameters[key] = value

# Number of diffusing and denoising steps
n_t_steps = model_parameters['n_t_steps']

model_parameters['alpha_decay'] = torch.tensor(0.4, device=device)

# Decay of parameter alpha
alpha_decay = model_parameters['alpha_decay']

# Dropouts for node and edge models (independent of each other)
pdropout_node = model_parameters['dropout_node']
pdropout_edge = model_parameters['dropout_edge']

# Generation of graph database for training

Load the datasets, already standarized if possible.

In [5]:
dataset_name                = f'{target_folder}/dataset.pt'
labels_name                 = f'{target_folder}/standardized_labels.pt'
dataset_name_std            = f'{target_folder}/standardized_dataset.pt'
dataset_parameters_name_std = f'{target_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

# Load the standardized dataset
dataset = torch.load(dataset_name_std, weights_only=False)

# Read the file in JSON format to a dictionary
with open(dataset_parameters_name_std, 'r') as json_file:
    numpy_dict = json.load(json_file)

# Convert torch tensors to numpy arrays
dataset_parameters = {}
for key, value in numpy_dict.items():
    try:
        dataset_parameters[key] = torch.tensor(value, device=device)
    except:
        dataset_parameters[key] = value

In [6]:
# Normalize target_tensor accordingly
target_tensor = (target_tensor - dataset_parameters['target_mean']) * dataset_parameters['scale'] / dataset_parameters['target_std']

In [7]:
# Calculate the mean and standard deviation of the number of nodes
total_nodes = torch.tensor([data.num_nodes for data in dataset])
mean_nodes  = torch.mean(total_nodes.float()).item()
std_nodes   = torch.std(total_nodes.float()).item()

mean_nodes, std_nodes

(17.983739852905273, 2.9542582035064697)

# Loading the model

In [8]:
# Determine number of node-level features in dataset, considering the t_step information
n_node_features = dataset[0].num_node_features

# Determine the number of graph-level features to be predicted
n_graph_features = len(dataset[0].y)

# Instantiate the models for nodes and edges
model = glm.GNN(n_node_features, n_graph_features, pdropout_node, pdropout_edge).to(device)

model.load_state_dict(torch.load(model_name, map_location=torch.device(device), weights_only=False))
model.eval()

# Allow data parallelization among multi-GPU
model= nn.DataParallel(model)

print('\nGCNN:')
print(model)


GCNN:
DataParallel(
  (module): GNN(
    (node_conv1): GraphConv(8, 32)
    (node_conv2): GraphConv(32, 64)
    (node_conv3): GraphConv(64, 4)
    (edge_linear_f1): Linear(in_features=17, out_features=32, bias=True)
    (edge_linear_r1): Linear(in_features=32, out_features=1, bias=True)
    (edge_linear_f2): Linear(in_features=65, out_features=32, bias=True)
    (edge_linear_r2): Linear(in_features=32, out_features=1, bias=True)
    (edge_linear_f3): Linear(in_features=129, out_features=16, bias=True)
    (edge_linear_r3): Linear(in_features=16, out_features=1, bias=True)
    (node_norm1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (edge_norm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)


# Generating new cystals

In [28]:
import importlib
importlib.reload(glm)

<module 'libraries.model' from '/home/claudio/cibran/Work/UPC/GenerativeModels/libraries/model.py'>

In [29]:
# Create constant target tensor once
features_tensor = torch.cat([target_tensor, torch.tensor([0], device=device, dtype=target_tensor.dtype)])

# Predicting loop
diffused_dataset = []
with torch.no_grad():
    for idx in range(N_predictions):
        # Get a positive random number of nodes using absolute value
        n_nodes = int(np.abs(np.random.normal(mean_nodes, std_nodes)))
        
        # Get random graph, acting as diffused
        diffused_graph = glm.get_random_graph(n_nodes, n_node_features)

        # Make room for n_graph_features and t_steps in the dataset
        diffused_graph = glm.add_features_to_graph(diffused_graph, features_tensor)
        
        diffused_dataset.append(diffused_graph)
            
    # Generate batch object and move data to device
    diff_batch = Batch.from_data_list(diffused_dataset).to(device)
    print(diff_batch)
    # Denoise batch
    predicted_dataset = glm.denoise(diff_batch,
                                    n_t_steps, alpha_decay,
                                    model,
                                    n_features=n_node_features)

# From batch object to list
predicted_dataset = predicted_dataset.to_data_list()

# Remove graph features
for graph in predicted_dataset:
    graph.x = graph.x[:, :n_node_features]

# Revert standardization
denoised_graphs = gld.revert_standardize_dataset(predicted_dataset, dataset_parameters)
denoised_graphs

DataBatch(x=[174, 8], edge_index=[2, 1469], edge_attr=[1469], batch=[174], ptr=[11])


[Data(x=[18, 4], edge_index=[2, 153], edge_attr=[153]),
 Data(x=[13, 4], edge_index=[2, 78], edge_attr=[78]),
 Data(x=[18, 4], edge_index=[2, 153], edge_attr=[153]),
 Data(x=[17, 4], edge_index=[2, 136], edge_attr=[136]),
 Data(x=[14, 4], edge_index=[2, 91], edge_attr=[91]),
 Data(x=[21, 4], edge_index=[2, 210], edge_attr=[210]),
 Data(x=[17, 4], edge_index=[2, 136], edge_attr=[136]),
 Data(x=[14, 4], edge_index=[2, 91], edge_attr=[91]),
 Data(x=[20, 4], edge_index=[2, 190], edge_attr=[190]),
 Data(x=[22, 4], edge_index=[2, 231], edge_attr=[231])]

In [31]:
denoised_graphs[0].edge_attr

tensor([2.7519, 2.9047, 2.6898, 3.0402, 2.9155, 2.9910, 3.0098, 3.0166, 2.8677,
        2.6701, 3.1365, 2.7193, 3.0985, 2.8962, 2.8931, 2.9255, 3.0719, 2.6909,
        2.7070, 3.3156, 3.1131, 2.5674, 2.9764, 2.9588, 3.2445, 2.6130, 2.9292,
        2.7703, 3.1421, 3.2245, 2.7823, 2.9829, 2.7802, 2.5691, 3.2204, 2.9880,
        2.6501, 2.6365, 2.6385, 3.1619, 2.9183, 2.6999, 2.9126, 3.0100, 3.0192,
        3.1426, 2.8504, 2.9245, 2.9521, 2.9425, 2.6392, 2.7955, 2.8089, 2.9238,
        2.9586, 3.1135, 2.8069, 2.8015, 2.8246, 2.6696, 2.9643, 2.8734, 2.7053,
        3.0933, 2.9145, 3.0263, 2.6329, 2.6536, 2.9527, 2.9915, 2.7621, 2.8187,
        2.6876, 2.9272, 3.1350, 3.0114, 2.8606, 2.7261, 2.8191, 2.9666, 3.0958,
        3.1195, 3.1285, 3.0675, 2.8924, 2.7753, 3.1929, 2.6308, 3.3878, 3.1406,
        2.8832, 2.9401, 2.6327, 2.8823, 3.0740, 2.8956, 2.8834, 3.1169, 3.3332,
        2.9948, 2.9850, 2.7612, 2.8803, 2.8523, 2.8900, 2.9136, 2.9375, 3.2321,
        3.1448, 3.2407, 2.6316, 3.0615, 

In [30]:
temp = denoised_graphs[0]

nodes = temp.x
edges = temp.edge_index.detach().cpu().numpy().T
weights = temp.edge_attr.detach().cpu().numpy()

## Molecules

In [15]:
if is_molecule:
    # Initial guess for the positions
    solution = np.random.rand(len(nodes) * 3)*10  # Initialize all points at origin, 1D array
    #solution = coordinates.reshape(-1, 1).ravel()
    
    # Function to calculate the squared difference between distances and weights
    def objective(solution_attempt, edges, weights):
        positions = solution_attempt.reshape(-1, 3)  # Reshape to 2D array
        errors = 0
        for edge, weight in zip(edges, weights):
            p1 = positions[edge[0]]
            p2 = positions[edge[1]]
            distance = np.linalg.norm(p2 - p1)
            errors += np.power(distance - weight, 2)
        #print(errors)
        return errors
    
    def worst_identification(edges, attributes, solution_attempt):
        positions = solution_attempt.reshape(-1, 3)  # Reshape to 2D array
    
        particle_errors = []
        for particle in np.unique(edges):
            # Get those edge indexes where particle has a connection
            particle_connections = np.where((edges[:, 0] == particle) | (edges[:, 1] == particle))
    
            particle_error = 0
            for idx in particle_connections[0]:
                # Load indexes in edge
                edge = edges[idx]
    
                # Load expected attribute
                p1 = positions[edge[0]]
                p2 = positions[edge[1]]
    
                # Load reference attribute
                weight = attributes[idx].item()
                
                # Compute error
                distance = np.linalg.norm(p2 - p1)

                # Append to trial errors for different atom images
                trial_error = np.power(distance - weight, 2)
    
                # Add error
                particle_error += trial_error
    
            # Average over the connection of the node
            particle_error /= len(particle_connections[0])
    
            # Append particle error
            particle_errors.append(particle_error)
    
        return np.argmax(particle_errors), np.max(particle_errors)

## Crystals

In [32]:
coordinates = np.array([
     0.0833003865704711  , 0.1859270061465210  , 0.3777845439929735
  , 0.0833003981061324  , 0.6859269881513725  , 0.3777845451628608
  , 0.4167001356044722  , 0.1859269992816479  , 0.3777845191410876
  , 0.4167001621404367  , 0.6859270194700571  , 0.3777845271190117
  , 0.7500002705756827  , 0.1859316651280807  , 0.3777885037770403
  , 0.7500002296629660  , 0.6859316875596519  , 0.3777885214985517
  , 0.2499997225877237  , 0.3140683536350650  , 0.6222114970725059
  , 0.2499996959117041  , 0.8140683345865583  , 0.6222114853550877
  , 0.5832998677821450  , 0.3140729827261879  , 0.6222154671411744
  , 0.5832999024809595  , 0.8140729783177250  , 0.6222154056502092
  , 0.9166995955193187  , 0.3140729782225051  , 0.6222154505134370
  , 0.9166995850678958  , 0.8140729821050883  , 0.6222154662954935
  , 0.0833003557957213  , 0.4359388259980790  , 0.1222199397756398
  , 0.0833003703218509  , 0.9359387910009076  , 0.1222198643536814
  , 0.4166992957118154  , 0.4359388088130629  , 0.1222198955474596
  , 0.4166993028463040  , 0.9359387786070243  , 0.1222199189671187
  , 0.7499998539722768  , 0.4359438124948127  , 0.1222156160502195
  , 0.7499998313637235  , 0.9359437662274885  , 0.1222156247913588
  , 0.2500001334862176  , 0.0640561904985262  , 0.8777843960840457
  , 0.2500001501522888  , 0.5640561935034967  , 0.8777843427165877
  , 0.5833007338820693  , 0.0640612097785151  , 0.8777800558508204
  , 0.5833007418971903  , 0.5640612254590351  , 0.8777801148276509
  , 0.9166996202755655  , 0.0640611993023512  , 0.8777801089383459
  , 0.9166996118127884  , 0.5640612295363212  , 0.8777801377671963
  , 0.2499997762110198  , 0.1713601343107101  , 0.5481720876539455
  , 0.2499997840703116  , 0.6713601214163134  , 0.5481720780145309
  , 0.5833250162477910  , 0.1713572960325820  , 0.5481789775038095
  , 0.5833250495988693  , 0.6713573040678895  , 0.5481789530732897
  , 0.9166745746964295  , 0.1713572981373801  , 0.5481789842084837
  , 0.9166745530899618  , 0.6713573021865074  , 0.5481789776050263
  , 0.0833249621182475  , 0.0786333712486353  , 0.0481761717281017
  , 0.0833249283762569  , 0.5786334128382151  , 0.0481761718552747
  , 0.4166745279576674  , 0.0786333673791404  , 0.0481761917888477
  , 0.4166745408175885  , 0.5786334029086504  , 0.0481761988613769
  , 0.7499997542585817  , 0.0786306836572876  , 0.0481693636313025
  , 0.7499997696249991  , 0.5786307106868804  , 0.0481693779947747
  , 0.2500002428303674  , 0.4213693111674246  , 0.9518306289886667
  , 0.2500002524284000  , 0.9213693090296857  , 0.9518306479504446
  , 0.5833254785610436  , 0.4213666122472688  , 0.9518238281457627
  , 0.5833254664540277  , 0.9213666048503626  , 0.9518238233300949
  , 0.9166750443396126  , 0.4213666121736850  , 0.9518238196621454
  , 0.9166750512045638  , 0.9213665989570998  , 0.9518238133931618
  , 0.0833254093017004  , 0.3286426988348055  , 0.4518210070768447
  , 0.0833254152034897  , 0.8286426985367399  , 0.4518210434996561
  , 0.4166749883104828  , 0.3286426993701070  , 0.4518210326992929
  , 0.4166750112461699  , 0.8286427028116776  , 0.4518210241465397
  , 0.7500002189827981  , 0.3286398726945166  , 0.4518279165260282
  , 0.7500002050440315  , 0.8286398695303134  , 0.4518279056712515
  , 0.0833343904517392  , 0.2493895957389327  , 0.8276681784694375
  , 0.0833343689273747  , 0.7493896103808027  , 0.8276681775519847
  , 0.4166646008297334  , 0.2493895985448376  , 0.8276681888402280
  , 0.4166646015290567  , 0.7493896053118263  , 0.8276681788916704
  , 0.7499994955974358  , 0.2493892016752000  , 0.8276653349518099
  , 0.7499994966257333  , 0.7493892321762345  , 0.8276653329659069
  , 0.2499994993380952  , 0.0006115346223297  , 0.3276669512905670
  , 0.2499994545945086  , 0.5006115507435567  , 0.3276669855827379
  , 0.5833340316108817  , 0.0006112292100227  , 0.3276694445189889
  , 0.5833340450490496  , 0.5006112576846249  , 0.3276694188060532
  , 0.9166649171447929  , 0.0006112406556724  , 0.3276694414395962
  , 0.9166649422872197  , 0.5006112516606720  , 0.3276694166837473
  , 0.0833350774736772  , 0.4993887728596320  , 0.6723305520180602
  , 0.0833350565146773  , 0.9993887556194139  , 0.6723305767408831
  , 0.4166659399161503  , 0.4993887649443494  , 0.6723305628139755
  , 0.4166659743108312  , 0.9993887529203889  , 0.6723305536346871
  , 0.7500005282137465  , 0.4993884802271822  , 0.6723330568981183
  , 0.7500005211473351  , 0.9993884480369175  , 0.6723330647731487
  , 0.2500004948450609  , 0.2506107521768470  , 0.1723346898767488
  , 0.2500005000138685  , 0.7506107679165481  , 0.1723346662931746
  , 0.5833354355602083  , 0.2506103907686850  , 0.1723318159613498
  , 0.5833354392384109  , 0.7506103988549171  , 0.1723318206017836
  , 0.9166656093784553  , 0.2506103920247469  , 0.1723318129244831
  , 0.9166656048998192  , 0.7506104136216649  , 0.1723318040722503
])

In [16]:
if not is_molecule:
    # Initial guess for the lattice parameters
    lattice_vectors = np.array([[12.6111574162109861, 0.0000011161086378, 0.0000448983002823],
                                [0.0000017328561662, 17.1582865432904406, -0.0000025255958988],
                                [0.0000367952301136, -0.0000015683987433, 10.2820259071568429]])
    
    # Initial guess for the positions
    #initial_positions = np.random.rand(len(nodes) * 3)  # Initialize all points at origin, 1D array
    initial_positions = coordinates
    solution = np.concatenate([lattice_vectors.ravel(), initial_positions])
    
    # Function to calculate the squared difference between distances and weights
    def objective(solution_attempt, edges, weights):
        solution_attempt = solution_attempt.reshape(-1, 3)  # Reshape to 2D array
        
        lattice_vectors = solution_attempt[:3]
        positions       = solution_attempt[3:]
        
        errors = 0
        for edge, weight in zip(edges, weights):
            p1 = positions[edge[0]]
            p2 = positions[edge[1]]
            
            trial_errors = [] 
            for i in [-1, 0, 1]:
                for j in [-1, 0, 1]:
                    for k in [-1, 0, 1]:
                        # i*lattice_vectors[0] + j*lattice_vectors[1] + k*lattice_vectors[2]
                        ijk_lattice_vectors = np.sum([i, j, k] * lattice_vectors.T, axis=1)

                        # Compute error
                        distance = np.linalg.norm(p2 - p1 + ijk_lattice_vectors)

                        # Append to trial errors for differente atom images
                        trial_errors.append(np.power(distance - weight, 2))
            errors += np.min(trial_errors)
        #print(errors)
        return errors
    
    def worst_identification(edges, attributes, solution_attempt):
        solution_attempt = solution_attempt.reshape(-1, 3)  # Reshape to 2D array
    
        lattice_vectors = solution_attempt[:3]
        positions       = solution_attempt[3:]
    
        particle_errors = []
        for particle in np.unique(edges):
            # Get those edge indexes where particle has a connection
            particle_connections = np.where((edges[:, 0] == particle) | (edges[:, 1] == particle))
    
            particle_error = 0
            for idx in particle_connections[0]:
                # Load indexes in edge
                edge = edges[idx]
    
                # Load expected attribute
                p1 = positions[edge[0]]
                p2 = positions[edge[1]]
    
                # Load reference attribute
                weight = attributes[idx].item()
    
                trial_errors = []
                for i in [-1, 0, 1]:
                    for j in [-1, 0, 1]:
                        for k in [-1, 0, 1]:
                            # i*lattice_vectors[0] + j*lattice_vectors[1] + k*lattice_vectors[2]
                            ijk_lattice_vectors = np.sum([i, j, k] * lattice_vectors.T, axis=1)
    
                            # Compute error
                            distance = np.linalg.norm(p2 - p1 + ijk_lattice_vectors)
    
                            # Append to trial errors for different atom images
                            trial_errors.append(np.power(distance - weight, 2))
    
                # Add error
                particle_error += np.min(trial_errors)
    
            # Average over the connection of the node
            particle_error /= len(particle_connections[0])
    
            # Append particle error
            particle_errors.append(particle_error)
    
        return np.argmax(particle_errors), np.max(particle_errors)

In [17]:
error_threshold = 1e-5

for attempt in range(100):
    print()
    print(f'Attempt {attempt}')
    solution = minimize(objective, solution,
                        args=(edges, weights),
                        method='Powell')

    is_success       = solution.success
    solution_message = solution.message
    worst_particle, worst_error = worst_identification(edges, weights, solution.x)

    attempt_error = objective(solution.x, edges, weights)
    print(f'Total: {attempt_error} and local {worst_error} errors')

    if attempt_error < error_threshold:
        break

    solution = solution.x.reshape(-1, 3)  # Reshape to 2D array

    # Re-initialize that position
    if is_molecule:
        solution[worst_particle] = np.random.rand(3)
    else:
        solution[worst_particle+3] = np.random.rand(3)

    solution = solution.flatten()

# Check convergence status
if is_success:
    print('Converged to a solution.')
else:
    print(f'Failed to converge: {solution_message}')


Attempt 0
Total: 0.5415438589272228 and local 0.00040429978425656995 errors

Attempt 1
Total: 0.5368245350803098 and local 0.00038523575297439415 errors

Attempt 2
Total: 0.5366366950743916 and local 0.00038031069335487305 errors

Attempt 3
Total: 0.5366253064002295 and local 0.00038189563477137326 errors

Attempt 4
Total: 0.5367007180030032 and local 0.00038107941270398474 errors

Attempt 5


KeyboardInterrupt: 

In [19]:
solution = solution.reshape(-1, 3)

## Molecules

In [20]:
if is_molecule:
    # Get the position of each atom in direct coordinates
    #direct_positions = graph_to_cartesian_positions(graph)
    #cartesian_positions = solution.x.reshape(-1, 3)*mw
    #cartesian_positions = solution.x.reshape(-1, 3)
    
    lattice_vectors     = np.array([[10,  0,   0],
                                    [0,   10,  0],
                                    [0,   0,   10]])
    cartesian_positions = solution

## Crystals

In [21]:
if not is_molecule:
    # Get the position of each atom in direct coordinates
    #direct_positions = graph_to_cartesian_positions(graph)
    #cartesian_positions = solution.x.reshape(-1, 3)*mw
    
    lattice_vectors     = solution[:3]
    cartesian_positions = solution[3:]

In [22]:
POSCAR_name = None

# Get name for the first line of the POSCAR
POSCAR_name = POSCAR_name or 'POSCAR from GenerativeModels'

# Clone the input graph to preserve the original structure
new_graph = temp.clone()

# Load and detach embeddings for the graph nodes
data_embeddings = new_graph.x.detach().cpu().numpy()

# Loading dictionary of available embeddings for atoms
available_embeddings = {}
with open('../MP/input/atomic_masses.dat', 'r') as atomic_masses_file:
    for line in atomic_masses_file:
        key, mass, charge, electronegativity, ionization_energy = line.split()

        # Check if all information is present
        if all(val != 'None' for val in (mass, charge, electronegativity, ionization_energy)):
            available_embeddings[key] = np.array([mass, charge, electronegativity, ionization_energy], dtype=float)

# Get most similar atoms for each graph node and create a list of keys
keys = [find_closest_key(available_embeddings, emb) for emb in data_embeddings]

# Get elements' composition, concentration, and positions
POSCAR_composition, POSCAR_concentration, POSCAR_positions = composition_concentration_from_keys(keys, cartesian_positions)

In [23]:
# Write file
with open('CONTCAR', 'w') as POSCAR_file:
    # Delete previous data in the file
    POSCAR_file.truncate()
    
    # Write POSCAR's name
    POSCAR_file.write(f'{POSCAR_name}\n')

    # Write scaling factor (assumed to be 1.0)
    POSCAR_file.write('1.0\n')

    # Write lattice parameters (assumed to be orthogonal)
    np.savetxt(POSCAR_file, lattice_vectors, delimiter=' ')

    # Write composition (each different species, previously sorted)
    np.savetxt(POSCAR_file, [POSCAR_composition], fmt='%s', delimiter=' ')

    # Write concentration (number of each of the previous elements)
    np.savetxt(POSCAR_file, [POSCAR_concentration], fmt='%d', delimiter=' ')

    # Write position in cartesian form
    POSCAR_file.write('Cartesian\n')
    np.savetxt(POSCAR_file, POSCAR_positions, delimiter=' ')