In [1]:
def mount_drive(dir):
  import os
  colab = 1
  if colab == 1:
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    current_folder = dir
    dest_folder = '/content/drive/My Drive/' + current_folder
    os.chdir(dest_folder)
    print('\n Current path: ' + os.getcwd())

mount_drive('')

Mounted at /content/drive

 Current path: /content/drive/My Drive


In [2]:
!pip install rdkit selfies

Collecting rdkit
  Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting selfies
  Downloading selfies-2.2.0-py3-none-any.whl.metadata (14 kB)
Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading selfies-2.2.0-py3-none-any.whl (36 kB)
Installing collected packages: selfies, rdkit
Successfully installed rdkit-2024.9.5 selfies-2.2.0


In [3]:
import numpy as np
import pandas as pd
from rdkit import Chem
import tensorflow as tf
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
import tensorflow.keras as keras
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
qm9 = pd.read_csv("qm9.csv")
qm9.head()

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom,Morgan_fingerprint,adj_matrix,node_features,node_count
0,gdb_1,C,157.7118,157.70997,157.70699,0.0,13.21,-0.3877,0.1171,0.5048,...,-40.498597,6.469,-395.999595,-398.64329,-401.014647,-372.471772,[0 0 0 ... 0 0 0],[[0]],[[6]],1
1,gdb_2,N,293.60975,293.54111,191.39397,1.6256,9.46,-0.257,0.0829,0.3399,...,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802,[0 0 0 ... 0 0 0],[[0]],[[7]],1
2,gdb_3,O,799.58812,437.90386,282.94545,1.8511,6.31,-0.2928,0.0687,0.3615,...,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171,[0 0 0 ... 0 0 0],[[0]],[[8]],1
3,gdb_4,C#C,0.0,35.610036,35.610036,0.0,16.28,-0.2845,0.0506,0.3351,...,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724,[0 0 0 ... 0 0 0],[[0 1]\n [1 0]],"[[6], [6]]",2
4,gdb_5,C#N,0.0,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,...,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028,[0 0 0 ... 0 0 0],[[0 1]\n [1 0]],"[[6], [7]]",2


In [5]:
class GraphGenerator_test(tf.keras.Model):
  def __init__(self, num_nodes, node_features, latent_dim):
     super().__init__()

     self.num_nodes = num_nodes
     self.node_features = node_features
     self.latent_dim = latent_dim

     self.mlp = models.Sequential([
         layers.Dense(128, input_shape = (latent_dim,), activation="leaky_relu"),
         layers.Dense(256, activation = "leaky_relu"),
         layers.Dense((num_nodes * num_nodes) + (num_nodes * node_features), activation = "softmax")
     ])

     self.optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)

  def call(self, z):
    batch_size = tf.shape(z)[0]

    output = self.mlp(z)

    adj_flat = output[:, :self.num_nodes * self.num_nodes]
    adj_flat = tf.nn.sigmoid(adj_flat)
    node_feature_flat = output[:, self.num_nodes * self.num_nodes:]

    adj = tf.reshape(adj_flat, (batch_size, self.num_nodes, self.num_nodes))
    adj = (adj + tf.transpose(adj, perm=[0, 2, 1])) / 2
    adj = tf.nn.softmax(adj, axis=-1)
    adj = tf.cast(adj, tf.float32)

    valid_atom_types = tf.constant([1, 6, 7, 8, 9], dtype = tf.int64)

    node_features = tf.reshape(node_feature_flat, (batch_size, self.num_nodes, self.node_features))
    node_features = tf.nn.softmax(node_features, axis = -1)
    node_features = tf.clip_by_value(node_features, 0, 4)
    node_features = tf.cast(node_features, tf.int32)
    node_features = tf.gather(valid_atom_types, node_features)
    node_features = tf.cast(node_features, tf.float32)

    return adj, node_features

  def loss_function(self, real_output, fake_output):
    loss_func = tf.keras.losses.BinaryCrossentropy(from_logits = False)
    return loss_func(real_output, fake_output)

  def fit(self, data, discriminator, epochs = 10):
    for epoch in range(epochs):
      total_loss = 0
      num_batches = 0
      for z in data:
        with tf.GradientTape() as tape:
          gen_adj, gen_z = self(z)

          fake_output = discriminator(gen_adj, gen_z)
          loss = self.loss_function(tf.ones_like(fake_output), fake_output)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        total_loss += loss
        num_batches += 1
      average_loss = total_loss / num_batches
      print(f"Epoch {epoch+1}/{epochs}, Loss: {average_loss.numpy():.4f}")

In [6]:
class GraphDiscriminator_test(tf.keras.Model):
    def __init__(self, num_nodes, node_features):
        super().__init__()
        self.input_dim = num_nodes * num_nodes
        self.condition_dim = num_nodes * node_features

        self.mlp = models.Sequential([
            layers.Dense(256, input_shape = (self.input_dim + self.condition_dim,), activation='leaky_relu'),
            layers.Dense(128, activation='leaky_relu'),
            layers.BatchNormalization(),
            layers.Dense(1, activation='sigmoid')
        ])

    def call(self, adj, node_features):
        batch_size = tf.shape(adj)[0]

        adj_flat = tf.reshape(adj, (batch_size, self.input_dim))
        node_features_flat = tf.reshape(node_features, (batch_size, self.condition_dim))

        combined = tf.concat([adj_flat, node_features_flat], axis = 1)

        validity = self.mlp(combined)

        return validity

In [7]:
def adjacency_matrix_to_mol(inp_matrix):
  mol = Chem.RWMol()
  atom_types = np.unique(inp_matrix[1])
  matrix = inp_matrix[0]
  atom_map = {i: mol.AddAtom(Chem.Atom(int(atom_types[i]))) for i in range(len(atom_types))}

  if isinstance(matrix, tf.Tensor):
        matrix = matrix.numpy()
  if isinstance(atom_types, tf.Tensor):
      atom_types = atom_types.numpy()

  matrix = np.array(matrix)
  atom_types = np.array(atom_types)

  if len(matrix) != len(atom_types):
    raise ValueError("NUMBER OF ATOM TYPES DOES NOT MATCH MATRIX DIMENSIONS")

  # Step 1: Add Atoms
  for i, atom_num in enumerate(atom_types):
    atom = Chem.Atom(int(atom_num))
    mol_idx = mol.AddAtom(atom)
    atom_map[i] = mol_idx

  # Step 2: Add Bonds
  for i in range(len(matrix)):
    for j in range(i + 1, len(matrix)):
        if j not in atom_map:
            continue

        value = int(np.argmax(matrix[i, j]))
        value = min(max(value, 0), 4)

        if value != 0:
            bond_type = {
                1: Chem.BondType.SINGLE,
                2: Chem.BondType.DOUBLE,
                3: Chem.BondType.TRIPLE,
                4: Chem.BondType.AROMATIC
            }.get(value, None)

            if bond_type is None:
                raise ValueError(f"INVALID BOND TYPE DETECTED: {value}")

            print(f"Adding bond: {i}-{j} Type: {bond_type}")
            mol.AddBond(atom_map[i], atom_map[j], bond_type)

  return mol

In [None]:

condition_features = ['mu', 'homo', 'gap']
condition_data = qm9.loc[:4, condition_features].values.astype(np.float32)

scaler = StandardScaler()
condition_data = scaler.fit_transform(condition_data)

print("Condition Data Shape:", condition_data.shape)
print(condition_data)

Condition Data Shape: (5, 3)
[[-1.1326836  -1.446403    1.9337426 ]
 [ 0.3125086   1.2079761  -0.70988333]
 [ 0.5129828   0.48091564 -0.36359873]
 [-1.1326836   0.64948    -0.7868353 ]
 [ 1.439876   -0.8919687  -0.07342527]]


In [None]:
gen = GraphGenerator_test(10,5,32)
z = tf.random.normal((5, 32))
gen_out = gen(z)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Discrete adjacency matrix (bond types):
 [[[0.09998479 0.09998094 0.10005055 0.0999805  0.10000604 0.09997454
   0.09999108 0.09994368 0.10004399 0.1000095 ]
  [0.09998094 0.10001254 0.10001496 0.09999897 0.09994604 0.10005394
   0.10009794 0.09999936 0.09997056 0.09997086]
  [0.10005055 0.10001496 0.09997237 0.1000327  0.09993888 0.10002509
   0.10006432 0.09997334 0.09998472 0.09997551]
  [0.0999805  0.09999897 0.1000327  0.09998281 0.09999724 0.0999912
   0.10002699 0.09994563 0.09995595 0.10000767]
  [0.10000604 0.09994604 0.09993888 0.09999724 0.10007958 0.1000319
   0.10000467 0.09997105 0.10003257 0.09995325]
  [0.09997454 0.10005394 0.10002509 0.0999912  0.1000319  0.09994145
   0.09996527 0.10004282 0.10005735 0.10005151]
  [0.09999108 0.10009794 0.10006432 0.10002699 0.10000467 0.09996527
   0.10004044 0.100058   0.09990337 0.10002761]
  [0.09994368 0.09999936 0.09997334 0.09994563 0.09997105 0.10004282
   0.100058   0.09996028 0.09995854 0.09997675]
  [0.10004399 0.09997056 

In [None]:
batch_size = 32
latent_dim = 16
num_nodes = 10
node_features = 5

# Create synthetic dataset of random latent vectors (z)
dataset = tf.data.Dataset.from_tensor_slices(
    tf.random.normal((1000, latent_dim))
).batch(batch_size)

generator = GraphGenerator_test(num_nodes=10, node_features=5, latent_dim=16)
discriminator = GraphDiscriminator_test(num_nodes=10, node_features=5)
# Train the generator for 10 epochs
generator.fit(dataset, discriminator, epochs=10)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10, Loss: 0.6995
Epoch 2/10, Loss: 0.6991
Epoch 3/10, Loss: 0.6988
Epoch 4/10, Loss: 0.6988
Epoch 5/10, Loss: 0.6988
Epoch 6/10, Loss: 0.6988
Epoch 7/10, Loss: 0.6988
Epoch 8/10, Loss: 0.6988
