In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your cleaned SMILES data
df = pd.read_csv("cleaned_smiles_data.csv")

# Initialize the tokenizer
tokenizer = Tokenizer(char_level=True)  # Tokenize at the character level for SMILES
tokenizer.fit_on_texts(df['Canonical SMILES'])

# Convert SMILES strings to sequences
sequences = tokenizer.texts_to_sequences(df['Canonical SMILES'])

# Pad sequences to the same length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post")

print("Tokenized and padded SMILES sequences:")
print(padded_sequences)

# Save tokenizer for later use
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


Tokenized and padded SMILES sequences:
[[1 1 3 2 4 5 4 1 6 2 1 1 2 1 1 2 1 6 1 3 2 4 5 4 0 0 0 0 0]
 [1 1 3 2 4 5 7 1 6 2 1 1 2 1 3 1 2 1 6 5 4 0 0 0 0 0 0 0 0]
 [1 1 3 1 5 1 1 6 2 1 1 2 1 3 1 2 1 6 5 1 3 1 5 1 3 2 4 5 4]]


In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

# Parameters
vocab_size = 10  # Example vocabulary size
embedding_dim = 8
max_length = 29  # Maximum sequence length
batch_size = 32
epochs = 10

# Generator
def build_generator(vocab_size, max_length):
    model = tf.keras.Sequential([
        layers.Input(shape=(max_length,)),
        layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        layers.LSTM(256, return_sequences=True),
        layers.LSTM(256, return_sequences=True),
        layers.TimeDistributed(layers.Dense(embedding_dim)),  # Produces embedding vectors
    ])
    return model

# Discriminator
def build_discriminator(max_length, embedding_dim):
    model = tf.keras.Sequential([
        layers.Input(shape=(max_length, embedding_dim)),  # Accepts embedded sequences
        layers.LSTM(256, return_sequences=True),
        layers.LSTM(256),
        layers.Dense(1, activation="sigmoid"),  # Binary classification
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

# Build models
generator = build_generator(vocab_size, max_length)
discriminator = build_discriminator(max_length, embedding_dim)

# GAN
discriminator.trainable = False
gan_input = tf.keras.Input(shape=(max_length,))
generator_output = generator(gan_input)  # Generator produces embeddings
gan_output = discriminator(generator_output)  # Pass generated embeddings to discriminator
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(optimizer="adam", loss="binary_crossentropy")

# Separate embedding layer for real sequences
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)

# Sample Data
real_sequences = np.random.randint(1, vocab_size, size=(batch_size, max_length))  # Real tokenized sequences
fake_sequences = np.random.randint(1, vocab_size, size=(batch_size, max_length))  # Random noise

# Labels
real_labels = np.ones((batch_size, 1))  # Real data label
fake_labels = np.zeros((batch_size, 1))  # Fake data label

# Training Loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Recompile discriminator to reset internal states
    discriminator.trainable = True
    discriminator.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    
    # Embed real sequences using the standalone embedding layer
    real_data_embedded = embedding_layer(real_sequences)  # Embed real sequences
    
    # Train Discriminator
    fake_data = generator.predict(fake_sequences)  # Generate fake data embeddings
    d_loss_real = discriminator.train_on_batch(real_data_embedded, real_labels)
    d_loss_fake = discriminator.train_on_batch(fake_data, fake_labels)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
    
    # Train Generator
    discriminator.trainable = False
    noise = np.random.randint(1, vocab_size, size=(batch_size, max_length))
    g_loss = gan.train_on_batch(noise, real_labels)  # Train generator to fool discriminator
    
    print(f"D Loss: {d_loss}, G Loss: {g_loss}")              

Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step
D Loss: [0.6973576 0.609375 ], G Loss: 0.6849991083145142
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
D Loss: [0.6962564 0.75     ], G Loss: 0.672376811504364
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
D Loss: [0.7039097 0.75     ], G Loss: 0.6547790169715881
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
D Loss: [0.7332922 0.75     ], G Loss: 0.6290233135223389
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
D Loss: [0.8195772 0.75     ], G Loss: 0.5919936895370483
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
D Loss: [1.0046253 0.75     ], G Loss: 0.5493327975273132
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
D Loss: [1.0859011 0.75     ], G Loss: 0.5483332872390747
Epoch 8/10
[1m1/1[0m 