In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem

# ToxGEN vol.1

In [None]:
train_df = pd.read_csv('LD50_train.csv')
test_df = pd.read_csv('LD50_test.csv')
print(train_df.head(), '\n\n', test_df.head())

   filename                     smiles  label
0  100-01-6    Nc1ccc([N+](=O)[O-])cc1  2.265
1  100-02-7    O=[N+]([O-])c1ccc(O)cc1  2.838
2  100-14-1  O=[N+]([O-])c1ccc(CCl)cc1  1.977
3  100-17-4   COc1ccc([N+](=O)[O-])cc1  1.823
4  100-20-9   O=C(Cl)c1ccc(C(=O)Cl)cc1  1.910 

    filename                    smiles  label
0  100-00-5  O=[N+]([O-])c1ccc(Cl)cc1  2.574
1  100-06-1        COc1ccc(C(C)=O)cc1  1.941
2  100-36-7                CCN(CC)CCN  1.613
3  100-45-8              N#CC1CC=CCC1  2.367
4  100-52-7               O=Cc1ccccc1  1.912


In [None]:
all_smiles = pd.concat([train_df['smiles'], test_df['smiles']])
tokenizer = Tokenizer(char_level=True, filters='', lower=False)
tokenizer.fit_on_texts(all_smiles)
train_seq = tokenizer.texts_to_sequences(train_df['smiles'])
test_seq = tokenizer.texts_to_sequences(test_df['smiles'])
max_length = max(max(len(s) for s in train_seq), max(len(s) for s in test_seq))
train_padded = pad_sequences(train_seq, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_seq, maxlen=max_length, padding='post')
vocab_size = len(tokenizer.word_index) + 1
print((train_padded.shape, test_padded.shape, vocab_size, max_length))

((5931, 284), (1482, 284), 37, 284)


In [None]:
X_train_tensor = torch.tensor(train_padded, dtype=torch.long)
y_train_tensor = torch.tensor(train_df['label'].values, dtype=torch.float).view(-1, 1)
X_test_tensor = torch.tensor(test_padded, dtype=torch.long)
y_test_tensor = torch.tensor(test_df['label'].values, dtype=torch.float).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=42)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

len(train_loader), len(val_loader), len(test_loader)

(75, 19, 24)

In [None]:
class VAE(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, latent_dim, n_layers, dropout):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.hidden_to_mean = nn.Linear(hidden_dim, latent_dim)
        self.hidden_to_logvar = nn.Linear(hidden_dim, latent_dim)
        self.decoder_lstm = nn.LSTM(latent_dim, embedding_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.outputs_to_vocab = nn.Linear(embedding_dim, vocab_size)

    def encode(self, input):
        embedded = self.embedding(input)
        _, (hidden, _) = self.encoder_lstm(embedded)
        mean = self.hidden_to_mean(hidden[-1])
        logvar = self.hidden_to_logvar(hidden[-1])
        return mean, logvar

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mean + eps*std

    def decode(self, z):
        z = z.unsqueeze(1).repeat(1, max_length, 1)
        lstm_out, _ = self.decoder_lstm(z)
        out = self.outputs_to_vocab(lstm_out)
        return out

    def forward(self, input):
        mean, logvar = self.encode(input)
        z = self.reparameterize(mean, logvar)
        return self.decode(z), mean, logvar


In [None]:
def vae_loss(recon_x, x, mean, logvar):
    recon_loss = nn.CrossEntropyLoss()(recon_x.transpose(1, 2), x)
    kl_div = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
    return recon_loss + kl_div

def train_vae(model, train_loader, val_loader, optimizer, n_epochs=30):
    model.train()
    for epoch in range(n_epochs):
        total_loss = 0
        for batch_idx, (data, _) in enumerate(train_loader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mean, logvar = model(data)
            loss = vae_loss(recon_batch, data, mean, logvar)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch}, Loss: {total_loss / len(train_loader)}')


In [None]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
hidden_dim = 512
latent_dim = 256
n_layers = 5
dropout = 0.5

model = VAE(vocab_size, embedding_dim, hidden_dim, latent_dim, n_layers, dropout)
optimizer = optim.Adam(model.parameters(), lr=3e-4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VAE(
  (embedding): Embedding(37, 256)
  (encoder_lstm): LSTM(256, 512, num_layers=5, batch_first=True, dropout=0.5)
  (hidden_to_mean): Linear(in_features=512, out_features=256, bias=True)
  (hidden_to_logvar): Linear(in_features=512, out_features=256, bias=True)
  (decoder_lstm): LSTM(256, 256, num_layers=5, batch_first=True, dropout=0.5)
  (outputs_to_vocab): Linear(in_features=256, out_features=37, bias=True)
)

In [None]:
train_vae(model, train_loader, val_loader, optimizer, n_epochs=25)

Epoch 0, Loss: 1.5374555150667826
Epoch 1, Loss: 0.5360402091344197
Epoch 2, Loss: 0.3818338179588318
Epoch 3, Loss: 0.3580182190736135
Epoch 4, Loss: 0.35387581706047055
Epoch 5, Loss: 0.3470436382293701
Epoch 6, Loss: 0.3452537739276886
Epoch 7, Loss: 0.34154662807782493
Epoch 8, Loss: 0.33991122285525005
Epoch 9, Loss: 0.34123504161834717
Epoch 10, Loss: 0.3378802450497945
Epoch 11, Loss: 0.33528814097245535
Epoch 12, Loss: 0.3351496394475301
Epoch 13, Loss: 0.3339887628952662
Epoch 14, Loss: 0.332522040605545
Epoch 15, Loss: 0.33339529871940615
Epoch 16, Loss: 0.3329658810297648
Epoch 17, Loss: 0.3308826744556427
Epoch 18, Loss: 0.3302076349655787
Epoch 19, Loss: 0.33132585207621257
Epoch 20, Loss: 0.3317698959509532
Epoch 21, Loss: 0.3297600382566452
Epoch 22, Loss: 0.32985772252082823
Epoch 23, Loss: 0.331263796488444
Epoch 24, Loss: 0.33237616896629335


In [None]:
def sample_with_temperature(logits, temperature):
    batch_size, seq_length, vocab_size = logits.size()
    scaled_logits = logits.view(batch_size * seq_length, vocab_size) / temperature
    probabilities = torch.nn.functional.softmax(scaled_logits, dim=-1)
    sampled_indices = torch.multinomial(probabilities, 1)
    sampled_indices = sampled_indices.view(batch_size, seq_length)
    return sampled_indices

In [None]:
def validate_and_diversify_smiles(smiles_list):
    valid_smiles = [smiles for smiles in smiles_list if Chem.MolFromSmiles(smiles)]
    unique_smiles = list(set(valid_smiles))  # Remove duplicates
    return unique_smiles

def generate_smiles(model, tokenizer, n_samples=3, max_length=30, temperature=0.8):
    model.eval()
    generated_smiles = []

    with torch.no_grad():
        while len(generated_smiles) < n_samples:
            z = torch.randn(2 * n_samples, model.latent_dim).to('cuda')  # Generate more candidates
            generated_sequences = model.decode(z)

            if temperature != 1:
                sampled_indices = sample_with_temperature(generated_sequences, temperature)
            else:
                _, sampled_indices = torch.max(generated_sequences, dim=2)

            temp_smiles = ["".join([tokenizer.index_word.get(token_id.item(), '?') for token_id in sequence if token_id != 0])
                           for sequence in sampled_indices]
            valid_unique_smiles = validate_and_diversify_smiles(temp_smiles)

            # Extend the list with new valid and unique SMILES
            generated_smiles.extend(s for s in valid_unique_smiles if s not in generated_smiles)
            generated_smiles = list(set(generated_smiles))  # Ensure uniqueness

    return generated_smiles[:n_samples]

In [None]:
tokenizer = Tokenizer(char_level=True, filters='', lower=False)
tokenizer.fit_on_texts(all_smiles)
x = generate_smiles(model, tokenizer)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
[23:51:05] SMILES Parse Error: extra close parentheses while parsing: CCC1F(c)CN=cCcC)CCcc1OOCC-1)
[23:51:05] SMILES Parse Error: Failed parsing SMILES 'CCC1F(c)CN=cCcC)CCcc1OOCC-1)' for input: 'CCC1F(c)CN=cCcC)CCcc1OOCC-1)'
[23:51:05] SMILES Parse Error: syntax error while parsing: CNC[C=cCN))cC))=)n1c(C=IcN
[23:51:05] SMILES Parse Error: Failed parsing SMILES 'CNC[C=cCN))cC))=)n1c(C=IcN' for input: 'CNC[C=cCN))cC))=)n1c(C=IcN'
[23:51:05] SMILES Parse Error: syntax error while parsing: CC=CncC+ccCc(ccCC2)c))
[23:51:05] SMILES Parse Error: Failed parsing SMILES 'CC=CncC+ccCc(ccCC2)c))' for input: 'CC=CncC+ccCc(ccCC2)c))'
[23:51:05] SMILES Parse Error: syntax error while parsing: CCOCCC(c=/cNCc)CC((Cc1CCCcC2
[23:51:05] SMILES Parse Error: Failed parsing SMILES 'CCOCCC(c=/cNCc)CC((Cc1CCCcC2' for input: 'CCOCCC(c=/cNCc)CC((Cc1CCCcC2'
[23:51:05] SMILES Parse Error: syntax error while parsing: NCCCNCcCcCcCC=)c

In [None]:
x

['CCC(N)C1/CCCCc=CcONcCO1']