In [1]:
%cd ..

/home/hudongcheng/Desktop/bo_osda_generator


In [2]:
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
from tqdm import tqdm
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import torch.backends.cudnn as cudnn

# import custom modules
from datasets.data_loader import *
from models.clamer import *
from models.loss import InfoNCELoss
from models.trfm import *
from utils.utils import *
from utils.plot_figures import *
from utils.metrics import *
from utils.build_vocab import *

In [3]:
cudnn.benchmark = True
cudnn.enabled = True

train_loss_history = []
train_acc_history = []
test_loss_history = []
test_acc_history = []

now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

PAD = 0
UNK = 1
EOS = 2
SOS = 3
MASK = 4
MAX_LEN = 220

In [4]:
def sample_clamer(temp, model, sample_dataloader, device, vocab):
    '''ArithmeticError
    Generate SMILES strings using the trained GPT model with sampling.
    
    Args:
        temp (float): The temperature parameter for sampling.
        model (CLAMER): The pre-trained GPT model for token generation.
        sample_dataloader (DataLoader): The data loader for the SMILES dataset.
        device (torch.device): The device on which to run the generation.
        vocab: The vocabulary object for encoding and decoding SMILES strings.
    
    Returns:
        List[float]: A list of negative log-likelihoods for the generated SMILES strings.
        List[str]: A list of generated SMILES strings.
    '''
    sample_nll_total = []
    smiles_gen_total = []
    with torch.no_grad():
        for batch_idx, (zeo, syn, tgt) in enumerate(tqdm(sample_dataloader)):
            # Generate the target sequence for the model
            target = [SOS] + [PAD] * 218
            tgt_seq = torch.LongTensor(target).unsqueeze(0).expand(zeo.size(0), len(target)).to(device)
            batch_size = zeo.size(0)
            # Move input tensors to the device
            zeo, syn = zeo.to(device), syn.to(device)
            smiles_gen = [[''] * batch_size][0]
            sample_nll = [0] * batch_size
            finished = np.array([False] * batch_size, dtype=object)
            end_char = '<eos>'
            for i in range(218):
                net_out = model(zeo, syn, tgt_seq)[:, i + 2, :]
                o = F.softmax(net_out, dim=-1).cpu().detach().numpy()
                # sample temp
                if temp != 0:
                    temp = abs(temp)  # No negative values
                    next_char_probs = np.log(o) / temp
                    next_char_probs = np.exp(next_char_probs)
                    next_char_probs = next_char_probs.astype(float)
                    next_char_probs = (next_char_probs.T / (next_char_probs.sum(axis=1))).T
                    sampleidc = torch.tensor(
                        [np.random.multinomial(1, next_char_prob, 1).argmax() for next_char_prob in
                            next_char_probs])
                else:
                    sampleidc = torch.tensor(np.argmax(o, axis=1))

                samplechars = [vocab.itos[idx] for idx in sampleidc.numpy()]

                for idx, samplechar in enumerate(samplechars):
                    if not finished[idx]:
                        if samplechar != end_char:
                            # Append the SMILES with the next character
                            smiles_gen[idx] += samplechar
                            tgt_seq[:, i + 1] = sampleidc.to(device)
                            # Calculate negative log likelihood for the selected character
                            sample_nll[idx] -= np.log(o[idx][sampleidc[idx]])
                        else:
                            finished[idx] = True
                            # print("SMILES has finished at %i" %i)
                # If all SMILES are finished, i.e. the end_char "<eos>" has been generated, stop the generation
            if finished.sum() == len(finished):
                sample_nll_total += sample_nll
                smiles_gen_total += smiles_gen
                    
    return sample_nll_total, smiles_gen_total

In [5]:
# load the data
AFI_smiles = read_strings('./data_AFI/AFI_smiles.csv', idx=False)
AFI_zeo = read_vec('./data_AFI/AFI_zeo.csv', idx=False)
AFI_syn = read_vec('./data_AFI/AFI_syn.csv', idx=False)
CHA_smiles = read_strings('./data_CHA/CHA_smiles.csv', idx=False)
CHA_zeo = read_vec('./data_CHA/CHA_zeo.csv', idx=False)
CHA_syn = read_vec('./data_CHA/CHA_syn.csv', idx=False)
AEI_smiles = read_strings('./data_AEI/AEI_smiles.csv', idx=False)
AEI_zeo = read_vec('./data_AEI/AEI_zeo.csv', idx=False)
AEI_syn = read_vec('./data_AEI/AEI_syn.csv', idx=False)

vocab = WordVocab.load_vocab('./model_hub/vocab.pkl')
print('the vocab size is :', len(vocab))
charlen = len(vocab)
print('the total num of charset is :', charlen)

cudnn.benchmark = True
batch_size = 64

manual_seed = 42
random.seed(manual_seed)
torch.manual_seed(manual_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# create the dataset and dataloader
AFI_dataset = Seq2seqDataset(AFI_zeo, AFI_syn, AFI_smiles, vocab)
CHA_dataset = Seq2seqDataset(CHA_zeo, CHA_syn, CHA_smiles, vocab)
AEI_dataset = Seq2seqDataset(AEI_zeo, AEI_syn, AEI_smiles, vocab)
AFI_dataloader = DataLoader(AFI_dataset, batch_size=batch_size, shuffle=True)
CHA_dataloader = DataLoader(CHA_dataset, batch_size=batch_size, shuffle=False)
AEI_dataloader = DataLoader(AEI_dataset, batch_size=batch_size, shuffle=False)

the vocab size is : 45
the total num of charset is : 45


In [6]:
d_model = 128
head = 4
# load the contrastive learning model and original model from checkpoints
model_origin = GptCovd(d_model=d_model, charlen=charlen, device=device, head=head)
model_origin.load_state_dict(torch.load('./checkpoints/best_Clamer_model.pth'))
model_origin.eval()
model_origin.to(device)

model_cl = GptCovd(d_model=d_model, charlen=charlen, device=device, head=head)
model_cl.load_state_dict(torch.load('./checkpoints/clamer/clamer_contrastive_model_9.pth'))
model_cl.eval()
model_cl.to(device)

total = sum(p.numel() for p in model_origin.parameters())
print('total parameters: %0.2fM' % (total / 1e6))  # print the total parameters

total parameters: 1.34M


In [7]:
# generate the AFI smiles with the original model
sample_nll_total, generated_smile_origin = sample_clamer(temp=0.7, model=model_origin, sample_dataloader=AFI_dataloader, device=device, vocab=vocab)
target_smile_origin = []
for i, (zeo, syn, tgt) in enumerate(tqdm(AFI_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_origin.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_origin))
    
# calculate the metrics
AFI_validity_rate_origin = validity_rate(generated_smile_origin)
AFI_uniqueness_rate_origin = uniqueness_rate(generated_smile_origin)
AFI_novelty_rate_origin = novelty_rate(generated_smile_origin, target_smile_origin)
AFI_reconstructability_rate_origin = reconstructability_rate(generated_smile_origin, target_smile_origin)
AFI_IntDiv_origin = IntDiv(generated_smile_origin)
AFI_FCD_score_origin = FCD_score(target_smile_origin, generated_smile_origin)
# print the metrics
print('AFI_validity_rate_origin: ', AFI_validity_rate_origin)
print('AFI_uniqueness_rate_origin: ', AFI_uniqueness_rate_origin)
print('AFI_novelty_rate_origin: ', AFI_novelty_rate_origin)
print('AFI_reconstructability_rate_origin: ', AFI_reconstructability_rate_origin)
print('AFI_IntDiv_origin: ', AFI_IntDiv_origin)
print('AFI_FCD_score_origin: ', AFI_FCD_score_origin)

100%|██████████| 16/16 [00:25<00:00,  1.61s/it]
100%|██████████| 16/16 [00:00<00:00, 33.18it/s]
[19:21:10] SMILES Parse Error: unclosed ring for input: 'C12C3CCCCN3CC(C3[N+](C)(CCCCCCCCC[N+](C)(C)CCCCCCCCCCCCCC)C)C1'
[19:21:10] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 8
[19:21:10] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13 14 15
[19:21:10] SMILES Parse Error: unclosed ring for input: 'C1CC2C3CC(CN2CCCC[N+]2(C)C3)CN1C3'
[19:21:10] SMILES Parse Error: unclosed ring for input: 'C1C2C3CC(C[N+]2(C)CCC1)C1N(C2)CCCC1'
[19:21:10] SMILES Parse Error: unclosed ring for input: 'C1C2C3CC(CN2CCC1)C1[N+](C)(C)C3'
[19:21:10] SMILES Parse Error: unclosed ring for input: 'C12C3CCCCN3CC(C3[N+](C)(CC)CC1)C2'
[19:21:10] SMILES Parse Error: unclosed ring for input: 'C[N+]12C3CC4N(CCCC4)CC(C1)CC2'
[19:21:10] SMILES Parse Error: unclosed ring for input: 'C1CCC2CCCC2C[N+]2(C)CCCC1'
[19:21:10] SMILES Parse Error: unclosed ring for input: 'C12CC(CN3CCCCC3C3)C1CCCC[N+]1(C)C2'
[19:21:10] SMILES 

the total num of target smiles is : 1000


[19:21:11] SMILES Parse Error: unclosed ring for input: 'C12C3CCCCN3CC(C3[N+](C)(CCCCCCCCC[N+](C)(C)CCCCCCCCCCCCCC)C)C1'
[19:21:11] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 8
[19:21:11] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13 14 15
[19:21:11] SMILES Parse Error: unclosed ring for input: 'C1CC2C3CC(CN2CCCC[N+]2(C)C3)CN1C3'
[19:21:11] SMILES Parse Error: unclosed ring for input: 'C1C2C3CC(C[N+]2(C)CCC1)C1N(C2)CCCC1'
[19:21:11] SMILES Parse Error: unclosed ring for input: 'C1C2C3CC(CN2CCC1)C1[N+](C)(C)C3'
[19:21:11] SMILES Parse Error: unclosed ring for input: 'C12C3CCCCN3CC(C3[N+](C)(CC)CC1)C2'
[19:21:11] SMILES Parse Error: unclosed ring for input: 'C[N+]12C3CC4N(CCCC4)CC(C1)CC2'
[19:21:11] SMILES Parse Error: unclosed ring for input: 'C1CCC2CCCC2C[N+]2(C)CCCC1'
[19:21:11] SMILES Parse Error: unclosed ring for input: 'C12CC(CN3CCCCC3C3)C1CCCC[N+]1(C)C2'
[19:21:11] SMILES Parse Error: extra close parentheses while parsing: c1(C)c(C)n(C)c(C)[n+]1CCCC)cc1
[19:21:11] SM

AFI_validity_rate_origin:  0.945
AFI_uniqueness_rate_origin:  0.529
AFI_novelty_rate_origin:  0.8601134215500945
AFI_reconstructability_rate_origin:  0.13988657844990549
AFI_IntDiv_origin:  0.8284920013276773
AFI_FCD_score_origin:  5.683464487336224


In [8]:
# generate the AFI smiles with the contrastive model
sample_nll_total, generated_smile_cl = sample_clamer(temp=0.7, model=model_cl, sample_dataloader=AFI_dataloader, device=device, vocab=vocab)
target_smile_cl = []
for i, (zeo, syn, tgt) in enumerate(tqdm(AFI_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_cl.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_cl))

# calculate the metrics
AFI_validity_rate_cl = validity_rate(generated_smile_cl)
AFI_uniqueness_rate_cl = uniqueness_rate(generated_smile_cl)
AFI_novelty_rate_cl = novelty_rate(generated_smile_cl, target_smile_cl)
AFI_reconstructability_rate_cl = reconstructability_rate(generated_smile_cl, target_smile_cl)
AFI_IntDiv_cl = IntDiv(generated_smile_cl)
AFI_FCD_score_cl = FCD_score(target_smile_cl, generated_smile_cl)
# print the metrics
print('AFI_validity_rate_cl: ', AFI_validity_rate_cl)
print('AFI_uniqueness_rate_cl: ', AFI_uniqueness_rate_cl)
print('AFI_novelty_rate_cl: ', AFI_novelty_rate_cl)
print('AFI_reconstructability_rate_cl: ', AFI_reconstructability_rate_cl)
print('AFI_IntDiv_cl: ', AFI_IntDiv_cl)
print('AFI_FCD_score_cl: ', AFI_FCD_score_cl)

100%|██████████| 16/16 [00:25<00:00,  1.61s/it]
100%|██████████| 16/16 [00:00<00:00, 33.04it/s]
[19:21:39] SMILES Parse Error: unclosed ring for input: 'C1CCCC2C3CC(C[N+]13C)C1N(C3)CCCC1'
[19:21:39] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:21:39] Can't kekulize mol.  Unkekulized atoms: 0 2 3 10 11 12 13
[19:21:39] SMILES Parse Error: unclosed ring for input: 'CCC(C)N(CCCCCN(CC(C)C)CC1)C'
[19:21:39] SMILES Parse Error: syntax error while parsing: [N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1
[19:21:39] SMILES Parse Error: Failed parsing SMILES '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1' for input: '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1'
[19:21:39] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:21:39] SMILES Parse Error: unclosed ring for input: 'C12CN3C(CCCC3)C(C[N+]3(C)CCCCC3)C2'
[19:21:39] Explicit valence for atom # 13 N, 4, is greater than permitted
[19:21:39] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1CC(C3)C2n1c[n+](C24CC5CC(CC(C

the total num of target smiles is : 1000


[19:21:39] SMILES Parse Error: unclosed ring for input: 'C1CCCC2C3CC(C[N+]13C)C1N(C3)CCCC1'
[19:21:39] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:21:39] Can't kekulize mol.  Unkekulized atoms: 0 2 3 10 11 12 13
[19:21:39] SMILES Parse Error: unclosed ring for input: 'CCC(C)N(CCCCCN(CC(C)C)CC1)C'
[19:21:39] SMILES Parse Error: syntax error while parsing: [N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1
[19:21:39] SMILES Parse Error: Failed parsing SMILES '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1' for input: '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1'
[19:21:39] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:21:39] SMILES Parse Error: unclosed ring for input: 'C12CN3C(CCCC3)C(C[N+]3(C)CCCCC3)C2'
[19:21:39] Explicit valence for atom # 13 N, 4, is greater than permitted
[19:21:39] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1CC(C3)C2n1c[n+](C24CC5CC(CC(C5)C3)C2)cc1'
[19:21:39] SMILES Parse Error: unclosed ring for input: 'C1C2C3CC(C[N+]1(C)CC)C1CCC

AFI_validity_rate_cl:  0.99
AFI_uniqueness_rate_cl:  0.57
AFI_novelty_rate_cl:  0.8842105263157894
AFI_reconstructability_rate_cl:  0.11578947368421053
AFI_IntDiv_cl:  0.8328627097385997
AFI_FCD_score_cl:  4.723098020050678


In [9]:
# write the metrics to the folder data_AFI
with open('./data_AFI/AFI_generated_clamer_metrics.txt', 'w') as f:
    # write the mertics
    f.write(f'AFI_validity_rate_origin: {AFI_validity_rate_origin}, AFI_validity_rate_cl: {AFI_validity_rate_cl}\n')
    f.write(f'AFI_uniqueness_rate_origin: {AFI_uniqueness_rate_origin}, AFI_uniqueness_rate_cl: {AFI_uniqueness_rate_cl}\n')
    f.write(f'AFI_novelty_rate_origin: {AFI_novelty_rate_origin}, AFI_novelty_rate_cl: {AFI_novelty_rate_cl}\n')
    f.write(f'AFI_reconstructability_rate_origin: {AFI_reconstructability_rate_origin}, AFI_reconstructability_rate_cl: {AFI_reconstructability_rate_cl}\n')
    f.write(f'AFI_IntDiv_origin: {AFI_IntDiv_origin}, AFI_IntDiv_cl: {AFI_IntDiv_cl}\n')
    f.write(f'AFI_FCD_score_origin: {AFI_FCD_score_origin}, AFI_FCD_score_cl: {AFI_FCD_score_cl}\n')

# write the generated smiles (origin and cl) and target smiles to the folder data_AFI
with open('./data_AFI/AFI_generated_clamer_smiles_origin.txt', 'w') as f:
    for smiles in range(len(generated_smile_origin)):
        f.write(f'origin: {generated_smile_origin[smiles]}, cl: {generated_smile_cl[smiles]}, target: {target_smile_origin[smiles]}\n')

In [10]:
# generate the CHA smiles with the original model
sample_nll_total, generated_smile_origin = sample_clamer(temp=0.7, model=model_origin, sample_dataloader=CHA_dataloader, device=device, vocab=vocab)
target_smile_origin = []
for i, (zeo, syn, tgt) in enumerate(tqdm(CHA_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_origin.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_origin))

# calculate the metrics
CHA_validity_rate_origin = validity_rate(generated_smile_origin)
CHA_uniqueness_rate_origin = uniqueness_rate(generated_smile_origin)
CHA_novelty_rate_origin = novelty_rate(generated_smile_origin, target_smile_origin)
CHA_reconstructability_rate_origin = reconstructability_rate(generated_smile_origin, target_smile_origin)
CHA_IntDiv_origin = IntDiv(generated_smile_origin)
CHA_FCD_score_origin = FCD_score(target_smile_origin, generated_smile_origin)
# print the metrics
print('CHA_validity_rate_origin: ', CHA_validity_rate_origin)
print('CHA_uniqueness_rate_origin: ', CHA_uniqueness_rate_origin)
print('CHA_novelty_rate_origin: ', CHA_novelty_rate_origin)
print('CHA_reconstructability_rate_origin: ', CHA_reconstructability_rate_origin)
print('CHA_IntDiv_origin: ', CHA_IntDiv_origin)
print('CHA_FCD_score_origin: ', CHA_FCD_score_origin)

100%|██████████| 16/16 [00:25<00:00,  1.61s/it]
100%|██████████| 16/16 [00:00<00:00, 36.86it/s]
[19:22:07] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1([n+]1cn(C)cc1)C3'
[19:22:07] SMILES Parse Error: unclosed ring for input: 'C1(C)(C)CC2(C)C[N+](C)(C)C(C3)C1C2'
[19:22:07] SMILES Parse Error: unclosed ring for input: 'C1NC(C)(C)CC(C)(C)NC'
[19:22:07] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC(n4cc[n+](C56CC7CC(CC(C6)C6)C6)c4)(CC1C3)C2'
[19:22:07] SMILES Parse Error: extra open parentheses for input: 'C1C2CC3CC(CC1([n+]1ccn(C45CC6CC(CC(C5)C6)C5)c1)(C3)C2'
[19:22:07] SMILES Parse Error: unclosed ring for input: 'C12([n+]3cn(C45CC7CC(CC(C5)C6)cc4)cc3)CC3CC(CC(C1)C3)C2'
[19:22:07] SMILES Parse Error: unclosed ring for input: 'C(CCCCC[N+]1(C)C)CC[N+]1(C)CCCC1'
[19:22:07] SMILES Parse Error: syntax error while parsing: C((C)NC(O)C)O
[19:22:07] SMILES Parse Error: Failed parsing SMILES 'C((C)NC(O)C)O' for input: 'C((C)NC(O)C)O'
[19:22:07] SMILES Parse Error: unclosed 

the total num of target smiles is : 1000


[19:22:08] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1([n+]1cn(C)cc1)C3'
[19:22:08] SMILES Parse Error: unclosed ring for input: 'C1(C)(C)CC2(C)C[N+](C)(C)C(C3)C1C2'
[19:22:08] SMILES Parse Error: unclosed ring for input: 'C1NC(C)(C)CC(C)(C)NC'
[19:22:08] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC(n4cc[n+](C56CC7CC(CC(C6)C6)C6)c4)(CC1C3)C2'
[19:22:08] SMILES Parse Error: extra open parentheses for input: 'C1C2CC3CC(CC1([n+]1ccn(C45CC6CC(CC(C5)C6)C5)c1)(C3)C2'
[19:22:08] SMILES Parse Error: unclosed ring for input: 'C12([n+]3cn(C45CC7CC(CC(C5)C6)cc4)cc3)CC3CC(CC(C1)C3)C2'
[19:22:08] SMILES Parse Error: unclosed ring for input: 'C(CCCCC[N+]1(C)C)CC[N+]1(C)CCCC1'
[19:22:08] SMILES Parse Error: syntax error while parsing: C((C)NC(O)C)O
[19:22:08] SMILES Parse Error: Failed parsing SMILES 'C((C)NC(O)C)O' for input: 'C((C)NC(O)C)O'
[19:22:08] SMILES Parse Error: unclosed ring for input: 'C1[N+](C)(C)CC2CCC1C1CCC2'
[19:22:08] SMILES Parse Error: extra close parenthes

CHA_validity_rate_origin:  0.977
CHA_uniqueness_rate_origin:  0.597
CHA_novelty_rate_origin:  0.8458961474036851
CHA_reconstructability_rate_origin:  0.1541038525963149
CHA_IntDiv_origin:  0.8826527908061444
CHA_FCD_score_origin:  3.636653191136247


In [11]:
# generate the CHA smiles with the original model
sample_nll_total, generated_smile_origin = sample_clamer(temp=0.7, model=model_cl, sample_dataloader=CHA_dataloader, device=device, vocab=vocab)
target_smile_origin = []
for i, (zeo, syn, tgt) in enumerate(tqdm(CHA_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_origin.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_origin))

# calculate the metrics
CHA_validity_rate_cl = validity_rate(generated_smile_cl)
CHA_uniqueness_rate_cl = uniqueness_rate(generated_smile_cl)
CHA_novelty_rate_cl = novelty_rate(generated_smile_cl, target_smile_cl)
CHA_reconstructability_rate_cl = reconstructability_rate(generated_smile_cl, target_smile_cl)
CHA_IntDiv_cl = IntDiv(generated_smile_cl)
CHA_FCD_score_cl = FCD_score(target_smile_cl, generated_smile_cl)
# print the metrics
print('CHA_validity_rate_cl: ', CHA_validity_rate_cl)
print('CHA_uniqueness_rate_cl: ', CHA_uniqueness_rate_cl)
print('CHA_novelty_rate_cl: ', CHA_novelty_rate_cl)
print('CHA_reconstructability_rate_cl: ', CHA_reconstructability_rate_cl)
print('CHA_IntDiv_cl: ', CHA_IntDiv_cl)
print('CHA_FCD_score_cl: ', CHA_FCD_score_cl)

100%|██████████| 16/16 [00:25<00:00,  1.62s/it]
100%|██████████| 16/16 [00:00<00:00, 36.92it/s]
[19:22:36] SMILES Parse Error: unclosed ring for input: 'C1CCCC2C3CC(C[N+]13C)C1N(C3)CCCC1'
[19:22:36] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:22:36] Can't kekulize mol.  Unkekulized atoms: 0 2 3 10 11 12 13
[19:22:36] SMILES Parse Error: unclosed ring for input: 'CCC(C)N(CCCCCN(CC(C)C)CC1)C'
[19:22:36] SMILES Parse Error: syntax error while parsing: [N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1
[19:22:36] SMILES Parse Error: Failed parsing SMILES '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1' for input: '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1'
[19:22:36] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:22:36] SMILES Parse Error: unclosed ring for input: 'C12CN3C(CCCC3)C(C[N+]3(C)CCCCC3)C2'
[19:22:36] Explicit valence for atom # 13 N, 4, is greater than permitted
[19:22:36] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1CC(C3)C2n1c[n+](C24CC5CC(CC(C

the total num of target smiles is : 1000


[19:22:36] SMILES Parse Error: unclosed ring for input: 'C1CCCC2C3CC(C[N+]13C)C1N(C3)CCCC1'
[19:22:36] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:22:36] Can't kekulize mol.  Unkekulized atoms: 0 2 3 10 11 12 13
[19:22:36] SMILES Parse Error: unclosed ring for input: 'CCC(C)N(CCCCCN(CC(C)C)CC1)C'
[19:22:36] SMILES Parse Error: syntax error while parsing: [N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1
[19:22:36] SMILES Parse Error: Failed parsing SMILES '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1' for input: '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1'
[19:22:36] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:22:36] SMILES Parse Error: unclosed ring for input: 'C12CN3C(CCCC3)C(C[N+]3(C)CCCCC3)C2'
[19:22:36] Explicit valence for atom # 13 N, 4, is greater than permitted
[19:22:36] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1CC(C3)C2n1c[n+](C24CC5CC(CC(C5)C3)C2)cc1'
[19:22:36] SMILES Parse Error: unclosed ring for input: 'C1C2C3CC(C[N+]1(C)CC)C1CCC

CHA_validity_rate_cl:  0.99
CHA_uniqueness_rate_cl:  0.57
CHA_novelty_rate_cl:  0.8842105263157894
CHA_reconstructability_rate_cl:  0.11578947368421053
CHA_IntDiv_cl:  0.8328627097385997
CHA_FCD_score_cl:  4.723098020050678


In [12]:
# write the metrics to the folder data_CHA
with open('./data_CHA/CHA_generated_clamer_metrics.txt', 'w') as f:
    # write the mertics
    f.write(f'CHA_validity_rate_origin: {CHA_validity_rate_origin}, CHA_validity_rate_cl: {CHA_validity_rate_cl}\n')
    f.write(f'CHA_uniqueness_rate_origin: {CHA_uniqueness_rate_origin}, CHA_uniqueness_rate_cl: {CHA_uniqueness_rate_cl}\n')
    f.write(f'CHA_novelty_rate_origin: {CHA_novelty_rate_origin}, CHA_novelty_rate_cl: {CHA_novelty_rate_cl}\n')
    f.write(f'CHA_reconstructability_rate_origin: {CHA_reconstructability_rate_origin}, CHA_reconstructability_rate_cl: {CHA_reconstructability_rate_cl}\n')
    f.write(f'CHA_IntDiv_origin: {CHA_IntDiv_origin}, CHA_IntDiv_cl: {CHA_IntDiv_cl}\n')
    f.write(f'CHA_FCD_score_origin: {CHA_FCD_score_origin}, CHA_FCD_score_cl: {CHA_FCD_score_cl}\n')
# write the generated smiles (origin and cl) and target smiles to the folder data_CHA
with open('./data_CHA/CHA_generated_clamer_smiles_origin.txt', 'w') as f:
    for smiles in range(len(generated_smile_origin)):
        f.write(f'origin: {generated_smile_origin[smiles]}, cl: {generated_smile_cl[smiles]}, target: {target_smile_origin[smiles]}\n')

In [13]:
# generate the AEI smiles with the original model
sample_nll_total, generated_smile_origin = sample_clamer(temp=0.7, model=model_origin, sample_dataloader=AEI_dataloader, device=device, vocab=vocab)
target_smile_origin = []
for i, (zeo, syn, tgt) in enumerate(tqdm(AEI_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_origin.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_origin))
    
# calculate the metrics
AEI_validity_rate_origin = validity_rate(generated_smile_origin)
AEI_uniqueness_rate_origin = uniqueness_rate(generated_smile_origin)
AEI_novelty_rate_origin = novelty_rate(generated_smile_origin, target_smile_origin)
AEI_reconstructability_rate_origin = reconstructability_rate(generated_smile_origin, target_smile_origin)
AEI_IntDiv_origin = IntDiv(generated_smile_origin)
AEI_FCD_score_origin = FCD_score(target_smile_origin, generated_smile_origin)
# print the metrics
print('AEI_validity_rate_origin: ', AEI_validity_rate_origin)
print('AEI_uniqueness_rate_origin: ', AEI_uniqueness_rate_origin)
print('AEI_novelty_rate_origin: ', AEI_novelty_rate_origin)
print('AEI_reconstructability_rate_origin: ', AEI_reconstructability_rate_origin)
print('AEI_IntDiv_origin: ', AEI_IntDiv_origin)
print('AEI_FCD_score_origin: ', AEI_FCD_score_origin)

100%|██████████| 16/16 [00:25<00:00,  1.62s/it]
100%|██████████| 16/16 [00:00<00:00, 30.65it/s]
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1CCC2CCC([N+]3(C)CCCC2)C1'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C12C3C4C[N+](C)(C)CC5C(C=C1)C3'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1C2C([N+](C)(C)C)CC1C1C2(C)C'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C12C(C3C4C[N+](CC)(CC)CC4C3C[N+](CC)(CC)CC13)C=C2'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1C2CCC(CC2)[N+]2(CC)CCC1'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C12C(C)(C)C(CCC[N+]3(CC)CC1C2)C'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1[N+]2(C)CC(C)CC(C)C1C'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1CC2C([N+](C)(C)C)CCCCCC2'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C12C3C4C(C(C=C1)C1C[N+](CC)(CC)C4)C[N+](CC)(CC)C3'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1C2CCC1CC[N+]1(C)CCCC1'
[19

the total num of target smiles is : 1000


[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1CCC2CCC([N+]3(C)CCCC2)C1'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C12C3C4C[N+](C)(C)CC5C(C=C1)C3'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1C2C([N+](C)(C)C)CC1C1C2(C)C'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C12C(C3C4C[N+](CC)(CC)CC4C3C[N+](CC)(CC)CC13)C=C2'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1C2CCC(CC2)[N+]2(CC)CCC1'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C12C(C)(C)C(CCC[N+]3(CC)CC1C2)C'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1[N+]2(C)CC(C)CC(C)C1C'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1CC2C([N+](C)(C)C)CCCCCC2'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C12C3C4C(C(C=C1)C1C[N+](CC)(CC)C4)C[N+](CC)(CC)C3'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1C2CCC1CC[N+]1(C)CCCC1'
[19:23:05] SMILES Parse Error: unclosed ring for input: 'C1(C)(C)CCC[N+]2(CC)CCC1'
[19:23:05] SMILE

AEI_validity_rate_origin:  0.973
AEI_uniqueness_rate_origin:  0.563
AEI_novelty_rate_origin:  0.9342806394316163
AEI_reconstructability_rate_origin:  0.06571936056838366
AEI_IntDiv_origin:  0.837822613947796
AEI_FCD_score_origin:  5.062680918198019


In [14]:
# generate the AEI smiles with the original model
sample_nll_total, generated_smile_origin = sample_clamer(temp=0.7, model=model_cl, sample_dataloader=AEI_dataloader, device=device, vocab=vocab)
target_smile_origin = []
for i, (zeo, syn, tgt) in enumerate(tqdm(AEI_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_origin.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_origin))

# calculate the metrics
AEI_validity_rate_cl = validity_rate(generated_smile_cl)
AEI_uniqueness_rate_cl = uniqueness_rate(generated_smile_cl)
AEI_novelty_rate_cl = novelty_rate(generated_smile_cl, target_smile_cl)
AEI_reconstructability_rate_cl = reconstructability_rate(generated_smile_cl, target_smile_cl)
AEI_IntDiv_cl = IntDiv(generated_smile_cl)
AEI_FCD_score_cl = FCD_score(target_smile_cl, generated_smile_cl)
# print the metrics
print('AEI_validity_rate_cl: ', AEI_validity_rate_cl)
print('AEI_uniqueness_rate_cl: ', AEI_uniqueness_rate_cl)
print('AEI_novelty_rate_cl: ', AEI_novelty_rate_cl)
print('AEI_reconstructability_rate_cl: ', AEI_reconstructability_rate_cl)
print('AEI_IntDiv_cl: ', AEI_IntDiv_cl)
print('AEI_FCD_score_cl: ', AEI_FCD_score_cl)

100%|██████████| 16/16 [00:25<00:00,  1.61s/it]
100%|██████████| 16/16 [00:00<00:00, 30.50it/s]
[19:23:33] SMILES Parse Error: unclosed ring for input: 'C1CCCC2C3CC(C[N+]13C)C1N(C3)CCCC1'
[19:23:33] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:23:33] Can't kekulize mol.  Unkekulized atoms: 0 2 3 10 11 12 13
[19:23:33] SMILES Parse Error: unclosed ring for input: 'CCC(C)N(CCCCCN(CC(C)C)CC1)C'
[19:23:33] SMILES Parse Error: syntax error while parsing: [N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1
[19:23:33] SMILES Parse Error: Failed parsing SMILES '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1' for input: '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1'
[19:23:33] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:23:33] SMILES Parse Error: unclosed ring for input: 'C12CN3C(CCCC3)C(C[N+]3(C)CCCCC3)C2'
[19:23:33] Explicit valence for atom # 13 N, 4, is greater than permitted
[19:23:33] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1CC(C3)C2n1c[n+](C24CC5CC(CC(C

the total num of target smiles is : 1000


[19:23:34] SMILES Parse Error: unclosed ring for input: 'C1CCCC2C3CC(C[N+]13C)C1N(C3)CCCC1'
[19:23:34] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:23:34] Can't kekulize mol.  Unkekulized atoms: 0 2 3 10 11 12 13
[19:23:34] SMILES Parse Error: unclosed ring for input: 'CCC(C)N(CCCCCN(CC(C)C)CC1)C'
[19:23:34] SMILES Parse Error: syntax error while parsing: [N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1
[19:23:34] SMILES Parse Error: Failed parsing SMILES '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1' for input: '[N+]1(C)(CC2(c3ccc(cc3)C<unk>)CCCC2)CCCCC1'
[19:23:34] Can't kekulize mol.  Unkekulized atoms: 0 7 8 10 11 12 13
[19:23:34] SMILES Parse Error: unclosed ring for input: 'C12CN3C(CCCC3)C(C[N+]3(C)CCCCC3)C2'
[19:23:34] Explicit valence for atom # 13 N, 4, is greater than permitted
[19:23:34] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1CC(C3)C2n1c[n+](C24CC5CC(CC(C5)C3)C2)cc1'
[19:23:34] SMILES Parse Error: unclosed ring for input: 'C1C2C3CC(C[N+]1(C)CC)C1CCC

AEI_validity_rate_cl:  0.99
AEI_uniqueness_rate_cl:  0.57
AEI_novelty_rate_cl:  0.8842105263157894
AEI_reconstructability_rate_cl:  0.11578947368421053
AEI_IntDiv_cl:  0.8328627097385997
AEI_FCD_score_cl:  4.723098020050678


In [15]:
# write the metrics to the folder data_AEI
with open('./data_AEI/AEI_generated_clamer_metrics.txt', 'w') as f:
    # write the mertics
    f.write(f'AEI_validity_rate_origin: {AEI_validity_rate_origin}, AEI_validity_rate_cl: {AEI_validity_rate_cl}\n')
    f.write(f'AEI_uniqueness_rate_origin: {AEI_uniqueness_rate_origin}, AEI_uniqueness_rate_cl: {AEI_uniqueness_rate_cl}\n')
    f.write(f'AEI_novelty_rate_origin: {AEI_novelty_rate_origin}, AEI_novelty_rate_cl: {AEI_novelty_rate_cl}\n')
    f.write(f'AEI_reconstructability_rate_origin: {AEI_reconstructability_rate_origin}, AEI_reconstructability_rate_cl: {AEI_reconstructability_rate_cl}\n')
    f.write(f'AEI_IntDiv_origin: {AEI_IntDiv_origin}, AEI_IntDiv_cl: {AEI_IntDiv_cl}\n')
    f.write(f'AEI_FCD_score_origin: {AEI_FCD_score_origin}, AEI_FCD_score_cl: {AEI_FCD_score_cl}\n')
# write the generated smiles (origin and cl) and target smiles to the folder data_AEI
with open('./data_AEI/AEI_generated_clamer_smiles_origin.txt', 'w') as f:
    for smiles in range(len(generated_smile_origin)):
        f.write(f'origin: {generated_smile_origin[smiles]}, cl: {generated_smile_cl[smiles]}, target: {target_smile_origin[smiles]}\n')