In [1]:
%cd ..

/home/hudongcheng/Desktop/bo_osda_generator


In [2]:
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
from tqdm import tqdm
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import torch.backends.cudnn as cudnn

# import custom modules
from datasets.data_loader import *
from models.clamer import *
from models.loss import InfoNCELoss
from models.trfm import *
from utils.utils import *
from utils.plot_figures import *
from utils.metrics import *
from utils.build_vocab import *

In [3]:
cudnn.benchmark = True
cudnn.enabled = True

train_loss_history = []
train_acc_history = []
test_loss_history = []
test_acc_history = []

now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

PAD = 0
UNK = 1
EOS = 2
SOS = 3
MASK = 4
MAX_LEN = 220

In [4]:
def sample_clamer(temp, model, sample_dataloader, device, vocab):
    '''ArithmeticError
    Generate SMILES strings using the trained GPT model with sampling.
    
    Args:
        temp (float): The temperature parameter for sampling.
        model (CLAMER): The pre-trained GPT model for token generation.
        sample_dataloader (DataLoader): The data loader for the SMILES dataset.
        device (torch.device): The device on which to run the generation.
        vocab: The vocabulary object for encoding and decoding SMILES strings.
    
    Returns:
        List[float]: A list of negative log-likelihoods for the generated SMILES strings.
        List[str]: A list of generated SMILES strings.
    '''
    sample_nll_total = []
    smiles_gen_total = []
    with torch.no_grad():
        for batch_idx, (zeo, syn, tgt) in enumerate(tqdm(sample_dataloader)):
            # Generate the target sequence for the model
            target = [SOS] + [PAD] * 218
            tgt_seq = torch.LongTensor(target).unsqueeze(0).expand(zeo.size(0), len(target)).to(device)
            batch_size = zeo.size(0)
            # Move input tensors to the device
            zeo, syn = zeo.to(device), syn.to(device)
            smiles_gen = [[''] * batch_size][0]
            sample_nll = [0] * batch_size
            finished = np.array([False] * batch_size, dtype=object)
            end_char = '<eos>'
            for i in range(218):
                net_out = model(zeo, syn, tgt_seq)[:, i + 2, :]
                o = F.softmax(net_out, dim=-1).cpu().detach().numpy()
                # sample temp
                if temp != 0:
                    temp = abs(temp)  # No negative values
                    next_char_probs = np.log(o) / temp
                    next_char_probs = np.exp(next_char_probs)
                    next_char_probs = next_char_probs.astype(float)
                    next_char_probs = (next_char_probs.T / (next_char_probs.sum(axis=1))).T
                    sampleidc = torch.tensor(
                        [np.random.multinomial(1, next_char_prob, 1).argmax() for next_char_prob in
                            next_char_probs])
                else:
                    sampleidc = torch.tensor(np.argmax(o, axis=1))

                samplechars = [vocab.itos[idx] for idx in sampleidc.numpy()]

                for idx, samplechar in enumerate(samplechars):
                    if not finished[idx]:
                        if samplechar != end_char:
                            # Append the SMILES with the next character
                            smiles_gen[idx] += samplechar
                            tgt_seq[:, i + 1] = sampleidc.to(device)
                            # Calculate negative log likelihood for the selected character
                            sample_nll[idx] -= np.log(o[idx][sampleidc[idx]])
                        else:
                            finished[idx] = True
                            # print("SMILES has finished at %i" %i)
                # If all SMILES are finished, i.e. the end_char "<eos>" has been generated, stop the generation
            if finished.sum() == len(finished):
                sample_nll_total += sample_nll
                smiles_gen_total += smiles_gen
                    
    return sample_nll_total, smiles_gen_total

In [5]:
# load the data
AFI_smiles = read_strings('./data_AFI/AFI_smiles.csv', idx=False)
AFI_zeo = read_vec('./data_AFI/AFI_zeo.csv', idx=False)
AFI_syn = read_vec('./data_AFI/AFI_syn.csv', idx=False)
CHA_smiles = read_strings('./data_CHA/CHA_smiles.csv', idx=False)
CHA_zeo = read_vec('./data_CHA/CHA_zeo.csv', idx=False)
CHA_syn = read_vec('./data_CHA/CHA_syn.csv', idx=False)

vocab = WordVocab.load_vocab('./model_hub/vocab.pkl')
print('the vocab size is :', len(vocab))
charlen = len(vocab)
print('the total num of charset is :', charlen)

cudnn.benchmark = True
batch_size = 64

manual_seed = 42
random.seed(manual_seed)
torch.manual_seed(manual_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# create the dataset and dataloader
AFI_dataset = Seq2seqDataset(AFI_zeo, AFI_syn, AFI_smiles, vocab)
CHA_dataset = Seq2seqDataset(CHA_zeo, CHA_syn, CHA_smiles, vocab)
AFI_dataloader = DataLoader(AFI_dataset, batch_size=batch_size, shuffle=True)
CHA_dataloader = DataLoader(CHA_dataset, batch_size=batch_size, shuffle=False)

the vocab size is : 45
the total num of charset is : 45


In [6]:
d_model = 128
head = 4
# load the contrastive learning model and original model from checkpoints
model_origin = GptCovd(d_model=d_model, charlen=charlen, device=device, head=head)
model_origin.load_state_dict(torch.load('./checkpoints/best_Clamer_model.pth'))
model_origin.eval()
model_origin.to(device)

model_cl = GptCovd(d_model=d_model, charlen=charlen, device=device, head=head)
model_cl.load_state_dict(torch.load('./checkpoints/clamer/clamer_contrastive_model_9.pth'))
model_cl.eval()
model_cl.to(device)

total = sum(p.numel() for p in model_origin.parameters())
print('total parameters: %0.2fM' % (total / 1e6))  # print the total parameters

total parameters: 1.34M


In [7]:
# generate the AFI smiles with the original model
sample_nll_total, generated_smile_origin = sample_clamer(temp=0.7, model=model_origin, sample_dataloader=AFI_dataloader, device=device, vocab=vocab)
target_smile_origin = []
for i, (zeo, syn, tgt) in enumerate(tqdm(AFI_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_origin.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_origin))
    
# calculate the metrics
AFI_validity_rate_origin = validity_rate(generated_smile_origin)
AFI_uniqueness_rate_origin = uniqueness_rate(generated_smile_origin)
AFI_novelty_rate_origin = novelty_rate(generated_smile_origin, target_smile_origin)
AFI_reconstructability_rate_origin = reconstructability_rate(generated_smile_origin, target_smile_origin)
AFI_IntDiv_origin = IntDiv(generated_smile_origin)
AFI_FCD_score_origin = FCD_score(target_smile_origin, generated_smile_origin)
# print the metrics
print('AFI_validity_rate_origin: ', AFI_validity_rate_origin)
print('AFI_uniqueness_rate_origin: ', AFI_uniqueness_rate_origin)
print('AFI_novelty_rate_origin: ', AFI_novelty_rate_origin)
print('AFI_reconstructability_rate_origin: ', AFI_reconstructability_rate_origin)
print('AFI_IntDiv_origin: ', AFI_IntDiv_origin)
print('AFI_FCD_score_origin: ', AFI_FCD_score_origin)

100%|██████████| 16/16 [00:25<00:00,  1.61s/it]
100%|██████████| 16/16 [00:00<00:00, 33.56it/s]
[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6
[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 2 4 5 7
[14:08:18] SMILES Parse Error: unclosed ring for input: 'CC1N(C)CCCN(C)CC(C)N(C)C'
[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 6
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C1CCN1CCCN1Cc1ccccc1'
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C(CC[P+](C)(c1ccccc1)C)[N+](Cc1c(ccccc1)cc1)C'
[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 1 15 16 17
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C1CC2C3CC(C[N+]2(C)CCC1)C1N(C2)CCCC1'
[14:08:18] SMILES Parse Error: unclosed ring for input: 'c1cc(C(C(C(c2ccccc2)O)O)cc2)ccc1'
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C12C3CCCC[N+]3(C)C(CCCC3)C1CCCC2'
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C(C[N+](C)(C)Cc1ccccc1)c1cccc(F)c12'
[14:08:18] Can't kek

the total num of target smiles is : 1000


[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 6
[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 2 4 5 7
[14:08:18] SMILES Parse Error: unclosed ring for input: 'CC1N(C)CCCN(C)CC(C)N(C)C'
[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 6
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C1CCN1CCCN1Cc1ccccc1'
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C(CC[P+](C)(c1ccccc1)C)[N+](Cc1c(ccccc1)cc1)C'
[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 1 15 16 17
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C1CC2C3CC(C[N+]2(C)CCC1)C1N(C2)CCCC1'
[14:08:18] SMILES Parse Error: unclosed ring for input: 'c1cc(C(C(C(c2ccccc2)O)O)cc2)ccc1'
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C12C3CCCC[N+]3(C)C(CCCC3)C1CCCC2'
[14:08:18] SMILES Parse Error: unclosed ring for input: 'C(C[N+](C)(C)Cc1ccccc1)c1cccc(F)c12'
[14:08:18] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 5
[14:08:18] SMILES Parse Error: unclosed ring for input:

AFI_validity_rate_origin:  0.96
AFI_uniqueness_rate_origin:  0.536
AFI_novelty_rate_origin:  0.8619402985074627
AFI_reconstructability_rate_origin:  0.13805970149253732
AFI_IntDiv_origin:  0.8355384311719931
AFI_FCD_score_origin:  5.452421771259896


In [8]:
# generate the AFI smiles with the contrastive model
sample_nll_total, generated_smile_cl = sample_clamer(temp=0.7, model=model_cl, sample_dataloader=AFI_dataloader, device=device, vocab=vocab)
target_smile_cl = []
for i, (zeo, syn, tgt) in enumerate(tqdm(AFI_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_cl.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_cl))

# calculate the metrics
AFI_validity_rate_cl = validity_rate(generated_smile_cl)
AFI_uniqueness_rate_cl = uniqueness_rate(generated_smile_cl)
AFI_novelty_rate_cl = novelty_rate(generated_smile_cl, target_smile_cl)
AFI_reconstructability_rate_cl = reconstructability_rate(generated_smile_cl, target_smile_cl)
AFI_IntDiv_cl = IntDiv(generated_smile_cl)
AFI_FCD_score_cl = FCD_score(target_smile_cl, generated_smile_cl)
# print the metrics
print('AFI_validity_rate_cl: ', AFI_validity_rate_cl)
print('AFI_uniqueness_rate_cl: ', AFI_uniqueness_rate_cl)
print('AFI_novelty_rate_cl: ', AFI_novelty_rate_cl)
print('AFI_reconstructability_rate_cl: ', AFI_reconstructability_rate_cl)
print('AFI_IntDiv_cl: ', AFI_IntDiv_cl)
print('AFI_FCD_score_cl: ', AFI_FCD_score_cl)

100%|██████████| 16/16 [00:25<00:00,  1.62s/it]
100%|██████████| 16/16 [00:00<00:00, 33.78it/s]
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 1 2 15 16 18 19
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 1 14 15 16 17 18
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 1 2 9 10 11 12
[14:08:46] SMILES Parse Error: unclosed ring for input: 'C1C(CO)N(CCC2)Cc2ccccc21'
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 2 3 8 9
[14:08:46] SMILES Parse Error: unclosed ring for input: 'C1CCC2C3CC(C[N+]3(C)CC1)C2CCCC2'
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 1 3 16 17 19 20
[14:08:46] SMILES Parse Error: unclosed ring for input: 'C1C2[N+](CCCC2)(C)CC2C3N(CCCC3)C1'
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 1 2 14 15 16 17
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 1 2 15 16 18 19
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 1 14 15 16 17 18
[14:08:46] Can't kekulize mol.  Unkekulized atoms: 0 1 2 9 10 11 12
[14:08:46] SMILES Parse Er

the total num of target smiles is : 1000


[14:08:47] Can't kekulize mol.  Unkekulized atoms: 0 1 2 15 16 18 19
[14:08:47] Can't kekulize mol.  Unkekulized atoms: 0 1 14 15 16 17 18
[14:08:47] Can't kekulize mol.  Unkekulized atoms: 0 1 2 9 10 11 12
[14:08:47] SMILES Parse Error: unclosed ring for input: 'C1C(CO)N(CCC2)Cc2ccccc21'
[14:08:47] Can't kekulize mol.  Unkekulized atoms: 0 2 3 8 9
[14:08:47] SMILES Parse Error: unclosed ring for input: 'C1CCC2C3CC(C[N+]3(C)CC1)C2CCCC2'
[14:08:47] Can't kekulize mol.  Unkekulized atoms: 0 1 3 16 17 19 20
[14:08:47] SMILES Parse Error: unclosed ring for input: 'C1C2[N+](CCCC2)(C)CC2C3N(CCCC3)C1'
[14:08:47] Can't kekulize mol.  Unkekulized atoms: 0 1 2 14 15 16 17


AFI_validity_rate_cl:  0.991
AFI_uniqueness_rate_cl:  0.556
AFI_novelty_rate_cl:  0.8776978417266187
AFI_reconstructability_rate_cl:  0.1223021582733813
AFI_IntDiv_cl:  0.8391299668011901
AFI_FCD_score_cl:  4.101164941491845


In [9]:
# write the metrics to the folder data_AFI
with open('./data_AFI/AFI_generated_clamer_metrics.txt', 'w') as f:
    # write the mertics
    f.write(f'AFI_validity_rate_origin: {AFI_validity_rate_origin}, AFI_validity_rate_cl: {AFI_validity_rate_cl}\n')
    f.write(f'AFI_uniqueness_rate_origin: {AFI_uniqueness_rate_origin}, AFI_uniqueness_rate_cl: {AFI_uniqueness_rate_cl}\n')
    f.write(f'AFI_novelty_rate_origin: {AFI_novelty_rate_origin}, AFI_novelty_rate_cl: {AFI_novelty_rate_cl}\n')
    f.write(f'AFI_reconstructability_rate_origin: {AFI_reconstructability_rate_origin}, AFI_reconstructability_rate_cl: {AFI_reconstructability_rate_cl}\n')
    f.write(f'AFI_IntDiv_origin: {AFI_IntDiv_origin}, AFI_IntDiv_cl: {AFI_IntDiv_cl}\n')
    f.write(f'AFI_FCD_score_origin: {AFI_FCD_score_origin}, AFI_FCD_score_cl: {AFI_FCD_score_cl}\n')

# write the generated smiles (origin and cl) and target smiles to the folder data_AFI
with open('./data_AFI/AFI_generated_clamer_smiles_origin.txt', 'w') as f:
    for smiles in range(len(generated_smile_origin)):
        f.write(f'origin: {generated_smile_origin[smiles]}, cl: {generated_smile_cl[smiles]}, target: {target_smile_origin[smiles]}\n')

In [10]:
# generate the CHA smiles with the original model
sample_nll_total, generated_smile_origin = sample_clamer(temp=0.7, model=model_origin, sample_dataloader=CHA_dataloader, device=device, vocab=vocab)
target_smile_origin = []
for i, (zeo, syn, tgt) in enumerate(tqdm(CHA_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_origin.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_origin))

# calculate the metrics
CHA_validity_rate_origin = validity_rate(generated_smile_origin)
CHA_uniqueness_rate_origin = uniqueness_rate(generated_smile_origin)
CHA_novelty_rate_origin = novelty_rate(generated_smile_origin, target_smile_origin)
CHA_reconstructability_rate_origin = reconstructability_rate(generated_smile_origin, target_smile_origin)
CHA_IntDiv_origin = IntDiv(generated_smile_origin)
CHA_FCD_score_origin = FCD_score(target_smile_origin, generated_smile_origin)
# print the metrics
print('CHA_validity_rate_origin: ', CHA_validity_rate_origin)
print('CHA_uniqueness_rate_origin: ', CHA_uniqueness_rate_origin)
print('CHA_novelty_rate_origin: ', CHA_novelty_rate_origin)
print('CHA_reconstructability_rate_origin: ', CHA_reconstructability_rate_origin)
print('CHA_IntDiv_origin: ', CHA_IntDiv_origin)
print('CHA_FCD_score_origin: ', CHA_FCD_score_origin)

100%|██████████| 16/16 [00:25<00:00,  1.61s/it]
100%|██████████| 16/16 [00:00<00:00, 37.46it/s]
[14:09:15] SMILES Parse Error: unclosed ring for input: 'CC1CCCC(C)N'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1C2([n+]3cn(C45CC6CC(CC(C5)C6)C5)cc3)CC3CC1C2'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C12([n+]3cn(C45CC6CC(CC(C6)C5)C5)cc3)CC3CC(CC(C3)C1)C2'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1C[N+](C)(C)C2CC1C1CCC2'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1C(CCCC[N+]3(C)C)(C2)C3'
[14:09:15] Can't kekulize mol.  Unkekulized atoms: 1 2 5
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1CCc2c3c(c4c2C[N+](C)(C)C4)C(C=C2)C[N+]1(C)C'
[14:09:15] SMILES Parse Error: ring closure 3 duplicates bond between atom 2 and atom 3 for input: 'C12CC3(n3cc[n+](C6C5CCC(CC(C6)C5)C5)c3)CC(CC(C3)C1)C2'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1(C)(C)NCC(C)(C)NC'
[14:09:15] SMILES Parse Error: unclosed ring for

the total num of target smiles is : 1000


[14:09:15] SMILES Parse Error: unclosed ring for input: 'CC1CCCC(C)N'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1C2([n+]3cn(C45CC6CC(CC(C5)C6)C5)cc3)CC3CC1C2'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C12([n+]3cn(C45CC6CC(CC(C6)C5)C5)cc3)CC3CC(CC(C3)C1)C2'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1C[N+](C)(C)C2CC1C1CCC2'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1C(CCCC[N+]3(C)C)(C2)C3'
[14:09:15] Can't kekulize mol.  Unkekulized atoms: 1 2 5
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1CCc2c3c(c4c2C[N+](C)(C)C4)C(C=C2)C[N+]1(C)C'
[14:09:15] SMILES Parse Error: ring closure 3 duplicates bond between atom 2 and atom 3 for input: 'C12CC3(n3cc[n+](C6C5CCC(CC(C6)C5)C5)c3)CC(CC(C3)C1)C2'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1(C)(C)NCC(C)(C)NC'
[14:09:15] SMILES Parse Error: unclosed ring for input: 'C1C2CC3([n+]4cn(C56CC6CC(CC(C5)C7)cc4)cc4)CC1CC(C3)C2'
[14:09:15] SMILES Parse Error: u

CHA_validity_rate_origin:  0.982
CHA_uniqueness_rate_origin:  0.604
CHA_novelty_rate_origin:  0.8311258278145696
CHA_reconstructability_rate_origin:  0.16887417218543047
CHA_IntDiv_origin:  0.8836950262074046
CHA_FCD_score_origin:  3.720788849002961


In [11]:
# generate the CHA smiles with the original model
sample_nll_total, generated_smile_origin = sample_clamer(temp=0.7, model=model_cl, sample_dataloader=CHA_dataloader, device=device, vocab=vocab)
target_smile_origin = []
for i, (zeo, syn, tgt) in enumerate(tqdm(CHA_dataloader)):
    tgt = tgt.to(device)
    # convert the tgt to smiles
    tgt_smiles = []
    for seq in tgt:
        smiles = ''
        for idx in seq:
            if idx.item() == EOS:
                break
            elif idx.item() != PAD and idx.item() != SOS:
                smiles += vocab.itos[idx.item()]
        tgt_smiles.append(smiles)
    target_smile_origin.extend(tgt_smiles)
print('the total num of target smiles is :', len(target_smile_origin))

# calculate the metrics
CHA_validity_rate_cl = validity_rate(generated_smile_cl)
CHA_uniqueness_rate_cl = uniqueness_rate(generated_smile_cl)
CHA_novelty_rate_cl = novelty_rate(generated_smile_cl, target_smile_cl)
CHA_reconstructability_rate_cl = reconstructability_rate(generated_smile_cl, target_smile_cl)
CHA_IntDiv_cl = IntDiv(generated_smile_cl)
CHA_FCD_score_cl = FCD_score(target_smile_cl, generated_smile_cl)
# print the metrics
print('CHA_validity_rate_cl: ', CHA_validity_rate_cl)
print('CHA_uniqueness_rate_cl: ', CHA_uniqueness_rate_cl)
print('CHA_novelty_rate_cl: ', CHA_novelty_rate_cl)
print('CHA_reconstructability_rate_cl: ', CHA_reconstructability_rate_cl)
print('CHA_IntDiv_cl: ', CHA_IntDiv_cl)
print('CHA_FCD_score_cl: ', CHA_FCD_score_cl)

100%|██████████| 16/16 [00:25<00:00,  1.61s/it]
100%|██████████| 16/16 [00:00<00:00, 37.63it/s]
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 15 16 18 19
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 1 14 15 16 17 18
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 9 10 11 12
[14:09:43] SMILES Parse Error: unclosed ring for input: 'C1C(CO)N(CCC2)Cc2ccccc21'
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 2 3 8 9
[14:09:43] SMILES Parse Error: unclosed ring for input: 'C1CCC2C3CC(C[N+]3(C)CC1)C2CCCC2'
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 1 3 16 17 19 20
[14:09:43] SMILES Parse Error: unclosed ring for input: 'C1C2[N+](CCCC2)(C)CC2C3N(CCCC3)C1'
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 14 15 16 17
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 15 16 18 19
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 1 14 15 16 17 18
[14:09:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 9 10 11 12
[14:09:43] SMILES Parse Er

the total num of target smiles is : 1000


[14:09:44] Can't kekulize mol.  Unkekulized atoms: 0 1 2 15 16 18 19
[14:09:44] Can't kekulize mol.  Unkekulized atoms: 0 1 14 15 16 17 18
[14:09:44] Can't kekulize mol.  Unkekulized atoms: 0 1 2 9 10 11 12
[14:09:44] SMILES Parse Error: unclosed ring for input: 'C1C(CO)N(CCC2)Cc2ccccc21'
[14:09:44] Can't kekulize mol.  Unkekulized atoms: 0 2 3 8 9
[14:09:44] SMILES Parse Error: unclosed ring for input: 'C1CCC2C3CC(C[N+]3(C)CC1)C2CCCC2'
[14:09:44] Can't kekulize mol.  Unkekulized atoms: 0 1 3 16 17 19 20
[14:09:44] SMILES Parse Error: unclosed ring for input: 'C1C2[N+](CCCC2)(C)CC2C3N(CCCC3)C1'
[14:09:44] Can't kekulize mol.  Unkekulized atoms: 0 1 2 14 15 16 17


CHA_validity_rate_cl:  0.991
CHA_uniqueness_rate_cl:  0.556
CHA_novelty_rate_cl:  0.8776978417266187
CHA_reconstructability_rate_cl:  0.1223021582733813
CHA_IntDiv_cl:  0.8391299668011901
CHA_FCD_score_cl:  4.101164941491845


In [12]:
# write the metrics to the folder data_CHA
with open('./data_CHA/CHA_generated_clamer_metrics.txt', 'w') as f:
    # write the mertics
    f.write(f'CHA_validity_rate_origin: {CHA_validity_rate_origin}, CHA_validity_rate_cl: {CHA_validity_rate_cl}\n')
    f.write(f'CHA_uniqueness_rate_origin: {CHA_uniqueness_rate_origin}, CHA_uniqueness_rate_cl: {CHA_uniqueness_rate_cl}\n')
    f.write(f'CHA_novelty_rate_origin: {CHA_novelty_rate_origin}, CHA_novelty_rate_cl: {CHA_novelty_rate_cl}\n')
    f.write(f'CHA_reconstructability_rate_origin: {CHA_reconstructability_rate_origin}, CHA_reconstructability_rate_cl: {CHA_reconstructability_rate_cl}\n')
    f.write(f'CHA_IntDiv_origin: {CHA_IntDiv_origin}, CHA_IntDiv_cl: {CHA_IntDiv_cl}\n')
    f.write(f'CHA_FCD_score_origin: {CHA_FCD_score_origin}, CHA_FCD_score_cl: {CHA_FCD_score_cl}\n')
# write the generated smiles (origin and cl) and target smiles to the folder data_CHA
with open('./data_CHA/CHA_generated_clamer_smiles_origin.txt', 'w') as f:
    for smiles in range(len(generated_smile_origin)):
        f.write(f'origin: {generated_smile_origin[smiles]}, cl: {generated_smile_cl[smiles]}, target: {target_smile_origin[smiles]}\n')