In [None]:
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
from tqdm import tqdm
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import torch.backends.cudnn as cudnn

# import custom modules
from datasets.data_loader import *
from models.clamer import *
from models.loss import InfoNCELoss
from models.trfm import *
from utils.utils import *
from utils.plot_figures import *
from utils.metrics import *
from utils.build_vocab import *

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from matplotlib import pyplot as plt
from utils.build_vocab import WordVocab
from models.trfm import *

In [None]:
# load the data
AFI_smiles = read_strings('./data_AFI/AFI_smiles.csv', idx=False)
CHA_smiles = read_strings('./data_CHA/CHA_smiles.csv', idx=False)
AEI_smiles = read_strings('./data_AEI/AEI_smiles.csv', idx=False)

In [None]:
# load the generated smiles
AFI_generate_smiles = pd.read_csv('./generation/AFI_generate_smiles.csv')
CHA_generate_smiles = pd.read_csv('./generation/CHA_generate_smiles.csv')
AEI_generate_smiles = pd.read_csv('./generation/AEI_generate_smiles.csv')

def validity_smiles(smiles):
    valid = []
    for smile in smiles:
        try:
            Chem.MolFromSmiles(smile)
            valid.append(smile)
        except:
            continue
    return valid

# get the valid smiles
AFI_generate_smiles = AFI_generate_smiles.apply(validity_smiles)
CHA_generate_smiles = CHA_generate_smiles.apply(validity_smiles)
AEI_generate_smiles = AEI_generate_smiles.apply(validity_smiles)

def random_sample(df, n):
    df_sample = pd.DataFrame()
    for i in range(6):
        smiles = df['epoch_{}'.format(i)]
        smiles = random.sample(smiles, n)
        smiles = pd.DataFrame(smiles, columns=['epoch_{}'.format(i)])
        df_sample = pd.concat([df_sample, smiles], axis=1)
    return df_sample

AFI_generate_smiles = random_sample(AFI_generate_smiles, 50)
CHA_generate_smiles = random_sample(CHA_generate_smiles, 50)
AEI_generate_smiles = random_sample(AEI_generate_smiles, 50)

print(AFI_generate_smiles)

In [None]:
# randomize the all_smiles_unique
# every smiles generate 10 new smiles
def smiles_randomize(smiles, num=10):
    new_smiles = []
    for i in range(len(smiles)):
        for j in range(num):
            # using rdkit to randomize the smiles
            mol = Chem.MolFromSmiles(smiles[i])
            if mol is None:
                continue
            new_smiles.append(Chem.MolToSmiles(mol, canonical=False))
    return new_smiles

In [None]:
# convert AFI, AEI, CHA to list like ['CCO', 'CCN']
AFI_smiles = AFI_smiles.tolist()
AFI_smiles = [i[0] for i in AFI_smiles]
print(AFI_smiles[:5])
CHA_smiles = CHA_smiles.tolist()
CHA_smiles = [i[0] for i in CHA_smiles]
AEI_smiles = AEI_smiles.tolist()
AEI_smiles = [i[0] for i in AEI_smiles]

# randomize the AFI, AEI, CHA smiles
AFI_smiles = smiles_randomize(AFI_smiles, 5)
CHA_smiles = smiles_randomize(CHA_smiles, 5)
AEI_smiles = smiles_randomize(AEI_smiles, 5)

In [None]:
# calculate the KL divergence of the AFI, AEI, CHA smiles for every epoch
def calculate_KL_divergence(real_smiles, generated_smiles):
    """
    Calculate the KL divergence of the smiles for every epoch
    real_smiles: the real smiles
    generated_smiles: the generated smiles which has columns like ['epoch_0', 'epoch_1', 'epoch_2', 'epoch_3', 'epoch_4', 'epoch_5']
    
    return the KL divergence of the smiles for every epoch
    """
    KL_divergence_list = []
    for i in tqdm(range(6)):
        real_smiles_i = real_smiles
        generated_smiles_i = generated_smiles['epoch_{}'.format(i)]
        # convert generated smiles to list
        generated_smiles_i = generated_smiles_i.tolist()
        # calculate the KL divergence
        kl = KL_divergence(real_smiles_i, generated_smiles_i)
        KL_divergence_list.append(kl)
    return KL_divergence_list

# calculate the KL divergence of the AFI, AEI, CHA smiles for every epoch
AFI_KL_divergence = calculate_KL_divergence(AFI_smiles, AFI_generate_smiles)
CHA_KL_divergence = calculate_KL_divergence(CHA_smiles, CHA_generate_smiles)
AEI_KL_divergence = calculate_KL_divergence(AEI_smiles, AEI_generate_smiles)
print(AFI_KL_divergence)
print(CHA_KL_divergence)
print(AEI_KL_divergence)

In [None]:
# draw the KL divergence of the AFI, AEI, CHA smiles for every epoch
import matplotlib.pyplot as plt
import seaborn as sns

def draw_KL_divergence(KL_divergence, title):
    """
    Draw the KL divergence of the smiles for every epoch
    KL_divergence: the KL divergence of the smiles for every epoch
    title: the title of the plot
    """
    plt.figure(figsize=(10, 6))
    plt.plot(KL_divergence, marker='o')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('KL Divergence')
    plt.xticks(np.arange(6), ['epoch_0', 'epoch_1', 'epoch_2', 'epoch_3', 'epoch_4', 'epoch_5'])
    plt.grid()
    plt.show()
    # save the plot
    plt.savefig('./KL_divergence_{}.png'.format(title))
    plt.close()
# draw the KL divergence of the AFI, AEI, CHA smiles for every epoch
draw_KL_divergence(AFI_KL_divergence, 'AFI')
draw_KL_divergence(CHA_KL_divergence, 'CHA')
draw_KL_divergence(AEI_KL_divergence, 'AEI')