In [1]:
import numpy as np
import pandas as pd
import os
import torch
import rdkit
from tqdm import tqdm
import random

from utils.utils import *
from utils.enumerator import SmilesEnumerator
from utils.build_vocab import WordVocab
from datasets.data_loader import Contrastive_Seq2seqDataset

In [2]:
train_smiles = read_strings('./data/train_smiles.csv', idx=False)
train_zeo = read_vec('./data/train_zeo.csv', idx=False)
train_syn = read_vec('./data/train_syn.csv', idx=False)
train_codes = read_strings('./data/train_codes.csv', idx=False)
test_smiles = read_strings('./data/test_smiles.csv', idx=False)
test_zeo = read_vec('./data/test_zeo.csv', idx=False)
test_syn = read_vec('./data/test_syn.csv', idx=False)
test_codes = read_strings('./data/test_codes.csv', idx=False)

vocab = WordVocab.load_vocab('./model_hub/vocab.pkl')
print('the vocab size is :', len(vocab))

charlen = len(vocab)
print('the total num of charset is :', charlen)

the vocab size is : 45
the total num of charset is : 45


In [3]:
print(type(train_smiles))
print(type(train_zeo))
print(type(train_syn))
print(type(train_codes))
print(train_smiles.shape)
print(train_zeo.shape)
print(train_syn.shape)
print(train_codes.shape)
print(train_smiles[:5])
print(train_zeo[:5])
print(train_syn[:5])
print(train_codes[:5])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(144938, 1)
(144938, 7)
(144938, 17)
(144938, 1)
[['n1(C)c(C)[n+](cc1)Cc1ccccc1C[n+]1ccn(C)c1C']
 ['Cn1cc[n+](Cc2ccccc2C[n+]2ccn(C)c2C)c1C']
 ['Cc1n(C)cc[n+]1Cc1c(cccc1)C[n+]1ccn(c1C)C']
 ['c1ccc(c(c1)C[n+]1c(C)n(C)cc1)C[n+]1ccn(c1C)C']
 ['[n+]1(c(n(C)cc1)C)Cc1c(C[n+]2ccn(c2C)C)cccc1']]
[['0.5746268656716419' '0.25' '1.0' '0.06857614477673278'
  '0.45069185152646607' '0.27416454832909665' '0.19443238886477773']
 ['0.5746268656716419' '0.25' '1.0' '0.06857614477673278'
  '0.45069185152646607' '0.27416454832909665' '0.19443238886477773']
 ['0.5746268656716419' '0.25' '1.0' '0.06857614477673278'
  '0.45069185152646607' '0.27416454832909665' '0.19443238886477773']
 ['0.5746268656716419' '0.25' '1.0' '0.06857614477673278'
  '0.45069185152646607' '0.27416454832909665' '0.19443238886477773']
 ['0.5746268656716419' '0.25' '1.0' '0.06857614477673278'
  '0.45069185152646607' '0.27416454832909665' '0.1

In [4]:
# convert zeo and syn to list
train_zeo = train_zeo.tolist()
train_syn = train_syn.tolist()
test_zeo = test_zeo.tolist()
test_syn = test_syn.tolist()

# check the zeo and syn
print(type(train_zeo))
print(type(train_syn))
print(train_zeo[:5])
print(train_syn[:5])

<class 'list'>
<class 'list'>
[['0.5746268656716419', '0.25', '1.0', '0.06857614477673278', '0.45069185152646607', '0.27416454832909665', '0.19443238886477773'], ['0.5746268656716419', '0.25', '1.0', '0.06857614477673278', '0.45069185152646607', '0.27416454832909665', '0.19443238886477773'], ['0.5746268656716419', '0.25', '1.0', '0.06857614477673278', '0.45069185152646607', '0.27416454832909665', '0.19443238886477773'], ['0.5746268656716419', '0.25', '1.0', '0.06857614477673278', '0.45069185152646607', '0.27416454832909665', '0.19443238886477773'], ['0.5746268656716419', '0.25', '1.0', '0.06857614477673278', '0.45069185152646607', '0.27416454832909665', '0.19443238886477773']]
[['1.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0'], ['1.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0'], ['1.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', 

In [5]:
# convert SMILES to sequence from np.array to list
if type(train_smiles) == np.ndarray:
    train_smiles = train_smiles.tolist()
    # convert it from [[''], [''], ['']] to ['','','']
    train_smiles = [i[0] for i in train_smiles]

if type(test_smiles) == np.ndarray:
    test_smiles = test_smiles.tolist()
    test_smiles = [i[0] for i in test_smiles]

print(type(train_smiles))
print(type(test_smiles))
print(train_smiles[:5])
print(test_smiles[:5])

<class 'list'>
<class 'list'>
['n1(C)c(C)[n+](cc1)Cc1ccccc1C[n+]1ccn(C)c1C', 'Cn1cc[n+](Cc2ccccc2C[n+]2ccn(C)c2C)c1C', 'Cc1n(C)cc[n+]1Cc1c(cccc1)C[n+]1ccn(c1C)C', 'c1ccc(c(c1)C[n+]1c(C)n(C)cc1)C[n+]1ccn(c1C)C', '[n+]1(c(n(C)cc1)C)Cc1c(C[n+]2ccn(c2C)C)cccc1']
['[n+]1(ccn(c1C)C)Cc1c(cccc1)C[n+]1c(n(cc1)C)C', 'c1(C[n+]2ccn(C)c2C)ccccc1C[n+]1c(n(C)cc1)C', 'Cc1[n+](Cc2ccccc2C[n+]2ccn(c2C)C)ccn1C', 'n1(c(C)[n+](Cc2c(cccc2)C[n+]2c(n(cc2)C)C)cc1)C', 'c1(C)[n+](Cc2c(C[n+]3ccn(C)c3C)cccc2)ccn1C']


In [6]:
# convert codes to list from np.array
if type(train_codes) == np.ndarray:
    train_codes = train_codes.tolist()
    train_codes = [i[0] for i in train_codes]

if type(test_codes) == np.ndarray:
    test_codes = test_codes.tolist()
    test_codes = [i[0] for i in test_codes]

print(type(train_codes))
print(type(test_codes))
print(train_codes[:5])
print(test_codes[:5])

<class 'list'>
<class 'list'>
['*BEA', '*BEA', '*BEA', '*BEA', '*BEA']
['*BEA', '*BEA', '*BEA', '*BEA', '*BEA']


In [7]:
# # check if smiles are valid
# # if not, remove them from the dataset including the corresponding zeolite and synthesis vectors, codes
# invalid_smiles_index = []
# sme = SmilesEnumerator()
# for i in range(len(train_smiles)):
#     if sme.randomize_smiles(train_smiles[i]) is None:
#         invalid_smiles_index.append(i)

# # remove invalid smiles and corresponding zeolite and synthesis vectors, codes
# # train_smiles is a list, so we can use pop() to remove elements
# if invalid_smiles_index:
#     for i in range(len(invalid_smiles_index)):
#         train_smiles.pop(invalid_smiles_index[i])
#     train_zeo = np.delete(train_zeo, invalid_smiles_index, axis=0)
#     train_syn = np.delete(train_syn, invalid_smiles_index, axis=0)
#     train_codes = np.delete(train_codes, invalid_smiles_index)
#     print('Invalid test smiles:', invalid_smiles_index)
# else:
#     print('No invalid train smiles')

In [8]:
# # check if test smiles are valid
# # if not, remove them from the dataset including the corresponding zeolite and synthesis vectors, codes
# invalid_smiles_index = []
# sme = SmilesEnumerator()
# for i in range(len(test_smiles)):
#     if sme.randomize_smiles(test_smiles[i]) is None:
#         invalid_smiles_index.append(i)

# # remove invalid smiles and corresponding zeolite and synthesis vectors, codes
# if invalid_smiles_index:
#     for i in range(len(invalid_smiles_index)):
#         test_smiles.pop(invalid_smiles_index[i])
#     test_zeo = np.delete(test_zeo, invalid_smiles_index, axis=0)
#     test_syn = np.delete(test_syn, invalid_smiles_index, axis=0)
#     test_codes = np.delete(test_codes, invalid_smiles_index)
#     print('Invalid test smiles:', invalid_smiles_index)
# else:
#     print('No invalid test smiles')

In [9]:
# copy train and test smiles, merge them into one list
all_smiles = train_smiles.copy()
test_smiles_copy = test_smiles.copy()
all_smiles.extend(test_smiles_copy)
print(len(all_smiles))

# convert all smiles into canonical smiles
for i in range(len(all_smiles)):
    all_smiles[i] = rdkit.Chem.MolToSmiles(rdkit.Chem.MolFromSmiles(all_smiles[i]), canonical=True)

# get unique smiles
all_smiles_unique = list(set(all_smiles))
print(len(all_smiles_unique))

179742
705


In [10]:
MAX_LEN = 220

def smiles_to_seq(smile, vocab, seq_len=MAX_LEN):
    sm_spaced = split(smile) # Spacing
    sm_split = sm_spaced.split()
    if len(sm_split)<=MAX_LEN - 2:
        # convert to sequence by numpy
        content = [vocab.stoi.get(token, vocab.unk_index) for token in smile]
        X = [vocab.sos_index] + content + [vocab.eos_index]
        padding = [vocab.pad_index]*(seq_len - len(X))
        X.extend(padding)
        smiles_seq = np.array(X)
        return smiles_seq
    else:
        smile = split(smile).split()
        # convert to sequence by numpy
        content = [vocab.stoi.get(token, vocab.unk_index) for token in smile]
        X = [vocab.sos_index] + content + [vocab.eos_index]
        padding = [vocab.pad_index]*(seq_len - len(X))
        X.extend(padding)
        smiles_seq = np.array(X)
        return smiles_seq

In [11]:
# convert the unique smiles into sequence
all_smiles_seq = []
for i in range(len(all_smiles_unique)):
    all_smiles_seq.append(smiles_to_seq(all_smiles_unique[i], vocab, seq_len=MAX_LEN))
# convert the list into numpy array
all_smiles_seq = np.array(all_smiles_seq)
print(all_smiles_seq.shape)
# save the unique smiles sequence
print(all_smiles_seq[:5])
np.save('./data/unique_smiles_seq.npy', all_smiles_seq)

# read the unique smiles sequence
all_smiles_seq = np.load('./data/unique_smiles_seq.npy')
print(all_smiles_seq.shape)

(705, 220)
[[ 3  6 11 ...  0  0  0]
 [ 3  6  6 ...  0  0  0]
 [ 3  6 18 ...  0  0  0]
 [ 3  6  6 ...  0  0  0]
 [ 3  6  6 ...  0  0  0]]
(705, 220)


In [12]:
# build the dictionary for the unique smiles
smiles_dict = {}
for i, smiles in enumerate(all_smiles_unique):
    smiles_dict[smiles] = i

In [13]:
# # build the dataset
# # code, zeo, syn, smiles, positive_smiles
# # positive_smiles: the smiles which is randomized from the original smiles (10)
# # canical_smile_index: the index of the original smiles in the all_smiles_seq
# train_dataset = pd.DataFrame(columns=['code', 'zeo', 'syn', 'smiles', 'positive_smiles', 'canonical_smile_index'])
# for i in tqdm(range(len(train_smiles))):
#     # get the index of the original smiles in the unique smiles
#     canonical_smile = rdkit.Chem.MolToSmiles(rdkit.Chem.MolFromSmiles(train_smiles[i]), canonical=True)
#     # get the positive smiles
#     positive_smiles = []
#     for j in range(10):
#         positive_smiles.append(sme.randomize_smiles(canonical_smile))
    
#     # convert the canonical smiles to sequence
#     canonical_smiles_seq = smiles_to_seq(canonical_smile, vocab)
    
#     # get the canonical smiles row index from the all_smiles_seq by numpy which is the same as the canonical smiles
#     canonical_smile_index = np.where(np.all(all_smiles_seq == canonical_smiles_seq, axis=1))[0][0]
    
#     train_dataset = train_dataset.append({'code': train_codes[i], 'zeo': train_zeo[i], 'syn': train_syn[i],
#                                           'smiles': train_smiles[i], 'positive_smiles': positive_smiles, 
#                                             'canonical_smile_index': canonical_smile_index}, ignore_index=True)
    
#     # every 10000 samples, save the dataset and clear the dataset
#     # avoid the memory error
#     if i % 10000 == 0 and i != 0:
#         # check if the file exists
#         if os.path.exists('./data/train_contrastive_dataset.csv'):
#             train_dataset.to_csv('./data/train_contrastive_dataset.csv', mode='a', header=False, index=False)
#             # clear the dataset
#             train_dataset = pd.DataFrame(columns=['code', 'zeo', 'syn', 'smiles', 'positive_smiles', 'canonical_smile_index'])
#             print('Save the dataset:', i)
#         else:
#             train_dataset.to_csv('./data/train_contrastive_dataset.csv', mode='w', header=True, index=False)
#             # clear the dataset
#             train_dataset = pd.DataFrame(columns=['code', 'zeo', 'syn', 'smiles', 'positive_smiles', 'canonical_smile_index'])
#             print('Save the dataset:', i)

# # save the last dataset
# train_dataset.to_csv('./data/train_contrastive_dataset.csv', mode='a', header=False, index=False)


# train_dataset.head(5)
# print(train_dataset.shape)

In [14]:
# # build the test dataset
# test_dataset = pd.DataFrame(columns=['code', 'zeo', 'syn', 'smiles', 'positive_smiles', 'canonical_smile_index'])
# for i in tqdm(range(len(test_smiles))):
#     # get the index of the original smiles in the unique smiles
#     canonical_smile = rdkit.Chem.MolToSmiles(rdkit.Chem.MolFromSmiles(test_smiles[i]), canonical=True)
#     # get the positive smiles
#     positive_smiles = []
#     for j in range(10):
#         positive_smiles.append(sme.randomize_smiles(canonical_smile))
    
#     # convert the canonical smiles to sequence
#     canonical_smiles_seq = smiles_to_seq(canonical_smile, vocab)
    
#     # get the canonical smiles row index from the all_smiles_seq by numpy which is the same as the canonical smiles
#     canonical_smile_index = np.where(np.all(all_smiles_seq == canonical_smiles_seq, axis=1))[0][0]
    
#     test_dataset = test_dataset.append({'code': test_codes[i], 'zeo': test_zeo[i], 'syn': test_syn[i],
#                                           'smiles': test_smiles[i], 'positive_smiles': positive_smiles, 
#                                             'canonical_smile_index': canonical_smile_index}, ignore_index=True)
    
#     # every 10000 samples, save the dataset and clear the dataset
#     # avoid the memory error
#     if i % 10000 == 0 and i != 0:
#         # check if the file exists
#         if os.path.exists('./data/test_dataset.csv'):
#             test_dataset.to_csv('./data/test_contrastive_dataset.csv', mode='a', header=False, index=False)
#             test_dataset = pd.DataFrame(columns=['code', 'zeo', 'syn', 'smiles', 'positive_smiles', 'canonical_smile_index'])
#             print('Save the dataset:', i)
#         else:
#             test_dataset.to_csv('./data/test_contrastive_dataset.csv', mode='w', header=True, index=False)
#             test_dataset = pd.DataFrame(columns=['code', 'zeo', 'syn', 'smiles', 'positive_smiles', 'canonical_smile_index'])
#             print('Save the dataset:', i)

# # save the last dataset
# test_dataset.to_csv('./data/test_contrastive_dataset.csv', mode='a', header=False, index=False)

# test_dataset.head(5)
# print(test_dataset.shape)

In [15]:
# read the dataset
train_dataset = pd.read_csv('./data/train_contrastive_dataset.csv')
test_dataset = pd.read_csv('./data/test_contrastive_dataset.csv')

# set the dataset
train_dataset = Contrastive_Seq2seqDataset(train_dataset, vocab, MAX_LEN)
test_dataset = Contrastive_Seq2seqDataset(test_dataset, vocab, MAX_LEN)

# sample the dataset
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=2, shuffle=False)

for i, (zeo, syn, smiles, positive_smiles, negative_smiles) in enumerate(train_loader):
    print(zeo.shape)
    print(syn.shape)
    print(smiles.shape)
    print(positive_smiles.shape)
    print(negative_smiles.shape)
    
    print(type(zeo))
    print(type(syn))
    print(type(smiles))
    print(type(positive_smiles))
    print(type(negative_smiles))
    break

for i, (zeo, syn, smiles, positive_smiles, negative_smiles) in enumerate(test_loader):
    print(zeo.shape)
    print(syn.shape)
    print(smiles.shape)
    print(positive_smiles.shape)
    print(negative_smiles.shape)

    print(type(zeo))
    print(type(syn))
    print(type(smiles))
    print(type(positive_smiles))
    print(type(negative_smiles))
    break

100%|██████████| 144938/144938 [00:00<00:00, 755781.09it/s]
100%|██████████| 144938/144938 [00:00<00:00, 565853.57it/s]
100%|██████████| 144938/144938 [00:02<00:00, 51455.05it/s]
100%|██████████| 14803/14803 [00:00<00:00, 1806519.89it/s]
100%|██████████| 14803/14803 [00:00<00:00, 1145688.23it/s]
100%|██████████| 14803/14803 [00:00<00:00, 52018.05it/s]


torch.Size([2, 7])
torch.Size([2, 17])
torch.Size([2, 220])
torch.Size([2, 10, 220])
torch.Size([2, 704, 220])
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
torch.Size([2, 7])
torch.Size([2, 17])
torch.Size([2, 220])
torch.Size([2, 10, 220])
torch.Size([2, 704, 220])
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
