-
Notifications
You must be signed in to change notification settings - Fork 13
/
token.py
243 lines (214 loc) · 9.37 KB
/
token.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import numpy as np
import ast
from SMILESX import utils, augm
from keras import backend as K
# import tensorflow as tf
# #from keras.backend.tensorflow_backend import set_session
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU
# config.log_device_placement = True # to log device placement (on which device the operation ran)
# sess = tf.Session(config=config)
# K.set_session(sess) # set this TensorFlow session as the default session for Keras
## SMILES Tokenizer
# smiles: input SMILES string to tokenize
# returns:
# list of tokens + terminators in a SMILES
# dictionary of tokens from http://opensmiles.org/opensmiles.html (Formal Grammar)
aliphatic_organic = ['B','C','N','O','S','P','F','Cl','Br','I']
aromatic_organic = ['b','c','n','o','s','p']
bracket = ['[',']'] # includes isotope, symbol, chiral, hcount, charge, class
bond = ['-','=','#','$','/','\\','.']
lrb = ['%'] # long ring bonds '%TWODIGITS'
terminator = [' '] # SPACE - start/end of SMILES
wildcard = ['*']
oov = ['oov'] # out-of-vocabulary tokens
#
def smiles_tokenizer(smiles):
smiles = smiles.replace('\n','') # avoid '\n' if exists in smiles
# '[...]' as single token
smiles = smiles.replace(bracket[0],' '+bracket[0]).replace(bracket[1],bracket[1]+' ')
# '%TWODIGITS' as single token
lrb_print = [smiles[ic:ic+3] for ic,ichar in enumerate(smiles) if ichar==lrb[0]]
if len(lrb_print)!=0:
for ichar in lrb_print:
smiles = smiles.replace(ichar, ' '+ichar+' ')
# split SMILES for [...] recognition
smiles = smiles.split(' ')
# split fragments other than [...]
splitted_smiles = list()
for ifrag in smiles:
ifrag_tag = False
for inac in bracket+lrb:
if inac in ifrag:
ifrag_tag = True
break
if ifrag_tag == False:
# check for Cl, Br in alphatic branches to not dissociate letters (e.g. Cl -> C, l is prohibited)
for iaa in aliphatic_organic[7:9]:
ifrag = ifrag.replace(iaa, ' '+iaa+' ')
ifrag_tmp = ifrag.split(' ')
for iifrag_tmp in ifrag_tmp:
if iifrag_tmp!=aliphatic_organic[7] \
and iifrag_tmp!=aliphatic_organic[8]: # not 'Cl' and not 'Br'
splitted_smiles.extend(iifrag_tmp) # automatic split char by char
else:
splitted_smiles.extend([iifrag_tmp])
else:
splitted_smiles.extend([ifrag]) # keep the original token size
return terminator+splitted_smiles+terminator # add start + ... + end of SMILES
##
## Get tokens from list of tokens from SMILES
# smiles_array: array of SMILES to split as individual tokens
# split_l: number of tokens present in a split (default: 1),
# e.g. split_l = 1 -> np.array(['CC=O']) => [[' ', 'C', 'C', '=', 'O', ' ']],
# split_l = 2 -> np.array(['CC=O']) => [[' C', 'CC', 'C=', '=O', 'O ']],
# split_l = 3 -> np.array(['CC=O']) => [[' CC', 'CC=', 'C=O', '=O ']],
# etc.
# returns:
# List of tokenized SMILES (=^def list of tokens)
def get_tokens(smiles_array, split_l = 1):
tokenized_smiles_list = list()
for ismiles in smiles_array.tolist():
tokenized_smiles_tmp = smiles_tokenizer(ismiles)
tokenized_smiles_list.append([''.join(tokenized_smiles_tmp[i:i+split_l])
for i in range(0,len(tokenized_smiles_tmp)-split_l+1,1)
])
return tokenized_smiles_list
##
## Vocabulary extraction
# lltokens: list of lists of tokens (list of tokenized SMILES)
# returns:
# set of individual tokens forming a vocabulary
def extract_vocab(lltokens):
return set([itoken for ismiles in lltokens for itoken in ismiles])
##
## Dictionary from tokens to integers, and the opposite
# tokens: list of tokens
# returns:
# dictionary from tokens to integers
def get_tokentoint(tokens):
return dict((c, i) for i, c in enumerate(tokens))
# returns:
# dictionary from integers to tokens
def get_inttotoken(tokens):
return dict((i, c) for i, c in enumerate(tokens))
##
## Tokens finder
# data: provided data (numpy array of: (SMILES, property))
# data_name: dataset's name
# data_units: property's SI units
# k_fold_number: number of k-folds used for cross-validation
# k_fold_index: k-fold index to be used for visualization
# augmentation: SMILES's augmentation (Default: False)
# token_tofind: targeted token (elements, bonds, etc.) to find in the training set
# verbose: print SMILES containing the targeted token (0: not print or 1: print, default: 1)
# returns:
# How many SMILES contain the targeted token, and which SMILES if verbose = 1
def TokensFinder(data,
data_name,
data_units = '',
k_fold_number = 8,
k_fold_index = 0,
augmentation = False,
token_tofind = '',
verbose = 1):
print("***SMILES_X token's finder starts...***\n\n")
np.random.seed(seed=123)
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
print("******")
print("***Fold #{} initiated...***".format(k_fold_index))
print("******")
print("***Sampling and splitting of the dataset.***\n")
# Reproducing the data split of the requested fold (k_fold_index)
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
utils.random_split(smiles_input=data.smiles,
prop_input=np.array(data.iloc[:,1]),
random_state=seed_list[k_fold_index],
scaling = True)
# data augmentation or not
if augmentation == True:
print("***Data augmentation.***\n")
canonical = False
rotation = True
else:
print("***No data augmentation has been required.***\n")
canonical = True
rotation = False
x_train_enum, x_train_enum_card, y_train_enum = \
augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)
x_valid_enum, x_valid_enum_card, y_valid_enum = \
augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)
x_test_enum, x_test_enum_card, y_test_enum = \
augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)
print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))
print("***Tokenization of SMILES.***\n")
# Tokenize SMILES per dataset
x_train_enum_tokens = get_tokens(x_train_enum)
x_valid_enum_tokens = get_tokens(x_valid_enum)
x_test_enum_tokens = get_tokens(x_test_enum)
print("Examples of tokenized SMILES from a training set:\n{}\n".\
format(x_train_enum_tokens[:5]))
# Vocabulary size computation
all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens
tokens = extract_vocab(all_smiles_tokens)
vocab_size = len(tokens)
train_unique_tokens = list(extract_vocab(x_train_enum_tokens))
# Token finder
print("The finder is processing the search...")
n_found = 0
for ismiles in x_train_enum_tokens:
if token_tofind in ismiles:
n_found += 1
if verbose == 1:
print(''.join(ismiles))
print("\n{} SMILES found with {} token in the training set.".format(n_found, token_tofind))
##
## Save the vocabulary for further use of a model
# vocab: vocabulary (list of tokens to save)
# tftokens: text file name with directory to be saved (*.txt)
def save_vocab(vocab, tftokens):
with open(tftokens,'w') as f_toks:
f_toks.write(str(list(vocab)))
##
## Get the vocabulary previously saved
# tftokens: text file name with directory in which the vocabulary is saved (*.txt)
# returns:
# set of individual tokens forming a vocabulary
def get_vocab(tftokens):
with open(tftokens,'r') as f_toks:
tokens = ast.literal_eval(f_toks.read())
return tokens
##
## Add tokens for unknown ('unk') and extra padding ('pad')
# tokens: list of tokens
# vocab_size: vocabulary size before the addition
# returns:
# extended vocabulary
# vocabulary size after extension
def add_extra_tokens(tokens, vocab_size):
tokens.insert(0,'unk')
tokens.insert(0,'pad')
vocab_size = vocab_size+2
return tokens, vocab_size
##
## Encode SMILES as a vector of integers
# tokenized_smiles_list: list of tokenized SMILES
# max_length: force the vectors to have a same length
# vocab: vocabulary of tokens
# returns:
# array of integers of dimensions (number_of_SMILES, max_length)
def int_vec_encode(tokenized_smiles_list, max_length, vocab):
token_to_int = get_tokentoint(vocab)
int_smiles_array = np.zeros((len(tokenized_smiles_list),max_length), dtype=np.int32)
for csmiles,ismiles in enumerate(tokenized_smiles_list):
ismiles_tmp = list()
if len(ismiles)<= max_length:
ismiles_tmp = ['pad']*(max_length-len(ismiles))+ismiles # Force output vectors to have same length
else:
ismiles_tmp = ismiles[-max_length:] # longer vectors are truncated (to be changed...)
integer_encoded = [token_to_int[itoken] if(itoken in vocab) \
else token_to_int['unk']\
for itoken in ismiles_tmp]
int_smiles_array[csmiles] = integer_encoded
return int_smiles_array