From 4ab86264e1850a4d680b727eb19479ae7eb75897 Mon Sep 17 00:00:00 2001 From: egracheva Date: Thu, 19 Sep 2019 13:25:00 +0900 Subject: [PATCH 1/3] Fixed the random state issue. --- SMILESX/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SMILESX/utils.py b/SMILESX/utils.py index be7a3c4..11b3dc1 100644 --- a/SMILESX/utils.py +++ b/SMILESX/utils.py @@ -17,7 +17,7 @@ # 3 arrays of properties for training, validation, test: y_train, y_valid, y_test, # the scaling function: scaler def random_split(smiles_input, prop_input, random_state, scaling = True): - + np.random.seed(seed=random_state) full_idx = np.array([x for x in range(smiles_input.shape[0])]) train_idx = np.random.choice(full_idx, size=math.ceil(0.8*smiles_input.shape[0]), From cfb0fd6e461657158ad2457087cfd7a3e264efdf Mon Sep 17 00:00:00 2001 From: egracheva Date: Thu, 19 Sep 2019 14:54:06 +0900 Subject: [PATCH 2/3] Changed the seed to fold, cleaned selection_seed and simply put seed_list[ifold] (with variations here depending on the file). --- SMILESX/embeddingvis.py | 13 ++-- SMILESX/inference.py | 7 +-- SMILESX/interpret.py | 19 +++--- SMILESX/main.py | 16 +++-- SMILESX/nsga2sort.py | 135 ++++++++++++++++++++++++++++++++++++++++ SMILESX/token.py | 11 ++-- 6 files changed, 164 insertions(+), 37 deletions(-) create mode 100644 SMILESX/nsga2sort.py diff --git a/SMILESX/embeddingvis.py b/SMILESX/embeddingvis.py index 1299088..349562f 100644 --- a/SMILESX/embeddingvis.py +++ b/SMILESX/embeddingvis.py @@ -46,18 +46,17 @@ def Embedding_Vis(data, print("***SMILES_X for embedding visualization starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() - # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times - selection_seed = seed_list[k_fold_index] print("******") - print("***Fold #{} initiated...***".format(selection_seed)) + print("***Fold #{} initiated...***".format(k_fold_index)) print("******") print("***Sampling and splitting of the dataset.***\n") + # Reproducing the data split of the requested fold (k_fold_index) x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), - random_state=selection_seed, + random_state=seed_list[k_fold_index], scaling = True) # data augmentation or not @@ -102,7 +101,7 @@ def Embedding_Vis(data, train_unique_tokens.insert(0,'pad') # Tokens as a list - tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt') + tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) @@ -116,7 +115,7 @@ def Embedding_Vis(data, token_to_int = token.get_tokentoint(tokens) int_to_token = token.get_inttotoken(tokens) - model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', + model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) print("Chosen model summary:\n") @@ -183,6 +182,6 @@ def Embedding_Vis(data, plt.yticks([]) ax.axis('tight') - plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_fold_'+str(k_fold_index)+'.png', bbox_inches='tight') plt.show() ## diff --git a/SMILESX/inference.py b/SMILESX/inference.py index 887346c..3de6ac4 100644 --- a/SMILESX/inference.py +++ b/SMILESX/inference.py @@ -44,9 +44,6 @@ def Inference(data_name, os.makedirs(save_dir, exist_ok=True) print("***SMILES_X for inference starts...***\n\n") - np.random.seed(seed=123) - seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() - print("***Checking the SMILES list for inference***\n") smiles_checked = list() smiles_rejected = list() @@ -95,7 +92,7 @@ def Inference(data_name, for ifold in range(k_fold_number): # Tokens as a list - tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(seed_list[ifold])+'.txt') + tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt') # Add 'pad', 'unk' tokens to the existing list vocab_size = len(tokens) tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) @@ -105,7 +102,7 @@ def Inference(data_name, int_to_token = token.get_inttotoken(tokens) # Best architecture to visualize from - model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(seed_list[ifold])+'.hdf5', + model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) if ifold == 0: diff --git a/SMILESX/interpret.py b/SMILESX/interpret.py index af1f828..8bbf05a 100644 --- a/SMILESX/interpret.py +++ b/SMILESX/interpret.py @@ -67,18 +67,17 @@ def Interpretation(data, print("***SMILES_X Interpreter starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() - # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times - selection_seed = seed_list[k_fold_index] print("******") - print("***Fold #{} initiated...***".format(selection_seed)) + print("***Fold #{} initiated...***".format(k_fold_index)) print("******") print("***Sampling and splitting of the dataset.***\n") + # Reproducing the data split of the requested fold (k_fold_index) x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), - random_state=selection_seed, + random_state=seed_list[k_fold_index], scaling = True) np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s') @@ -145,7 +144,7 @@ def Interpretation(data, train_unique_tokens.insert(0,'pad') # Tokens as a list - tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt') + tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) @@ -160,7 +159,7 @@ def Interpretation(data, int_to_token = token.get_inttotoken(tokens) # Best architecture to visualize from - model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', + model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) best_arch = [model_topredict.layers[2].output_shape[-1]/2, model_topredict.layers[3].output_shape[-1], @@ -179,7 +178,7 @@ def Interpretation(data, print("\n") print("***Interpretation from the best model.***\n") - model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5') + model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5') model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens, @@ -210,7 +209,7 @@ def Interpretation(data, fontsize = font_size, rotation = font_rotation) plt.yticks([]) - plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight') #plt.show() smiles_tmp = smiles_toviz_x_enum[ienumcard] @@ -233,7 +232,7 @@ def Interpretation(data, colorMap='Reds', contourLines = 10, alpha = 0.25) - fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight') #fig.show() model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) @@ -276,7 +275,7 @@ def Interpretation(data, rotation = font_rotation) plt.yticks(fontsize = 20) plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15) - plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight') #plt.show() ## diff --git a/SMILESX/main.py b/SMILESX/main.py index 74973b1..c587fd1 100644 --- a/SMILESX/main.py +++ b/SMILESX/main.py @@ -152,11 +152,10 @@ def Main(data, print("******") print("***Sampling and splitting of the dataset.***\n") - selection_seed = seed_list[ifold] x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), - random_state=selection_seed, + random_state=seed_list[ifold], scaling = True) # data augmentation or not @@ -217,9 +216,9 @@ def Main(data, print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) # Save the vocabulary for re-use - token.save_vocab(tokens, save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt') + token.save_vocab(tokens, save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt') # Tokens as a list - tokens = token.get_vocab(save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt') + tokens = token.get_vocab(save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) @@ -355,7 +354,7 @@ def create_mod(params): multi_model.compile(loss="mse", optimizer=custom_adam, metrics=[metrics.mae,metrics.mse]) # Checkpoint, Early stopping and callbacks definition - filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5' + filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5' checkpoint = ModelCheckpoint(filepath, monitor='val_loss', @@ -394,14 +393,13 @@ def create_mod(params): plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Validation'], loc='upper right') - plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight') plt.close() print("Best val_loss @ Epoch #{}\n".format(np.argmin(history.history['val_loss'])+1)) print("***Predictions from the best model.***\n") - model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5') -# model.save(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5') + model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5') model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) # predict and compare for the training, validation and test sets @@ -520,5 +518,5 @@ def create_mod(params): plt.legend() # Added fold number - plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80) + plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80) plt.close() diff --git a/SMILESX/nsga2sort.py b/SMILESX/nsga2sort.py new file mode 100644 index 0000000..20e16e6 --- /dev/null +++ b/SMILESX/nsga2sort.py @@ -0,0 +1,135 @@ +import numpy as np +import warnings + +def nsga_sort(objVals, returnFronts=False): + # This function has been copied from here: + # https://github.com/google/brain-tokyo-workshop/blob/master/WANNRelease/WANN/wann_src/nsga_sort.py + + """Returns ranking of objective values based on non-dominated sorting. + Optionally returns fronts (useful for visualization). + + NOTE: Assumes maximization of objective function + + Args: + objVals - (np_array) - Objective values of each individual + [nInds X nObjectives] + + Returns: + rank - (np_array) - Rank in population of each individual + int([nIndividuals X 1]) + front - (np_array) - Pareto front of each individual + int([nIndividuals X 1]) + + Todo: + * Extend to N objectives + """ + # Sort by dominance into fronts + fronts = getFronts(objVals) + + # Rank each front by crowding distance + for f in range(len(fronts)): + x1 = objVals[fronts[f],0] + x2 = objVals[fronts[f],1] + crowdDist = getCrowdingDist(x1) + getCrowdingDist(x2) + frontRank = np.argsort(-crowdDist) + fronts[f] = [fronts[f][i] for i in frontRank] + + # Convert to ranking + tmp = [ind for front in fronts for ind in front] + rank = np.empty_like(tmp) + rank[tmp] = np.arange(len(tmp)) + + if returnFronts is True: + return rank, fronts + else: + return rank + +def getFronts(objVals): + """Fast non-dominated sort. + + Args: + objVals - (np_array) - Objective values of each individual + [nInds X nObjectives] + + Returns: + front - [list of lists] - One list for each front: + list of indices of individuals in front + + Todo: + * Extend to N objectives + [adapted from: https://github.com/haris989/NSGA-II] + """ + + values1 = objVals[:,0] + values2 = objVals[:,1] + + S=[[] for i in range(0,len(values1))] + front = [[]] + n=[0 for i in range(0,len(values1))] + rank = [0 for i in range(0, len(values1))] + + # Get dominance relations + for p in range(0,len(values1)): + S[p]=[] + n[p]=0 + for q in range(0, len(values1)): + if (values1[p] > values1[q] and values2[p] > values2[q]) \ + or (values1[p] >= values1[q] and values2[p] > values2[q]) \ + or (values1[p] > values1[q] and values2[p] >= values2[q]): + if q not in S[p]: + S[p].append(q) + elif (values1[q] > values1[p] and values2[q] > values2[p]) \ + or ( values1[q] >= values1[p] and values2[q] > values2[p]) \ + or ( values1[q] > values1[p] and values2[q] >= values2[p]): + n[p] = n[p] + 1 + if n[p]==0: + rank[p] = 0 + if p not in front[0]: + front[0].append(p) + + # Assign fronts + i = 0 + while(front[i] != []): + Q=[] + for p in front[i]: + for q in S[p]: + n[q] =n[q] - 1 + if( n[q]==0): + rank[q]=i+1 + if q not in Q: + Q.append(q) + i = i+1 + front.append(Q) + del front[len(front)-1] + return front + +def getCrowdingDist(objVector): + """Returns crowding distance of a vector of values, used once on each front. + Note: Crowding distance of individuals at each end of front is infinite, as + they don't have a neighbor. + Args: + objVector - (np_array) - Objective values of each individual + [nInds X nObjectives] + + Returns: + dist - (np_array) - Crowding distance of each individual + [nIndividuals X 1] + """ + # Order by objective value + key = np.argsort(objVector) + sortedObj = objVector[key] + + # Distance from values on either side + shiftVec = np.r_[np.inf,sortedObj,np.inf] # Edges have infinite distance + warnings.filterwarnings("ignore", category=RuntimeWarning) # inf on purpose + prevDist = np.abs(sortedObj-shiftVec[:-2]) + nextDist = np.abs(sortedObj-shiftVec[2:]) + crowd = prevDist+nextDist + if (sortedObj[-1]-sortedObj[0]) > 0: + crowd *= abs((1/sortedObj[-1]-sortedObj[0])) # Normalize by fitness range + + # Restore original order + dist = np.empty(len(key)) + dist[key] = crowd[:] + + return dist \ No newline at end of file diff --git a/SMILESX/token.py b/SMILESX/token.py index aa0f4f4..9f3b04a 100644 --- a/SMILESX/token.py +++ b/SMILESX/token.py @@ -117,7 +117,7 @@ def TokensFinder(data, data_name, data_units = '', k_fold_number = 8, - k_fold_index=0, + k_fold_index = 0, augmentation = False, token_tofind = '', verbose = 1): @@ -125,18 +125,17 @@ def TokensFinder(data, print("***SMILES_X token's finder starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() - # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times - selection_seed = seed_list[k_fold_index] - + print("******") - print("***Fold #{} initiated...***".format(selection_seed)) + print("***Fold #{} initiated...***".format(k_fold_index)) print("******") print("***Sampling and splitting of the dataset.***\n") + # Reproducing the data split of the requested fold (k_fold_index) x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), - random_state=selection_seed, + random_state=seed_list[k_fold_index], scaling = True) # data augmentation or not From 3ed12c6096c6801ebd50848fcd38a2f84935d4b0 Mon Sep 17 00:00:00 2001 From: egracheva Date: Thu, 19 Sep 2019 14:55:01 +0900 Subject: [PATCH 3/3] Removed ngsa2 sort file, added before by mistake --- SMILESX/nsga2sort.py | 135 ------------------------------------------- 1 file changed, 135 deletions(-) delete mode 100644 SMILESX/nsga2sort.py diff --git a/SMILESX/nsga2sort.py b/SMILESX/nsga2sort.py deleted file mode 100644 index 20e16e6..0000000 --- a/SMILESX/nsga2sort.py +++ /dev/null @@ -1,135 +0,0 @@ -import numpy as np -import warnings - -def nsga_sort(objVals, returnFronts=False): - # This function has been copied from here: - # https://github.com/google/brain-tokyo-workshop/blob/master/WANNRelease/WANN/wann_src/nsga_sort.py - - """Returns ranking of objective values based on non-dominated sorting. - Optionally returns fronts (useful for visualization). - - NOTE: Assumes maximization of objective function - - Args: - objVals - (np_array) - Objective values of each individual - [nInds X nObjectives] - - Returns: - rank - (np_array) - Rank in population of each individual - int([nIndividuals X 1]) - front - (np_array) - Pareto front of each individual - int([nIndividuals X 1]) - - Todo: - * Extend to N objectives - """ - # Sort by dominance into fronts - fronts = getFronts(objVals) - - # Rank each front by crowding distance - for f in range(len(fronts)): - x1 = objVals[fronts[f],0] - x2 = objVals[fronts[f],1] - crowdDist = getCrowdingDist(x1) + getCrowdingDist(x2) - frontRank = np.argsort(-crowdDist) - fronts[f] = [fronts[f][i] for i in frontRank] - - # Convert to ranking - tmp = [ind for front in fronts for ind in front] - rank = np.empty_like(tmp) - rank[tmp] = np.arange(len(tmp)) - - if returnFronts is True: - return rank, fronts - else: - return rank - -def getFronts(objVals): - """Fast non-dominated sort. - - Args: - objVals - (np_array) - Objective values of each individual - [nInds X nObjectives] - - Returns: - front - [list of lists] - One list for each front: - list of indices of individuals in front - - Todo: - * Extend to N objectives - [adapted from: https://github.com/haris989/NSGA-II] - """ - - values1 = objVals[:,0] - values2 = objVals[:,1] - - S=[[] for i in range(0,len(values1))] - front = [[]] - n=[0 for i in range(0,len(values1))] - rank = [0 for i in range(0, len(values1))] - - # Get dominance relations - for p in range(0,len(values1)): - S[p]=[] - n[p]=0 - for q in range(0, len(values1)): - if (values1[p] > values1[q] and values2[p] > values2[q]) \ - or (values1[p] >= values1[q] and values2[p] > values2[q]) \ - or (values1[p] > values1[q] and values2[p] >= values2[q]): - if q not in S[p]: - S[p].append(q) - elif (values1[q] > values1[p] and values2[q] > values2[p]) \ - or ( values1[q] >= values1[p] and values2[q] > values2[p]) \ - or ( values1[q] > values1[p] and values2[q] >= values2[p]): - n[p] = n[p] + 1 - if n[p]==0: - rank[p] = 0 - if p not in front[0]: - front[0].append(p) - - # Assign fronts - i = 0 - while(front[i] != []): - Q=[] - for p in front[i]: - for q in S[p]: - n[q] =n[q] - 1 - if( n[q]==0): - rank[q]=i+1 - if q not in Q: - Q.append(q) - i = i+1 - front.append(Q) - del front[len(front)-1] - return front - -def getCrowdingDist(objVector): - """Returns crowding distance of a vector of values, used once on each front. - Note: Crowding distance of individuals at each end of front is infinite, as - they don't have a neighbor. - Args: - objVector - (np_array) - Objective values of each individual - [nInds X nObjectives] - - Returns: - dist - (np_array) - Crowding distance of each individual - [nIndividuals X 1] - """ - # Order by objective value - key = np.argsort(objVector) - sortedObj = objVector[key] - - # Distance from values on either side - shiftVec = np.r_[np.inf,sortedObj,np.inf] # Edges have infinite distance - warnings.filterwarnings("ignore", category=RuntimeWarning) # inf on purpose - prevDist = np.abs(sortedObj-shiftVec[:-2]) - nextDist = np.abs(sortedObj-shiftVec[2:]) - crowd = prevDist+nextDist - if (sortedObj[-1]-sortedObj[0]) > 0: - crowd *= abs((1/sortedObj[-1]-sortedObj[0])) # Normalize by fitness range - - # Restore original order - dist = np.empty(len(key)) - dist[key] = crowd[:] - - return dist \ No newline at end of file