diff --git a/SMILESX/embeddingvis.py b/SMILESX/embeddingvis.py index 1299088..349562f 100644 --- a/SMILESX/embeddingvis.py +++ b/SMILESX/embeddingvis.py @@ -46,18 +46,17 @@ def Embedding_Vis(data, print("***SMILES_X for embedding visualization starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() - # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times - selection_seed = seed_list[k_fold_index] print("******") - print("***Fold #{} initiated...***".format(selection_seed)) + print("***Fold #{} initiated...***".format(k_fold_index)) print("******") print("***Sampling and splitting of the dataset.***\n") + # Reproducing the data split of the requested fold (k_fold_index) x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), - random_state=selection_seed, + random_state=seed_list[k_fold_index], scaling = True) # data augmentation or not @@ -102,7 +101,7 @@ def Embedding_Vis(data, train_unique_tokens.insert(0,'pad') # Tokens as a list - tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt') + tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) @@ -116,7 +115,7 @@ def Embedding_Vis(data, token_to_int = token.get_tokentoint(tokens) int_to_token = token.get_inttotoken(tokens) - model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', + model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) print("Chosen model summary:\n") @@ -183,6 +182,6 @@ def Embedding_Vis(data, plt.yticks([]) ax.axis('tight') - plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_fold_'+str(k_fold_index)+'.png', bbox_inches='tight') plt.show() ## diff --git a/SMILESX/inference.py b/SMILESX/inference.py index 887346c..3de6ac4 100644 --- a/SMILESX/inference.py +++ b/SMILESX/inference.py @@ -44,9 +44,6 @@ def Inference(data_name, os.makedirs(save_dir, exist_ok=True) print("***SMILES_X for inference starts...***\n\n") - np.random.seed(seed=123) - seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() - print("***Checking the SMILES list for inference***\n") smiles_checked = list() smiles_rejected = list() @@ -95,7 +92,7 @@ def Inference(data_name, for ifold in range(k_fold_number): # Tokens as a list - tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(seed_list[ifold])+'.txt') + tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt') # Add 'pad', 'unk' tokens to the existing list vocab_size = len(tokens) tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) @@ -105,7 +102,7 @@ def Inference(data_name, int_to_token = token.get_inttotoken(tokens) # Best architecture to visualize from - model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(seed_list[ifold])+'.hdf5', + model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) if ifold == 0: diff --git a/SMILESX/interpret.py b/SMILESX/interpret.py index af1f828..8bbf05a 100644 --- a/SMILESX/interpret.py +++ b/SMILESX/interpret.py @@ -67,18 +67,17 @@ def Interpretation(data, print("***SMILES_X Interpreter starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() - # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times - selection_seed = seed_list[k_fold_index] print("******") - print("***Fold #{} initiated...***".format(selection_seed)) + print("***Fold #{} initiated...***".format(k_fold_index)) print("******") print("***Sampling and splitting of the dataset.***\n") + # Reproducing the data split of the requested fold (k_fold_index) x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), - random_state=selection_seed, + random_state=seed_list[k_fold_index], scaling = True) np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s') @@ -145,7 +144,7 @@ def Interpretation(data, train_unique_tokens.insert(0,'pad') # Tokens as a list - tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt') + tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) @@ -160,7 +159,7 @@ def Interpretation(data, int_to_token = token.get_inttotoken(tokens) # Best architecture to visualize from - model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', + model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) best_arch = [model_topredict.layers[2].output_shape[-1]/2, model_topredict.layers[3].output_shape[-1], @@ -179,7 +178,7 @@ def Interpretation(data, print("\n") print("***Interpretation from the best model.***\n") - model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5') + model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5') model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens, @@ -210,7 +209,7 @@ def Interpretation(data, fontsize = font_size, rotation = font_rotation) plt.yticks([]) - plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight') #plt.show() smiles_tmp = smiles_toviz_x_enum[ienumcard] @@ -233,7 +232,7 @@ def Interpretation(data, colorMap='Reds', contourLines = 10, alpha = 0.25) - fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight') #fig.show() model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) @@ -276,7 +275,7 @@ def Interpretation(data, rotation = font_rotation) plt.yticks(fontsize = 20) plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15) - plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight') #plt.show() ## diff --git a/SMILESX/main.py b/SMILESX/main.py index 74973b1..c587fd1 100644 --- a/SMILESX/main.py +++ b/SMILESX/main.py @@ -152,11 +152,10 @@ def Main(data, print("******") print("***Sampling and splitting of the dataset.***\n") - selection_seed = seed_list[ifold] x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), - random_state=selection_seed, + random_state=seed_list[ifold], scaling = True) # data augmentation or not @@ -217,9 +216,9 @@ def Main(data, print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) # Save the vocabulary for re-use - token.save_vocab(tokens, save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt') + token.save_vocab(tokens, save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt') # Tokens as a list - tokens = token.get_vocab(save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt') + tokens = token.get_vocab(save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) @@ -355,7 +354,7 @@ def create_mod(params): multi_model.compile(loss="mse", optimizer=custom_adam, metrics=[metrics.mae,metrics.mse]) # Checkpoint, Early stopping and callbacks definition - filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5' + filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5' checkpoint = ModelCheckpoint(filepath, monitor='val_loss', @@ -394,14 +393,13 @@ def create_mod(params): plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Validation'], loc='upper right') - plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'.png', bbox_inches='tight') + plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight') plt.close() print("Best val_loss @ Epoch #{}\n".format(np.argmin(history.history['val_loss'])+1)) print("***Predictions from the best model.***\n") - model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5') -# model.save(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5') + model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5') model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) # predict and compare for the training, validation and test sets @@ -520,5 +518,5 @@ def create_mod(params): plt.legend() # Added fold number - plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80) + plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80) plt.close() diff --git a/SMILESX/token.py b/SMILESX/token.py index aa0f4f4..9f3b04a 100644 --- a/SMILESX/token.py +++ b/SMILESX/token.py @@ -117,7 +117,7 @@ def TokensFinder(data, data_name, data_units = '', k_fold_number = 8, - k_fold_index=0, + k_fold_index = 0, augmentation = False, token_tofind = '', verbose = 1): @@ -125,18 +125,17 @@ def TokensFinder(data, print("***SMILES_X token's finder starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() - # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times - selection_seed = seed_list[k_fold_index] - + print("******") - print("***Fold #{} initiated...***".format(selection_seed)) + print("***Fold #{} initiated...***".format(k_fold_index)) print("******") print("***Sampling and splitting of the dataset.***\n") + # Reproducing the data split of the requested fold (k_fold_index) x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), - random_state=selection_seed, + random_state=seed_list[k_fold_index], scaling = True) # data augmentation or not diff --git a/SMILESX/utils.py b/SMILESX/utils.py index be7a3c4..11b3dc1 100644 --- a/SMILESX/utils.py +++ b/SMILESX/utils.py @@ -17,7 +17,7 @@ # 3 arrays of properties for training, validation, test: y_train, y_valid, y_test, # the scaling function: scaler def random_split(smiles_input, prop_input, random_state, scaling = True): - + np.random.seed(seed=random_state) full_idx = np.array([x for x in range(smiles_input.shape[0])]) train_idx = np.random.choice(full_idx, size=math.ceil(0.8*smiles_input.shape[0]),