Skip to content

Commit

Permalink
Merge pull request #5 from egracheva/master
Browse files Browse the repository at this point in the history
Random seed bug
  • Loading branch information
GLambard committed Oct 8, 2019
2 parents 9f5b5f4 + 57a562c commit 290cf9f
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 38 deletions.
13 changes: 6 additions & 7 deletions SMILESX/embeddingvis.py
Expand Up @@ -46,18 +46,17 @@ def Embedding_Vis(data,
print("***SMILES_X for embedding visualization starts...***\n\n")
np.random.seed(seed=123)
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
# Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
selection_seed = seed_list[k_fold_index]

print("******")
print("***Fold #{} initiated...***".format(selection_seed))
print("***Fold #{} initiated...***".format(k_fold_index))
print("******")

print("***Sampling and splitting of the dataset.***\n")
# Reproducing the data split of the requested fold (k_fold_index)
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
utils.random_split(smiles_input=data.smiles,
prop_input=np.array(data.iloc[:,1]),
random_state=selection_seed,
random_state=seed_list[k_fold_index],
scaling = True)

# data augmentation or not
Expand Down Expand Up @@ -102,7 +101,7 @@ def Embedding_Vis(data,
train_unique_tokens.insert(0,'pad')

# Tokens as a list
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt')
# Add 'pad', 'unk' tokens to the existing list
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

Expand All @@ -116,7 +115,7 @@ def Embedding_Vis(data,
token_to_int = token.get_tokentoint(tokens)
int_to_token = token.get_inttotoken(tokens)

model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5',
model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5',
custom_objects={'AttentionM': model.AttentionM()})

print("Chosen model summary:\n")
Expand Down Expand Up @@ -183,6 +182,6 @@ def Embedding_Vis(data,
plt.yticks([])
ax.axis('tight')

plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
plt.savefig(save_dir+'Visualization_'+data_name+'_Embedding_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
plt.show()
##
7 changes: 2 additions & 5 deletions SMILESX/inference.py
Expand Up @@ -44,9 +44,6 @@ def Inference(data_name,
os.makedirs(save_dir, exist_ok=True)

print("***SMILES_X for inference starts...***\n\n")
np.random.seed(seed=123)
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()

print("***Checking the SMILES list for inference***\n")
smiles_checked = list()
smiles_rejected = list()
Expand Down Expand Up @@ -95,7 +92,7 @@ def Inference(data_name,
for ifold in range(k_fold_number):

# Tokens as a list
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(seed_list[ifold])+'.txt')
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
# Add 'pad', 'unk' tokens to the existing list
vocab_size = len(tokens)
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
Expand All @@ -105,7 +102,7 @@ def Inference(data_name,
int_to_token = token.get_inttotoken(tokens)

# Best architecture to visualize from
model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(seed_list[ifold])+'.hdf5',
model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5',
custom_objects={'AttentionM': model.AttentionM()})

if ifold == 0:
Expand Down
19 changes: 9 additions & 10 deletions SMILESX/interpret.py
Expand Up @@ -67,18 +67,17 @@ def Interpretation(data,
print("***SMILES_X Interpreter starts...***\n\n")
np.random.seed(seed=123)
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
# Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
selection_seed = seed_list[k_fold_index]

print("******")
print("***Fold #{} initiated...***".format(selection_seed))
print("***Fold #{} initiated...***".format(k_fold_index))
print("******")

print("***Sampling and splitting of the dataset.***\n")
# Reproducing the data split of the requested fold (k_fold_index)
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
utils.random_split(smiles_input=data.smiles,
prop_input=np.array(data.iloc[:,1]),
random_state=selection_seed,
random_state=seed_list[k_fold_index],
scaling = True)

np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s')
Expand Down Expand Up @@ -145,7 +144,7 @@ def Interpretation(data,
train_unique_tokens.insert(0,'pad')

# Tokens as a list
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(k_fold_index)+'.txt')
# Add 'pad', 'unk' tokens to the existing list
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

Expand All @@ -160,7 +159,7 @@ def Interpretation(data,
int_to_token = token.get_inttotoken(tokens)

# Best architecture to visualize from
model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5',
model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5',
custom_objects={'AttentionM': model.AttentionM()})
best_arch = [model_topredict.layers[2].output_shape[-1]/2,
model_topredict.layers[3].output_shape[-1],
Expand All @@ -179,7 +178,7 @@ def Interpretation(data,
print("\n")

print("***Interpretation from the best model.***\n")
model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(k_fold_index)+'.hdf5')
model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens,
Expand Down Expand Up @@ -210,7 +209,7 @@ def Interpretation(data,
fontsize = font_size,
rotation = font_rotation)
plt.yticks([])
plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
#plt.show()

smiles_tmp = smiles_toviz_x_enum[ienumcard]
Expand All @@ -233,7 +232,7 @@ def Interpretation(data,
colorMap='Reds',
contourLines = 10,
alpha = 0.25)
fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
#fig.show()

model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
Expand Down Expand Up @@ -276,7 +275,7 @@ def Interpretation(data,
rotation = font_rotation)
plt.yticks(fontsize = 20)
plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15)
plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_fold_'+str(k_fold_index)+'.png', bbox_inches='tight')
#plt.show()
##

Expand Down
16 changes: 7 additions & 9 deletions SMILESX/main.py
Expand Up @@ -152,11 +152,10 @@ def Main(data,
print("******")

print("***Sampling and splitting of the dataset.***\n")
selection_seed = seed_list[ifold]
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
utils.random_split(smiles_input=data.smiles,
prop_input=np.array(data.iloc[:,1]),
random_state=selection_seed,
random_state=seed_list[ifold],
scaling = True)

# data augmentation or not
Expand Down Expand Up @@ -217,9 +216,9 @@ def Main(data,
print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))

# Save the vocabulary for re-use
token.save_vocab(tokens, save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
token.save_vocab(tokens, save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
# Tokens as a list
tokens = token.get_vocab(save_dir+data_name+'_tokens_set_seed'+str(selection_seed)+'.txt')
tokens = token.get_vocab(save_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
# Add 'pad', 'unk' tokens to the existing list
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

Expand Down Expand Up @@ -355,7 +354,7 @@ def create_mod(params):
multi_model.compile(loss="mse", optimizer=custom_adam, metrics=[metrics.mae,metrics.mse])

# Checkpoint, Early stopping and callbacks definition
filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5'
filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5'

checkpoint = ModelCheckpoint(filepath,
monitor='val_loss',
Expand Down Expand Up @@ -394,14 +393,13 @@ def create_mod(params):
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight')
plt.close()

print("Best val_loss @ Epoch #{}\n".format(np.argmin(history.history['val_loss'])+1))

print("***Predictions from the best model.***\n")
model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
# model.save(save_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5')
model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

# predict and compare for the training, validation and test sets
Expand Down Expand Up @@ -520,5 +518,5 @@ def create_mod(params):
plt.legend()

# Added fold number
plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_seed_'+str(selection_seed)+'_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80)
plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80)
plt.close()
11 changes: 5 additions & 6 deletions SMILESX/token.py
Expand Up @@ -117,26 +117,25 @@ def TokensFinder(data,
data_name,
data_units = '',
k_fold_number = 8,
k_fold_index=0,
k_fold_index = 0,
augmentation = False,
token_tofind = '',
verbose = 1):

print("***SMILES_X token's finder starts...***\n\n")
np.random.seed(seed=123)
seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
# Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
selection_seed = seed_list[k_fold_index]


print("******")
print("***Fold #{} initiated...***".format(selection_seed))
print("***Fold #{} initiated...***".format(k_fold_index))
print("******")

print("***Sampling and splitting of the dataset.***\n")
# Reproducing the data split of the requested fold (k_fold_index)
x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
utils.random_split(smiles_input=data.smiles,
prop_input=np.array(data.iloc[:,1]),
random_state=selection_seed,
random_state=seed_list[k_fold_index],
scaling = True)

# data augmentation or not
Expand Down
2 changes: 1 addition & 1 deletion SMILESX/utils.py
Expand Up @@ -17,7 +17,7 @@
# 3 arrays of properties for training, validation, test: y_train, y_valid, y_test,
# the scaling function: scaler
def random_split(smiles_input, prop_input, random_state, scaling = True):

np.random.seed(seed=random_state)
full_idx = np.array([x for x in range(smiles_input.shape[0])])
train_idx = np.random.choice(full_idx,
size=math.ceil(0.8*smiles_input.shape[0]),
Expand Down

0 comments on commit 290cf9f

Please sign in to comment.