In [None]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
from collections import Counter
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import tensorflow as tf
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras import optimizers

from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef

from IPython.display import HTML, display
import time

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import scipy.spatial.distance as ds
import json

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)
path = "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/"

Mounted at /gdrive


In [None]:
# Hyperparameters
n = 3
stride = 3

# Create vocabulary

Extract sequences

In [None]:
dataset_swiss = path + "data/swissProt.fasta"
dataset_deeploc = path + "data/deeploc_data.fasta"

with open(dataset_swiss, "r") as f:
  lines = f.readlines()
  sequences_swiss = [seq.replace("\n","") for i,seq in enumerate(lines) if i%2!=0]
with open(dataset_deeploc, "r") as f:
  lines = f.readlines()
  sequences_deeploc = [seq.replace("\n","") for i,seq in enumerate(lines) if i%2!=0]
sequences = sequences_swiss + sequences_deeploc

Extracet tokens from sequences

In [None]:
slice_size = 1000
progressbar = display(progress(0, len(sequences)), display_id=True) # progress bar
token_counter = Counter()

for batch in range(0,len(sequences), slice_size):
  tokens = []
  for seq in sequences[batch:batch+slice_size]:
    x = [seq[i:i+n] for i in range(0,len(seq), stride)]
    tokens.append(x)
  tokens = np.concatenate(tokens, axis = 0)
  tokens = [x for x in tokens if len(x)==n]
  token_counter += Counter(tokens)
  progressbar.update(progress(batch, len(sequences)))

Save vocabulary

In [None]:
with open(path + "data/vocabulary_"+str(n)+"_"+str(stride)+".txt", "w+") as f:
  tokens_name = np.array(list(token_counter.most_common()))[:,0]
  tokens_name = np.insert(tokens_name, 0, ['<S>', '</S>', '<UNK>'])
  for token in tokens_name:
    f.write(token+"\n")

# Create training dataset

In [None]:
dataset = path + "data/swissProt.fasta"

with open(dataset, "r") as f:
  lines = f.readlines()
  sequences = [seq.replace("\n","") for i,seq in enumerate(lines) if i%2!=0]

In [None]:
slice_size = 100
progressbar = display(progress(0, len(sequences)), display_id=True) # progress bar


for i, batch in enumerate(range(0,int(len(sequences)), slice_size)):
  tokens = []
  if not os.path.exists(path + "data/training_"+str(n)+"_"+str(stride)+"/"):
    os.makedirs(path + "data/training_"+str(n)+"_"+str(stride)+"/")
  with open(path + "data/training_"+str(n)+"_"+str(stride)+"/"+str(i)+".txt", "w+") as f:
    for i, seq in enumerate(sequences[batch:batch+slice_size]):
      x = [seq[i:i+n] for i in range(0,len(seq), stride)]
      for n_gram in x:
        if len(n_gram) == n:
          f.write(n_gram+" ")
      if i != slice_size-1:
        f.write("\n")
  progressbar.update(progress(batch, len(sequences)+1))

# Elmo



> ## Install



In [None]:
!git clone https://github.com/allenai/bilm-tf.git && mv bilm-tf/ bilmtf

Cloning into 'bilm-tf'...
remote: Enumerating objects: 292, done.[K
remote: Total 292 (delta 0), reused 0 (delta 0), pack-reused 292[K
Receiving objects: 100% (292/292), 588.40 KiB | 9.05 MiB/s, done.
Resolving deltas: 100% (137/137), done.


In [None]:
!cd bilmtf/ && python setup.py install



> ## Train




Get n_train_tokens:

In [None]:
n_train_tokens_ = 0
training_path = "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/training_"+str(n)+"_"+str(stride)+"/"
training_files = [f for f in listdir(training_path) if isfile(join(training_path, f))]

for file_name in training_files:
  with open(path + "data/training_"+str(n)+"_"+str(stride)+"/"+file_name, "r") as f:
    lines = f.readlines()
    for line_i in lines:
      n_train_tokens_ += len(line_i.split(" ")) - 1
n_train_tokens_

67650480

Get n_vocab_tokens (probably unnecessary):

In [None]:
n_vocab_tokens = 0
vocab_path = "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/vocabulary_"+str(n)+"_"+str(stride)+".txt"

with open(vocab_path, "r") as f:
  lines = f.readlines()
  n_vocab_tokens = len(lines)
n_vocab_tokens

9544

> ### train_elmo.py



Start training

In [None]:
import argparse

from bilmtf.bilm.training import train, load_options_latest_checkpoint, load_vocab
from bilmtf.bilm.data import BidirectionalLMDataset


def main(args):
    # load the vocab
    vocab = load_vocab(args[1], 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = n_train_tokens_

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': 1,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 1,
    }

    prefix = args[2]
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args[0]
    tf_log_dir = args[0]
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
      train(options, data, n_gpus, tf_save_dir, tf_log_dir)


if __name__ == '__main__':
    args = ["/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/checkpoint_"+str(n)+"_"+str(stride)+"/",
            "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/vocabulary_"+str(n)+"_"+str(stride)+".txt",
            "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/training_"+str(n)+"_"+str(stride)+"/*"]
    
    # create checkpoint folder if it doesn't exist
    if not os.path.exists(path + "data/checkpoint_"+str(n)+"_"+str(stride)+"/"):
      os.makedirs(path + "data/checkpoint_"+str(n)+"_"+str(stride)+"/")

    main(args)

Save weights

In [None]:
# Command for lab pc
# cd bilmtf/ && python bin/dump_weights.py --save_dir "../data/checkpoint_3_1/" --outfile "../data/weights/weights_3_1.hdf5"

In [None]:
!cd bilmtf/ && python bin/dump_weights.py --save_dir "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/checkpoint_{n}_{stride}/" --outfile "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/weights/weights_{n}_{stride}.hdf5"

> ### Evaluate (not working in colab)

In [None]:
!cd bilmtf/ && python bin/run_test.py \
    --test_prefix="/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/training_{n}_{stride}/*" \
    --vocab_file "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/vocabulary_{n}_{stride}.txt" \
    --save_dir "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/checkpoint_{n}_{stride}/"

In [None]:
# command for lab pc
# cd bilmtf/ && python bin/run_test.py --test_prefix="../data/heldout_3_1/*" --vocab_file "../data/vocabulary_3_1.txt" --save_dir "../data/checkpoint_3_1/"



> ## Prediction



n_characters must be changed from 261 to 262 before prediction

In [None]:
options_path = "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/checkpoint_"+str(n)+"_"+str(stride)+"/options.json"

with open(options_path) as f:
  options = json.loads(f.read())
options['char_cnn']['n_characters'] = 262

with open(options_path, 'w') as json_file:
  json.dump(options, json_file)

Prepare the model

In [None]:
from bilmtf.bilm import Batcher, BidirectionalLanguageModel, weight_layers


# Location of pretrained LM.  Here we use the test fixtures.
vocab_file = "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/vocabulary_"+str(n)+"_"+str(stride)+".txt"
options_file = "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/checkpoint_"+str(n)+"_"+str(stride)+"/options.json"
weight_file = "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/weights/weights_"+str(n)+"_"+str(stride)+".hdf5"
 
# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, 50)
 
# Input placeholders to the biLM.
context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
 
# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file, weight_file)
 

with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
  # Get ops to compute the LM embeddings.
  context_embeddings_op = bilm(context_character_ids)
 
  # Get an op to compute ELMo (weighted average of the internal biLM layers)
  elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)

Get proteins embedding from deeploc_data

In [None]:
deeploc_file = "/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/deeploc_data.fasta"

labels_dic_location = {
    'Cell.membrane': 0,
    'Cytoplasm': 1,
    'Endoplasmic.reticulum': 2,
    'Golgi.apparatus': 3,
    'Lysosome/Vacuole': 4,
    'Mitochondrion': 5,
    'Nucleus': 6,
    'Peroxisome': 7,
    'Plastid': 8,
    'Extracellular': 9
}


# Now we can compute embeddings.
with open(deeploc_file, "r") as f:
  lines = f.readlines()
  sequences_deeploc = [seq.replace("\n","") for i,seq in enumerate(lines) if i%2!=0]
  headers_deeploc = [seq.replace("\n","") for i,seq in enumerate(lines) if i%2==0]

tokenized_context = []
for seq in sequences_deeploc:
  x = [seq[i:i+n] for i in range(0,len(seq), stride)]
  tokenized_context.append(x)
print(len(tokenized_context))

embeddings_path = "./data/embeddings_"+str(n)+"_"+str(stride)+"/"
slice_size = 1 # Don't change this value

# restart from where you left
# create embeddings folder if it doesn't exist
if not os.path.exists(embeddings_path):
  os.makedirs(embeddings_path)
if os.path.isfile(embeddings_path+"sequence_completed_"+str(n)+"_"+str(stride)+".txt"):
  with open(embeddings_path+"sequence_completed_"+str(n)+"_"+str(stride)+".txt", "r") as f:
    sequence_completed = int(f.readline())
  X_train = np.load(embeddings_path+"X_train_"+str(n)+"_"+str(stride)+".npy")
  y_train_subcellular = np.load(embeddings_path+"y_train_subcellular_"+str(n)+"_"+str(stride)+".npy")
  y_train_membrane = np.load(embeddings_path+"y_train_membrane_"+str(n)+"_"+str(stride)+".npy")
  X_test = np.load(embeddings_path+"X_test_"+str(n)+"_"+str(stride)+".npy")
  y_test_subcellular = np.load(embeddings_path+"y_test_subcellular_"+str(n)+"_"+str(stride)+".npy")
  y_test_membrane = np.load(embeddings_path+"y_test_membrane_"+str(n)+"_"+str(stride)+".npy")
else:
  sequence_completed = 0
  X_train = np.zeros((1,1024))
  y_train_subcellular = np.ones((1))*99
  y_train_membrane = np.ones((1))
  X_test = np.zeros((1,1024))
  y_test_subcellular = np.ones((1))*99
  y_test_membrane = np.ones((1))*99

#start extracting
with tf.compat.v1.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
  # It is necessary to initialize variables once before running inference.
  sess.run(tf.global_variables_initializer())
  for i, batch in enumerate(range(sequence_completed,len(tokenized_context), slice_size)):
    tokens = tokenized_context[batch:batch+slice_size]
    header_tokens = headers_deeploc[batch:batch+slice_size][0]
    print(header_tokens)
    headers_class = [val for key,val in labels_dic_location.items() if key in header_tokens]
    if "-U" not in header_tokens and len(headers_class) == 1:
      # Create batches of data.
      context_ids = batcher.batch_sentences(tokens)
      #print("Shape of context ids = ", context_ids.shape)

      # Compute ELMo representations (here for the input only, for simplicity).
      elmo_context_input_ = sess.run(
          elmo_context_input['weighted_op'],
          feed_dict={context_character_ids: context_ids}
      )
      elmo_context_input_ = sess.run(tf.reduce_mean(elmo_context_input_, 1))
      print("Shape of generated embeddings = ",elmo_context_input_.shape)

      # Save
      if "test" not in header_tokens:
        X_train = np.concatenate((X_train, elmo_context_input_), axis=0)
        y_train_subcellular = np.concatenate((y_train_subcellular, headers_class), axis=0)
        np.save(embeddings_path+"X_train_"+str(n)+"_"+str(stride)+".npy", X_train)
        np.save(embeddings_path+"y_train_subcellular_"+str(n)+"_"+str(stride)+".npy", y_train_subcellular)
        if "-M" in header_tokens:
          y_train_membrane = np.concatenate((y_train_membrane, [0]), axis=0)
        else:
          y_train_membrane = np.concatenate((y_train_membrane, [1]), axis=0)
        np.save(embeddings_path+"y_train_membrane_"+str(n)+"_"+str(stride)+".npy", y_train_membrane)
      else:
        X_test = np.concatenate((X_test, elmo_context_input_), axis=0)
        y_test_subcellular = np.concatenate((y_test_subcellular, headers_class), axis=0)
        np.save(embeddings_path+"X_test_"+str(n)+"_"+str(stride)+".npy", X_test)
        np.save(embeddings_path+"y_test_subcellular_"+str(n)+"_"+str(stride)+".npy", y_test_subcellular)
        if "-M" in header_tokens:
          y_test_membrane = np.concatenate((y_test_membrane, [0]), axis=0)
        else:
          y_test_membrane = np.concatenate((y_test_membrane, [1]), axis=0)
        np.save(embeddings_path+"y_test_membrane_"+str(n)+"_"+str(stride)+".npy", y_test_membrane)

    sequence_completed += slice_size
    with open(embeddings_path+"sequence_completed_"+str(n)+"_"+str(stride)+".txt", "w+") as f:
      f.write(str(sequence_completed))

    print(f"Sequence completed {sequence_completed}/{len(tokenized_context)}")

In [None]:
embeddings_path = "./data/embeddings_"+str(n)+"_"+str(stride)+"/"

X_train = np.load(embeddings_path+"X_train_"+str(n)+"_"+str(stride)+".npy")
y_train_subcellular = np.load(embeddings_path+"y_train_subcellular_"+str(n)+"_"+str(stride)+".npy")
y_train_membrane = np.load(embeddings_path+"y_train_membrane_"+str(n)+"_"+str(stride)+".npy")
X_test = np.load(embeddings_path+"X_test_"+str(n)+"_"+str(stride)+".npy")
y_test_subcellular = np.load(embeddings_path+"y_test_subcellular_"+str(n)+"_"+str(stride)+".npy")
y_test_membrane = np.load(embeddings_path+"y_test_membrane_"+str(n)+"_"+str(stride)+".npy")

np.savez_compressed(embeddings_path+".train_"+str(n)+"_"+str(stride), X_train=X_train, y_train_location=y_train_subcellular, y_train_membrane=y_train_membrane)
np.savez_compressed(embeddings_path+".test_"+str(n)+"_"+str(stride), X_test=X_test, y_test_location=y_test_subcellular, y_test_membrane=y_test_membrane)



# Sentiment Analysis



In [None]:
class Attention(tf.keras.layers.Layer):
  """ Implementing a layer that does attention according to Bahdanau style """

  def __init__(self, units):
      super(Attention, self).__init__()
      # W1 weight of the previously hidden state(hidden_size x hidden_size)
      self.W1 = tf.keras.layers.Dense(units)
      # W2 weight for all the encoder hidden states
      self.W2 = tf.keras.layers.Dense(units)
      self.V = tf.keras.layers.Dense(1)

  def call(self, inputs, hidden):
      # 'hidden' (h_t) is expanded over the time axis to prepare it for the addition
      # that follows. hidden will always be the last hidden state of the RNN.
      # (in seq2seq in would have been the current state of the decoder step)
      # 'features' (h_s) are all the hidden states of the encoder.
      hidden_with_time_axis = tf.expand_dims(hidden, 1)

      # Bahdanau additive style to compute the score:
      # score = v_a * tanh(W_1*h_t + W_2*h_s)
      score = tf.nn.tanh(self.W1(inputs) + self.W2(hidden_with_time_axis))
      attention_weights = tf.nn.softmax(self.V(score), axis=1)
      context_vector = attention_weights * inputs
      context_vector = tf.reduce_sum(context_vector, axis=1)

      return context_vector, attention_weights

In [None]:
import math

def create_CNN_LSTM_Attention_complete():
  n_hid=20
  lr=0.0005
  drop_prob=0.6
  drop_hid=0.1
  n_filt=40
  n_feat=1024
  n_membrane_class=2


  inputs = keras.Input(shape=(1, n_feat))

  l_drop1 = layers.Dropout(drop_prob)(inputs)

  # Size of convolutional layers
  f_size_a = 1
  f_size_b = 3
  f_size_c = 5
  f_size_d = 9
  f_size_e = 15
  f_size_f = 21

  # initialization with random orthogonal weights using sqrt(2) for rectified linear units as scaling factor
  initializer = tf.keras.initializers.Orthogonal(gain=math.sqrt(2))

  l_conv_a = layers.Conv1D(n_filt, f_size_a, strides=1, padding="same", kernel_initializer=initializer,
                            activation="relu", data_format='channels_first')(l_drop1)
  l_conv_b = layers.Conv1D(n_filt, f_size_b, strides=1, padding="same", kernel_initializer=initializer,
                            activation="relu", data_format='channels_first')(l_drop1)
  l_conv_c = layers.Conv1D(n_filt, f_size_c, strides=1, padding="same", kernel_initializer=initializer,
                            activation="relu", data_format='channels_first')(l_drop1)
  l_conv_d = layers.Conv1D(n_filt, f_size_d, strides=1, padding="same", kernel_initializer=initializer,
                            activation="relu", data_format='channels_first')(l_drop1)
  l_conv_e = layers.Conv1D(n_filt, f_size_e, strides=1, padding="same", kernel_initializer=initializer,
                            activation="relu", data_format='channels_first')(l_drop1)
  l_conv_f = layers.Conv1D(n_filt, f_size_f, strides=1, padding="same", kernel_initializer=initializer,
                            activation="relu", data_format='channels_first')(l_drop1)

  # concatenate all convolutional layers
  l_conc = tf.keras.layers.Concatenate(axis=1)([l_conv_a, l_conv_b, l_conv_c, l_conv_d, l_conv_e, l_conv_f])

  l_conv_final = layers.Conv1D(
      filters=128, kernel_size=f_size_b, strides=1, padding="same", activation="relu",
      data_format='channels_first')(l_conc)

  # encoders LSTM
  l_lstm, forward_h, forward_c, backward_h, backward_c = layers.Bidirectional \
      (layers.LSTM(n_hid, dropout=drop_hid, return_sequences=True, return_state=True,
                    activation="tanh")) \
      (l_conv_final)
  state_h = layers.Concatenate()([forward_h, backward_h])
  state_c = layers.Concatenate()([forward_c, backward_c])

  # Set up the attention layer
  context_vector, attention_weights = Attention(n_hid * 2)(l_lstm, state_h)

  l_drop2 = layers.Dropout(drop_hid)(context_vector)

  l_dense = layers.Dense(n_hid * 2, activation="relu", kernel_initializer=initializer)(l_drop2)

  l_drop3 = layers.Dropout(drop_hid)(l_dense)

  l_out_subcellular = layers.Dense(n_class, activation="softmax", name="subcellular")(l_drop3)
  l_out_membrane = layers.Dense(n_membrane_class, activation="softmax", name="membrane")(l_drop3)
  model = keras.Model(inputs, [l_out_subcellular, l_out_membrane])

  # gradient clipping clips parameters' gradients during backprop by a maximum value of 2
  # with clipnorm the gradients will be clipped when their L2 norm exceeds this value.
  model.compile(loss=['categorical_crossentropy', 'categorical_crossentropy'],
                      optimizer=optimizers.Adam(learning_rate=lr, clipvalue=2, clipnorm=3),
                      metrics=['accuracy'])

  # setting initial state tensors to be passed to the first call of the cell (cell init and hid init in
  # bidirectional LSTM)
  model.layers[12].initial_states = [tf.keras.initializers.Orthogonal(), tf.keras.initializers.Orthogonal()]

  return model

In [None]:
train = np.load("/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/noSecVeq/embeddings_"+str(n)+"_"+str(stride)+"/train_"+str(n)+"_"+str(stride)+".npz")
X_train = train['X_train'][1:]
y_train_subcellular = train['y_train_location'][1:]
y_train_membrane = train['y_train_membrane'][1:]

validation = np.load("/gdrive/My Drive/ProteinsML/Protein-subcellular-localization/Elmo/data/noSecVeq/embeddings_"+str(n)+"_"+str(stride)+"/test_"+str(n)+"_"+str(stride)+".npz")
X_val = validation['X_test'][1:]
y_val_subcellular = validation['y_test_location'][1:]
y_val_membrane = validation['y_test_membrane'][1:]


# One-hot encoding
n_class = 10
y_train_subcellular = to_categorical(y_train_subcellular, n_class)
y_train_membrane = to_categorical(y_train_membrane, 2)
y_val_subcellular = to_categorical(y_val_subcellular, n_class)
y_val_membrane = to_categorical(y_val_membrane, 2)

X_train = np.reshape(X_train, (6913, 1, 1024))
X_val = np.reshape(X_val, (1749, 1, 1024))

In [None]:
# creating the model
model = create_CNN_LSTM_Attention_complete()

n_epochs = 120

history = model.fit(X_train, [y_train_subcellular, y_train_membrane], validation_data=(X_val, [y_val_subcellular, y_val_membrane]), epochs=n_epochs, batch_size=128)

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

In [None]:
Y_pred = model.predict(X_val)
y_pred = np.argmax(Y_pred[1], axis=1)
MCC = matthews_corrcoef(validation['y_test_membrane'][1:], y_pred)
Y_pred = model.predict(X_val)
y_pred = np.argmax(Y_pred[1], axis=1)
gorodkin = matthews_corrcoef(validation['y_test_location'][1:], y_pred)

print("Minimum subcellular validation loss: {:.6f}".format(min(history.history['val_subcellular_loss'])))
acc_index = np.argmin(history.history['val_subcellular_loss'])
print("With subcellular accuracy: {:.6f}".format(history.history['val_subcellular_accuracy'][acc_index]))
print("Minimum membrane validation loss: {:.6f}".format(history.history['val_membrane_loss'][acc_index]))
print("With membrane accuracy: {:.6f}".format(history.history['val_membrane_accuracy'][acc_index]))
print("MCC: " + str(MCC))
print("Gorodkin: " + str(gorodkin))

Minimum subcellular validation loss: 1.195808
With subcellular accuracy: 0.607776
Minimum membrane validation loss: 0.367878
With membrane accuracy: 0.844483
MCC: 0.6602917169936836
Gorodkin: 0.2675304361520159
