## Import Library

In [1]:
import gzip
import numpy as np
import random as rd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from keras.layers import LSTM, GRU, Dense, RepeatVector, TimeDistributed, Input, \
    multiply, concatenate, Flatten, Activation, dot, Bidirectional, Embedding, Dropout, Conv1D, \
    MultiHeadAttention, Add, LayerNormalization, BatchNormalization

from keras.models import Sequential, Model, load_model

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils.vis_utils import plot_model, model_to_dot

from tensorflow.keras.regularizers import L1, L2



In [2]:
f = gzip.GzipFile('../Data/Preprocessed/CullPDB_6133_filtered_profile_MASK.npy.gz', "r")
CullPDB_profile = np.load(f)
f.close()

f = gzip.GzipFile('../Data/Preprocessed/CullPDB_6133_filtered_sequence_MASK.npy.gz', "r")
CullPDB_sequence = np.load(f)
f.close()

## CullPDB_sequence_integer : transform one hot encoding into a single integer for Embedding input
CullPDB_sequence_integer = np.zeros((CullPDB_sequence.shape[0], CullPDB_sequence.shape[1]))
print(CullPDB_sequence_integer.shape)

(5534, 700)


## Load Data

In [2]:

def prepareData ():
    f = gzip.GzipFile('../Data/Preprocessed/CullPDB_6133_filtered_profile_MASK.npy.gz', "r")
    CullPDB_profile = np.load(f)
    f.close()

    f = gzip.GzipFile('../Data/Preprocessed/CullPDB_6133_filtered_sequence_MASK.npy.gz', "r")
    CullPDB_sequence = np.load(f)
    f.close()

    ## CullPDB_sequence_integer : transform one hot encoding into a single integer for Embedding input
    CullPDB_sequence_integer = np.zeros((CullPDB_sequence.shape[0], CullPDB_sequence.shape[1]))
    print(CullPDB_sequence_integer.shape)
    for i in range(0, CullPDB_sequence_integer.shape[0]):
        for j in range(0, CullPDB_sequence_integer.shape[1]):
            CullPDB_sequence_integer[i][j] = np.argmax(CullPDB_sequence[i][j])
    print(CullPDB_sequence_integer)

    f = gzip.GzipFile("../Data/Preprocessed/CullPDB_6133_filtered_traininglabel_MASK.npy.gz", "r")
    CullPDB_traininglabel = np.load(f)
    f.close()

    hmm_profile = np.load("Data/hmm_train.npy", "r")

    CullPDB_data = {"sequence": CullPDB_sequence_integer, "profile": CullPDB_profile, "label":CullPDB_traininglabel, "hmm_profile":hmm_profile}

    return CullPDB_data

## Build Model

In [3]:
 def buildModel (data, gru_hidden, do, epoch, bs) :

    sequence_length= 700
    input_dim = 63
    output_dim = 9
    pssm_input_dim = 21
    hmm_input_dim = 20
    embedding_input_dim = 22
    embedding_output_dim = 22
    units = 256


    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():

        ## Layers
        # input: pssm and one hot encoding
        pssm_input = Input(shape=(sequence_length, pssm_input_dim))
        embedding_input=Input(shape=(sequence_length, ))
        print(embedding_input.shape)
        print(pssm_input.shape)

        hmm_input = Input(shape=(sequence_length, hmm_input_dim))
        print(hmm_input.shape)

        output = Input(shape=(sequence_length, output_dim))
        print(output.shape)

        # embedding layer for the one hot encoding
        embedding_out= Embedding(input_dim=embedding_input_dim, output_dim=embedding_output_dim, input_length=sequence_length, mask_zero=True)(embedding_input)
        print(embedding_out.shape)
        model_input =  tf.concat([embedding_out, pssm_input, hmm_input], axis=2)
        
        print(model_input.shape)

        ## Self attention here
        ## No Mask is used here
        
        att_output, att_scores= MultiHeadAttention(num_heads=1, key_dim=model_input.shape[-1], dropout=0.5)(query=model_input, value=model_input, return_attention_scores=True)
        model_input_att = Add()([model_input, att_output])
        model_input_att = LayerNormalization()(model_input_att)
        model_input_att = BatchNormalization(synchronized=True)(model_input_att)

        ## Block 1

        model_input1 = Dense(63, activation='relu')(model_input_att)
        model_input2 = Dense(128, activation='relu')(model_input1)

        whole_sequence_output1, final_state_1f, final_state_1b = Bidirectional(GRU(128, return_state=True, return_sequences=True, dropout=0.3))(model_input2)
        whole_sequence_output2, final_state_2f, final_state_2b = Bidirectional(GRU(128, return_state=True, return_sequences=True, dropout=0.3))(whole_sequence_output1)

        whole_sequence_output2=BatchNormalization(synchronized=True)(whole_sequence_output2)

        ## Block 2

        model_input3 = Dense(63, activation='relu')(model_input_att)
        model_input4 = Dense(256, activation='relu')(model_input3)

        whole_sequence_output3, final_state_3f, final_state_3b = Bidirectional(GRU(256, return_state=True, return_sequences=True, dropout=0.5))(model_input4)
        whole_sequence_output4, final_state_4f, final_state_4b = Bidirectional(GRU(256, return_state=True, return_sequences=True, dropout=0.5))(whole_sequence_output3)

        whole_sequence_output4=BatchNormalization(synchronized=True)(whole_sequence_output4)

        ## Block 3
        model_input5 = Dense(63, activation='relu')(model_input_att)
        model_input6 = Dense(512, activation='relu')(model_input5)

        whole_sequence_output5, final_state_5f, final_state_5b = Bidirectional(GRU(512, return_state=True, return_sequences=True, dropout=0.6))(model_input6)
        whole_sequence_output6, final_state_6f, final_state_6b = Bidirectional(GRU(512, return_state=True, return_sequences=True, dropout=0.6))(whole_sequence_output5)

        whole_sequence_output6=BatchNormalization(synchronized=True)(whole_sequence_output6)

        ## Combine the blocks

        whole_sequence_output = tf.concat([whole_sequence_output2, whole_sequence_output4, whole_sequence_output6], axis=-1)
        whole_sequence_output = Dropout(do)(whole_sequence_output)
        
      
        # Fully Connected Layers

        dense0= Dense(512, activation='relu')(whole_sequence_output)
        dense1= Dense(256, activation='relu')(dense0)
        dense2= Dense(128, activation='relu')(dense1)
        
        out = TimeDistributed(Dense(output.shape[2], activation='softmax'))(dense2)


        print(out.shape)

        ## MODEL TRAINING
        model = Model(inputs=[embedding_input, pssm_input, hmm_input], outputs=out)
        opt = Adam(learning_rate=0.0001)

    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    model.summary()

    plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

    epc = epoch
    es = EarlyStopping(monitor='val_loss', mode='min', patience=10)
    history = model.fit([data["sequence"], data["profile"], data["hmm_profile"]], data["label"],
                    epochs=epc, verbose=1, callbacks=[es], validation_split=0.1,
                    batch_size=bs)

    #Plot the training loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')


    # Save the model
    model_name = 'model'
    model.save(model_name+'.h5')



In [4]:
# Call prepareData function
train_data= prepareData()

# Call buildModel function
buildModel(train_data, 256, 0.5, 100, 64)


(5534, 700)
[[ 6.  4. 20. ...  0.  0.  0.]
 [16. 14.  3. ...  0.  0.  0.]
 [10.  9.  7. ...  0.  0.  0.]
 ...
 [15.  7. 12. ...  0.  0.  0.]
 [ 1.  6.  1. ...  0.  0.  0.]
 [10.  9. 16. ...  0.  0.  0.]]


FileNotFoundError: [Errno 2] No such file or directory: 'Data/hhm_train.npy'