In [90]:
import os, shutil, pickle, shelve
#from Bio import SeqIO
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import load_model, Model
from keras.utils import plot_model
from keras.models import Model, Sequential
from keras.models import Sequential
from keras.layers import Dense, Dropout, MaxPooling1D, Flatten, Conv1D, LSTM, CuDNNLSTM, Bidirectional
from keras.optimizers import RMSprop, Adam
from keras.initializers import glorot_normal
import utilities

In [4]:
def get_hybrid(opt):
    """  Return a hybrid network given a optimizer
    """
    model = Sequential()
    model.add(Conv1D(filters = 30, 
                     kernel_size = 15, 
                     padding = 'valid',
                     data_format = 'channels_last',
                     activation = 'relu'))
    model.add(MaxPooling1D(pool_size = 5, strides = 5))
    model.add(Dropout(0.6))
    model.add(Bidirectional(LSTM(10)))
    model.add(Dropout(0.7))
    model.add(Dense(20, activation = 'relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])
    return model

def train(model, train_x, train_y, val_data, config = {'epochs': 35, 'batch_size': 256}):
    """  Train model for a given config, training data, and validation data
    """
    epochs, batch_size = config['epochs'], config['batch_size']
    return model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, validation_data=val_data)

In [184]:
def pad_for_detector(input_x, kernel_size):
    """ Pad the input matrix such that the (i, k) entry of the output 
        matrix is the score of motif detector k aligned to position i.
    input_x has shape = (N, n, 4)
    kernel_size has shape m
    output has shape = (N, n + 2m - 2, 4)
    """
    N, n, C = input_x.shape
    pad_value, num_pad = 0.25, kernel_size - 1
    pad_matrix = np.full((N, num_pad, C), pad_value)
    return np.concatenate((pad_matrix, input_x, pad_matrix), axis=1)

def get_activated_subseq(activations, test_seq, m):
    #################################################################
    # Extract those in the TEST_SEQUENCE that has least one position
    #     having positive activation.
    #################################################################
    N, L, C = activations.shape
    activated_subseq = {}
    for i in range(C): # for each filter
        activated_subseq[i] = []
        activation = activations[:, :, i]
        
        # candidate and potential_start has a shape (N, )
        candidate, potential_start = np.max(activation, axis=1), np.argmax(activation, axis=1)
        
        # activated_indices should have a shape (K, ), where K = # of positive activations
        activated_indices = [i for i in range(N) if candidate[i] > 0]
        K = len(activated_indices)
        
        # activated_seq should have a shape(K, n + 2m - 2, 4)
        activated_seq, ends = test_seq[activated_indices, :, :], potential_start[activated_indices]
        starts = ends - m + 1
        for k in range(K):
            start, end = starts[k], ends[k]
            activated_subseq[i].append(activated_seq[k, start:(end+1), :])
    return activated_subseq

In [50]:
model = load_model('../../models/hybrid_net.h5')

filters = model.layers[0].get_weights()[0]
bias = model.layers[0].get_weights()[1]

In [180]:
N, n, m = 100, 1000, 15
test_seq = np.random.randint(2, size = N*n*4).reshape((N, n, 4))
padded_test_seq = pad_for_detector(test_seq, m)
padded_test_seq.shape

(100, 1028, 4)

In [181]:
layer_name = model.layers[0].get_config()['name']
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)
intermediate_output = intermediate_layer_model.predict(test_seq)

In [182]:
intermediate_output.shape

(100, 986, 30)

In [185]:
activated_subseq = get_activated_subseq(intermediate_output, test_seq, m)

In [186]:
activated_subseq

{0: [array([[1, 0, 1, 1],
         [1, 0, 0, 0],
         [1, 0, 1, 1],
         [1, 0, 1, 1],
         [1, 0, 0, 1],
         [0, 1, 1, 0],
         [0, 1, 1, 1],
         [1, 1, 0, 0],
         [1, 1, 0, 1],
         [1, 0, 0, 0],
         [0, 0, 0, 1],
         [0, 0, 1, 0],
         [1, 1, 1, 1],
         [1, 0, 0, 1],
         [1, 0, 1, 0]]), array([[0, 1, 0, 0],
         [0, 0, 1, 1],
         [1, 1, 0, 1],
         [1, 1, 0, 0],
         [1, 0, 0, 0],
         [1, 1, 1, 0],
         [1, 1, 0, 0],
         [1, 1, 0, 0],
         [0, 1, 1, 0],
         [1, 1, 1, 1],
         [1, 1, 0, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 0, 1],
         [0, 0, 0, 0]]), array([[1, 0, 1, 0],
         [0, 0, 1, 1],
         [1, 0, 1, 1],
         [0, 1, 1, 0],
         [0, 0, 1, 0],
         [1, 0, 0, 1],
         [1, 1, 1, 0],
         [1, 0, 0, 1],
         [0, 0, 1, 1],
         [0, 1, 0, 0],
         [0, 1, 1, 0],
         [0, 1, 0, 0],
         [0, 1, 0, 1],
        

In [None]:
code_to_char = {[1, 0, 0, 0]: 'A',
                [0, 1, 0, 0]: 'C',
                [0, 0, 1, 0]: 'G',
                [0, 0, 0, 1]: 'T'}
for i in range(m):
    subseqs = activated_subseq[i]
    for j in range(len(subseqs)):