In [1]:
%load_ext autoreload
%autoreload 2

In [59]:
import os, shutil, pickle, shelve
from Bio import SeqIO
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import load_model, Model
from keras.utils import plot_model
from keras.models import Model, Sequential
from keras.models import Sequential
from keras.layers import Dense, Dropout, MaxPooling1D, Flatten, Conv1D, LSTM, CuDNNLSTM, Bidirectional, TimeDistributed
from keras.optimizers import RMSprop, Adam
from keras.initializers import glorot_normal
from utilities import sampling, one_hot_encoding, curtail, get_training_data, load_data, data_split, dianostic_plots, pad_for_detector
from utilities import get_char_list, get_activated_subseq, get_freqs, get_candidates, get_motif
import keras
import utilities

In [4]:
# Subset sampling: save to me_samples; "me" for "mutually exclusive"
output_folder_path = "../../../../temp/buffers/me_samples"

data_dir = "/home/ubuntu/group_volume/team_neural_network/data/input/3.24_species_only"

len(os.listdir(data_dir)) # total number of regions

3543

### Start creating 5 folds of training data

In [18]:
L = len(os.listdir(data_dir))
k = 5
r = 1/(k + 1)
partition = [int(n*r*L) for n in range(0, k)] + [L]

In [19]:
partition

[0, 590, 1181, 1771, 2362, 3543]

In [22]:
folds = [np.arange(partition[i], partition[i+1]) for i in range(k)]

In [29]:
all_data_lst = np.array(os.listdir(data_dir))
fold_root_dir = '/home/ubuntu/data/temp/folds'
for i in range(k):
    fold_files = all_data_lst[folds[i]]
    fold_dir = os.path.join(fold_root_dir, 'fold'+str(i+1))
    for file in fold_files:
        shutil.copy(os.path.join(data_dir, file),
                              fold_dir)
    print('copied training samples to {}'.format(fold_dir))

copied training samples to /home/ubuntu/data/temp/folds/fold1
copied training samples to /home/ubuntu/data/temp/folds/fold2
copied training samples to /home/ubuntu/data/temp/folds/fold3
copied training samples to /home/ubuntu/data/temp/folds/fold4
copied training samples to /home/ubuntu/data/temp/folds/fold5


In [43]:
k_fold_data = []
i = 0
fold_output_path = '/home/ubuntu/data/temp/buffers/folds/'
for i in range(k):
    one_fold_output_path = os.path.join(fold_output_path, 'fold'+str(i+1))
    all_regions = one_hot_encoding(os.path.join(fold_root_dir, 'fold'+str(i+1)+'/'),
                                   os.path.join(one_fold_output_path, 'fold'+str(i+1)+'.data'))
    temp_x, temp_y = get_training_data(all_regions, one_fold_output_path,
                                       1000, 'fold'+str(i+1)+'_x.data', 'fold'+str(i+1)+'_y.data')
    k_fold_data.append((temp_x, temp_y))

save to /home/ubuntu/data/temp/buffers/folds/fold1/fold1.data
(14160, 1000, 4) (14160,)
save to /home/ubuntu/data/temp/buffers/folds/fold1/fold1_x.data
save to /home/ubuntu/data/temp/buffers/folds/fold1/fold1_y.data
save to /home/ubuntu/data/temp/buffers/folds/fold2/fold2.data
(14184, 1000, 4) (14184,)
save to /home/ubuntu/data/temp/buffers/folds/fold2/fold2_x.data
save to /home/ubuntu/data/temp/buffers/folds/fold2/fold2_y.data
save to /home/ubuntu/data/temp/buffers/folds/fold3/fold3.data
(14160, 1000, 4) (14160,)
save to /home/ubuntu/data/temp/buffers/folds/fold3/fold3_x.data
save to /home/ubuntu/data/temp/buffers/folds/fold3/fold3_y.data
save to /home/ubuntu/data/temp/buffers/folds/fold4/fold4.data
(14184, 1000, 4) (14184,)
save to /home/ubuntu/data/temp/buffers/folds/fold4/fold4_x.data
save to /home/ubuntu/data/temp/buffers/folds/fold4/fold4_y.data
save to /home/ubuntu/data/temp/buffers/folds/fold5/fold5.data
(28344, 1000, 4) (28344,)
save to /home/ubuntu/data/temp/buffers/folds/fol

### Finish creating 5 folds of data

## Start training model and extracting motifs on all 5 folds

In [44]:
data_x = pickle.load(open('/home/ubuntu/data/temp/buffers/folds/fold5/fold5_x.data', 'rb'))
data_y = pickle.load(open('/home/ubuntu/data/temp/buffers/folds/fold5/fold5_y.data', 'rb'))
train_x, train_y, val_x, val_y = data_split(data_x, data_y, seed = 157)
train_x, val_x = pad_for_detector(train_x, 10), pad_for_detector(val_x, 10)

28344 (22676, 1000, 4) (22676,) (5668, 1000, 4) (5668,)


In [62]:
n = 15 # number of filters
m = 10 # filter size
def get_hybrid(opt, num_filters, kernel_size):
    """  Return a hybrid network given a optimizer
    """
    model = Sequential()
    model.add(Conv1D(filters = num_filters, 
                     kernel_size = kernel_size, 
                     padding = 'valid',
                     activation = 'relu'))
    model.add(MaxPooling1D(pool_size = 5, strides = 5))
    model.add(Dropout(0.1))
    #model.add(Bidirectional(LSTM(20)))
    #model.add(Bidirectional(CuDNNLSTM(15, return_sequences=True)))
    model.add(Bidirectional(CuDNNLSTM(20)))
    model.add(Dropout(0.2))
    model.add(Dense(50))
    model.add(keras.layers.LeakyReLU(alpha=0.3))
    model.add(Dense(20))
    model.add(keras.layers.LeakyReLU(alpha=0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])
    return model


def train(model, train_x, train_y, val_data, config = {'epochs': 35, 'batch_size': 256}):
    """  Train model for a given config, training data, and validation data
    """
    epochs, batch_size = config['epochs'], config['batch_size']
    return model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, validation_data=val_data)

In [85]:
# Start training
# Set up some configurations
optimizers = {'adam': Adam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=None, decay=1e-2, amsgrad=False),
              'rmsprop': RMSprop(lr=1e-2, rho=0.9, epsilon=None, decay=1e-2)}
config = {'epochs': 100, 'batch_size': 562}
opt = optimizers['rmsprop']

In [None]:
save_model_path = '/home/ubuntu/data/team_neural_network/code/models'
k_fold_history = []
for i in range(1, k+1):
    print("************************************")
    print("Training on fold {}".format(i))
    
    data_x_path = '/home/ubuntu/data/temp/buffers/folds/fold{}/fold{}_x.data'.format(i, i)
    data_y_path = '/home/ubuntu/data/temp/buffers/folds/fold{}/fold{}_y.data'.format(i, i)
    # Data preparation
    data_x = pickle.load(open(data_x_path, 'rb'))
    data_y = pickle.load(open(data_y_path, 'rb'))
    train_x, train_y, val_x, val_y = data_split(data_x, data_y, seed = 157)
    train_x, val_x = pad_for_detector(train_x, m), pad_for_detector(val_x, 10)
    
    # Training
    model = get_hybrid(opt, num_filters = n, kernel_size = m)
    history = train(model, train_x, train_y, (val_x, val_y), config)
    
    # Saving models
    k_fold_history.append(history)
    model_name = 'hybrid_net_fold' + str(i) + '.h5'
    model.save(os.path.join(save_model_path, model_name))
    print("************************************")

************************************
Training on fold 1
14160 (11328, 1000, 4) (11328,) (2832, 1000, 4) (2832,)
Train on 11328 samples, validate on 2832 samples
Epoch 1/100
Epoch 2/100

In [None]:
motifs_from_k_folds = {i:[] for i in range(1, k+1)}

for i in range(1, k+1):
    model_name = 'hybrid_net_fold' + str(i) + '.h5'
    model = load_model(os.path.join('../../models/', model_name))

    filters = model.layers[0].get_weights()[0]
    bias = model.layers[0].get_weights()[1]

    layer_name = model.layers[0].get_config()['name']
    intermediate_layer_model = Model(inputs=model.input,
                                     outputs=model.get_layer(layer_name).output)
    intermediate_output = intermediate_layer_model.predict(val_x)

    activated_subseq = get_activated_subseq(intermediate_output, val_x, m)
    for k in list(activated_subseq):
        char_list = get_char_list(activated_subseq[k])
        uniques, freqs = get_freqs(char_list)
        candidates = get_candidates(uniques, freqs, 0.45)
        motifs_from_k_folds[i].append(get_motif(candidates))

In [None]:
k = 5
for i in range(1, k+1):
    print("**************************")
    print("motifs extracted from fold {}\n".format(i))
    for seq in motifs_from_k_folds[i]:
        print(seq)
    print("**************************")