**Date**: 2018-08-23

**Authors**: Zhanyuan Zhang

**Purpose**: Apply convolutional neural network (CNN) to train the binary classificaton model.
- Use operations like 1D convolution, maxpooling, and dropout to improve accuracy.
- Add bias in layers.
- Use Relu for learning session, and softmax for final classification.

**Background**: Current model trained by recurent neural network (RNN) gives low accuracy. The reason why we switch from CNN to RNN was that this somehow improved the accuracy by 10%. However, given the math behind these two neural networks, CNN should be better in handling spacial data, which is in our case, since the order in a nucleotide sequence does matter.

**Experiment**:

In [2]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from utility import flatten
from utility import curtail
from utility import prepare_input
from utility import to_np_array
from utility import unpickle

  (fname, cnt))
  (fname, cnt))


In [5]:
real_buffer_path = "/home/ubuntu/data2/10_percent/random_0.1_instance_7.txt"
# random_buffer_path = "/home/ubuntu/formatted/random_sequences/random_sequence_buffer.txt"
curtail_len = 3000
motif_num = 3

In [6]:
seq_record_list = unpickle(real_buffer_path)
len(seq_record_list)

8088

In [7]:
import random
from random import shuffle

first_list = [] # to add to training set
second_list = [] # to add to test set
current = [] # contains all 24 sequences from the same DNA section

for i in range(len(seq_record_list)):
    current.append(seq_record_list.pop())
    if len(current) == 24:
        shuffle(current) # Shuffle the 24 sequences from the same DNA section
        random_select = random.randint(18, 24) # Allocate the number of sequences to the training set
        first_list.extend(current[:random_select])
        second_list.extend(current[random_select:])
        current = []

shuffle(first_list) # Shuffle again to eliminate dependencies
shuffle(second_list) # Shuffle again to eliminate dependencies

seq_record_list = first_list + second_list

print("Number of sequences in training/validation set are: " + str(len(first_list)))
print("Number of sequences in testing set are: " + str(len(second_list)))

Number of sequences in training/validation set are: 7114
Number of sequences in testing set are: 974


In [8]:
train_val_num = len(first_list)
test_num = len(second_list)

In [9]:
X_train, y_train, X_test, y_test = prepare_input(train_val_num, test_num, curtail_len, seq_record_list, motif_num)
X_train, y_train, X_test, y_test = to_np_array(X_train, y_train, X_test, y_test)

# Check the shape of training and testing data
[X_train.shape, y_train.shape, X_test.shape, y_test.shape]

[(7114, 21000), (7114, 1), (974, 21000), (974, 1)]

In [10]:
from keras.models import Model, Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Dropout, Flatten
from keras.activations import relu
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
X_train_cnn = np.expand_dims(X_train, axis=2)
X_train_cnn.shape

(7114, 21000, 1)

In [12]:
LR = 5e-2
model = Sequential()
model.add(Conv1D(filters=1, kernel_size=168, input_shape=(21000, 1), activation="relu", use_bias=True))
model.add(MaxPooling1D())
model.add(Conv1D(filters=2, kernel_size=84, activation="relu", use_bias=True))
model.add(MaxPooling1D())
model.add(Conv1D(filters=4, kernel_size=42, activation="relu", use_bias=True))
model.add(MaxPooling1D())
model.add(Conv1D(filters=8, kernel_size=21, activation="relu", use_bias=True))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(1000, activation="relu"))
model.add(Dropout(0.7))
model.add(Dense(1, activation="sigmoid"))

In [13]:
model.compile(optimizer=Adam(lr=LR), 
                             loss='binary_crossentropy',
                             metrics=['acc'])
history = model.fit(X_train_cnn, y_train, epochs=30, batch_size=128, validation_split=0.1)

Train on 6402 samples, validate on 712 samples
Epoch 1/30


ResourceExhaustedError: OOM when allocating tensor with shape[1] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: dense_2/bias/Assign = Assign[T=DT_FLOAT, use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dense_2/bias, conv1d_1/Const)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'dense_2/bias/Assign', defined at:
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/asyncio/base_events.py", line 1431, in _run_once
    handle._run()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tornado/ioloop.py", line 759, in _run_callback
    ret = callback()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2909, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-52836be6c34b>", line 14, in <module>
    model.add(Dense(1, activation="sigmoid"))
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/models.py", line 522, in add
    output_tensor = layer(self.outputs[0])
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/engine/topology.py", line 592, in __call__
    self.build(input_shapes[0])
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/layers/core.py", line 870, in build
    constraint=self.bias_constraint)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/engine/topology.py", line 416, in add_weight
    constraint=constraint)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 396, in variable
    v = tf.Variable(value, dtype=tf.as_dtype(dtype), name=name)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 235, in __init__
    constraint=constraint)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 387, in _init_from_args
    validate_shape=validate_shape).op
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/ops/state_ops.py", line 283, in assign
    validate_shape=validate_shape)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/ops/gen_state_ops.py", line 60, in assign
    use_locking=use_locking, name=name)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: dense_2/bias/Assign = Assign[T=DT_FLOAT, use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dense_2/bias, conv1d_1/Const)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

