Date: 2018-9-28

Author: Adam Stafford

Purpose: Testing a multiclassifcation scheme, where gene expression is separated by developmental stage.

Background:

* This experiment is a modification of the previous experiments in which transcription factor binding site data and DNA sequences were used to predict gene expression (regardless of developmental stage), to test whether separating developmental stages would improve accuracy.
* GPU is necessary to run this notebook.

Experiment:

In [2]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

ModuleNotFoundError: No module named 'numpy'

Path variables, `classes_path` is new in this experiment, and refers to the path to the file storing the developmental stages of expression data.

In [None]:
real_buffer_path = "/home/ubuntu/newOutput/10_percent/random_0.1_instance_7.txt"
random_buffer_path = "/home/ubuntu/newOutput/random_sequences/random_sequence_buffer.txt"
classes_path = "/home/ubuntu/data/team_neural_network/data/input/classification_table3_21Aug2018.csv"
curtail_len = 3000
motif_num = 3

This script replaces the expression column from the existing data (which is restricted to 0 for no expression, 1 for expression) with a string containing the developmental stage data.

In [None]:
with open(real_buffer_path, "rb") as buff:
    seq_record_list = pickle.load(buff)
expression = []
with open(classes_path, encoding='utf-8') as csv_file:
    for a_line in csv_file:
        curr_line = a_line.split(',')
        stage = curr_line[3]
        vtid = curr_line[1]
        if vtid != '"VTID"':
            expression.append([vtid, stage])
for a in range(0,len(seq_record_list)):
    seq_record_list[a][0] = seq_record_list[a][0].split('|')[0][2:]    
for a in range(0, len(expression)):
    expression[a][0] = expression[a][0][3:-1]
seq_record_list = pd.DataFrame(seq_record_list)
seq_record_list.columns = ['vtid', 1, 2, 3]
expression = pd.DataFrame(expression)
expression.columns = ['vtid', 1]
seq_record_list = seq_record_list.merge(expression, on='vtid')
seq_record_list = seq_record_list.values.tolist()

The following cell transforms the data into a format that is recognizable by the neural network model.

In [None]:
import random
from random import shuffle

first_list = [] # to add to training set
second_list = [] # to add to test set
current = [] # contains all 24 sequences from the same DNA section

for i in range(len(seq_record_list)):
    current.append(seq_record_list.pop())
    if len(current) == 24:
        shuffle(current) # Shuffle the 24 sequences from the same DNA section
        random_select = random.randint(18, 24) # Allocate the number of sequences to the training set
        first_list.extend(current[:random_select])
        second_list.extend(current[random_select:])
        current = []

shuffle(first_list) # Shuffle again to eliminate dependencies
shuffle(second_list) # Shuffle again to eliminate dependencies

seq_record_list = first_list + second_list

print("Number of sequences in training/validation set are: " + str(len(first_list)))
print("Number of sequences in testing set are: " + str(len(second_list)))

In [None]:
train_val_num = len(first_list)
test_num = len(second_list)

In [None]:
# A helper function to flatten a 2d list to 1d.
# Input: [[1, 2], [2, 3], [3, 4, 5]]
# Output: [1, 2, 2, 3, 3, 4, 5]
def flatten(lst):
    new_lst = []
    for sub_lst in lst:
        for item in sub_lst:
            new_lst.append(item)
    return new_lst

# A helper function to transform a lst so that its length becomes read_len by:
# 1. If len(lst) > read_len, curtail the end of the lst.
# 2. If len(lst) < read_len, keep extending the end of the lst with 0 (NA).
def curtail(lst, read_len, motif_number):
    if len(lst) > read_len:
        lst = lst[:read_len]
    else:
        for i in range(read_len - len(lst)):
            lst.append([0 for _ in range(motif_number + 4)])
    return lst

# Produce the train-test split
# length_read: the length that you want all DNA sequences to conform to
def prepare_input(training_size, test_size, length_read, original_list, motif_number):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    seq_count = 0
    while seq_count < training_size:
        X_train.append(flatten(curtail(original_list[seq_count][3], length_read, motif_number)))
        y_train.append(original_list[seq_count][4])
        seq_count += 1
    while seq_count < (training_size + test_size):
        X_test.append(flatten(curtail(original_list[seq_count][3], length_read, motif_number)))
        y_test.append(original_list[seq_count][4])
        seq_count += 1
    return X_train, y_train, X_test, y_test

# Turn list into numpy tensors that can directly feed into a neural network model
def to_np_array(X_train, y_train, X_test, y_test):
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    if len(y_train.shape) == 1:
        y_train = np.transpose(np.array([y_train]))
    X_test = np.array(X_test)
    y_test = np.transpose(np.array(y_test))
    if len(y_test.shape) == 1:
        y_test = np.transpose(np.array([y_test]))
    return X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = prepare_input(train_val_num, test_num, curtail_len, seq_record_list, motif_num)
X_train, y_train, X_test, y_test = to_np_array(X_train, y_train, X_test, y_test)
[X_train.shape, y_train.shape, X_test.shape, y_test.shape]

In [None]:
import re

The following code transforms the development stage srings into integers.

In [None]:
y_train_new = [];
for row in y_train:
    a = [0]*64
    count = 0
    if re.search('4_6', row[0]):
        count+=1
    if re.search('7_8', row[0]):
        count+=2
    if re.search('9_10', row[0]):
        count+=4
    if re.search('11_12', row[0]):
        count+=8
    if re.search('13_14', row[0]):
        count+=16
    if re.search('15_16', row[0]):
        count+=32
    a[count] = 1
    y_train_new.append(a)
y_train_new = np.asarray(y_train_new)
y_test_new = [];
for row in y_train:
    a = [0]*64
    count = 0
    if re.search('4_6', row[0]):
        count+=1
    if re.search('7_8', row[0]):
        count+=2
    if re.search('9_10', row[0]):
        count+=4
    if re.search('11_12', row[0]):
        count+=8
    if re.search('13_14', row[0]):
        count+=16
    if re.search('15_16', row[0]):
        count+=32
    a[count] = 1
    y_test_new.append(a)
y_test_new = np.asarray(y_test_new)

The following cells build a RNN with four LSTM layers, with 128, 128, 64 units respectively:

In [None]:
from keras.models import Model, Sequential
from keras.layers import Dense, CuDNNLSTM, CuDNNGRU

In [None]:
X_train_rnn = X_train.reshape(train_val_num, curtail_len, motif_num + 4)

In [None]:
model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(curtail_len, motif_num + 4), return_sequences=True))
model.add(CuDNNLSTM(128))
model.add(Dense(64, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train_rnn, y_train_new, epochs=5, batch_size=128, validation_split=0.1)

Plot the data:

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training Accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('epoches')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('epoches')
plt.legend()

plt.show()