In [1]:
import os
import glob
import ast
import pandas as pd
import numpy as np
from Bio import SeqIO

The following cell imports all the TFBS scores and transform them into a dictionary called `all_scores`.

`all_scores` has the data structure: `{species: {motif: {raw_position: score}}}`.

In [2]:
# A helper function to extract the motif name from the csv name.
def get_motif(name):
    if 'cad_FlyReg.fm' in name:
        return 'cad_FlyReg.fm'
    if 'hb_nar2008.fm' in name:
        return 'hb_nar2008.fm'
    if 'bcd_FlyReg.fm' in name:
        return 'bcd_FlyReg.fm'

path = '../data/input/5_TFBS_score_subset_30May2018'
all_csvs = glob.glob(path + '/*.csv')
all_scores = {}
for csv_ in all_csvs:
    with open(csv_, encoding='utf-8') as csv_file:
        motif = get_motif(csv_file.name)
        for a_line in csv_file:
            curr_line = a_line.split('\t')
            strand = curr_line[6]
            if strand == 'positive\n':
                score = float(curr_line[2])
                species = curr_line[4]
                raw_position = int(curr_line[5])
                if species not in all_scores:
                    all_scores[species] = {}
                if motif not in all_scores[species]:
                    all_scores[species][motif] = {}
                all_scores[species][motif][raw_position] = score

The aim of the following cell is to produce a one-hot encoding scheme with TFBS scores embedded for each DNA sequence segment.

It consists of three parts:

1. Read in all DNA sequence segments.
2. Transform each position of the DNA sequence into a 4-letter one-hot encoding based on the `base_pairs` dictionary.
3. For each position, attach the TFBS scores to the end of the one-hot encoding.
4. Output the final encoding into `txt` files for bookkeeping.

In [3]:
# Use the following dictionary to perform the transformation
base_pairs = {'A': [1, 0, 0, 0], 
              'C': [0, 1, 0, 0],
              'G': [0, 0, 1, 0],
              'T': [0, 0, 0, 1],
              'a': [1, 0, 0, 0],
              'c': [0, 1, 0, 0],
              'g': [0, 0, 1, 0],
              't': [0, 0, 0, 1],
              'n': [0, 0, 0, 0],
              'N': [0, 0, 0, 0]}

file_num_limit = 110    # The maximum number of files to be decoded
file_count = 0

# Iterate through every file
for file in os.listdir("../data/input/3.24_species_only"):
    one_hot = []
    to_write = False
    # When the number of file decoded has reached the limit, stop
    if file_count < file_num_limit:
        data = list(SeqIO.parse("../data/input/3.24_species_only/" + file,"fasta"))
        for n in range(0, len(data)):
            # Extract the header information
            header = data[n].description.split('|')
            descr = data[n].description
            regionID = header[0]
            expressed = header[1]
            speciesID = header[2]
            strand = header[3]
            # Complement all sequences in the negative DNA strand
            if strand == '-':
                # Using the syntax [e for e in base_pairs[n]] to create a new pointer for each position
                one_hot.append([descr, expressed, speciesID, [[e for e in base_pairs[n]] for n in data[n].seq.complement()]])
            else:
                one_hot.append([descr, expressed, speciesID, [[e for e in base_pairs[n]] for n in data[n].seq]])
        # Attach the TFBS scores to the end of each position
        for item in one_hot:
            # Only outputs sequences that currently have TFBS scores
            # Ignore all sequences that do not have TFBS scores yet
            if descr in all_scores:
                to_write = True
                i = 0
                for encoding in item[3]:
                    # Take care of positions that do not have TFBS scores, attaching 0 as placeholder (i.e. NA)
                    if i not in all_scores[descr]['cad_FlyReg.fm']:
                        encoding.extend([0, 0, 0])
                    else:
                        encoding.append(all_scores[descr]['cad_FlyReg.fm'][i])
                        encoding.append(all_scores[descr]['hb_nar2008.fm'][i])
                        encoding.append(all_scores[descr]['bcd_FlyReg.fm'][i])
                    i += 1
                # Write the final encoding into txt files
        if to_write:
            with open("../data/output/" + regionID + ".txt", mode="w", encoding='utf-8') as output:
                output.write(str(one_hot))
            file_count += 1

The rest of the notebook uses the one-hot encoding files produced above to build a neural network prototype to make sure everything works as intended.

The following cell reads in one-hot encoding files as a list `seq_record_list`.

In [4]:
path = '../data/output'
all_txts = glob.glob(path + '/*.txt')
seq_record_list = []
# Iterate through all one-hot encoding files
for txt_ in all_txts:
    with open(txt_, encoding='utf-8') as f:
        # attach the one-hot encoding information of this file to the end of seq_record_list
        seq_record_list += ast.literal_eval(f.read())
len(seq_record_list)

2640

The following cell transforms the data into a format that is recognizable by the neural network model.

In [5]:
# A helper function to flatten a 2d list to 1d.
# Input: [[1, 2], [2, 3], [3, 4, 5]]
# Output: [1, 2, 2, 3, 3, 4, 5]
def flatten(lst):
    new_lst = []
    for sub_lst in lst:
        for item in sub_lst:
            new_lst.append(item)
    return new_lst

# A helper function to transform a lst so that its length becomes read_len by:
# 1. If len(lst) > read_len, curtail the end of the lst.
# 2. If len(lst) < read_len, keep extending the end of the lst with 0 (NA).
def curtail(lst, read_len):
    if len(lst) > read_len:
        lst = lst[:read_len]
    else:
        for i in range(read_len - len(lst)):
            lst.append([0, 0, 0, 0, 0, 0, 0])
    return lst

# Produce the train-test split
# length_read: the length that you want all DNA sequences to conform to
def prepare_input(training_size, test_size, length_read):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    seq_count = 0
    while seq_count < training_size:
        X_train.append(flatten(curtail(seq_record_list[seq_count][3], length_read)))
        y_train.append(int(seq_record_list[seq_count][1]))
        seq_count += 1
    while seq_count < (training_size + test_size):
        X_test.append(flatten(curtail(seq_record_list[seq_count][3], length_read)))
        y_test.append(int(seq_record_list[seq_count][1]))
        seq_count += 1
    return X_train, y_train, X_test, y_test

# Turn list into numpy tensors that can directly feed into a neural network model
def to_np_array(X_train, y_train, X_test, y_test):
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    if len(y_train.shape) == 1:
        y_train = np.transpose(np.array([y_train]))
    X_test = np.array(X_test)
    y_test = np.transpose(np.array(y_test))
    if len(y_test.shape) == 1:
        y_test = np.transpose(np.array([y_test]))
    return X_train, y_train, X_test, y_test

In [6]:
X_train, y_train, X_test, y_test = prepare_input(2112, 528, 2000)
X_train, y_train, X_test, y_test = to_np_array(X_train, y_train, X_test, y_test)
[X_train.shape, y_train.shape, X_test.shape, y_test.shape]

[(2112, 14000), (2112, 1), (528, 14000), (528, 1)]

In [7]:
from sklearn import linear_model as lm
from keras.layers import Input, Dense
from keras.models import Model, Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


The following is a logistic regression model. Just to check if everything works as intended.

In [8]:
model = lm.LogisticRegression()
model.fit(X_train, y_train.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
y_predicted = np.array(model.predict(X_test))
round(sum(y_test.ravel() == y_predicted)/y_test.shape[0], 3)

0.642

The rest of the cells build a basic neural network, just for the sake of testing whether the data is usable.

Using this neural network prototype, we can achieve a test accuracy of 0.652, incating lots of improvement potentials.

In [10]:
def train_nn(X_train, y_train, pr):
    model = Sequential()
    model.add(Dense(units=1000, activation='relu', input_dim=14000))
    model.add(Dense(units=400, activation='relu'))
    model.add(Dense(units=40, activation='relu'))
    model.add(Dense(units=10, activation='elu'))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer = 'SGD',
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])
    model.fit(X_train, y_train, batch_size=100, epochs=5, verbose = pr)
    return model

In [11]:
def test_accuracy(model, X_test, y_test):
    result = model.predict(X_test)
    correct = list(np.apply_along_axis(lambda x: 0 if x<0.5 else 1, 1, result))==y_test.ravel()
    return round(sum(correct)/y_test.shape[0], 3)

In [12]:
test_accuracy(train_nn(X_train, y_train, 1), X_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.652