In [2]:
#!pip install tensorflow

In [3]:
import Bio.SeqIO as SeqIO
import numpy as np
import tensorflow
from tensorflow import keras
import matplotlib.pyplot as plt

In [19]:
def load_data(seqs_path, labels_path):
    """
    Given the path to the fasta files of the sequence and labels
    returns a list of a tuple pair of the accession of sequence/label
    and a string of sequence/string.

    :param seqs_path: file path to fasta file with amino acid sequence data
    :param labels_path: file path to fasta file with binary label data
    :return: [(accession_seq, 'amino_acid_seq')], [(accession_label, 'label')]
            ex: [('QP106', 'AASSSDD'), ...], [('QP106', '00001111000'), ...]
    """

    # Load files
    seqs = []
    for record in SeqIO.parse(seqs_path, 'fasta'):
        accession = record.description.split('|')[0]
        seq = str(record.seq)
        seqs.append((accession, seq))

    labels = []
    for record in SeqIO.parse(labels_path, 'fasta'):
        accession = record.description.split('|')[0]
        label = [int(sym) for sym in record.seq]
        labels.append((accession, label))

    return seqs, labels

In [20]:
# Parameters
sym_codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

In [21]:
def convert_ohc(seqs):
    """
    Converts the given tuple of the accession and amino acid sequence string into
    a int list using sym_codes and then one hot encoded the int list using keras.util.to_categorical

    :param seqs: list of tuple pair of the accession of the amino acid sequence
    and the string of the amino acid sequence ex: [('QP106', 'AASSSDD'), ...]
    :return: 2D list of each one hot encoded amino acid sequence ex: [[0,0,0,0,0,0,1], ...]
    """
    x = []
    # convert string amino acid string characters into int list
    for _, seq in seqs:
        seq_idx = [sym_codes.index(sym) for sym in seq]
        x.append(seq_idx)

    # convert list into numpy array and one hot encode int array
    x = np.array(x)
    x = keras.utils.to_categorical(x, num_classes=len(sym_codes))
    return x

In [22]:
# Load data
train_seqs, train_labels = load_data('../../inpainting_mobidb/out/train_seq.fasta', '../../inpainting_mobidb/out/train_label.fasta')

valid_seqs, valid_labels = load_data('../../inpainting_mobidb/out/validation_seq.fasta','../../inpainting_mobidb/out/validation_label.fasta')

test_seqs, test_labels = load_data('../../inpainting_mobidb/out/test_seq.fasta','../../inpainting_mobidb/out/test_label.fasta')

In [24]:
# convert character array to numpy array and one hot encode the sequences
x_train = convert_ohc(train_seqs)
x_test = convert_ohc(test_seqs)
x_valid = convert_ohc(valid_seqs)

y_train = np.array([label for _, label in train_labels])
y_test = np.array([label for _, label in test_labels])
y_valid = np.array([label for _, label in valid_labels])

In [34]:
#mask the sequences
x_train_mask = np.expand_dims(y_train, axis=2)*x_train
x_test_mask = np.expand_dims(y_test, axis=2)*x_test
x_valid = np.expand_dims(y_valid, axis=2)*x_valid

In [9]:
# Size and shape of "batch"
batch_len = len(x_train)
batch_shape = x_train.shape
print("Length of training set is " + str(batch_len) + " and the shape is " + str(batch_shape))

Length of training set is 1495 and the shape is (1495, 180, 20)


In [37]:
# make generative model
def make_generative_model():
    # convolution 
    model = tensorflow.keras.Sequential()
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())

    model.add(keras.layers.Conv2D(8, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv2D(16, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv2D(32, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())

    model.add(keras.layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    # ***HOW TO USE LEARNED FILTERS????*** 
    
    # deconvolution 
    model.add(keras.layers.Conv2DTranspose(128, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv2DTranspose(64, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv2DTranspose(32, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv2DTranspose(16, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv2DTranspose(8, (3, 3), strides=(1, 1), padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    # ***WHAT IS THE LAST LAYER????*** 

    return model

In [39]:
generator = make_generative_model()

masked = x_train_mask
generated_image = generator(masked, training=False)

InvalidArgumentError: input must be 4-dimensional[1495,180,12544] [Op:FusedBatchNormV3]