In [2]:
#!pip install tensorflow

In [1]:
import Bio.SeqIO as SeqIO
import numpy as np
import tensorflow
from tensorflow import keras
import matplotlib.pyplot as plt

In [2]:
def load_data(seqs_path, labels_path):
    """
    Given the path to the fasta files of the sequence and labels
    returns a list of a tuple pair of the accession of sequence/label
    and a string of sequence/string.

    :param seqs_path: file path to fasta file with amino acid sequence data
    :param labels_path: file path to fasta file with binary label data
    :return: [(accession_seq, 'amino_acid_seq')], [(accession_label, 'label')]
            ex: [('QP106', 'AASSSDD'), ...], [('QP106', '00001111000'), ...]
    """

    # Load files
    seqs = []
    for record in SeqIO.parse(seqs_path, 'fasta'):
        accession = record.description.split('|')[0]
        seq = str(record.seq)
        seqs.append((accession, seq))

    labels = []
    for record in SeqIO.parse(labels_path, 'fasta'):
        accession = record.description.split('|')[0]
        label = [int(sym) for sym in record.seq]
        labels.append((accession, label))

    return seqs, labels

In [3]:
# Parameters
sym_codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

In [4]:
def convert_ohc(seqs):
    """
    Converts the given tuple of the accession and amino acid sequence string into
    a int list using sym_codes and then one hot encoded the int list using keras.util.to_categorical

    :param seqs: list of tuple pair of the accession of the amino acid sequence
    and the string of the amino acid sequence ex: [('QP106', 'AASSSDD'), ...]
    :return: 2D list of each one hot encoded amino acid sequence ex: [[0,0,0,0,0,0,1], ...]
    """
    x = []
    # convert string amino acid string characters into int list
    for _, seq in seqs:
        seq_idx = [sym_codes.index(sym) for sym in seq]
        x.append(seq_idx)

    # convert list into numpy array and one hot encode int array
    x = np.array(x)
    x = keras.utils.to_categorical(x, num_classes=len(sym_codes))
    return x

In [5]:
# Load data
train_seqs, train_labels = load_data('../../inpainting_mobidb/out/train_seq.fasta', '../../inpainting_mobidb/out/train_label.fasta')

valid_seqs, valid_labels = load_data('../../inpainting_mobidb/out/validation_seq.fasta','../../inpainting_mobidb/out/validation_label.fasta')

test_seqs, test_labels = load_data('../../inpainting_mobidb/out/test_seq.fasta','../../inpainting_mobidb/out/test_label.fasta')

In [6]:
# convert character array to numpy array and one hot encode the sequences
x_train = convert_ohc(train_seqs)
x_test = convert_ohc(test_seqs)
x_valid = convert_ohc(valid_seqs)

y_train = np.array([label for _, label in train_labels])
y_test = np.array([label for _, label in test_labels])
y_valid = np.array([label for _, label in valid_labels])

In [7]:
#mask the sequences
x_train_mask = np.expand_dims(y_train, axis=2)*x_train
x_test_mask = np.expand_dims(y_test, axis=2)*x_test
x_valid = np.expand_dims(y_valid, axis=2)*x_valid

In [8]:
# Size and shape of "batch"
batch_len = len(x_train)
batch_shape = x_train.shape
print("Length of training set is " + str(batch_len) + " and the shape is " + str(batch_shape))

Length of training set is 1495 and the shape is (1495, 180, 20)


# Generator Model

In [30]:
# make generative model
def make_generative_model():
    # convolution 
    model = tensorflow.keras.Sequential()
    model.add(keras.Input(shape=((180, 20))))

    model.add(keras.layers.Conv1D(8, 3, strides = 1, padding='same', name='first'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(16, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(32, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())

    model.add(keras.layers.Conv1D(64, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(128, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(256, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    # deconvolution 
    model.add(keras.layers.Conv1DTranspose(128, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(64, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(32, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(16, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(8, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    #PLAY AROUND WITH THE RATIO OF FILTER 
    
    model.add(keras.layers.Conv1DTranspose(20, 3, strides = 1, padding='same', activation = 'softmax'))

    return model

In [31]:
x_train_mask[0].shape

(180, 20)

In [32]:
generator = make_generative_model()
generator.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (Conv1D)               (None, 180, 8)            488       
_________________________________________________________________
batch_normalization_81 (Batc (None, 180, 8)            32        
_________________________________________________________________
re_lu_81 (ReLU)              (None, 180, 8)            0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 180, 16)           400       
_________________________________________________________________
batch_normalization_82 (Batc (None, 180, 16)           64        
_________________________________________________________________
re_lu_82 (ReLU)              (None, 180, 16)           0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 180, 32)         

In [35]:
masked = x_train_mask[0:5]
masked.shape

(5, 180, 20)

In [36]:
generator = make_generative_model()

#masked = x_train_mask[0]
generated_image = generator.predict(masked)
generated_image

array([[[0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],


# Discriminator Model

In [44]:
# make discrimator model
def make_discriminator_model():
    model = tensorflow.keras.Sequential()
    model.add(keras.Input(shape=((180, 20))))
    
    model.add(keras.layers.Conv1D(25, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(13, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(7, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(4, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    #model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(1))
    
    return model

In [45]:
discriminator = make_discriminator_model()
discriminator.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_47 (Conv1D)           (None, 90, 25)            2025      
_________________________________________________________________
batch_normalization_141 (Bat (None, 90, 25)            100       
_________________________________________________________________
re_lu_141 (ReLU)             (None, 90, 25)            0         
_________________________________________________________________
conv1d_48 (Conv1D)           (None, 45, 13)            1313      
_________________________________________________________________
batch_normalization_142 (Bat (None, 45, 13)            52        
_________________________________________________________________
re_lu_142 (ReLU)             (None, 45, 13)            0         
_________________________________________________________________
conv1d_49 (Conv1D)           (None, 23, 7)           

# Loss Function