In [2]:
#!pip install tensorflow

In [118]:
import Bio.SeqIO as SeqIO
import numpy as np
import tensorflow
from tensorflow import keras
import matplotlib.pyplot as plt

In [119]:
# Parameters
sym_codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

In [120]:
def convert_ohc(seq):
    """
    One Hot Encodes the categorical data of the string of amino acids passed into 
    a 2D list of numerical data.

    :param seq: amino acid sequence string
    
    :return: one hot encoded sequence 
    """
    
    seq_idx = [sym_codes.index(sym) for sym in seq]
    x = np.array(seq_idx)
    x = keras.utils.to_categorical(x, num_classes=len(sym_codes), dtype='int32')
    
    return x

In [123]:
def load_data(seqs_path, label_path):
    """
    Given the path to the fasta files of the sequence and labels
    returns a list with tuples for each record of the weight, target and context

    :param seqs_path: file path to fasta file with amino acid sequence data
    :param labels_path: file path to fasta file with binary label data
    
    :return: a list of tuples with the weight, target, and context for each record
            ex: [(weight, target, context), ]
    """

    
    data = []
    
    for record_seq, record_label in zip(SeqIO.parse(seqs_path, 'fasta'), SeqIO.parse(label_path, 'fasta')):
        
        # one hot encode each record_seq 
        seq = str(record_seq.seq)
        seq_ohc = convert_ohc(seq)
        
        # expand the dimension of record_label for broadcasting
        label = [int(sym) for sym in record_label]
        weight = np.expand_dims(label, axis = 1)
        
        # get the target from the record 
        target = weight*seq_ohc
        
        # get the context from the record (inverted the weight)
        context = (np.invert(weight) + 2)*seq_ohc 
        
        data.append((label, target, context))
        
    return data 

In [124]:
# Load data
train_data = load_data('../../inpainting_mobidb/out/train_seq.fasta', '../../inpainting_mobidb/out/train_label.fasta')

test_data = load_data('../../inpainting_mobidb/out/test_seq.fasta','../../inpainting_mobidb/out/test_label.fasta')

valid_data = load_data('../../inpainting_mobidb/out/validation_seq.fasta','../../inpainting_mobidb/out/validation_label.fasta')

In [125]:
# Size and shape of training, testing, and validation sets
train_len = len(train_data)
test_len = len(test_data)
valid_len = len(valid_data)
print("Length of training set: " + str(train_len))
print("Length of testing set: " + str(test_len))
print("Length of validation set: " + str(valid_len))

Length of training set: 1495
Length of testing set: 187
Length of validation set: 186


# Generator Model

In [151]:
# make generative model
def make_generative_model():
    # convolution 
    model = tensorflow.keras.Sequential()
    model.add(keras.Input(shape=((180, 20))))
    
    # MAYBE 40?????
    model.add(keras.layers.Conv1D(8, 3, strides = 1, padding='same', name='first'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(16, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(32, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())

    model.add(keras.layers.Conv1D(64, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(128, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(256, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    # deconvolution 
    model.add(keras.layers.Conv1DTranspose(128, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(64, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(32, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(16, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(8, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    #PLAY AROUND WITH THE RATIO OF FILTER 
    
    model.add(keras.layers.Conv1DTranspose(20, 3, strides = 1, padding='same', activation = 'softmax'))

    return model

In [152]:
generator = make_generative_model()
generator.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (Conv1D)               (None, 180, 8)            488       
_________________________________________________________________
batch_normalization_147 (Bat (None, 180, 8)            32        
_________________________________________________________________
re_lu_147 (ReLU)             (None, 180, 8)            0         
_________________________________________________________________
conv1d_69 (Conv1D)           (None, 180, 16)           400       
_________________________________________________________________
batch_normalization_148 (Bat (None, 180, 16)           64        
_________________________________________________________________
re_lu_148 (ReLU)             (None, 180, 16)           0         
_________________________________________________________________
conv1d_70 (Conv1D)           (None, 180, 32)         

In [148]:
train[0][1].shape

(180, 20)

In [149]:
train[0][1]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [150]:
generator = make_generative_model()

#masked = x_train_mask[0]

train_img = train[0][1]

generated_image = generator.predict(train_img)

ValueError: in user code:

    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1462 predict_function  *
        return step_function(self, iterator)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1452 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1445 run_step  **
        outputs = model.predict_step(data)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1418 predict_step
        return self(x, training=False)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:976 __call__
        self.name)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:196 assert_input_compatibility
        str(x.shape.as_list()))

    ValueError: Input 0 of layer sequential_13 is incompatible with the layer: : expected min_ndim=3, found ndim=2. Full shape received: [None, 20]


In [14]:
generated_image.shape

(5, 180, 20)

# Discriminator Model

In [15]:
# make discrimator model
def make_discriminator_model():
    model = tensorflow.keras.Sequential()
    model.add(keras.Input(shape=((180, 20))))
    
    model.add(keras.layers.Conv1D(25, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(13, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(7, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(4, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    #model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(1, activation = 'softmax'))
    
    return model

In [16]:
discriminator = make_discriminator_model()
discriminator.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_10 (Conv1D)           (None, 90, 25)            2025      
_________________________________________________________________
batch_normalization_22 (Batc (None, 90, 25)            100       
_________________________________________________________________
re_lu_22 (ReLU)              (None, 90, 25)            0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 45, 13)            1313      
_________________________________________________________________
batch_normalization_23 (Batc (None, 45, 13)            52        
_________________________________________________________________
re_lu_23 (ReLU)              (None, 45, 13)            0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 23, 7)            

# Loss Function

## Generator Loss

In [154]:
cross_entropy = tensorflow.keras.losses.CategoricalCrossentropy()

In [155]:
def generator_loss(fake_output, sample_weight):
    # label: data_step[0], target: data_step[1], context: data_step[2]
    cross_entropy(tensorflow.ones_like(fake_output), fake_output, sample_weight)

## Discriminator Loss

In [19]:
def discriminator_loss(real_output, fake_output, data_step):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output, sample_weight)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output, sample_weight)
    total_loss = real_loss + fake_loss
    return total_loss

# Optimizer

In [20]:
generator_optimizer = tensorflow.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tensorflow.keras.optimizers.Adam(1e-4)

# Training Loop

In [22]:
EPOCHS = 10

In [157]:
def train_step(data_step):
    # label: data_step[0], target: data_step[1], context: data_step[2]
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_target = generator(data_step[2], training=True)
        
        real_output = discriminator(data_step[1], training=True)
        fake_output = discriminator(generated_target, training=True)
        
        
        # need to incorporate weights, labels 
        # need to have target in the generator loss
        gen_loss = generator_loss(fake_output, data_step[0])
        disc_loss = discriminator_loss(real_output, fake_output, data_step[0])
    
    #backpropogration 

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

In [25]:
def train(data, epochs):
    for epoch in range(epochs):
        
        for data_step in data:
            train_step(data_step)

In [None]:
train(train_data)

In [None]:
# How do I run validation batches

## Input Training Loop

def load_data(seqs_path, labels_path):
    """
    Given the path to the fasta files of the sequence and labels
    returns a list of a tuple pair of the accession of sequence/label
    and a string of sequence/string.

    :param seqs_path: file path to fasta file with amino acid sequence data
    :param labels_path: file path to fasta file with binary label data
    
    :return: [(accession_seq, 'amino_acid_seq')], [(accession_label, 'label')]
            ex: [('QP106', 'AASSSDD'), ...], [('QP106', '00001111000'), ...]
    """

    # Load files
    seqs = []
    for record in SeqIO.parse(seqs_path, 'fasta'):
        #accession = record.description.split('|')[0]
        seq = str(record.seq)
        seqs.append((accession, seq))

    labels = []
    for record in SeqIO.parse(labels_path, 'fasta'):
        #accession = record.description.split('|')[0]
        label = [int(sym) for sym in record.seq]
        labels.append((accession, label))

    return seqs, labels

def convert_ohc(seqs):
    """
    Converts the given tuple of the accession and amino acid sequence string into
    a int list using sym_codes and then one hot encoded the int list using keras.util.to_categorical

    :param seqs: list of tuple pair of the accession of the amino acid sequence
    and the string of the amino acid sequence ex: [('QP106', 'AASSSDD'), ...]
    :return: 2D list of each one hot encoded amino acid sequence ex: [[0,0,0,0,0,0,1], ...]
    """
    x = []
    # convert string amino acid string characters into int list
    for _, seq in seqs:
        seq_idx = [sym_codes.index(sym) for sym in seq]
        x.append(seq_idx)

    # convert list into numpy array and one hot encode int array
    x = np.array(x)
    x = keras.utils.to_categorical(x, num_classes=len(sym_codes))
    return x

# convert character array to numpy array and one hot encode the sequences
x_train = convert_ohc(train_seqs)
x_test = convert_ohc(test_seqs)
x_valid = convert_ohc(valid_seqs)

y_train = np.array([label for _, label in train_labels])
y_test = np.array([label for _, label in test_labels])
y_valid = np.array([label for _, label in valid_labels])

# mask the sequences
x_train_mask = np.expand_dims(y_train, axis=2)*x_train
x_test_mask = np.expand_dims(y_test, axis=2)*x_test
x_valid_mask = np.expand_dims(y_valid, axis=2)*x_valid

# target for sequences 
x_train_target = np.expand(np.invert(y_train[0]) + 2)*x_train 
x_test_target = np.expand(np.invert(y_test[0]) + 2)*x_train 
x_valid_mask = np.expand(np.invert(y_valid[0]) + 2)*x_train 

masked = x_train_mask[0:5]
masked.shape

masked

def generator_loss(fake_output, sample_weights):
    return cross_entropy(tensorflow.ones_like(fake_output), fake_output) + cross_entropy(t)

def train_step(masked_seq, target, sample_weights):
    #noise = tf.random.normal([BATCH_SIZE, noise_dim])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_target = generator(masked_seq, training=True)
        
        real_output = discriminator(target, training=True)
        fake_output = discriminator(generated_target, training=True)
        
        
        # need to incorporate weights, labels 
        # need to have target in the generator loss
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
    
    #backpropogration 

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))