In [1]:
#!pip install tensorflow

### Glossary
* sequences: string representing amino acid sequence
* label: binary string representing disordered and ordered segments with 1 as target(disordered region) and 0 as context
* target: disordered region as one hot encoded vector from the sequences that we want the neural network to reproduce from the given context 
* weight: labels 
* context: one hot encoded vector that represents the sequences around the target with the target masked (zero-'d out)

In [1]:
import Bio.SeqIO as SeqIO
import numpy as np
import tensorflow
from tensorflow import keras
import matplotlib.pyplot as plt

In [2]:
# Parameters
sym_codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
BATCH_SIZE = 10 

In [4]:
# Helper function for func load_data 
def convert_ohc(seq):
    """
    One hot encodes given amino acid sequence string.
    
    :param seq: string of amino acid sequence 
    :return: 2D array of one hot encoded string 
    
    """
    seq_idx = [sym_codes.index(sym) for sym in seq]
    x = np.array(seq_idx)
    x = keras.utils.to_categorical(x, num_classes=len(sym_codes), dtype='int32')
    
    return x

In [6]:
def load_data(seqs_path, label_path):
    """
    Loads sequences and lables from fasta files. 
    
    :param seq_path: path for fasta file of amino acid sequences 
    :param label_path: path fasta file of labels of amino acid sequences where disordered residues are labeled are labeled as 1 and ordered residues are labeled as 0
    :return: array all one hot encoded sequences and array of all labels from 
    """
    seq_ohc_lst = []
    label_lst = []
    
    for record_seq, record_label in zip(SeqIO.parse(seqs_path, 'fasta'), SeqIO.parse(label_path, 'fasta')):
        
        # one hot encode each record_seq 
        seq = str(record_seq.seq)
        seq_ohc = convert_ohc(seq)
        seq_ohc_lst.append(seq_ohc)
        
        # expand the dimension of record_label for broadcasting
        label = [int(sym) for sym in record_label]
        label_lst.append(label)
        
    return np.array(seq_ohc_lst), np.array(label_lst)

In [7]:
train_seq, train_label = load_data('../../inpainting_mobidb/out/train_seq.fasta', '../../inpainting_mobidb/out/train_label.fasta')

In [8]:
def get_weight_target_context(seq_ohc, label):
    """
    Gets the target, context, and weight from one hot encoded sequences and labels. 
    
    :param seq_ohc: one hot ended 2D arrays of sequences 
    :param label: array of labels corresponding to seq_ohc 
    :return: target, context and weight according to seq_ohc and label 
    
    """
    weight = np.expand_dims(label, axis = 2)

    # get the target from the record 
    target = weight*seq_ohc
        
    # get the context from the record (inverted the weight)
    context = (np.invert(weight) + 2)*seq_ohc
    
    return weight, target, context

# Generator Model

In [10]:
# make generative model
def make_generative_model():
    """
    Makes generative generative model for DCGAN based off of architecture from "Protein Loop Modeling Using 
    Deep Generative Adversarial Network" paper. 
    
    :return: model instance of generative model 
    
    """
    
    # convolution 
    model = tensorflow.keras.Sequential()
    model.add(keras.Input(shape=((180, 20))))
    
    model.add(keras.layers.Conv1D(8, 3, strides = 1, padding='same', name='first'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(16, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(32, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())

    model.add(keras.layers.Conv1D(64, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(128, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(256, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    # deconvolution 
    model.add(keras.layers.Conv1DTranspose(128, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(64, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(32, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(16, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1DTranspose(8, 3, strides = 1, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    #FIXEME: PLAY AROUND WITH THE RATIO OF FILTER 
    
    # added last layer to transform filters to probability classes 
    model.add(keras.layers.Conv1DTranspose(20, 3, strides = 1, padding='same', activation = 'softmax'))

    return model

In [11]:
generator = make_generative_model()
generator.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first (Conv1D)               (None, 180, 8)            488       
_________________________________________________________________
batch_normalization (BatchNo (None, 180, 8)            32        
_________________________________________________________________
re_lu (ReLU)                 (None, 180, 8)            0         
_________________________________________________________________
conv1d (Conv1D)              (None, 180, 16)           400       
_________________________________________________________________
batch_normalization_1 (Batch (None, 180, 16)           64        
_________________________________________________________________
re_lu_1 (ReLU)               (None, 180, 16)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 180, 32)           1

# Discriminator Model

In [12]:
# make discrimator model
def make_discriminator_model():
    """
    Makes adverserial/discriminative model for DCGAN based off of architecture from "Protein Loop Modeling Using 
    Deep Generative Adversarial Network" paper. 
    
    :return: model instance of discriminative model 
    
    """
    model = tensorflow.keras.Sequential()
    model.add(keras.Input(shape=((180, 20))))
    
    model.add(keras.layers.Conv1D(25, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(13, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(7, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Conv1D(4, 4, strides = 2, padding='same'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.ReLU())
    
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(1, activation = 'softmax'))
    
    return model

In [13]:
discriminator = make_discriminator_model()
discriminator.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_5 (Conv1D)            (None, 90, 25)            2025      
_________________________________________________________________
batch_normalization_11 (Batc (None, 90, 25)            100       
_________________________________________________________________
re_lu_11 (ReLU)              (None, 90, 25)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 45, 13)            1313      
_________________________________________________________________
batch_normalization_12 (Batc (None, 45, 13)            52        
_________________________________________________________________
re_lu_12 (ReLU)              (None, 45, 13)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 23, 7)            

# Loss Function

## Generator Loss

In [14]:
cross_entropy = tensorflow.keras.losses.CategoricalCrossentropy()

In [15]:
def generator_loss(fake_output, generated_target, target):
    generated_target = tensorflow.cast(generated_target, tensorflow.int64)
    ones_like_fake_output = tensorflow.cast(tensorflow.ones_like(fake_output), tensorflow.int64)
    a = cross_entropy(ones_like_fake_output, fake_output)
    b = cross_entropy(generated_target, target)
    return  a + b

## Discriminator Loss

In [16]:
def discriminator_loss(real_output, fake_output, weight):
    
    real_loss = cross_entropy(tensorflow.cast(tensorflow.ones_like(real_output), tensorflow.int64), real_output, weight)
    fake_loss = cross_entropy(tensorflow.cast(tensorflow.zeros_like(fake_output), tensorflow.int64), fake_output, weight)
    total_loss = real_loss + fake_loss
    return total_loss

# Optimizer

In [17]:
generator_optimizer = tensorflow.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tensorflow.keras.optimizers.Adam(1e-4)

# Training Loop

In [18]:
def train_step(context, target, weight):

    with tensorflow.GradientTape() as gen_tape, tensorflow.GradientTape() as disc_tape:
        generated_target = generator(context, training=True)*weight
        generated_target = tensorflow.cast(generated_target, tensorflow.int64)
        
        real_output = discriminator(target, training=True)
        fake_output = discriminator(generated_target, training=True)
        
        
        target = tensorflow.cast(tensorflow.constant(target), dtype = tensorflow.float32)
        
        gen_loss = generator_loss(fake_output, generated_target, target)
        disc_loss = discriminator_loss(real_output, fake_output, weight)
    
        # backpropogration
        gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
        gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

        generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
        discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

In [19]:
def train(context, target, weight, epochs):
    
    # batch data 
    context_batch = np.array_split(context, BATCH_SIZE)
    target_batch = np.array_split(target, BATCH_SIZE)
    weight_batch = np.array_split(weight, BATCH_SIZE)
    
    for epoch in range(epochs):

        for context, target, weight in zip(context_batch, target_batch, weight_batch):
            
            train_step(context, target, weight)

In [20]:
# Load data
train_seq, train_label = load_data('../../inpainting_mobidb/out/train_seq.fasta', '../../inpainting_mobidb/out/train_label.fasta')
train_weight, train_target, train_context = get_weight_target_context(train_seq, train_label)


#test_data = load_data('../../inpainting_mobidb/out/test_seq.fasta','../../inpainting_mobidb/out/test_label.fasta')

#valid_data = load_data('../../inpainting_mobidb/out/validation_seq.fasta','../../inpainting_mobidb/out/validation_label.fasta')

In [28]:
train(train_context, train_target, train_weight, 1)

(150, 1)
(150, 180, 20)
(150, 180, 20)
<dtype: 'int64'>
<dtype: 'float32'>
<dtype: 'int64'>
<dtype: 'float32'>
a
tf.Tensor(1.1920929e-07, shape=(), dtype=float32)
b
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor([150 180  20], shape=(3,), dtype=int32)
tf.Tensor([150 180  20], shape=(3,), dtype=int32)
tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(20,), dtype=float32)
tf.Tensor([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(20,), dtype=int64)
tf.Tensor(nan, shape=(), dtype=float32)
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[<tf.Variable 'first/kernel:0' shape=(3, 20, 8) dtype=float32, numpy=
array([[[-0.16022187,  0.17067975, -0.16519868, -0.02088495,
         -0.03232104,  0.20495439,  0.21268955, -0.10696983],
     

ValueError: No gradients provided for any variable: ['first/kernel:0', 'first/bias:0', 'batch_normalization/gamma:0', 'batch_normalization/beta:0', 'conv1d/kernel:0', 'conv1d/bias:0', 'batch_normalization_1/gamma:0', 'batch_normalization_1/beta:0', 'conv1d_1/kernel:0', 'conv1d_1/bias:0', 'batch_normalization_2/gamma:0', 'batch_normalization_2/beta:0', 'conv1d_2/kernel:0', 'conv1d_2/bias:0', 'batch_normalization_3/gamma:0', 'batch_normalization_3/beta:0', 'conv1d_3/kernel:0', 'conv1d_3/bias:0', 'batch_normalization_4/gamma:0', 'batch_normalization_4/beta:0', 'conv1d_4/kernel:0', 'conv1d_4/bias:0', 'batch_normalization_5/gamma:0', 'batch_normalization_5/beta:0', 'conv1d_transpose/kernel:0', 'conv1d_transpose/bias:0', 'batch_normalization_6/gamma:0', 'batch_normalization_6/beta:0', 'conv1d_transpose_1/kernel:0', 'conv1d_transpose_1/bias:0', 'batch_normalization_7/gamma:0', 'batch_normalization_7/beta:0', 'conv1d_transpose_2/kernel:0', 'conv1d_transpose_2/bias:0', 'batch_normalization_8/gamma:0', 'batch_normalization_8/beta:0', 'conv1d_transpose_3/kernel:0', 'conv1d_transpose_3/bias:0', 'batch_normalization_9/gamma:0', 'batch_normalization_9/beta:0', 'conv1d_transpose_4/kernel:0', 'conv1d_transpose_4/bias:0', 'batch_normalization_10/gamma:0', 'batch_normalization_10/beta:0', 'conv1d_transpose_5/kernel:0', 'conv1d_transpose_5/bias:0'].