In [139]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
#import tensorflow_probability as tfp
import pandas as pd
import time
import csv
from sklearn.preprocessing import OneHotEncoder
from IPython.display import clear_output

'''
Sourced from Tensorflow's tutorial on Variational Autoencoders.
'''

class CVAE(tf.keras.Model):
  """Convolutional variational autoencoder."""

  def __init__(self, latent_dim, seq_len, num_aa, dropout_rate): #CHANGED
    super(CVAE, self).__init__()
    self.latent_dim = latent_dim
    self.encoder = tf.keras.Sequential(
        [
            tf.keras.layers.InputLayer(input_shape=(1, seq_len, num_aa)), #CHANGED
            tf.keras.layers.Conv2D(
                filters=32, kernel_size=(1, 10), strides= 2, activation='relu'), #CHANGED
            tf.keras.layers.Dropout(rate = dropout_rate),
            tf.keras.layers.Conv2D(
                filters=32, kernel_size=(1, 10), strides= 2, activation='relu'), #CHANGED
            tf.keras.layers.Dropout(rate = dropout_rate),
            tf.keras.layers.Conv2D(
                filters=32, kernel_size=(1, 10), strides= 2, activation='relu'), #CHANGED
            tf.keras.layers.Dropout(rate = dropout_rate),
            tf.keras.layers.Conv2D(
                filters=32, kernel_size=(1, 10), strides= 2, activation='relu'), #CHANGED
            tf.keras.layers.Dropout(rate = dropout_rate),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128),
            tf.keras.layers.Dense(latent_dim + latent_dim),
        ]
    )

    self.decoder = tf.keras.Sequential(
        [
            tf.keras.layers.InputLayer(input_shape=(latent_dim,)),
            tf.keras.layers.Dense(units=1*seq_len//16*32, activation=tf.nn.relu), #CHANGED
            tf.keras.layers.Reshape(target_shape=(1, seq_len//16, 32)),
            tf.keras.layers.Conv2DTranspose(
                filters=32, kernel_size=(1, 10), strides= (1, 2), padding='same', #CHANGED
                activation='relu'),
            tf.keras.layers.Conv2DTranspose(
                filters=64, kernel_size=(1, 10), strides= (1, 2), padding='same', #CHANGED
                activation='relu'),
            tf.keras.layers.Conv2DTranspose(
                filters=64, kernel_size=(1, 10), strides= (1, 2), padding='same', #CHANGED
                activation='relu'),
            tf.keras.layers.Conv2DTranspose(
                filters=64, kernel_size=(1, 10), strides= (1, 2), padding='same', #CHANGED
                activation='relu'),
            # No activation
            tf.keras.layers.Conv2DTranspose(
                filters=21, kernel_size=3, strides=1, padding='same'),
        ]
    )

  @tf.function
  def sample(self, eps=None):
    if eps is None:
      eps = tf.random.normal(shape=(100, self.latent_dim))
    return self.decode(eps, apply_sigmoid=True)

  def encode(self, x):
    mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
    return mean, logvar

  def reparameterize(self, mean, logvar):
    eps = tf.random.normal(shape=mean.shape)
    return eps * tf.exp(logvar * .5) + mean

  def decode(self, z, apply_sigmoid=False):
    logits = self.decoder(z)
    if apply_sigmoid:
      probs = tf.sigmoid(logits)
      return probs
    return logits


In [144]:
##
debug = True
Train = True
prot_length_perc_cutoff = 70
##

## from TF_VAE import CVAE
'''
Sourced from Tensorflow's tutorial on VAEs.
'''
optimizer = tf.keras.optimizers.Adam(1e-4)
def log_normal_pdf(sample, mean, logvar, raxis=1):
  log2pi = tf.math.log(2. * np.pi)
  return tf.reduce_sum(
      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
      axis=raxis)


def compute_loss(model, x):
  mean, logvar = model.encode(x)
  z = model.reparameterize(mean, logvar)
  x_logit = model.decode(z)
  cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)
  logpx_z = -tf.reduce_mean(cross_ent, axis=[1, 2, 3])
  
  logpz = log_normal_pdf(z, 0., 0.)
  logqz_x = log_normal_pdf(z, mean, logvar)
  # -tf.reduce_mean(logpx_z + logpz - logqz_x)
  return -tf.reduce_mean(logpx_z + logpz - logqz_x)

def train_step(model, x, optimizer):
  """Executes one training step and returns the loss.
  This function computes the loss and gradients, and uses the latter to
  update the model's parameters.
  """
  with tf.GradientTape() as tape:
    loss = compute_loss(model, x)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

def train(epochs = 100, latent_dim=50, training_rate = 1e-4, dropout_rate = 0.5):
    losses = []
    optimizer = tf.keras.optimizers.Adam(training_rate)
    model = CVAE(latent_dim, seq_len = seq_len, num_aa = 21, dropout_rate = dropout_rate) ##Num aa includes "_"
    _train_step = tf.function(train_step) ## CHANGED
    for epoch in range(1, epochs + 1):
      start_time = time.time()
      for train_x in train_dataset:
        _train_step(model, train_x, optimizer)
      end_time = time.time()
      loss = tf.keras.metrics.Mean()
      for test_x in test_dataset:
        loss(compute_loss(model, test_x))
      elbo = -loss.result()
      losses.append(elbo)
      # display.clear_output(wait=False)
      print('Epoch: {}, Test set ELBO: {}, time elapse for current epoch: {}'
            .format(epoch, elbo, end_time - start_time))
    return model, losses


In [145]:
import random
location = '/Users/AndrewHennes/Desktop/Code/Python/Class_Specific_Code/6.802/Project/Ensembl Datasets/New Datasets/5SpeciesProteins.csv'
data = np.array([row for row in csv.reader(open(location, 'r'))])[1:, 1:] ## Remove the x and y axis labels.
seq_len = min(len(data[0][1]), 300)//16*16 ## CHANGE BACK!!
data = np.array([[aa for aa in row[1][:seq_len]] for row in data])
data = data[:len(data)//10]
data = data[:len(data)//50*50]

num_aa = 21
enc = OneHotEncoder(categories = [np.array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '_'], dtype='<U1') for i in range(seq_len)],
                    sparse = False, 
                    handle_unknown='ignore')


one_hot = enc.fit_transform(data).reshape(-1, 50, 1, seq_len, num_aa).astype('float32')
train_dataset = one_hot[:len(one_hot)*4//5] # 80 Percent Training Dataset
test_dataset = one_hot[len(one_hot)*4//5:] # 20 Percent Testing Dataset
print("Training dataset shape is", train_dataset.shape)
print("Test dataset shape is", test_dataset.shape)


Training dataset shape is (150, 50, 1, 288, 21)
Test dataset shape is (38, 50, 1, 288, 21)


In [141]:
aas = ['*', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
sample = model.sample().numpy()
sample = np.argmax(sample, axis = 3)
print(sample[0])
print(one_hot[0][0].reshape(1, 1, 288, 21).shape)
for i in range(10):
    a = one_hot[i][0].reshape(1, 1, 288, 21)
    mean, logvar = model.encode(a)
    sample = model.decode(mean)
    sample = np.argmax(sample, axis = 3)
    print(sample)


[[10  0  0  0  5  0  9  9  9  9  9  9  9  9  9  9  9  9  0  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  0  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
   9  9  9  9  9  9  9  9  9  9  9  9 

InvalidArgumentError: Matrix size-incompatible: In[0]: [1,320], In[1]: [736,128] [Op:MatMul]

In [119]:
a, b = tf.constant([1, 2, 3, 4])
print(a, b)

ValueError: too many values to unpack (expected 2)

In [None]:
results = {}
for ind1, training_rate in enumerate([1e-3, 1e-4]):
  for ind2, dropout_rate in enumerate([0.0, 0.5, 0.9]):
    for ind3, latent_dim in enumerate([40, 80]):
      print("For a training_rate of %s, a dropout rate of %s, and a latent dim of %s, the loss over 100 epochs is:" % (training_rate, dropout_rate, latent_dim))
      model, losses = train(latent_dim = latent_dim, training_rate = training_rate, dropout_rate = dropout_rate)
      results[(training_rate, dropout_rate, latent_dim)] = losses


In [None]:
results = {}
for ind1, training_rate in enumerate([1e-4]):
  for ind2, dropout_rate in enumerate([0.0, 0.5, 0.9]):
    for ind3, latent_dim in enumerate([40, 100]):
      print("For a training_rate of %s, a dropout rate of %s, and a latent dim of %s, the loss over 100 epochs is:" % (training_rate, dropout_rate, latent_dim))
      model, losses = train(latent_dim = latent_dim, training_rate = training_rate, dropout_rate = dropout_rate)
      results[(training_rate, dropout_rate, latent_dim)] = losses

In [10]:
sample()

NameError: name 'sample' is not defined

In [18]:
new_results = {}
for res in results:
    new_results[res] = [el.numpy() for el in results[res]]
print(new_results)

{(0.001, 0.0, 40): [-1790.5707, -1789.6794, -1786.7626, -1787.9448, -1785.2084, -1784.0507, -1783.5522, -1783.8016, -1783.0607, -1783.7692, -1783.4398, -1782.6387, -1783.0658, -1783.4341, -1782.3354, -1782.3458, -1783.2943, -1782.4447, -1782.0837, -1781.8798, -1782.9958, -1782.2377, -1781.7018, -1783.9497, -1782.0614, -1782.0466, -1781.4056, -1781.8307, -1781.6782, -1781.5577, -1781.8972, -1782.5903, -1782.1842, -1783.1013, -1781.4486, -1781.8625, -1782.1094, -1781.476, -1781.697, -1782.9884, -1786.0206, -1781.8312, -1782.914, -1784.4685, -1781.8132, -1783.3301, -1782.321, -1783.8785, -1782.1566, -1784.0618, -1782.1826, -1782.0687, -1782.2238, -1783.1028, -1784.215, -1783.7091, -1784.1252, -1782.667, -1783.2996, -1782.2134, -1782.6747, -1782.8258, -1783.8694, -1785.8668, -1782.1346, -1781.4915, -1782.648, -1782.3585, -1785.0575, -1782.8269, -1784.8578, -1784.2999, -1787.8481, -1783.1556, -1783.746, -1783.4237, -1782.6327, -1781.7089, -1781.3628, -1781.4161, -1783.1666, -1781.644, -1781