In [1]:
%whos

Interactive namespace is empty.


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, CSVLogger
import os
from pathlib import Path
import json
from tensorflow.keras.models import Model

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        
        self.encoder: Model = encoder
        self.decoder: Model = decoder
        
        self.total_loss_tracker = keras.metrics.Mean(name = "total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name = "reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name = "kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss_fn = keras.losses.MeanSquaredError()
            reconstruction_loss = reconstruction_loss_fn(data, reconstruction)
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        return self.decoder(z)


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

latent_dim = 10
base_path = 'rvrs_out'

X, y = make_classification(n_samples=1000,
                           n_features=100,
                           n_informative=10,
                           n_redundant=90,
                           random_state=1,
                           shift = 30)

scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(X)
X = scaler.transform(X)
data = X

data = np.trunc(1000 * data) / 1000

X_train, X_val = train_test_split(data, test_size=0.2)

train_data = X_train
val_data = X_val

input_dimensions = train_data.shape[1]

encoder_inputs = keras.Input(shape=(input_dimensions,))
x = layers.Dense(units=input_dimensions / 2, activation="relu")(encoder_inputs)
x = layers.Dense(units=input_dimensions / 3, activation="relu")(x)
x = layers.Dense(units=input_dimensions / 4, activation="relu")(x)

z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])

encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(units=input_dimensions / 4, activation="relu")(latent_inputs)
x = layers.Dense(units=input_dimensions / 3, activation="relu")(x)
x = layers.Dense(units=input_dimensions / 2, activation="relu")(x)

decoder_outputs = layers.Dense(units=input_dimensions, activation="relu")(x)

decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

vae: VAE = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())

callbacks = []
early_stop = EarlyStopping(monitor="reconstruction_loss",
                           mode="min", patience=5,
                           restore_best_weights=True)

callbacks.append(early_stop)
csv_logger = CSVLogger(os.path.join(base_path, 'training.log'), # Is this writing a file?
                       separator='\t')
callbacks.append(csv_logger)
history = vae.fit(train_data, # Fit the VAE, work backward to carve out minimum viable code path to run this function
                  callbacks=callbacks,
                  validation_data=(val_data, val_data),
z_mean, z_var, embedding = vae.encoder.predict(test_data)
embedding = pd.DataFrame(embedding)


SyntaxError: positional argument follows keyword argument (2392556223.py, line 131)

In [10]:
history

<keras.callbacks.History at 0x7f7dd8dc7880>

In [3]:
embedding

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.021363,-1.199799,0.137270,1.297587,2.042372,0.183850,-0.775907,0.255613,0.981052,0.354788
1,1.041011,-1.260385,0.132285,-0.479902,-0.364104,-0.319177,-0.984617,-0.083420,-0.033995,-0.696346
2,0.444304,-0.689801,-0.093025,-1.255087,1.027433,0.396463,-0.138884,0.043872,-0.855598,-0.737397
3,0.968805,-0.195319,-0.755937,0.917800,-0.407270,-0.048072,-0.746339,0.903619,0.502316,-0.904222
4,0.050442,0.034416,1.446062,-0.247074,-0.749640,-0.040352,0.963695,1.084258,0.246785,0.879446
...,...,...,...,...,...,...,...,...,...,...
795,-0.857996,1.304992,-1.216675,-0.690762,0.625938,0.556321,-0.468636,0.207434,-0.181016,-0.913800
796,0.144425,-0.251678,-0.103680,-0.865499,-0.878160,0.570694,-1.595752,0.051418,-0.625263,-0.956271
797,-0.589522,-0.880361,0.773587,-0.296867,-0.271078,-0.199294,-1.703998,1.282618,1.047622,2.104883
798,-0.136441,0.646560,-0.476017,-0.062668,-0.901895,-1.633245,0.811345,0.688448,0.797758,0.536947


In [None]:
# Is the order of the samples the same to map back on the
# Use TCGA labels as a proxy?

In [4]:
X

array([[0.74678965, 0.91327356, 0.29944317, ..., 0.73152156, 0.73856296,
        0.64872896],
       [0.42702927, 0.33032489, 0.49930979, ..., 0.59468616, 0.25594968,
        0.49775591],
       [0.37808867, 0.52139799, 0.51435171, ..., 0.45008391, 0.60017762,
        0.35836613],
       ...,
       [0.41375141, 0.52002896, 0.39327986, ..., 0.62038482, 0.65482385,
        0.45506501],
       [0.62531317, 0.87386112, 0.34198686, ..., 0.69795355, 0.73965887,
        0.77668578],
       [0.59298449, 0.65442758, 0.67669512, ..., 0.653978  , 0.6337722 ,
        0.53203201]])

In [5]:
data

array([[0.746, 0.913, 0.299, ..., 0.731, 0.738, 0.648],
       [0.427, 0.33 , 0.499, ..., 0.594, 0.255, 0.497],
       [0.378, 0.521, 0.514, ..., 0.45 , 0.6  , 0.358],
       ...,
       [0.413, 0.52 , 0.393, ..., 0.62 , 0.654, 0.455],
       [0.625, 0.873, 0.341, ..., 0.697, 0.739, 0.776],
       [0.592, 0.654, 0.676, ..., 0.653, 0.633, 0.532]])

In [6]:
X, y = make_classification(n_samples=1000,
                           n_features=100,
                           n_informative=10,
                           n_redundant=90,
                           random_state=1,
                           shift = 30)

In [7]:
 X # does each row correspond to a one or zero labe?

array([[35.83689773, 39.48106626, 24.07085253, ..., 35.37136211,
        35.7226078 , 30.51555388],
       [27.28218758, 24.31671149, 29.16569292, ..., 31.25532969,
        23.32282631, 27.44610943],
       [25.97285531, 29.28713243, 29.54912945, ..., 26.90566869,
        32.16707373, 24.61216514],
       ...,
       [26.92695835, 29.25151945, 26.46286269, ..., 32.02834985,
        33.57109914, 26.57815753],
       [32.58697636, 38.45582302, 25.15534229, ..., 34.36163077,
        35.75076495, 33.11705391],
       [31.72207094, 32.74765656, 33.68745793, ..., 33.03883854,
        33.03021905, 28.14297938]])

In [8]:
X.shape

(1000, 100)

In [9]:
y

array([1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,