# *Modern Deep Learning for Tabular Data*, Chapter 9

**Data Generation**

This notebook contains the complementary code discussed in Chapter 9 of *Modern Deep Learning for Tabular Data*.

External Kaggle links to datasets used in this notebook:
- [Mouse Protein Expression Dataset](https://www.kaggle.com/datasets/washingtongold/mpempe)
- [Higgs Boston Dataset](https://www.kaggle.com/datasets/mragpavank/higs-bonsons-and-background-process)

You can download these datasets from Kaggle, or import these notebooks into Kaggle and connect them internally.

---

## Imports

In [None]:
# data management
import numpy as np                   # for linear algebra
import pandas as pd                  # for tabular data manipulation and processing

# machine learning
import sklearn                       # for data prep and classical ML
import tensorflow as tf              # for deep learning
from tensorflow.keras.datasets import mnist    # for example dataset
from tensorflow import keras         # for deep learning
import keras.layers as L             # for easy NN layer access

# data visualization and graphics
import matplotlib.pyplot as plt      # for visualization fundamentals
import seaborn as sns                # for pretty visualizations
import cv2                           # for image manipulation

# misc
from tqdm.notebook import tqdm       # for progress bars
import math                          # for calculation
import sys                           # for system manipulation
import os                            # for file manipulation

# disable logging
tf.get_logger().setLevel('WARNING')
tf.autograph.set_verbosity(2)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

---

## Variational Autoencoder

In [None]:
(x_train, y_train), (x_valid, y_valid) = tensorflow.keras.datasets.mnist.load_data()
x_train = x_train.reshape(len(x_train),784)/255
x_valid = x_valid.reshape(len(x_valid),784)/255

Attempting to interpolate with a standard autoencoder.

In [None]:
(X_train, y_train), (X_valid, y_valid) = keras.datasets.mnist.load_data()
X_train = X_train.reshape((len(X_train), 784)).astype(np.float32)
X_valid = X_valid.reshape((len(X_valid), 784,)).astype(np.float32)
import numpy as np
import pandas as pd
from keras import layers as L
X_train /= 255
X_valid /= 255
inp = L.Input((784,))
d1 = L.Dense(128, activation='relu')(inp)
d2 = L.Dense(64, activation='relu')(d1)
d3 = L.Dense(32, activation='relu')(d2)
d4 = L.Dense(16, activation='relu')(d3)
encoder = keras.models.Model(inputs=inp, outputs=d4)

inp = L.Input((16,))
d1 = L.Dense(32, activation='relu')(inp)
d2 = L.Dense(64, activation='relu')(d1)
d3 = L.Dense(128, activation='relu')(d2)
d4 = L.Dense(784, activation='sigmoid')(d3)
decoder = keras.models.Model(inputs=inp, outputs=d4)

inp = L.Input((784,))
encoded = encoder(inp)
decoded = decoder(encoded)
model = keras.models.Model(inputs=inp, outputs=decoded)

model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(X_train, X_train, epochs=30,
          validation_data=(X_valid, X_valid))

In [None]:
encoded = encoder(X_train[0:1])

plt.figure(figsize=(10, 10), dpi=400)
for i in range(5):
    for j in range(5):
        plt.subplot(5, 5, i*5+j+1)
        modified_encoded = encoded + 0.5 * (i*5+j+1)
        decoded = decoder(modified_encoded).numpy()
        plt.imshow(decoded.reshape((28,28)))
        plt.axis('off')
plt.show()

In [None]:
for i in range(10):
    
    encoded1 = encoder(X_train[i:i+1])
    encoded2 = encoder(X_train[i+1:i+2])

    modified_encoded = (encoded1 + encoded2) / 2
    decoded = decoder(modified_encoded)

    plt.figure(figsize=(10, 3), dpi=400)
    plt.subplot(1, 3, 1)
    plt.imshow(X_train[i:i+1].reshape((28,28)))
    plt.axis('off')
    plt.subplot(1, 3, 2)
    plt.imshow(decoded.numpy().reshape((28,28)))
    plt.axis('off')
    plt.subplot(1, 3, 3)
    plt.imshow(X_train[i+1:i+2].reshape((28,28)))
    plt.axis('off')
    plt.show()

Building a Variational Autoencoder.

In [None]:
# encoder
enc_inputs = L.Input((784,), name='input')
enc_dense1 = L.Dense(256, activation='relu',
                     name='dense1')(enc_inputs)
enc_dense2 = L.Dense(128, activation='relu',                 
                     name='dense2')(enc_dense1)
means = L.Dense(32, name='means')(enc_dense2)
log_stds = L.Dense(32, name='log-stds')(enc_dense2)

def sampling(args):
    means, log_stds = args
    eps = tf.random.normal(shape=(tf.shape(means)[0], 32),
                           mean=0, stddev=0.15)
    return means + tf.exp(log_stds) * eps

x = L.Lambda(sampling, name='sampling')([means, log_stds])

encoder = keras.Model(inputs=enc_inputs, 
                      outputs=[means, log_stds, x],
                      name='encoder')

# decoder
dec_inputs = L.Input((32,), name='input')
dec_dense1 = L.Dense(128, activation='relu',               
                     name='dense1')(dec_inputs)
dec_dense2 = L.Dense(256, activation='relu',
                     name='dense2')(dec_dense1)
output = L.Dense(784, activation='sigmoid',
                 name='output')(dec_dense2)
decoder = keras.Model(inputs=dec_inputs, 
                      outputs=output, 
                      name='decoder')

# construct vae
vae_inputs = enc_inputs
encoded = encoder(vae_inputs)
decoded = decoder(encoded[2])
vae = keras.Model(inputs=vae_inputs, 
                  outputs=decoded,
                  name='vae')

# build loss function
from keras.losses import binary_crossentropy
reconst_loss = binary_crossentropy(vae_inputs, decoded)
kl_loss = 1 + log_stds - tf.square(means) - tf.exp(log_stds)
kl_loss = tf.square(tf.reduce_sum(kl_loss, axis=-1))
vae_loss = tf.reduce_mean(reconst_loss + kl_loss)

# compile model
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

# fit
vae.fit(x_train, x_train, epochs=20)
for i in range(10):
    
    base = encoder.predict(x_train[i:i+1])[2]

    plt.figure(figsize=(10, 10), dpi=400)
    for row in range(10):
        for col in range(10):
            plt.subplot(10, 10, (row) * 10 + col + 1)
            add = np.zeros(base.shape)
            add[:, [0, 2, 4, 6]] = 0.25 * (row - 5)
            add[:, [1, 3, 5, 7]] = 0.25 * (col - 5)
            decoded = decoder.predict(base + add)
            plt.imshow(decoded.reshape((28, 28)))
            plt.axis('off')
    plt.show()

Fitting a VAE on the Higgs Boson dataset.

In [None]:
data = pd.read_csv('../input/higs-bonsons-and-background-process/train.csv')
X = data.drop(['class', 'id'], axis=1)
y = data['class']

from sklearn.model_selection import train_test_split as tts
X_train, X_valid, y_train, y_valid = tts(X, y, train_size = 0.8, random_state = 42)

# encoder
enc_inputs = L.Input((28,), name='input')
enc_dense1 = L.Dense(16, activation='relu',
                     name='dense1')(enc_inputs)
enc_dense2 = L.Dense(16, activation='relu',                 
                     name='dense2')(enc_dense1)
means = L.Dense(8, name='means')(enc_dense2)
log_stds = L.Dense(8, name='log-stds')(enc_dense2)

def sampling(args):
    means, log_stds = args
    eps = tf.random.normal(shape=(tf.shape(means)[0], 8),
                           mean=0, stddev=0.15)
    return means + tf.exp(log_stds) * eps

x = L.Lambda(sampling, name='sampling')([means, log_stds])

encoder = keras.Model(inputs=enc_inputs, 
                      outputs=[means, log_stds, x],
                      name='encoder')

# decoder
dec_inputs = L.Input((8,), name='input')
dec_dense1 = L.Dense(16, activation='relu',               
                     name='dense1')(dec_inputs)
dec_dense2 = L.Dense(16, activation='relu',
                     name='dense2')(dec_dense1)
output = L.Dense(28, activation='linear',
                 name='output')(dec_dense2)
decoder = keras.Model(inputs=dec_inputs, 
                      outputs=output, 
                      name='decoder')

# construct vae
vae_inputs = enc_inputs
encoded = encoder(vae_inputs)
decoded = decoder(encoded[2])
vae = keras.Model(inputs=vae_inputs, 
                  outputs=decoded,
                  name='vae')

# build loss function
from keras.losses import mean_squared_error
reconst_loss = mean_squared_error(vae_inputs, decoded)
kl_loss = 1 + log_stds - tf.square(means) - tf.exp(log_stds)
kl_loss = tf.square(tf.reduce_sum(kl_loss, axis=-1))
vae_loss = tf.reduce_mean(reconst_loss + kl_loss)

# compile model
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

# fit
vae.fit(X_train, X_train, epochs=20)

In [None]:
NUM_BASES = 40
NUM_PER_SAMPLE = 20
samples = []

for i in tqdm(range(NUM_BASES)): 
    base = encoder.predict(X_train[i:i+1])[2]
    for i in range(NUM_PER_SAMPLE):
        add = np.random.normal(0, 1, size=base.shape)
        generated = decoder.predict(base + add)
        samples.append(generated[0])
        
samples = np.array(samples)

In [None]:
generated = pd.DataFrame(samples, columns=X.columns)

plt.figure(figsize=(50, 50), dpi=400)
sns.pairplot(generated,
             x_vars = X.columns[:5],
             y_vars = X.columns[5:10],
             kind='kde')
plt.show()
plt.figure(figsize=(50, 50), dpi=400)
sns.pairplot(X.iloc[np.random.choice(len(X), size=800, replace=False)],
             x_vars = X.columns[:5],
             y_vars = X.columns[5:10],
             kind='kde')
plt.show()

Fitting on the Mouse Protein Expression dataset.

In [None]:
df = pd.read_csv('../input/mpempe/mouse-protein-expression.csv').drop('Unnamed: 0', axis=1)

from sklearn.model_selection import train_test_split as tts
mpe_x = df.drop('class', axis=1)
mpe_y = df['class']
X_train, X_valid, y_train, y_valid = tts(mpe_x, mpe_y, train_size = 0.8, random_state = 42)


enc_inputs = L.Input((80,), name='input')
enc_dense1 = L.Dense(64, activation='relu',
                     name='dense1')(enc_inputs)
enc_dense2 = L.Dense(32, activation='relu',                 
                     name='dense2')(enc_dense1)
enc_dense3 = L.Dense(16, activation='relu',                 
                     name='dense3')(enc_dense2)
means = L.Dense(8, name='means')(enc_dense3)
log_stds = L.Dense(8, name='log-stds')(enc_dense3)

def sampling(args):
    means, log_stds = args
    eps = tf.random.normal(shape=(tf.shape(means)[0], 8),
                           mean=0, stddev=0.15)
    return means + tf.exp(log_stds) * eps

x = L.Lambda(sampling, name='sampling')([means, log_stds])

encoder = keras.Model(inputs=enc_inputs, 
                      outputs=[means, log_stds, x],
                      name='encoder')

# decoder
dec_inputs = L.Input((8,), name='input')
dec_dense1 = L.Dense(16, activation='relu',               
                     name='dense1')(dec_inputs)
dec_dense2 = L.Dense(32, activation='relu',
                     name='dense2')(dec_dense1)
dec_dense3 = L.Dense(64, activation='relu',
                     name='dense3')(dec_dense2)
output = L.Dense(80, activation='linear',
                 name='output')(dec_dense3)
decoder = keras.Model(inputs=dec_inputs, 
                      outputs=output, 
                      name='decoder')

# construct vae
vae_inputs = enc_inputs
encoded = encoder(vae_inputs)
decoded = decoder(encoded[2])
vae = keras.Model(inputs=vae_inputs, 
                  outputs=decoded,
                  name='vae')

# build loss function
from keras.losses import mean_squared_error
reconst_loss = mean_squared_error(vae_inputs, decoded)
kl_loss = 1 + log_stds - tf.square(means) - tf.exp(log_stds)
kl_loss = tf.square(tf.reduce_sum(kl_loss, axis=-1))
vae_loss = tf.reduce_mean(reconst_loss + kl_loss)

# compile model
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

# fit
vae.fit(X_train, X_train, epochs=100)

In [None]:
NUM_BASES = 40
NUM_PER_SAMPLE = 20
samples = []

for i in tqdm(range(NUM_BASES)): 
    base = encoder.predict(X_train[i:i+1])[2]
    for i in range(NUM_PER_SAMPLE):
        add = np.random.normal(0, 1, size=base.shape)
        generated = decoder.predict(base + add)
        samples.append(generated[0])
        
samples = np.array(samples)

In [None]:
generated = pd.DataFrame(samples, columns=X_train.columns)

plt.figure(figsize=(50, 50), dpi=400)
sns.pairplot(generated,
             x_vars = X_train.columns[:5],
             y_vars = X_train.columns[5:10],
             kind='kde')
plt.show()
plt.figure(figsize=(50, 50), dpi=400)
sns.pairplot(X_train, #.iloc[np.random.choice(len(X_train), size=500, replace=False)],
             x_vars = X_train.columns[:5],
             y_vars = X_train.columns[5:10],
             kind='kde')
plt.show()

---

## Traditional GANs

We will go through the implementation of a basic GAN for the MNIST dataset 

In [None]:
# sepcific imports for different components of Keras
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.callbacks as C

In [None]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [None]:
# remove 1
X_train = X_train[y_train!=1] 
# reshape to (28, 28, 1)
X_train = np.expand_dims(X_train, axis=3)
# for operations later
X_train = X_train.astype("float32")
# normalize
X_train /= 255.0
del y_train, X_test, y_test

In [None]:
# discriminator 
# simple fully-connected NN, can be modified to CNN to improve performance
# flatten 2D images
inp = L.Input(shape=(28, 28, 1))
x = L.Flatten(input_shape=[28, 28])(inp)
x = L.Dense(512, activation=L.LeakyReLU(alpha=0.25))(x)
x = L.Dropout(0.3)(x)
x = L.Dense(1024, activation=L.LeakyReLU(alpha=0.25))(x)
x = L.Dropout(0.3)(x)
x = L.Dense(256, activation=L.LeakyReLU(alpha=0.25))(x)
x = L.Dropout(0.3)(x)
x = L.Dense(512, activation=L.LeakyReLU(alpha=0.25))(x)
x = L.Dropout(0.3)(x)
x = L.Dense(64, activation="swish")(x)
out = L.Dense(1, activation="sigmoid")(x)

# beta_1 is set to 0.5 in the adam optimizer for more stable training
discriminator = M.Model(inputs=inp, outputs=out)
discriminator.compile(loss="binary_crossentropy", 
                      optimizer=tf.keras.optimizers.Adam(lr=0.0002, beta_1=0.5), 
                      metrics=["acc"])

# generator
# 128 as latent dimension
inp_gen = L.Input(shape=(128))
y = L.Dense(224)(inp_gen)
y = L.LeakyReLU(alpha=0.2)(y)
y = L.Dense(256)(inp_gen)
y = L.LeakyReLU(alpha=0.2)(y)
y = L.Dense(512)(y)
y = L.LeakyReLU(alpha=0.2)(y)
y = L.Dense(664)(y)
y = L.LeakyReLU(alpha=0.2)(y)
y = L.Dense(1024)(y)
y = L.LeakyReLU(alpha=0.2)(y)
# shape of mnist image
y = L.Dense(784, activation="sigmoid")(y)
# reshape to dimensions of an image
out_gen = L.Reshape([28, 28, 1])(y)

# do not complie since the generator will never be trained alone
generator = M.Model(inputs=inp_gen, outputs=out_gen)

In [None]:
# combine model and make discriminator untrainable
gan_model = M.Sequential([generator, discriminator])
discriminator.trainable=False
gan_model.compile(loss="binary_crossentropy", 
                  optimizer=tf.keras.optimizers.Adam(lr=0.0002, beta_1=0.5), 
                  metrics=["acc"])

In [None]:
def build_dataset(data, batch_size=32):
    AUTO = tf.data.experimental.AUTOTUNE
    dset = tf.data.Dataset.from_tensor_slices(data).shuffle(1024)
    return dset.batch(batch_size, drop_remainder=True).prefetch(AUTO)

In [None]:
batch_size = 256
real_img_dataset = build_dataset(X_train, batch_size=batch_size)

In [None]:
# retrieve each individual model
generator, discriminator = gan_model.layers

# recommended to be trained on GPU
epochs = 100

for epo in range(epochs):
    
    print(f"TRAINING EPOCH {epo+1}")
    
    for idx, cur_batch in enumerate(real_img_dataset):
        # random noise for generating fake img 
        noise = tf.random.normal(shape=[batch_size, 128])
        # generate fake img and label
        fake_img, fake_label = generator(noise), tf.constant([[0.0]]*batch_size)
        # extract one batch of real img and label
        real_img, real_label = tf.dtypes.cast(cur_batch, dtype=tf.float32), tf.constant([[1.0]]*batch_size)
        
        # the X of discriminator, consists of half fake img, half real img
        discriminator_X = tf.concat([real_img, fake_img], axis=0)
        # the y of discriminator, 1s and 0s
        discriminator_y = tf.concat([real_label, fake_label], axis=0)
        # set to trainable
        discriminator.trainable = True
        # train discriminator as standalone classification model
        d_loss = discriminator.train_on_batch(discriminator_X, discriminator_y)
        
        # X of generator, noise
        gan_x = tf.random.normal(shape=[batch_size, 128])
        # y of generator, set to "real" 
        gan_y = tf.constant([[1.0]]*batch_size)
        # set discriminator to untraibable
        gan_model.layers[1].trainable = False
        gan_loss = gan_model.train_on_batch(gan_x, gan_y)
        
        # avoid OOM 
        del fake_img, real_img, fake_label, real_label, 
        del discriminator_X, discriminator_y
        
        if (idx+1) % 100 == 0:
            print(f"\t On batch {idx+1}/{len(real_img_dataset)}   Discriminator Acc: {d_loss[1]}  GAN Acc {gan_loss[1]}")
    
    if (epo+1)%10==0:
        # plot results every 10 epochs
        print(f"RESULTS FOR EPOCH {epo}")
        gen_img = generator(tf.random.normal(shape=[5, 128]))
        columns = 5
        rows = 1

        fig = plt.figure(figsize=(12, 2))
        for i in range(rows*columns):
            fig.add_subplot(rows, columns, i+1)
            plt.imshow(gen_img[i], interpolation='nearest', cmap='gray_r')
        plt.tight_layout()
        plt.show()

We will visualize results produced by the GAN

In [None]:
gen_img = generator(tf.random.normal(shape=[20, 128]))
columns = 5
rows = 4

fig = plt.figure(figsize=(12, 8), dpi=120)
for i in range(rows*columns):
    fig.add_subplot(rows, columns, i+1)
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
    plt.imshow(gen_img[i], interpolation='nearest', cmap='gray_r')
# plt.tight_layout()
plt.show()

---

## CTGAN

In [None]:
# install CTGAN from PyPi
!pip install sdv

In [None]:
# we will use the Higgs Boson Dataset for generation
# training process may take up to hours if on CPU
data = pd.read_csv('../input/higs-bonsons-and-background-process/train.csv')
from sdv.tabular import CTGAN
ctgan_model = CTGAN(verbose=True)
ctgan_model.fit(data)

In [None]:
new_data = ctgan_model.sample(num_rows=800)
new_data

In [None]:
import seaborn as sns
plt.figure(figsize=(50, 50), dpi=400)
sns.pairplot(new_data,
             x_vars = new_data.columns[1:6],
             y_vars = new_data.columns[6:11],
             kind='kde')
plt.show()

plt.figure(figsize=(50, 50), dpi=400)
sns.pairplot(data.iloc[np.random.choice(len(data), size=800, replace=False)],
             x_vars = data.columns[1:6],
             y_vars = data.columns[6:11],
             kind='kde')
plt.show()


In [None]:
# an example out of the dataset's context
# the generated column of 'DER_mass_MMC' will be treated as names and anonymized accordingly
# the full list of categories can be found at 
# https://sdv.dev/SDV/user_guides/single_table/ctgan.html#anonymizing-personally-identifiable-information-pii
ctgan_model = CTGAN(
     primary_key='EventId',
     anonymize_fields={
         'DER_mass_MMC': 'name'
     }
)

In [None]:
from sdv.sampling import Condition
condition = Condition({
    'DER_deltar_tau_lep': 2.0,
    # categotical features' values can be passed in as a string
})
constrained_sample = ctgan_model.sample_conditions(condition)

given_colums = pd.DataFrame({
    # arbitrary values
    "DER_mass_MMC": [120.2, 117.3, -988, 189.9]
})
constrained_sample = ctgan_model.sample_remaining_columns(given_columns)