In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
import argparse
import matplotlib
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.transforms as transforms

from tqdm import tqdm
from torchvision import datasets
from torch.utils.data import DataLoader
from torchvision.utils import save_image
matplotlib.style.use('ggplot')

In [13]:
import pandas as pd
df = pd.read_csv("absorbance.csv")
# df = pd.read_csv("wba_data_CLEAN.csv")

In [14]:
wba_df = df.iloc[:,39:]
# wba_df = df.copy(deep=True)
batch_size = 32
train_data = torch.tensor(wba_df.values, dtype=torch.float32)
train_data

tensor([[0.0000e+00, 8.0000e+00, 0.0000e+00,  ..., 2.5300e-01, 2.2820e-01,
         2.0030e-01],
        [1.0000e+00, 2.2000e+01, 0.0000e+00,  ..., 2.0480e-01, 1.9170e-01,
         1.6590e-01],
        [2.0000e+00, 2.4000e+01, 1.0000e+00,  ..., 3.2370e-01, 3.0560e-01,
         2.8810e-01],
        ...,
        [2.3600e+02, 4.9400e+02, 1.0000e+00,  ..., 3.1360e-01, 2.8710e-01,
         2.6790e-01],
        [2.3700e+02, 4.9500e+02, 1.0000e+00,  ..., 6.1200e-02, 4.2700e-02,
         3.1000e-03],
        [2.3800e+02, 4.9600e+02, 1.0000e+00,  ..., 4.0810e-01, 3.8740e-01,
         3.8520e-01]])

In [15]:
train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=batch_size, shuffle=True
)

In [21]:
# define a simple linear VAE
class LinearVAE(nn.Module):
    def __init__(self):
        super(LinearVAE, self).__init__()
        self.wba_input_size = 107
        self.latent_dim = 10
 
        # encoder
        self.enc1 = nn.Linear(in_features=self.wba_input_size, out_features=self.wba_input_size//2)
        self.enc2 = nn.Linear(in_features=self.wba_input_size//2, out_features=self.wba_input_size//3)
        self.enc3 = nn.Linear(in_features=self.wba_input_size//3, out_features=self.latent_dim*2)
 
        # decoder 
        self.dec1 = nn.Linear(in_features=self.latent_dim, out_features=self.wba_input_size//2)
        self.dec2 = nn.Linear(in_features=self.wba_input_size//2, out_features=self.wba_input_size//3)
        self.dec3 = nn.Linear(in_features=self.wba_input_size//3, out_features=self.wba_input_size)

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5*log_var) # standard deviation
        eps = torch.randn_like(std) # `randn_like` as we need the same size
        sample = mu + (eps * std) # sampling as if coming from the input space
        return sample
 
    def encode(self, x):
        # encoding
        x = F.leaky_relu(self.enc1(x))
        x = F.leaky_relu(self.enc2(x))
        x = self.enc3(x).view(-1, 2, self.latent_dim)
        # get `mu` and `log_var`
        mu = x[:, 0, :] # the first feature values as mean
        log_var = x[:, 1, :] # the other feature values as variance
        return mu, log_var

    def decode(self, z):
        x = F.leaky_relu(self.dec1(z))
        x = F.leaky_relu(self.dec2(x))
        reconstruction = torch.sigmoid(self.dec3(x))
        return reconstruction

    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        reconstruction = self.decode(z)
        return reconstruction, mu, log_var
    
    def sample(self, num_samples, current_device):
        z = torch.randn(num_samples, self.latent_dim)
        z = z.to(current_device)
        samples = self.decode(z)
        return samples

    def generate(self, x):
        return self.forward(x)[0]

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LinearVAE().to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
criterion = nn.BCELoss(reduction='sum')

In [23]:
def final_loss(bce_loss, mu, logvar):
    """
    This function will add the reconstruction loss (BCELoss) and the 
    KL-Divergence.
    KL-Divergence = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    :param bce_loss: recontruction loss 
    :param mu: the mean from the latent vector
    :param logvar: log variance from the latent vector
    """
    BCE = bce_loss 
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

In [24]:
def fit(model, dataloader):
    model.train()
    running_loss = 0.0
    for i, data in tqdm(enumerate(dataloader), total=int(len(train_data)/dataloader.batch_size)):
        data = data.to(device)
        data = data.view(data.size(0), -1)
        optimizer.zero_grad()
        reconstruction, mu, logvar = model(data)
        bce_loss = criterion(reconstruction, data)
        loss = final_loss(bce_loss, mu, logvar)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    train_loss = running_loss/len(dataloader.dataset)
    return train_loss

In [25]:
epochs = 1000
for epoch in range(epochs):
    # print(f"Epoch {epoch+1} of {epochs}")
    train_epoch_loss = fit(model, train_loader)
    # print(f"Train Loss: {train_epoch_loss:.4f}")

8it [00:00, 449.36it/s]              
8it [00:00, 523.28it/s]              
8it [00:00, 574.59it/s]              
8it [00:00, 605.64it/s]              
8it [00:00, 539.69it/s]              
8it [00:00, 563.49it/s]              
8it [00:00, 567.02it/s]              
8it [00:00, 595.26it/s]              
8it [00:00, 567.86it/s]              
8it [00:00, 559.32it/s]              
8it [00:00, 558.50it/s]              
8it [00:00, 634.86it/s]              
8it [00:00, 553.36it/s]              
8it [00:00, 524.42it/s]              
8it [00:00, 600.37it/s]              
8it [00:00, 561.56it/s]              
8it [00:00, 575.12it/s]              
8it [00:00, 575.86it/s]              
8it [00:00, 579.12it/s]              
8it [00:00, 631.66it/s]              
8it [00:00, 565.81it/s]              
8it [00:00, 617.14it/s]              
8it [00:00, 580.89it/s]              
8it [00:00, 603.69it/s]              
8it [00:00, 567.54it/s]              
8it [00:00, 546.56it/s]              
8it [00:00, 

In [32]:
# sns.set(rc={'figure.figsize':(15,10)})
# generated_samples = model.sample(100, device)
# generated_samples = generated_samples.cpu().detach()
# generated_samples
# generated_df = pd.DataFrame(generated_samples)
# generated_df.columns = wba_df.columns
# generated_df

Unnamed: 0.1,Unnamed: 0,Subject,Gender,AgeY,PTA0.5,PTA1,PTA2,PTA4,PTAAv4FA,ECV,...,f(6168.8433),f(6349.6042),f(6535.6618),f(6727.1713),f(6924.2925),f(7127.1897),f(7336.0323),f(7550.9945),f(7772.2555),f(8000.0000)
0,0.959687,0.961017,0.467895,0.957767,0.953373,0.962800,0.944985,0.964236,0.951006,0.693967,...,0.518945,0.438503,0.426514,0.481926,0.517834,0.433519,0.468229,0.499365,0.499873,0.466425
1,0.485764,0.500111,0.452105,0.532784,0.511923,0.573419,0.512627,0.502371,0.516934,0.522819,...,0.535578,0.426508,0.497013,0.513535,0.547084,0.444050,0.519769,0.471671,0.446383,0.499444
2,0.851826,0.827207,0.471316,0.850688,0.823919,0.849717,0.827352,0.851043,0.838742,0.624497,...,0.494884,0.432775,0.491639,0.501517,0.505031,0.469961,0.498299,0.495046,0.509975,0.520110
3,1.000000,1.000000,0.483982,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.980823,...,0.290042,0.287749,0.317447,0.307681,0.275490,0.261834,0.267609,0.265041,0.288455,0.274905
4,0.999586,0.999609,0.448347,0.999582,0.999508,0.999622,0.999461,0.999634,0.999541,0.862597,...,0.445088,0.386998,0.392659,0.404784,0.416524,0.356089,0.402388,0.389682,0.401734,0.389366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.491258,0.506731,0.443629,0.527053,0.504374,0.563482,0.499832,0.531441,0.535657,0.525146,...,0.509938,0.447808,0.481957,0.509945,0.556147,0.457521,0.523786,0.492850,0.461244,0.489946
96,0.996476,0.996303,0.439868,0.996393,0.996643,0.996998,0.995032,0.997021,0.996349,0.782171,...,0.460565,0.415796,0.413850,0.455945,0.506500,0.349831,0.400028,0.386837,0.394321,0.400008
97,0.999760,0.999790,0.454804,0.999780,0.999748,0.999811,0.999716,0.999811,0.999763,0.886557,...,0.446104,0.371032,0.379704,0.397180,0.443576,0.340140,0.373994,0.381866,0.362976,0.362606
98,0.999967,0.999972,0.418640,0.999969,0.999970,0.999972,0.999967,0.999976,0.999968,0.911627,...,0.420329,0.369690,0.314352,0.366368,0.363487,0.348369,0.344445,0.358213,0.362475,0.345703


In [26]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(rc={'figure.figsize':(15,10)})
generated_samples = model.sample(100, device)
generated_samples = generated_samples.cpu().detach()
generated_df = pd.DataFrame(generated_samples, columns=df.iloc[:,39:].columns)
wba_df_scatter = generated_df.melt()
sns.boxplot(x="variable", y="value", data=wba_df_scatter)

ValueError: Shape of passed values is (100, 128), indices imply (100, 89)