In [11]:
# Derived from the DCGAN paper's Parzen Estimation LL calculations:
#
# Fundamentally, no changes to the method; some updates for Python2 -> Python3.7+, adding
# verbose comments on LL methods, cutting CLI wrapper, and handling for sending Pytorch.datasets
# data to Theano
#
# See: https://github.com/goodfeli/adversarial/blob/master/parzen_ll.py

# NLL Functions from DCGAN Paper: Credit: Yann N. Dauphin

# NOTE: On Sagemaker, use `conda_amazonei_pytorch_latest_p37` (OR `conda_pytorch_latest_p36`)

import theano
import theano.tensor as T
import numpy as np
import gc
import datetime

# Torch Deps
import torch
import torchvision.datasets as dset
import torchvision.transforms as transforms

# DCGAN
import gaudi_dcgan as dcgan

In [None]:
class LimitDataset(torch.utils.data.Dataset):
    """
    Simple wrapper around torch.utils.data.Dataset to limit # of data-points passed
    to a DataLoader
    """

    def __init__(self, dataset, n):
        self.dataset = dataset
        self.n = n

    def __len__(self):
        """Clobber the old Length"""
        return self.n

    def __getitem__(self, i):
        return self.dataset[i]

In [10]:
def get_nll(x, parzen, batch_size=10):
    """
    Calculate the Negative Log-Liklihood over X using parzen function
    -------
    Args:
        X -
        parzen - theano.function, see `theano_parzen`
        batch_size - int - # of images to use for each NLL sample
    """

    inds = range(x.shape[0])
    n_batches = int(np.ceil(float(len(inds)) / batch_size))
    nlls = []
    for i in range(n_batches):
        nll = parzen(x[inds[i::n_batches]])
        nlls.extend(nll)
        if i % 10 == 0:
            print(
                f"[{datetime.datetime.utcnow().__str__()}]\t[{i}/{n_batches}]\tMean NLL: {np.mean(nlls)}"
            )

    return np.array(nlls)


def log_mean_exp(a):
    max_ = a.max(1)
    return max_ + T.log(T.exp(a - max_.dimshuffle(0, "x")).mean(1))


def theano_parzen(mu, sigma):
    """
    Create Parzen function from sample of Mu (i.e. Samples from G)
    -------
    Args:
        - mu - np.Array - Samples from G cast to NDArray and reshaped
        - sigma - float32 - proposed sigma value for Parzen Kernel
    """

    x = T.matrix()
    mu = theano.shared(mu)

    a = (x.dimshuffle(0, "x", 1) - mu.dimshuffle("x", 0, 1)) / sigma
    E = log_mean_exp(-0.5 * (a ** 2).sum(2))
    Z = mu.shape[1] * T.log(sigma * np.sqrt(np.pi * 2))

    return theano.function([x], E - Z)


def cross_validate_sigma(g_samples, data, sigmas, batch_size):
    """
    Select optimal kernel size for Parzen
    -------
    Args:
        g_samples - numpy.ndarray - Sample images from G
        data - numpy.ndarray - Sample images from MSLS
        sigmas - numpy.ndarray - array of sigmas to test
    """

    lls = []
    for sigma in sigmas:
        print(f"[{datetime.datetime.utcnow().__str__()}]\t[σ = {sigma}]")

        parzen = theano_parzen(g_samples, sigma)
        tmp = get_nll(data, parzen, batch_size=batch_size)

        lls.append(np.asarray(tmp).mean())
        del parzen
        gc.collect()

    ind = np.argmax(lls)
    print(f"[{datetime.datetime.utcnow().__str__()}]\t[Using: σ = {sigma}]")
    return sigmas[ind]

In [None]:
# Get Dataset...

# Inputs
DATAROOT = "/efs/samples"
IMG_SIZE = 64
BATCH_SIZE = 128
DATASET_SIGMA = None

# See Section `Data and Translations` for discussion on what this dataloader
# sequence does

dataset = dset.ImageFolder(
    root=DATAROOT,
    transform=transforms.Compose(
        [
            transforms.RandomAffine(degrees=0, translate=(0.2, 0.0)),
            transforms.CenterCrop(IMG_SIZE * 4),
            transforms.Resize(IMG_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(
                (
                    0.5,
                    0.5,
                    0.5,
                ),
                (
                    0.5,
                    0.5,
                    0.5,
                ),
            ),
        ]
    ),
)

# Use LimitDataset wrapper to ensure this doesn't blow up memory...
limited_msls_data = LimitDataset(dataset, 2000)

# WARNING: Create the dataloader - We're just going to have it load in a single iteration
# I *KNOW* we're working on a small dataset (<2,000 imgs), but this is generally a bad idea!!
msls_real_data = torch.utils.data.DataLoader(
    limited_msls_data,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
    batch_size=len(dataset),
)

msls_real_data = next(iter(msls_real_data))[0].numpy()
print(f"Shape after Fetch From Loader: {msls_real_data.shape}")

msls_real_data = msls_real_data.reshape(
    (msls_real_data.shape[0], np.prod(msls_real_data.shape[1:]))
)

print(f"Shape after Reshape: {msls_real_data.shape}")

In [None]:
# Get Data From G(Z)
model_cfg = dcgan.ModelCheckpointConfig()
train_cfg = dcgan.TrainingConfig()

# generated_data.shape == torch.Size([16, 3, 64, 64]);
# but then convert to numpy for Theano! -> (16, 3, 64, 64)
generated_data = dcgan.generate_fake_samples(
    n_samples=16, train_cfg=train_cfg, model_cfg=model_cfg, as_of_epoch=4
).numpy()

print(f"Shape after Generation: {generated_data.shape}")

# Reshape for Theano!
generated_data = generated_data.reshape(
    (generated_data.shape[0], np.prod(generated_data.shape[1:]))
)

print(f"Shape after Reshape: {generated_data.shape}")

In [None]:
# If we are comfortable estimating sigma for the Gaussian (or have estimated it before), then
# skip sigma estimation

if DATASET_SIGMA:
    sigma = DATASET_SIGMA
else:
    sigma = cross_validate_sigma(
        generated_data,
        msls_real_data,
        np.logspace(-1.0, 0, num=10),  # Default Sigma Space...
        BATCH_SIZE,  # Default Batch Size
    )

gc.collect()

# fit and evaulate
parzen = theano_parzen(generated_data, sigma)

ll = get_nll(msls_real_data, parzen, batch_size=BATCH_SIZE)

se = ll.std() / np.sqrt(msls_real_data.shape[0])

print(f"Log-Likelihood of Test Set = {ll.mean()}, se: {se}")