# Baseline

## Load datasets

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive/') 
# %%capture
# !tar -xzvf ../content/gdrive/MyDrive/IDL\ 11785/project/knnw-720p.tar.gz -C ../content/gdrive/MyDrive/IDL\ 11785/project/

In [None]:
!/opt/bin/nvidia-smi

In [None]:
!kill -9 -1

In [6]:
!pip install torch===1.7.1 torchvision===0.8.2 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch===1.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/90/5d/095ddddc91c8a769a68c791c019c5793f9c4456a688ddd235d6670924ecb/torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8MB)
[K     |████████████████████████████████| 776.8MB 23kB/s 
[?25hCollecting torchvision===0.8.2
[?25l  Downloading https://files.pythonhosted.org/packages/94/df/969e69a94cff1c8911acb0688117f95e1915becc1e01c73e7960a2c76ec8/torchvision-0.8.2-cp37-cp37m-manylinux1_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 204kB/s 
[?25hCollecting torchaudio===0.7.2
[?25l  Downloading https://files.pythonhosted.org/packages/37/16/ecdb9eb09ec6b8133d6c9536ea9e49cd13c9b5873c8488b8b765a39028da/torchaudio-0.7.2-cp37-cp37m-manylinux1_x86_64.whl (7.6MB)
[K     |████████████████████████████████| 7.6MB 54.7MB/s 
[31mERROR: torchtext 0.9.1 has requirement torch==1.8.1, but you'll have torch 1.7.1 which is incompa

In [42]:
import torch
from torch import nn
from torch.utils import data
import torchvision as vis
import sys

# torch.manual_seed(117850791)
is_windows = sys.platform == "win32"
has_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if has_cuda else "cpu")
device

device(type='cuda', index=0)

In [32]:
from PIL import Image
import os

class FlatImageData(vis.datasets.VisionDataset):
  def __init__(self, root, transform, validation_reserved_images=25947, win_len=3):
    self.root = root
    self.images = os.listdir(root)
    self.images.sort(key=lambda x: int(x[6:-5]))# sort by frame no.
    self.transform = transform
    self.training_mode = True
    self.reserved_images = validation_reserved_images
    self.win_len = 3
        
  def __len__(self):
    if self.training_mode:
      return len(self.images) - self.reserved_images - self.win_len + 1
    else:
      return self.reserved_images - self.win_len + 1
    
  def pil_loader(self, path: str) -> Image.Image:
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')

  def __getitem__(self, index):
    if self.training_mode:
      index += self.reserved_images
    
    image_name = [self.images[index+i] for i in range(self.win_len)]
    image_path = [f"{self.root}/{i}" for i in image_name]
    image = [self.pil_loader(img) for img in image_path]
    if self.transform is not None:
         image = [self.transform(img) for img in image]
    return image

  def collate_fn(batch):
      return torch.as_tensor(batch)

In [34]:
PATH = "/content/gdrive/MyDrive/IDL 11785/project/knnw-720p"
dataset = FlatImageData(root=PATH, #"/home/ubuntu/data/knnw-720p",
                             transform=vis.transforms.Compose([
                               vis.transforms.RandomHorizontalFlip(),
                               vis.transforms.RandomApply(nn.ModuleList([
                                 vis.transforms.RandomAffine(degrees=15),
                                 vis.transforms.CenterCrop((1024, 576))
                               ]), p=0.5),
                               vis.transforms.ToTensor(),
                               nn.AdaptiveAvgPool2d((128, 128))
                             ])
                            )
dataset

Dataset FlatImageData
    Number of datapoints: 12408
    Root location: /content/gdrive/MyDrive/IDL 11785/project/knnw-720p

## Train Model

In [8]:
import os
model_store = "model_checkpoints"

class StoredModel:
  def __init__(self, model, optimizer, scheduler, criterion):
    self.model = model
    self.optimizer = optimizer
    self.scheduler = scheduler
    self.criterion = criterion

In [48]:
from torch.nn import functional as F
from typing import List, Callable, Union, Any, TypeVar, Tuple
Tensor = TypeVar('torch.tensor')

class BetaVAE(nn.Module):

    num_iter = 0 # Global static variable to keep track of iterations

    def __init__(self,
                 in_channels: int,
                 latent_dim: int,
                 hidden_dims: List = None,
                 beta: int = 4,
                 gamma:float = 1000.,
                 max_capacity: int = 25,
                 Capacity_max_iter: int = 1e5,
                 loss_type:str = 'B',
                 **kwargs) -> None:
        super(BetaVAE, self).__init__()

        self.latent_dim = latent_dim
        self.beta = beta
        self.gamma = gamma
        self.loss_type = loss_type
        self.C_max = torch.Tensor([max_capacity])
        self.C_stop_iter = Capacity_max_iter

        modules = []
        if hidden_dims is None:
            hidden_dims = [32, 64, 128, 256, 512]

        # Build Encoder
        for h_dim in hidden_dims:
            modules.append(
                nn.Sequential(
                    nn.Conv2d(in_channels, out_channels=h_dim,
                              kernel_size= 3, stride= 2, padding  = 1),
                    nn.BatchNorm2d(h_dim),
                    nn.LeakyReLU())
            )
            in_channels = h_dim
            
        modules.append(nn.Flatten())

        self.encoder = nn.Sequential(*modules)
        
        self.fc_mu = nn.Linear(hidden_dims[-1]*16*3, latent_dim)
        self.fc_var = nn.Linear(hidden_dims[-1]*16*3, latent_dim)


        # Build Decoder
        modules = []

        self.decoder_input = nn.Linear(latent_dim, hidden_dims[-1] * 16)

        hidden_dims.reverse()

        for i in range(len(hidden_dims) - 1):
            modules.append(
                nn.Sequential(
                    nn.ConvTranspose2d(hidden_dims[i],
                                       hidden_dims[i + 1],
                                       kernel_size=3,
                                       stride = 2,
                                       padding=1,
                                       output_padding=1),
                    nn.BatchNorm2d(hidden_dims[i + 1]),
                    nn.LeakyReLU())
            )



        self.decoder = nn.Sequential(*modules)

        self.final_layer = nn.Sequential(
                            nn.ConvTranspose2d(hidden_dims[-1],
                                               hidden_dims[-1],
                                               kernel_size=3,
                                               stride=2,
                                               padding=1,
                                               output_padding=1),
                            nn.BatchNorm2d(hidden_dims[-1]),
                            nn.LeakyReLU(),
                            nn.Conv2d(hidden_dims[-1], out_channels= 3,
                                      kernel_size= 3, padding= 1),
                            nn.Tanh())

    def encode(self, inputs: Tensor) -> List[Tensor]:
        """
        Encodes the input by passing through the encoder network
        and returns the latent codes.
        :param input: (Tensor) Input tensor to encoder [N x C x H x W]
        :return: (Tensor) List of latent codes
        """
        result = torch.cat([self.encoder(input) for input in inputs], dim=1)

        # Split the result into mu and var components
        # of the latent Gaussian distribution
        mu = self.fc_mu(result)
        log_var = self.fc_var(result)

        return (inputs, mu, log_var)

    def decode(self, z: Tensor) -> Tensor:
        result = self.decoder_input(z)
        result = result.view(-1, 512, 4, 4)
        result = self.decoder(result)
        result = self.final_layer(result)
        return result

    def reparameterize(self, mu: Tensor, logvar: Tensor) -> Tensor:
        """
        Will a single z be enough ti compute the expectation
        for the loss??
        :param mu: (Tensor) Mean of the latent Gaussian
        :param logvar: (Tensor) Standard deviation of the latent Gaussian
        :return:
        """
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return eps * std + mu

    def forward(self, inputs: Tensor, **kwargs) -> Tensor:
        pooled_inputs, mu, log_var = self.encode(inputs)
        z = self.reparameterize(mu, log_var)
        
        self.current_inputs = pooled_inputs
        self.current_mu = mu
        self.current_log_var = log_var
        self.current_recon = self.decode(z)
        
        return self.current_recon

    def loss(self, *args, **kwargs) -> dict:
        self.num_iter += 1
        recons = self.current_recon
        input = self.current_inputs
        mu = self.current_mu
        log_var = self.current_log_var
        kld_weight = kwargs['kld_weight']  # Account for the minibatch samples from the dataset
        
        # since the image value is normalized between 0~1, BCE loss is better
        # recons_loss =F.binary_cross_entropy(recons, input)
        recons_loss = sum([F.mse_loss(recons, img) * (255 ** 2) for img in input])
  
        kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1), dim = 0)

        if self.loss_type == 'H': # https://openreview.net/forum?id=Sy2fzU9gl
            loss = recons_loss + self.beta * kld_weight * kld_loss
        elif self.loss_type == 'B': # https://arxiv.org/pdf/1804.03599.pdf
            self.C_max = self.C_max.to(device) #input.device
            C = torch.clamp(self.C_max/self.C_stop_iter * self.num_iter, 0, self.C_max.data[0])
            loss = recons_loss + self.gamma * kld_weight * (kld_loss - C).abs()
        else:
            raise ValueError('Undefined loss type.')

        return {'loss': loss, 'Reconstruction_Loss':recons_loss, 'KLD':kld_loss}

    def sample(self,
               num_samples:int,
               current_device: int, **kwargs) -> Tensor:
        """
        Samples from the latent space and return the corresponding
        image space map.
        :param num_samples: (Int) Number of samples
        :param current_device: (Int) Device to run the model
        :return: (Tensor)
        """
        z = torch.randn(num_samples,
                        self.latent_dim)

        z = z.to(current_device)

        samples = self.decode(z)
        return samples

    def generate(self, x: Tensor, **kwargs) -> Tensor:
        """
        Given an input image x, returns the reconstructed image
        :param x: (Tensor) [B x C x H x W]
        :return: (Tensor) [B x C x H x W]
        """

        return self.forward(x)[0]

### Resume from checkpoint or a new model?

#### train a new model

In [49]:
import torchsummary
model_id = "model_03"

model = BetaVAE(3, 32)

epoch_start = 0
model.to(device)
print(model)

# model_spec = torchsummary.summary_string(model, (3, 128, 128))[0]
# print(model_spec)

BetaVAE(
  (encoder): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (1): Sequential(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (2): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (3): Sequential(
      (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (4): Seque

In [18]:
PATH = "/content/gdrive/MyDrive/IDL 11785/project"

os.mkdir(f"{PATH}/{model_store}/{model_id}")
# save model summary to a txt file
with open(f"{PATH}/{model_store}/{model_id}/model_spec.txt", "w") as file:
  file.write(str(model) + "\n")
  # file.write(model_spec)

#### load a trained model from checkpoint 

In [22]:
def load_model(model_id, specific_epoch = None):
  global optimizer, scheduler
  epoch_start = -1
  for checkpoint in os.listdir(f"{PATH}/{model_store}/{model_id}"):
    if not checkpoint.startswith("epoch"):
      continue
    epoch = int(checkpoint.split("_")[1])
    if specific_epoch is None:
      # find the latest
      if epoch > epoch_start:
        epoch_start = epoch
        last_checkpoint = checkpoint
    else:
      if epoch == specific_epoch:
        epoch_start = epoch
        last_checkpoint = checkpoint
        break

  if epoch_start == -1:
    print(f"No checkpoints available for {model_id}!")
    return -1, None
  else:
    epoch_start += 1
    print(f"resuming from last checkpoint {last_checkpoint}")
    data = torch.load(f"{model_store}/{model_id}/{last_checkpoint}")
    
    model = data.model
    optimizer = data.optimizer
    scheduler = data.scheduler
    criterion = data.criterion
    
    model.to(device)
    return epoch_start, model, criterion

In [None]:
model_id = "model_03"
epoch_start, model, criterion = load_model(model_id)
print(model)

### Start training

In [24]:
# clear GPU cache
if has_cuda:
  torch.cuda.empty_cache()

In [43]:
train_dataloader_args = dict(batch_size=128,
                             num_workers=0 if is_windows else 4) if has_cuda else dict(batch_size=64)
train_dataloader_args["shuffle"] = True

train_dataloader = data.DataLoader(dataset, **train_dataloader_args)

  data = _utils.pin_memory.pin_memory(data)


In [45]:
from torch import optim
from itertools import chain

num_epochs = 100

if epoch_start == 0:
  # define only at the start of the training
  
  regularization = 2e-5
#   learning_rate = 1e-1
#   optimizer = optim.SGD(chain(model.parameters(), criterion.parameters()),
#                          lr = learning_rate, momentum=0.9, weight_decay=regularization, nesterov=True)
  learning_rate = 1e-3
  optimizer = optim.Adam(model.parameters(),
                         lr = learning_rate, weight_decay=regularization)
  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.5)

# scaler = torch.cuda.amp.GradScaler() # mix-precision training

with open(f"/content/gdrive/MyDrive/IDL 11785/project/{model_store}/{model_id}/training_params.txt", "w") as file:
  file.write(f"num_epochs = {num_epochs}\n")
  file.write(f"optimizer = {optimizer}\n")
  file.write(f"scheduler = {type(scheduler).__name__}({scheduler.state_dict()})\n")

In [52]:
from tqdm import tqdm
import sys
import json

print(f"Model: {model_id}. Training for {num_epochs} epochs", file=sys.stderr)

for epoch in tqdm(range(epoch_start, num_epochs), desc="Train"):
  print(f"Epoch {epoch}", file=sys.stderr)
  
  # set model in training mode
  model.train()
  training_loss = 0.0
  reconstruction_loss = 0.0
  kld_loss = 0.0

  for x in train_dataloader:
    optimizer.zero_grad() # clear calculated gradients

    x = [i.to(device) for i in x]
    
    with torch.cuda.amp.autocast():
      output = model(x)
      all_loss = model.loss(kld_weight=1.0)
      loss = all_loss["loss"]
    
    # backpropo loss and accumuate loss stat
    # scaler.scale(loss).backward()    
    
    training_loss += loss.detach().item() # otherwise this would be a tensor
    reconstruction_loss += all_loss['Reconstruction_Loss'].detach().item()
    kld_loss += all_loss['KLD'].detach().item()

    # scaler.step(optimizer)
    # scaler.update()
    loss.backward()
    optimizer.step()
    
  # let scheduler know it's the next epoch
  scheduler.step()
  
  training_loss /= len(train_dataloader)
  reconstruction_loss /= len(train_dataloader)
  kld_loss /= len(train_dataloader)
  
  log_str = json.dumps({
    "Epoch": epoch,
    "training loss": round(training_loss, 6),
    "reconstruction loss": round(reconstruction_loss, 6),
    "KLD loss": round(kld_loss, 6),
    "Learning rate": scheduler._last_lr
  })

#   log_str = f"Epoch {epoch}: training loss {training_loss:.6f}, " +\
#             f"reconstruction loss {reconstruction_loss:.6f}, kld_loss {kld_loss:.6f}"+\
#             f" Learning Rate: {scheduler._last_lr}"
 
  with open(f"{model_store}/{model_id}/training_logs.txt", "a") as log_file:
    log_file.write(log_str + "\n")
  print(log_str, file=sys.stderr)
  
  torch.save(StoredModel(model, optimizer, scheduler, None),
             f"{model_store}/{model_id}/epoch_{epoch:02d}" +\
             f"_tr-loss_{training_loss:.6f}")

Model: model_03. Training for 100 epochs





Train:   0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A[AEpoch 0
  data = _utils.pin_memory.pin_memory(data)


FileNotFoundError: ignored

In [None]:
from tqdm import tqdm

validataion_dataloader_args = dict(batch_size=128,
                             num_workers=0 if is_windows else 2) if has_cuda else dict(batch_size=64)
validataion_dataloader_args["shuffle"] = False

validataion_dataloader = data.DataLoader(dataset, **validataion_dataloader_args)

# set model in training mode
model.eval()

latent_mu = list()
latent_log_var = list()

for i, x in enumerate(tqdm(validataion_dataloader, desc="Validate")):
  x = x.to(device)

  _, mus, log_vars = model.encode(x)
  latent_mu.append(mus.detach().cpu())
  latent_log_var.append(log_vars.detach().cpu())

Validate: 100%|██████████| 1500/1500 [41:09<00:00,  1.65s/it]


In [None]:
torch.save((torch.vstack(latent_mu), torch.vstack(latent_log_var)), f"latent_vectors/{model_id}")

In [None]:
L2_divergence_raw = list()

image_1 = dataset[0].to(device)

for i in tqdm(range(len(dataset) - 1), desc="L2"):
  image_2 = dataset[i + 1].to(device)
  
  diff = (image_1 - image_2).flatten()
  
  L2_divergence_raw.append(torch.linalg.norm(diff, 2).cpu().item())
  
  image_1 = image_2

L2: 100%|██████████| 191880/191880 [48:40<00:00, 65.69it/s]


In [None]:
torch.save(torch.tensor(L2_divergence_raw), f"temp_store/{model_id}/l2_divergence_raw")

In [None]:
normalize = lambda X, mn, mx: [(x - mn)/(mx - mn) for x in X]

63.88896179199219