In [1]:
import sys; sys.path.append("../")

In [2]:
from dataset_wrappper import NewsData
from utils_train import transfer_batch_to_device
import os
from run_validation import load_model_for_eval
from pathlib import Path
import matplotlib.pyplot as plt
from transformers import RobertaTokenizerFast
import torch
import torch.nn.functional as F
from utils_evaluation import tokenizer_batch_decode
import numpy as np
import pickle
import tqdm

plt.style.reload_library()
plt.style.use('thesis_style')

%matplotlib inline

In [3]:
BATCH_SIZE = 32
DEVICE = "cuda:0"
CHECKPOINT_TYPE = "best" # else "best"
RESULT_DIR = Path("result-files")

In [4]:
data = NewsData(batch_size=BATCH_SIZE, tokenizer_name="roberta", dataset_name="ptb_text_only", max_seq_len=64)
validation_loader = data.val_dataloader(batch_size=BATCH_SIZE)
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

print(f"{len(validation_loader)} batches with batch size {BATCH_SIZE}")

Is file!
train 42068
validation 3370
test 3761
106 batches with batch size 32


In [5]:
def get_clean_name(run_name):
    latent_size = run_name.split("-")[4][-2:]
    if "autoencoder" in run_name:
        FB = "autoencoder"
    else:
        FB = run_name.split("-")[6]
        if len(FB) == 3:
            FB += "0"
        FB = "FB-" + FB
    clean_name = f"NZ-{latent_size} | {FB}"
    return clean_name

In [6]:
PTB_run_name_paths = {}
for r in os.listdir("/home/cbarkhof/code-thesis/NewsVAE/Runs"):
    if "PTB" in r:
        path = Path("/home/cbarkhof/code-thesis/NewsVAE/Runs") / r / f"checkpoint-{CHECKPOINT_TYPE}.pth"
        PTB_run_name_paths[r] = path

for r in PTB_run_name_paths.keys():
    print(get_clean_name(r))

NZ-32 | FB-0.50
NZ-32 | FB-1.00
NZ-32 | FB-1.50
NZ-64 | FB-1.00
NZ-32 | FB-0.00
NZ-32 | autoencoder
NZ-64 | FB-0.50
NZ-64 | FB-1.50
NZ-64 | FB-0.00
NZ-64 | autoencoder
NZ-32 | FB-0.75
NZ-32 | FB-0.25
NZ-64 | FB-0.75
NZ-64 | FB-0.25


In [10]:
for p in PTB_run_name_paths.values():
    print(p)
    print()

p = "/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-FB-0.5-run-09:31:02/checkpoint-best.pth"

latent_size = 32

vae_model = load_model_for_eval(p, device_name="cuda:0", 
                                latent_size=latent_size, 
                                add_latent_via_memory=True,
                                add_latent_via_embeddings=False, 
                                do_tie_weights=True, 
                                do_tie_embedding_spaces=True,
                                add_decoder_output_embedding_bias=False)

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-FB-0.5-run-09:31:02/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-FB-1.0-run-11:43:17/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-FB-1.50-run-12:13:36/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent64-FB-1.0-run-13:06:00/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-FB-0.00-run-14:32:09/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-autoencoder-run-17:30:41/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent64-FB-0.50-run-12:29:58/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent64-FB-1.50-run-13:22:14/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent64-FB-0.00-run-17:14:10/checkpoint-best.pth

/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-

Some weights of the model checkpoint at roberta-base were not used when initializing VAE_Encoder_RobertaModel: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing VAE_Encoder_RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VAE_Encoder_RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VAE_Encoder_RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta_new.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done model...
Loading VAE_model, optimizer and scheduler from /home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-FB-0.5-run-09:31:02/checkpoint-best.pth
Removing module string from state dict from checkpoint
Checkpoint global_step: best, epoch: 37, best_valid_loss: 84.10610651086878


In [30]:
import copy

n_samples = 1000
device_name = "cuda:0"
chunk_size = 100
loader = validation_loader

N = len(loader)
max_batches = 2

ppls = []
ds = []
log_p_xs = []
log_p_x_p_ws = []
ce_p_ws = []
autoregressive = False
log_p_x_zs, log_q_z_xs, log_p_zs = [], [], []


# for r, p in PTB_run_name_paths.items():
p = "/home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-autoencoder-run-17:30:41/checkpoint-best.pth"
    
latent_size = 32 if "latent32" in r else 64

vae_model = load_model_for_eval(p, device_name="cuda:0", 
                                latent_size=latent_size, 
                                add_latent_via_memory=True,
                                add_latent_via_embeddings=False, 
                                do_tie_weights=True, 
                                do_tie_embedding_spaces=True,
                                add_decoder_output_embedding_bias=False)

with torch.no_grad():
    # For all batches in the validation set
    for batch_i, batch in enumerate(loader):
        print(f"{batch_i:3d}/{N}")

        # Send to right device
        batch = transfer_batch_to_device(batch, device_name)
        batch_size = batch["input_ids"].shape[0]

        # Get label mask (where input ids are not 1, which is the padding ID)
        labels = copy.deepcopy(batch["input_ids"])[:, 1:].contiguous()
        label_mask = (labels != 1).float()
        avg_len = label_mask.sum(dim=-1).mean()  # sum where is 1 and average over batch

        print("Average length:", avg_len)

        # Encode these input ids and sample <n_samples> for each x
        enc_out = vae_model.encoder.encode(batch["input_ids"], batch["attention_mask"], 
                                           n_samples=n_samples, # <-- Number of samples for important sampling
                                           hinge_kl_loss_lambda=0.5,
                                           return_log_q_z_x=True,
                                           return_log_p_z=True,
                                           return_embeddings=False)

        # Unpack the tensors we need
        latent_z, log_p_z, log_q_z_x = enc_out["latent_z"], enc_out["log_p_z"], enc_out["log_q_z_x"]

        print("latent_z.shape", latent_z.shape)

        print(torch.exp(enc_out["log_p_z"].mean()), torch.exp(enc_out["log_q_z_x"].mean()))

        # Now we need to loop again because our batch size was multiplied by n_samples
        log_p_x_z = []
        ce_per_word = []
        distortion = []


        # For all samples x in batch
        for sample_i in range(batch_size):
            print(f"sample i: {sample_i:3d}")

            # Gather all n_samples z belonging to that x_i
            latent_z_sample_i = latent_z[sample_i, :, :]

            # Chunk those samples into batches and copy inputs and attention masks to match x (repeat)
            input_ids = batch['input_ids'][sample_i, :].repeat(chunk_size, 1)
            attention_mask = batch['attention_mask'][sample_i, :].repeat(chunk_size, 1)


            n_chunks = int(n_samples / chunk_size)

            # Get the mask for this sequence and its length
            label_mask_i = label_mask[sample_i, :].repeat(chunk_size, 1)
            len_i = label_mask[sample_i, :].sum()

            for chunk_i, z_b in enumerate(torch.chunk(latent_z_sample_i, n_chunks, dim=0)):
                # Teacher forced decoding
                if autoregressive is False:
                    dec_out = vae_model.decoder.forward(z_b, input_ids, attention_mask,
                                                        labels=copy.deepcopy(input_ids),
                                                        return_cross_entropy=True,
                                                        reduce_seq_dim_ce="none",
                                                        reduce_batch_dim_ce="none")

                # Auto-regressive decoding
                else:
                    dec_out = vae_model.decoder.autoregressive_decode(z_b,
                                                                      max_seq_len=input_ids.shape[1],
                                                                      device_name=device_name,
                                                                      labels=copy.deepcopy(input_ids),
                                                                      return_cross_entropy=True,
                                                                      reduce_seq_dim_ce="none",
                                                                      reduce_batch_dim_ce="none")

                # Collect cross entropy per word: multiply with mask, get average over seq and over batch
                ce_per_word.append(((dec_out["cross_entropy"] * label_mask_i).sum(dim=-1) / len_i).mean())

                # Collect distortion
                distortion.append((dec_out["cross_entropy"] * label_mask_i).sum(dim=-1).mean())

                # CE = - log p_x_z (not averaged over batch yet)
                ce = (dec_out['cross_entropy'] * label_mask_i).sum(dim=-1)

                log_p_x_z.append(- ce)

            # End of chunk of latents from one data sample

        # End of all samples for all data points in batch

        # Get mean CE per word and mean distortion
        ce_per_word = torch.stack(ce_per_word).mean().item()
        distortion = torch.stack(distortion).mean().item()

        log_p_x_z = torch.cat(log_p_x_z, dim=0).reshape(-1, n_samples) # make shape batch x n_samples

        # Calculate importance weighted perplexity
        # log p(x) = log 1/N sum_i^N ( p(x|z_i) * p(z_i) ) / q(z_i|x)
        # log p(x) = log sum_i^N exp( log( p(x|z_i) * p(z_i) ) / q(z_i|x) )) + log 1/N
        # log p(x) = log sum_i^N exp( log p(x|z_i) + log p(z_i) - log q(z_i|x)) + log 1/N
        log_frac = log_p_x_z + log_p_z - log_q_z_x
        log_p_x = torch.logsumexp(log_frac, dim=-1) + np.log(1 / n_samples) # do importance weighted mean (over samples)

        # importance weighted negative log likelihood per word
        log_p_x_p_w = - log_p_x.mean() / avg_len # average over the batch and then words
        ppl = torch.exp(log_p_x_p_w)

        ppls.append(ppl.item())
        ds.append(distortion)
        log_p_xs.append(log_p_x.mean().item())
        log_p_x_p_ws.append(log_p_x_p_w.item())
        ce_p_ws.append(ce_per_word)

        log_p_x_zs.append(log_p_x_z.cpu().numpy())
        log_q_z_xs.append(log_q_z_x.cpu().numpy())
        log_p_zs.append(log_p_z.cpu().numpy())

        # print("log_p_x_z.shape", log_p_x_z.shape)
        # print("log_q_z_x.shape", log_q_z_x.shape)
        # print("log_p_z.shape", log_p_z.shape)

        print(f"ce per word: {ce_per_word:.2f} | D: {distortion:.2f}")
        print(f"log p x p w: {log_p_x_p_w:.2f} | ppl: {ppl:6f}")

        if batch_i == max_batches - 1:
            break

    # log_p_x_zs = np.concatenate(log_p_x_zs, axis=0)
    # log_q_z_xs = np.concatenate(log_q_z_xs, axis=0)
    # log_p_zs = np.concatenate(log_p_zs, axis=0)

    # results = dict(PPL=ppls, distortion=ds, log_p_x=log_p_xs, log_p_x_p_w=log_p_x_p_ws, ce_p_w=ce_p_ws,
    #                log_p_x_zs=log_p_x_zs, log_q_z_xs=log_q_z_xs, log_p_zs=log_p_zs)

Loading model...
Replacing linear output layer with one without bias!


Some weights of the model checkpoint at roberta-base were not used when initializing VAE_Encoder_RobertaModel: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing VAE_Encoder_RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VAE_Encoder_RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VAE_Encoder_RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta_new.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done model...
Loading VAE_model, optimizer and scheduler from /home/cbarkhof/code-thesis/NewsVAE/Runs/2021-02-03-PTB-latent32-autoencoder-run-17:30:41/checkpoint-best.pth
Removing module string from state dict from checkpoint
Checkpoint global_step: best, epoch: 56, best_valid_loss: 62.82812358714916


  return torch.tensor(x, **format_kwargs)
  return torch.tensor(x, **format_kwargs)
  return torch.tensor(x, **format_kwargs)
  return torch.tensor(x, **format_kwargs)


  0/106
Average length: tensor(25.9375, device='cuda:0')
latent_z.shape torch.Size([32, 1000, 32])
tensor(7.4469e-40, device='cuda:0') tensor(inf, device='cuda:0')
sample i:   0
sample i:   1
sample i:   2
sample i:   3
sample i:   4
sample i:   5
sample i:   6
sample i:   7
sample i:   8
sample i:   9
sample i:  10
sample i:  11
sample i:  12
sample i:  13
sample i:  14
sample i:  15
sample i:  16
sample i:  17
sample i:  18
sample i:  19
sample i:  20
sample i:  21
sample i:  22
sample i:  23
sample i:  24
sample i:  25
sample i:  26
sample i:  27
sample i:  28
sample i:  29
sample i:  30
sample i:  31
ce per word: 2.89 | D: 78.99
log p x p w: 10.04 | ppl: 22993.521484
  1/106
Average length: tensor(26.8438, device='cuda:0')
latent_z.shape torch.Size([32, 1000, 32])
tensor(1.4122e-40, device='cuda:0') tensor(inf, device='cuda:0')
sample i:   0
sample i:   1
sample i:   2
sample i:   3
sample i:   4
sample i:   5
sample i:   6
sample i:   7
sample i:   8
sample i:   9
sample i:  10
sa

KeyboardInterrupt: 

In [27]:
x = torch.randn((12))
print(x)
print(x.reshape(3, 4))

tensor([-2.2040, -2.2625, -0.3351, -0.2449,  0.0886, -0.7174,  0.5582, -2.0932,
         0.6206,  0.5095, -0.8901,  1.4365])
tensor([[-2.2040, -2.2625, -0.3351, -0.2449],
        [ 0.0886, -0.7174,  0.5582, -2.0932],
        [ 0.6206,  0.5095, -0.8901,  1.4365]])


In [32]:
mode = "autoregressive"
autoregressive = True if mode == "autoregressive" else False
autoregressive

True