In [1]:
# Utils analysis
from utils_analysis import *

# Standard
import torch
import numpy as np
import pandas as pd
import os
import sys; sys.path.append("../")

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
%config InlineBackend.figure_format='retina'

Not deleting all pareto related files, if you want to recompute, run: update(recompute=True)
Making run overview, based on dir: /home/cbarkhof/code-thesis/NewsVAE/Runs
Making run overview of /home/cbarkhof/code-thesis/NewsVAE/Runs, in /home/cbarkhof/code-thesis/NewsVAE/final-analysis/Runs_run_overview.csv
pareto epoch not in full_par_dict for run: 2021-06-02-YELP | DECODER-ONLY-run-13:01:04
pareto epoch not in full_par_dict for run: 2021-06-02-PTB | DECODER-ONLY-run-13:01:37
Reading last checkpoint and extracting pareto dict and saving it to a pickle.
Reading all pareto dicts and calculating best checkpoint, saving it to a csv
error in calc_weighted_pareto_best_checkpoint list index out of range
error in calc_weighted_pareto_best_checkpoint list index out of range
error in calc_weighted_pareto_best_checkpoint list index out of range
--------------------------------------------------
Making run overview, based on dir: /home/cbarkhof/code-thesis/NewsVAE/Runs-ablation
Making run overview 

In [2]:
def JSD(p, q, log_p, log_q):
    """
    Jenson Shannon Divergence(P || Q)
    
    p, q: n-dimensional tensors both expected to contain log-probabilities
    Args:
        
    
    
    they should be batch x seq_len x vocab_size
    
    """
    
    # Mean distribution 
    m = 0.5 * (p + q)
    
    
    # JSD: symmtretical KL divergence
    # unreduced KL: p(x) * log ( p(x) / q(x) ) -> sum over probability dimension (last dim.)
    # kl div: input given is expected to contain log-probabilities and is not restricted to a 2D Tensor. 
    # The targets are interpreted as probabilities by default
    jsd = 0.5 * F.kl_div(log_p, m, reduction="none").sum(dim=-1) + \
          0.5 * F.kl_div(log_q, m, reduction="none").sum(dim=-1)
        
    return jsd

In [3]:
from utils_train import load_from_checkpoint, transfer_batch_to_device
from dataset_wrappper import NewsData
import torch.nn.functional as F


def calc_JSD_over_seq(run_name, exp_name, max_batches, batch_size, dataset, n_exp, device="cuda:0"):
    # collect the mean JSD between prior decoded and posterior decoded samples over 10 samples 
    # for the whole validation set
    jsd_over_seq = []
    jsd_post_post = []
    jsd_prior_post = []
    label_masks = []

    path, best_epoch = get_best_checkpoint(run_name, exp_name=exp_name)
    
    if path is None:
        return None

    #--------------------------------------------------
    # Get data that this model was trained on
    if "optimus" in dataset.lower():
        dataset_name = "optimus_yelp"
    elif "yelp" in dataset.lower():
        dataset_name = "yelp"
    else:
        dataset_name = "ptb_text_only"

    # Load relevant data
    data = NewsData(dataset_name=dataset_name, tokenizer_name="roberta", batch_size=batch_size, 
                    num_workers=3, pin_memory=True, max_seq_len=64, device=device)
    val_loader = data.val_dataloader(shuffle=False, batch_size=batch_size)

    #--------------------------------------------------
    # Get model
    vae_model = load_from_checkpoint(path, world_master=True, ddp=False, device_name=device, 
                                     evaluation=True, return_loss_term_manager=False)



    with torch.no_grad():
        for batch_i, batch in enumerate(val_loader):

            batch = transfer_batch_to_device(batch, device)

            logits_prior_batch = []
            logits_posterior_batch = []

            for exp_i in range(n_exp):
                print(f"Batch: {batch_i+1:3d}/{max_batches} - Exp: {exp_i+1:3d}/{n_exp}", end="\r")

                for decode_sample_from_prior_mode in [False, True]:

                    vae_output = vae_model(input_ids=batch["input_ids"],
                                           attention_mask=batch["attention_mask"],
                                           auto_regressive=False,
                                           max_seq_len=64,
                                           return_reconstruction_loss=True,
                                           return_posterior_stats=False,
                                           nucleus_sampling=False,
                                           top_k=0,
                                           top_p=1.0,

                                           return_logits=True,

                                           # these two are the relevant ones
                                           decode_sample_from_prior=decode_sample_from_prior_mode,
                                           n_prior_samples=batch_size, # as many as the batch reconstruct, so batch size

                                           device_name=device)

                    if decode_sample_from_prior_mode is True:
                        logits_prior_batch.append(vae_output["logits"].cpu())
                    else:
                        logits_posterior_batch.append(vae_output["logits"].cpu())

            # make a reordered posterior list, to compare posterior 
            # to posterior (and account for internal variability)
            re_order = list(np.arange(1, n_exp)) + [0]          
            logits_posterior_reordered = [logits_posterior_batch[i] for i in re_order]            

            # After cat operation, they matrices are [n_exp * batchsize x seq_len x vocab]
            # The log soft_max should operate on the last dimension
            log_probs_prior_batch = F.log_softmax(torch.cat(logits_prior_batch, dim=0), dim=-1)
            log_probs_posterior_batch = F.log_softmax(torch.cat(logits_posterior_batch, dim=0), dim=-1)
            log_probs_posterior_reordered_batch = F.log_softmax(torch.cat(logits_posterior_reordered, dim=0), dim=-1)

            probs_prior_batch = F.softmax(torch.cat(logits_prior_batch, dim=0), dim=-1)
            probs_posterior_batch = F.softmax(torch.cat(logits_posterior_batch, dim=0), dim=-1)
            probs_posterior_reordered_batch = F.softmax(torch.cat(logits_posterior_reordered, dim=0), dim=-1)

            # JSD returns [n_exp * batchsize x seq_len]
            jsd_prior_posterior = JSD(probs_posterior_batch, probs_prior_batch, log_probs_posterior_batch, log_probs_prior_batch)
            jsd_posterior_posterior = JSD(probs_posterior_batch, probs_posterior_reordered_batch, log_probs_posterior_batch, log_probs_posterior_reordered_batch)
            jsd_dif = jsd_prior_posterior - jsd_posterior_posterior

            # Take into account the different sequence lengths, correct for that when averaging
            labels = batch["input_ids"].cpu()[:, 1:].contiguous()  # skip <s> token
            label_mask = (labels != 1).float().repeat(n_exp, 1) # pad token is int 1
            #label_mask_sum_batch_exp = label_mask.sum(dim=0) # sum over the batch, n_exp dim
            # mean_jsd_dif = jsd_dif.sum(dim=0) / label_mask_sum_batch_exp

            jsd_over_seq.append(jsd_dif)
            jsd_post_post.append(jsd_posterior_posterior)
            jsd_prior_post.append(jsd_prior_posterior)
            label_masks.append(label_mask)

            if batch_i == max_batches - 1:
                break

            # ------- END BATCH EXP ---------

        # -------- END ALL BATCHES ----------

    # Maximum sequence length, needed for padding
    max_len = max([t.shape[1] for t in jsd_over_seq])

    # pad all to have the same length 
    jsd_over_seq = [F.pad(x, (0, max_len-x.shape[1])) for x in jsd_over_seq]
    jsd_over_seq = torch.cat(jsd_over_seq, dim=0)
#     jsd_over_seq = jsd_over_seq.mean(dim=0)

    jsd_post_post = [F.pad(x, (0, max_len-x.shape[1])) for x in jsd_post_post]
    jsd_post_post = torch.cat(jsd_post_post, dim=0)

    jsd_prior_post = [F.pad(x, (0, max_len-x.shape[1])) for x in jsd_prior_post]
    jsd_prior_post = torch.cat(jsd_prior_post, dim=0)

    label_masks = [F.pad(x, (0, max_len-x.shape[1])) for x in label_masks]
    label_masks = torch.cat(label_masks, dim=0)

    results_jsd ={
        "jsd_over_seq": jsd_over_seq,
        "jsd_post_post": jsd_post_post,
        "jsd_prior_post": jsd_prior_post,
        "label_masks": label_masks
    }
    
    return results_jsd


In [4]:
N_EXP = 5
MAX_BATCHES = 4
BS = 64
DEVICE = "cuda:0"

failed_runs = []
for exp_name, run_dir in RUN_DIRS.items():
    run_overview = read_overview_csv(exp_name=exp_name)
    
    for row_i, row in run_overview.iterrows():
        run_name, clean_name = row['run_name'], row['clean_name']
        
        if check_if_running(run_name, exp_name):
            continue
        
        print("*" * 50)
        print(clean_name)
        print(run_name)
        print("*" * 50)
        
        d = f"{RES_FILE_DIR}/{exp_name}/{run_name}"
        os.makedirs(d, exist_ok=True)
        RESULT_FILE = f"{d}/result_JSD_over_seq_N_EXP_{N_EXP}_MAX_BATCHES_{MAX_BATCHES}_BS_{BS}.pickle"
        print(RESULT_FILE)

        # If already ran, do not run again
        if os.path.exists(RESULT_FILE):
            print(f"Loading file {RESULT_FILE}, it existed.")
            results_jsd = pickle.load( open( RESULT_FILE, "rb" ) )

        else:
            # run_name, exp_name, max_batches, batch_size, dataset, n_exp, device="cuda:0"
            results_jsd = calc_JSD_over_seq(run_name=run_name, exp_name=exp_name, max_batches=MAX_BATCHES, batch_size=BS, dataset=row["dataset"], n_exp=N_EXP, device=DEVICE)
        
        if results_jsd is not None:
            pickle.dump( results_jsd, open( RESULT_FILE, "wb" ))
        
        else:
            failed_runs.append(run_name)

print("X"*80)
print("Failed runs")
print("X"*80)
for r in failed_runs:
    print(r)
    

**************************************************
YELP | MDR-0.5 | matrix+mem | DROP 40
2021-05-31-YELP | MDR-0.5 | matrix-memory | DROP 40-run-06:30:27
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-31-YELP | MDR-0.5 | matrix-memory | DROP 40-run-06:30:27/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-31-YELP | MDR-0.5 | matrix-memory | DROP 40-run-06:30:27/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle, it existed.
**************************************************
PTB | CYC-FB-0.5 | matrix+mem
2021-05-24-PTB | CYC-FB-0.5 | matrix-memory-run-19:12:45
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-24-PTB | CYC-FB-0.5 | matrix-memory-run-19:12:45/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbark

**************************************************
PTB | CYC-FB-0.5 | mem+emb | DROP 40
2021-05-22-PTB | CYC-FB-0.5 | memory-embeddings | DROP 40-run-09:00:45
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-22-PTB | CYC-FB-0.5 | memory-embeddings | DROP 40-run-09:00:45/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-22-PTB | CYC-FB-0.5 | memory-embeddings | DROP 40-run-09:00:45/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle, it existed.
**************************************************
PTB | CYC-FB-0.5 | matrix+mem | DROP 40
2021-05-24-PTB | CYC-FB-0.5 | matrix-memory | DROP 40-run-19:30:07
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-24-PTB | CYC-FB-0.5 | matrix-memory | DROP 40-run-19:30:07/result_JSD_over_seq_N_EXP_5_MAX_

**************************************************
YELP | VAE | mem+emb
2021-05-23-YELP | VAE | memory-embeddings-run-07:33:28
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-23-YELP | VAE | memory-embeddings-run-07:33:28/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-23-YELP | VAE | memory-embeddings-run-07:33:28/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle, it existed.
**************************************************
YELP | VAE | mem+emb | DROP 40
2021-05-23-YELP | VAE | memory-embeddings | DROP 40-run-09:15:07
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-23-YELP | VAE | memory-embeddings | DROP 40-run-09:15:07/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbarkhof/code-thesis/NewsVAE/final-

**************************************************
PTB | VAE | matrix+mem | DROP 40
2021-05-24-PTB | VAE | matrix-memory | DROP 40-run-15:34:33
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-24-PTB | VAE | matrix-memory | DROP 40-run-15:34:33/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-24-PTB | VAE | matrix-memory | DROP 40-run-15:34:33/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle, it existed.
**************************************************
PTB | MDR-0.5 | matrix+mem | DROP 40
2021-05-26-PTB | MDR-0.5 | matrix-memory | DROP 40-run-09:00:48
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-05-26-PTB | MDR-0.5 | matrix-memory | DROP 40-run-09:00:48/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbar

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.alpha', 'roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_mo

Some weights of VAE_Encoder_RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 8416, epoch: 7
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
Batch:   1/4 - Exp:   1/5

  return torch.exp(-torch.tensor(kernel_input))


**************************************************
PTB | AE | emb
2021-06-02-PTB | AE | embeddings-run-03:33:15
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-06-02-PTB | AE | embeddings-run-03:33:15/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-06-02-PTB | AE | embeddings-run-03:33:15/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle, it existed.
**************************************************
YELP | AE | mem+emb
2021-06-02-YELP | AE | memory-embeddings-run-13:01:03
**************************************************
/home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-06-02-YELP | AE | memory-embeddings-run-13:01:03/result_JSD_over_seq_N_EXP_5_MAX_BATCHES_4_BS_64.pickle
Loading file /home/cbarkhof/code-thesis/NewsVAE/final-analysis/result-files/Runs/2021-06-02-YELP | AE | memory-embedding

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_module.layer.weight', 'roberta.encoder.layer.0.attention.self.k

Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 5922, epoch: 17
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
**************************************************
YELP | AE | matrix
2021-06-05-YELP | AE | matrix-run-07:16:22
**************************************************
/home/cbarkhof/code-thesis

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.alpha', 'roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_mo

Some weights of VAE_Encoder_RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 100010, epoch: 9
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
**************************************************
YELP | AE | matrix+mem
2021-06-06-YELP | AE | matrix-memory-run-02:54:57
**************************************************
/home/cbarkhof

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.alpha', 'roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_mo

Some weights of VAE_Encoder_RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 100010, epoch: 9
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
**************************************************
YELP | AE | emb
2021-06-06-YELP | AE | embeddings-run-05:09:31
**************************************************
/home/cbarkhof/code-thes

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_module.layer.weight', 'roberta.encoder.layer.0.attention.self.k

Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 50790, epoch: 9
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
**************************************************
YELP | AE | mem
2021-06-06-YELP | AE | memory-run-15:12:30
**************************************************
/home/cbarkhof/code-thesis/Ne

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_module.layer.weight', 'roberta.encoder.layer.0.attention.self.k

Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 50790, epoch: 9
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
**************************************************
YELP | AE | mem+emb
2021-06-06-YELP | AE | memory-embeddings-run-15:14:38
**************************************************
/home/cbarkhof

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_module.layer.weight', 'roberta.encoder.layer.0.attention.self.k

Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 50790, epoch: 9
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
**************************************************
PTB | MDR-0.5 | mem
2021-06-07-PTB | MDR-0.5 | memory-run-07:46:12
**************************************************
/home/cbarkhof/code-t

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_module.layer.weight', 'roberta.encoder.layer.0.attention.self.k

Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 5922, epoch: 17
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
**************************************************
PTB | MDR-0.5 | mem | DROP 40
2021-06-07-PTB | MDR-0.5 | memory | DROP 40-run-14:54:06
**************************************************
/

Some weights of the model checkpoint at roberta-base were not used when initializing VaeDecoderRobertaForCausalLM: ['lm_head.bias']
- This IS expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VaeDecoderRobertaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of VaeDecoderRobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.0.attention.self.query_module.layer.weight', 'roberta.encoder.layer.0.attention.self.query_module.layer.bias', 'roberta.encoder.layer.0.attention.self.key_module.layer.weight', 'roberta.encoder.layer.0.attention.self.k

Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Done loading model...
Checkpoint global_step: 6580, epoch: 19
Tying encoder decoder RoBERTa checkpoint weights!
<class 'modules.decoder_roberta.VaeDecoderRobertaModel'> and <class 'modules.encoder_roberta.VAE_Encoder_RobertaModel'> are not equal. In this case make sure that all encoder weights are correctly initialized. 
The following encoder weights were not tied to the decoder ['roberta/pooler']
Tying embedding spaces!
Setting to eval mode.
**************************************************
PTB | AE | matrix+mem
2021-06-07-PTB | AE | matrix-memory-run-22:34:56
**************************************************
/home/cbarkhof/co