# 1 PRELIMINARIES

## Imports

In [1]:
# IMPORTS
import torch
import numpy as np
import os
import pickle
import csv
from argparse import Namespace
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib as mpl
import time

from relevant_optimus_code.configuration_bert import BertConfig
from relevant_optimus_code.configuration_gpt2 import GPT2Config
from relevant_optimus_code.tokenization_bert import BertTokenizer
from relevant_optimus_code.tokenization_gpt2 import GPT2Tokenizer
from relevant_optimus_code.modeling_bert import BertForLatentConnector
from relevant_optimus_code.modeling_gpt2 import GPT2ForLatentConnector
from relevant_optimus_code.run_latent_generation import add_special_tokens_to_decoder, interpolate, latent_code_from_text, text_from_latent_code, top_k_top_p_filtering
from relevant_optimus_code.vae import VAE

## Globals

In [2]:
# GLOBALS
MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2ForLatentConnector, GPT2Tokenizer),
    'bert': (BertConfig, BertForLatentConnector, BertTokenizer)}
DEVICE = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"); print("DEVICE:", DEVICE)
SEED = 42; np.random.seed(SEED); torch.manual_seed(SEED);
ENCODER_MODEL_TYPE = 'bert'
ENCODER_MODEL_NAME = 'bert-base-cased'
DECODER_MODEL_TYPE = 'gpt2'
DECODER_MODEL_NAME = 'gpt2'
MAX_LEN_EX_SPECIAL = 510

GLOBAL_STEP = 31250
LATENT_SIZE = 768
DO_LOWERCASE = False

DEVICE: cpu


## Paths

In [3]:
# PATHS
PREFIX_PATH = '/Users/claartje/Dropbox (Persoonlijk)/Studie/Master AI/Thesis/Thesis_MSc_AI/Experimentation/Optimus/'
DATA_DIR = 'claartje/data/'
DATA_FILES = {'train':'train.txt', 'valid':'valid.txt', 'test':'test.txt'}
CHECKPOINT_DIRS = {'B{}'.format(v):'{}output/LM/Snli/768/philly_vae_snli_b{}_d5_r00.5_ra0.25_length_weighted/checkpoint-31250/'.format(PREFIX_PATH, v) for v in [0.0, 0.5, 1.0]}
OUTPUT_ENCODER_DIR = {'B{}'.format(v):CHECKPOINT_DIRS['B{}'.format(v)]+'checkpoint-encoder-{}'.format(GLOBAL_STEP) for v in [0.0, 0.5, 1.0]}
OUTPUT_DECODER_DIR = {'B{}'.format(v):CHECKPOINT_DIRS['B{}'.format(v)]+'checkpoint-decoder-{}'.format(GLOBAL_STEP) for v in [0.0, 0.5, 1.0]}
OUTPUT_FULL_DIR = {'B{}'.format(v):CHECKPOINT_DIRS['B{}'.format(v)]+'checkpoint-full-{}'.format(GLOBAL_STEP) for v in [0.0, 0.5, 1.0]}

# 2 INITIALISATION

## Encoder Model (BERT)

## Functions to initialise models

In [4]:
def get_model_tokenizer_encoder(beta_name="B1.0"):
    # Load a trained Encoder model and vocabulary that you have fine-tuned
    encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES[ENCODER_MODEL_TYPE]
    # print("Loading:", OUTPUT_ENCODER_DIR[beta_name])
    model_encoder = encoder_model_class.from_pretrained(OUTPUT_ENCODER_DIR[beta_name], latent_size=LATENT_SIZE)
    tokenizer_encoder = encoder_tokenizer_class.from_pretrained(ENCODER_MODEL_NAME, do_lower_case=DO_LOWERCASE)
    return model_encoder, tokenizer_encoder

def get_model_tokenizer_decoder(beta_name="B1.0"):
    # Load a trained Decoder model and vocabulary that you have fine-tuned
    decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[DECODER_MODEL_TYPE]
    # print("Loading: ", OUTPUT_DECODER_DIR[beta_name])
    model_decoder = decoder_model_class.from_pretrained(OUTPUT_DECODER_DIR[beta_name], latent_size=LATENT_SIZE)
    tokenizer_decoder = decoder_tokenizer_class.from_pretrained(DECODER_MODEL_NAME, do_lower_case=DO_LOWERCASE)
    model_decoder, tokenizer_decoder = add_special_tokens_to_decoder(model_decoder, tokenizer_decoder)
    return model_decoder, tokenizer_decoder

def get_model_vae(model_encoder, model_decoder, tokenizer_encoder, tokenizer_decoder, beta_name="B1.0"):
    checkpoint_full = torch.load(os.path.join(OUTPUT_FULL_DIR[beta_name], 'training.bin'), map_location=torch.device(DEVICE))
    # print("Loading: ", OUTPUT_FULL_DIR[beta_name])
    args = {'latent_size':LATENT_SIZE, 'device':DEVICE}
    model_vae = VAE(model_encoder, model_decoder, tokenizer_encoder, tokenizer_decoder, Namespace(**args))
    model_vae.load_state_dict(checkpoint_full['model_state_dict'])
    return model_vae

## Init models for different beta values: 0.0, 0.5 and 1.0

In [5]:
decoder_models, encoder_models, vae_models = {}, {}, {}

for beta in [0.0, 0.5, 1.0]:
    beta_name = "B{}".format(beta)
    
    model_encoder, tokenizer_encoder = get_model_tokenizer_encoder(beta_name=beta_name)
    model_decoder, tokenizer_decoder = get_model_tokenizer_decoder(beta_name=beta_name)
    
    decoder_models[beta_name] = model_decoder
    encoder_models[beta_name] = model_encoder
    
    vae_models[beta_name] = get_model_vae(model_encoder, model_decoder, tokenizer_encoder, tokenizer_decoder, beta_name=beta_name)

CONFIG: {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

CONFIG: {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "initializer_range": 0.02,
  "latent_size": 768,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activation":

## Number of parameters VAE models

In [6]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

standard_bert = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')
standard_gpt2 = torch.hub.load('huggingface/pytorch-transformers', 'model', 'gpt2')

print("Number of parameters in the full VAE:\t\t\t", get_n_params(vae_models["B1.0"]))
print("Number of parameters in standard BERT + GPT2:\t\t", get_n_params(standard_gpt2) +  get_n_params(standard_bert))

print("\nNumber of parameters in the decoder (GPT2):\t\t", get_n_params(decoder_models["B1.0"]))
print("Number of parameters in a standard decoder (GPT2):\t", get_n_params(standard_gpt2))
print("\nNumber of parameters in the encoder (BERT):\t\t", get_n_params(encoder_models["B1.0"]))
print("Number of parameters in a standard encoder (BERT):\t", get_n_params(standard_bert))

del standard_bert
del standard_gpt2

Using cache found in /Users/claartje/.cache/torch/hub/huggingface_pytorch-transformers_master
Using cache found in /Users/claartje/.cache/torch/hub/huggingface_pytorch-transformers_master
Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters in the full VAE:			 241599744
Number of parameters in standard BERT + GPT2:		 233922048

Number of parameters in the decoder (GPT2):		 132109824
Number of parameters in a standard decoder (GPT2):	 124439808

Number of parameters in the encoder (BERT):		 109489920
Number of parameters in a standard encoder (BERT):	 109482240


## Interpolate with different beta value models

In [7]:
def interpolate_different_beta(source_sentence, target_sentence, 
                               vae_models, tokenizer_encoder, tokenizer_decoder,
                               num_interpolation_steps=9, temperature=1.0, top_k=0, top_p=1.0):
    for beta in [0.0, 0.5, 1.0]:
        beta_name = "B{}".format(beta)
        print('*' * 30)
        print("Interpolation for BETA: {}".format(beta))
        print('-')
        print("0", source_sentence)
        result = interpolate(vae_models[beta_name], tokenizer_encoder, tokenizer_decoder, source_sentence, target_sentence, DEVICE, num_interpolation_steps, top_k, top_p, temperature)
        print()

sentence_pairs = [["I am very happy", "He is terribly sad"], ["The man walks up the stairs to the woman", "Three women greet a man that walks on the street"]]

for p in sentence_pairs:
    print(p[0], '-->', p[1])
    interpolate_different_beta(p[0], p[1], vae_models, tokenizer_encoder, tokenizer_decoder)

I am very happy --> He is terribly sad
******************************
Interpolation for BETA: 0.0
-
0 I am very happy
1 there is very happy
2 it is very happy
3 there is very happy person
4 it is very happy
5 there is very happy man
6 he is very sad
7 he is very unhappy
8 he is very unhappy
9 he is terribly sad
10 he is terribly sad
11 he is terribly sad

******************************
Interpolation for BETA: 0.5
-
0 I am very happy
1 there is very happy person
2 there is very happy woman
3 it is very happy
4 it is very happy
5 it is very sad
6 he is very happy
7 he is very unhappy
8 he is very sad
9 he is very sad
10 he is terribly sad
11 he is terribly sad

******************************
Interpolation for BETA: 1.0
-
0 I am very happy
1 there is very happy person
2 there is very happy woman
3 this is very happy
4 it is very happy
5 it is very happy
6 it is quite sad
7 he is very sad
8 he is very unhappy
9 he is terribly sad
10 he is very sad
11 he is terribly sad

The man walks up th

## Auto encode some samples

In [14]:
def reconstruct_text(input_text, tokenizer_encoder, tokenizer_decoder, vae_models, beta_name, verbose=False):
    s = time.time()

    words_random_text = input_text.split(' ')

    latent_z, _ = latent_code_from_text(" ".join(words_random_text[:318]), tokenizer_encoder, vae_models[beta_name], DEVICE)
    reconstructed_text = text_from_latent_code(latent_z, vae_models[beta_name], DEVICE, 0, 1.0, 1.0, tokenizer_decoder)
    if verbose:
        print("Input text:\n\n", input_text)
        print("\n\nLength of the text:", len(words_random_text))
        print("\n\nReconstruction of the text:\n\n", reconstructed_text)
        print("\nTook {:.2f} seconds to reconstruct the input.".format(time.time() - s))
    return latent_z, reconstructed_text

## Longer text (325 words, 510 tokens)

In [15]:
random_text = '''The epic, traditionally ascribed to the Maharishi Valmiki, narrates the life of Rama, prince of the legendary kingdom of Kosala. The story follows his fourteen-year exile to the forest urged by his father King Dasharatha, on the request of Rama's stepmother Kaikeyi; his travels across forests in India with his wife Sita and brother Lakshmana, the kidnapping of Sita by Ravana --the evil king of Lanka, that resulted in war (against evil); and Rama's eventual return to Ayodhya to be crowned king amidst jubilation and celebration. This is the culmination point of the epic. It is considered a sacred book, and is read by millions of people every year.
There have been many attempts to unravel the epic's historical growth and compositional layers; various recent scholars' estimates for the earliest stage of the text range from the 7th to 4th centuries BCE,[5] with later stages extending up to the 3rd century CE.[6]
The Ramayana is one of the largest ancient epics in world literature. It consists of nearly 24,000 verses (mostly set in the Shloka/Anustubh meter), divided into five kāṇḍas: the ayodhyakāṇḍa, the araṇyakāṇḍa, the kiṣkindakāṇḍa, the sundarākāṇḍa, and the laṅkākāṇḍa. and about 500 sargas (chapters).The uttarākāṇḍa,the bālakāṇḍa, although frequently counted among the main ones, is not a part of the original epic. Though Balakanda sometimes considered in the main epic, but according to many, Uttara Kanda is a later interpolation and thus it's not attributed to the work of Maharshi Valmiki.[7] In Hindu tradition, the Ramayana is considered to be the Adi-kavya (first poem). It depicts the duties of relationships, portraying ideal characters like the ideal father, the ideal servant, the ideal brother, the ideal husband and the ideal king. The Ramayana was an important influence on later Sanskrit poetry and Hindu life and culture. Its most important moral influence was the importance of virtue, in the life of a citizen and in the ideals of the formation of a state or of a functioning society.'''

_, _ = reconstruct_text(random_text, tokenizer_encoder, tokenizer_decoder, vae_models, "B1.0", verbose=True)

torch.Size([1, 768])
Input text:

 The epic, traditionally ascribed to the Maharishi Valmiki, narrates the life of Rama, prince of the legendary kingdom of Kosala. The story follows his fourteen-year exile to the forest urged by his father King Dasharatha, on the request of Rama's stepmother Kaikeyi; his travels across forests in India with his wife Sita and brother Lakshmana, the kidnapping of Sita by Ravana --the evil king of Lanka, that resulted in war (against evil); and Rama's eventual return to Ayodhya to be crowned king amidst jubilation and celebration. This is the culmination point of the epic. It is considered a sacred book, and is read by millions of people every year.
There have been many attempts to unravel the epic's historical growth and compositional layers; various recent scholars' estimates for the earliest stage of the text range from the 7th to 4th centuries BCE,[5] with later stages extending up to the 3rd century CE.[6]
The Ramayana is one of the largest ancient e

In [10]:
random_text_2 = '''We investigate whether the considered unsupervised disentanglement approaches are effective at enforcing a factorizing and thus uncorrelated aggregated posterior. For each trained model, we sample 10 000 images and compute a sample from the corresponding approximate posterior. We then fit a multivariate Gaussian distribution over these 10 000 samples by computing the empirical mean and covariance matrix. Finally, we compute the total correlation of the fitted Gaussian and report the median value for each data set, method and hyperparameter value.'''

_, _ = reconstruct_text(random_text_2, tokenizer_encoder, tokenizer_decoder, vae_models, "B1.0", verbose=True)

Input text:

 We investigate whether the considered unsupervised disentanglement approaches are effective at enforcing a factorizing and thus uncorrelated aggregated posterior. For each trained model, we sample 10 000 images and compute a sample from the corresponding approximate posterior. We then fit a multivariate Gaussian distribution over these 10 000 samples by computing the empirical mean and covariance matrix. Finally, we compute the total correlation of the fitted Gaussian and report the median value for each data set, method and hyperparameter value.


Length of the text: 80


Reconstruction of the text:

 this test examines the different techniques and methodologies vying at results from a eton future branded plugging objects, fully incline what most london, the mic, being held to enter the rounded pound for <unk>.

Took 5.47 seconds to reconstruct the input.


In [11]:
random_text_3 = '''We investigate whether the considered unsupervised disentanglement approaches are effective at enforcing a factorizing and thus uncorrelated aggregated posterior .'''

_, _ = reconstruct_text(random_text_3, tokenizer_encoder, tokenizer_decoder, vae_models, "B1.0", verbose=True)

Input text:

 We investigate whether the considered unsupervised disentanglement approaches are effective at enforcing a factorizing and thus uncorrelated aggregated posterior .


Length of the text: 20


Reconstruction of the text:

 the evaluation of these substances is conforming to differing <unk> <unk> instead of a attentive restrained cautionary call.

Took 2.39 seconds to reconstruct the input.


In [12]:
random_text_4 = '''This is an apple .'''

_, _ = reconstruct_text(random_text_4, tokenizer_encoder, tokenizer_decoder, vae_models, "B0.0", verbose=True);print('-'*50)
_, _ = reconstruct_text(random_text_4, tokenizer_encoder, tokenizer_decoder, vae_models, "B0.5", verbose=True);print('-'*50)
_, _ = reconstruct_text(random_text_4, tokenizer_encoder, tokenizer_decoder, vae_models, "B1.0", verbose=True)

Input text:

 This is an apple .


Length of the text: 5


Reconstruction of the text:

 this is an apple.

Took 0.55 seconds to reconstruct the input.
--------------------------------------------------
Input text:

 This is an apple .


Length of the text: 5


Reconstruction of the text:

 this is an apple.

Took 0.55 seconds to reconstruct the input.
--------------------------------------------------
Input text:

 This is an apple .


Length of the text: 5


Reconstruction of the text:

 this is an apple.

Took 0.48 seconds to reconstruct the input.


## Encode a set of sentences

In [None]:
import torch.nn.functional as F

def sentence_from_tok_ids(tok_ids_tensor):
    text = tokenizer_decoder.decode(tok_ids_tensor.tolist(), clean_up_tokenization_spaces=True)
    text = text.split()[1:-1]
    text = ' '.join(text)
    return text

def text_from_latent_code_batch(latents, tokenizer_decoder, model_vae, 
                                top_p=1.0, top_k=0, temperature=1.0, 
                                batch_size=300, max_sentence_length=4):
    N_batches = int(np.ceil(len(sentences) / batch_size))
    
    decoded = []
 
    for batch_i, batch_latents in enumerate(torch.chunk(latents, N_batches, dim=0)):
        print("DECODE - Batch {:03d}".format(batch_i))
        generated_so_far = torch.tensor(tokenizer_decoder.added_tokens_encoder['<BOS>'], dtype=torch.long, device=DEVICE).unsqueeze(0).repeat(batch_latents.shape[0], 1)
        for i in range(max_sentence_length):
            print('word', i, end='\r')
            outputs = model_vae.decoder(generated_so_far, past=batch_latents)
            next_token_logits = outputs[0][:, 0, :]
            next_token_probs = F.softmax(next_token_logits, dim=1)
            next_token = torch.multinomial(next_token_probs, num_samples=1)
            generated_so_far = torch.cat((generated_so_far, next_token), dim=1)
        print("end generation batch shape:", generated_so_far.shape)
        decoded.append(generated_so_far)
    return torch.cat(decoded)

def tokenize_pad_batch(sentences):
    tokenized_sentences = []
    max_len = 0
    
    for i, s in enumerate(sentences):
        tokenized1 =  [101] + tokenizer_encoder.encode(s) + [102]
        if len(tokenized1) > max_len:
            max_len = len(tokenized1)
        tokenized_sentences.append(tokenized1)

    padded_tokenized_sentences = []
    for ts in tokenized_sentences:
        padding = [tokenizer_encoder.vocab['[PAD]']] * (max_len - len(ts))
        padded_tokenized_sentences.append(ts + padding)

    return torch.tensor(padded_tokenized_sentences), tokenized_sentences, max_len

def latent_code_from_text_batch(pad_tok_sentences, tokenizer_encoder, model_vae, batch_size=500):
    N_batches = np.ceil(pad_tok_sentences.shape[0] / batch_size)
    print("{} sentences in total, with max batch size of {} gives {} batches".format(len(sentences), batch_size, N_batches))
    
    latents = []
    for batch_i, batch in enumerate(torch.chunk(pad_tok_sentences, int(N_batches), dim=0)):
        print("ENCODE - Batch {:03d}".format(batch_i), end='\r')
        
        coded1 = torch.Tensor.long(batch)
        with torch.no_grad():
            x0 = coded1
            x0 = x0.to(DEVICE)
            _, pooled_hidden_fea = model_vae.encoder(x0, attention_mask=(x0 > 0).float())  #
            mean, logvar = model_vae.encoder.linear(pooled_hidden_fea).chunk(2, -1)
            latent_z = mean.squeeze(1)
            latents.append(latent_z)
    
    return torch.cat(latents)

## Load input sequences

In [None]:
max_N_sentences = True
N_sentences = 200
sentences_txt_file = 'sample_text.txt'

sentences = []
with open(sentences_txt_file, 'r') as fd:
    reader = csv.reader(fd, delimiter='\t')
    for i, row in enumerate(reader):
        sentences.append(row[1])
        sentences.append(row[2])
        if max_N_sentences:
            if (len(sentences)>= N_sentences):
                break

padded_tokenised_sentences, tokenised_sentences, max_len_sentences = tokenize_pad_batch(sentences)
print("Maximum sentence length:", max_len_sentences)
print("Padded tokenised sentences block shape:", padded_tokenised_sentences.shape)

In [9]:
process_sentences = True            

if process_sentences:
        
    latents = {"B1.0":[], "B0.5":[], "B0.0":[]}
    decoded_padded = {"B1.0":[], "B0.5":[], "B0.0":[]}

    for beta in [0.0, 0.5, 1.0]:
        print("*"*100)
        beta_name = "B{}".format(beta)
        
        print(beta_name)
        print("-"*100)

        z = latent_code_from_text_batch(padded_tokenised_sentences, tokenizer_encoder, vae_models[beta_name], batch_size=500)
        latents[beta_name] = z
                
        dec = text_from_latent_code_batch(z, tokenizer_decoder, vae_models[beta_name], batch_size=500, 
                                                     max_sentence_length=10)
        decoded_padded[beta_name] = dec
        
#     with open('latents.pickle', 'wb') as handle:
#         pickle.dump(latents, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
# else:
#     with open('latents.pickle', 'rb') as handle:
#         latents = pickle.load(handle)

****************************************************************************************************
B0.0
----------------------------------------------------------------------------------------------------
402 sentences in total, with max batch size of 500 gives 1.0 batches
DECODE - Batch 000
end generation batch shape: torch.Size([402, 11])
****************************************************************************************************
B0.5
----------------------------------------------------------------------------------------------------
402 sentences in total, with max batch size of 500 gives 1.0 batches
DECODE - Batch 000
end generation batch shape: torch.Size([402, 11])
****************************************************************************************************
B1.0
----------------------------------------------------------------------------------------------------
402 sentences in total, with max batch size of 500 gives 1.0 batches
DECODE - Batch 000
end generation 

In [23]:
reconstructed_text = {"B1.0":[], "B0.5":[], "B0.0":[]}

for beta in [0.0, 0.5, 1.0]:
    print("*"*100)
    beta_name = "B{}".format(beta)
    for i, pad_dec_sequence in enumerate(decoded_padded[beta_name]):
        print("------ {} ------".format(i))
        text = sentence_from_tok_ids(pad_dec_sequence)
        print("Original sentence:", sentences[i])
#         print("Batch reconstructed sentence:", text)
        z = latents[beta_name][i, :].unsqueeze(0)
        t = text_from_latent_code(z, vae_models[beta_name], DEVICE, 0, 1.0, 1.0, tokenizer_decoder)
        print("Reconstructed sentence:", t)
        _, t2 = reconstruct_text(sentences[i], tokenizer_encoder, tokenizer_decoder, 
                                                            vae_models, beta_name, verbose=False)
        print("Reconstructed sentence 2:", t2)
        if i == 10:
            break
    break

****************************************************************************************************
------ 0 ------
Original sentence: Tasting it is the only reliable way.
Reconstructed sentence: it is locking the so called best weather.
torch.Size([1, 768])
Reconstructed sentence 2: it is adjusting the only wiener safe.
------ 1 ------
Original sentence: The way you have it is fine.
Reconstructed sentence: the way it is made of sand.
torch.Size([1, 768])
Reconstructed sentence 2: the way that you have is fine.
------ 2 ------
Original sentence: I think it probably depends on your money.
Reconstructed sentence: it thinks that it should cost nothing.
torch.Size([1, 768])
Reconstructed sentence 2: it thinks it might cost me one penny.
------ 3 ------
Original sentence: It depends on your country.
Reconstructed sentence: it chooses itself on the country.
torch.Size([1, 768])
Reconstructed sentence 2: it chooses on itself.
------ 4 ------
Original sentence: You need to read a lot to know 

In [38]:
reconstruct_text()
    
text_from_latent_code_batch(latents["B1.0"], tokenizer_decoder, vae_models["B1.0"])

<BOS> bu it it it


## Make t-SNE plots for different Beta

In [None]:
mpl.rcParams['figure.figsize'] = 15, 10

models = {beta_name: TSNE(n_components=2, random_state=0) for beta_name in ["B1.0", "B0.5", "B0.0"]}
data_proj = {beta_name:None for beta_name in ["B1.0", "B0.5", "B0.0"]}
for beta_name, model in models.items():
    data = np.asarray(latents[beta_name])
    data_proj[beta_name] = model.fit_transform(data)
    
colors = ['y', 'g', 'b']
for i, (beta_name, data) in enumerate(data_proj.items()):
    plt.scatter(data[:,0] , data[:,1], color=colors[i], label=beta_name, alpha=0.4)

plt.title("t-SNE plot of encoded sentences by VAEs with different Beta values")
plt.legend()
plt.show()