# T5-VAE

Here you can try using a T5-VAE trained on Python state changes.

It tries to learn a smooth latent space of Python assignments.

In [None]:
!gsutil cp -r gs://fras/python_assignments_autoencoder .
!git clone https://github.com/Fraser-Greenlee/T5-VAE.git
!pip install transformers==3.0.2 wandb tqdm

In [None]:
import os
os.chdir('T5-VAE/')

In [None]:
import t5_vae

def args_list_from_txt(args_txt):
    l = [args_line.strip().split(' ') for args_line in args_txt.strip().split('\n')]
    return [item for sublist in l for item in sublist]

def load_t5_vae_from_path(t5_vae_path):
    with open( os.path.join(t5_vae_path, 'args.txt'), 'r' ) as f:
        args_txt = f.read()
    args_list = args_list_from_txt(args_txt)
    return t5_vae.load_t5_vae_from_args(args_list)

model = load_t5_vae_from_path('python_assignments_autoencoder')



In [None]:
for param in model.parameters():
  param.requires_grad = False
model = model.to('cuda').eval()

In [None]:
import torch

def str_to_input_ids(model, input_str):
    return model.pad_input_ids(
        torch.tensor(
            model.tokenizer.encode(input_str),
            device='cuda'
        )
    )

def logits_to_str(model, logits):
    return model.tokenizer.decode(torch.topk(logits[0], 1).indices.view(-1))

In [None]:
# recreate the input string
INPUT_STR = 'x = 100;'
input_ids = str_to_input_ids(model, INPUT_STR)
logits = model.greedy_logits(input_ids=input_ids)
logits_to_str(model, logits)

'x = 100;'

In [None]:
# traverse between 2 assignments
latent1 = model.get_latent(str_to_input_ids(model, 'x = a - 1;'))
latent2 = model.get_latent(str_to_input_ids(model, 'x = a + 10 * 2;'))

latent_diff = latent2 - latent1
latent_start = latent1

for i in range(11):
    ratio = i/10
    latent = latent_start + latent_diff * ratio
    logits = model.greedy_logits(latent=latent)
    print(ratio, logits_to_str(model, logits))

0.0 x = a - 1;
0.1 x = a - 1;
0.2 x = a - 1;
0.3 x = a - 1;
0.4 x = a + 1;
0.5 x = a + 2;
0.6 x = a + 2;
0.7 x = a + 2 * 2;
0.8 x = a + 10 * 2;
0.9 x = a + 10 * 2;
1.0 x = a + 10 * 2;


Above you can see that all intermediate values valid Python assignments.

In [None]:
# test recreating a range of inputs
input_temp = 'x = a + {0};'
for i in range(50)[::4]:
    input_str = input_temp.format(i)
    input_ids = str_to_input_ids(model, input_str)
    logits = model.greedy_logits(input_ids=input_ids)
    print(input_str, logits_to_str(model, logits))

x = a + 0; x = a + 0;
x = a + 4; x = a + 4;
x = a + 8; x = a + 8;
x = a + 12; x = a + 12;
x = a + 16; x = a + 16;
x = a + 20; x = a + count;
x = a + 24; x = a + 100;
x = a + 28; x = a + mult;
x = a + 32; x = a + 32;
x = a + 36; x = a + 12;
x = a + 40; x = a + height;
x = a + 44; x = a + bit;
x = a + 48; x = a + bit;


When applied to a range of values you can see the model has holes, this is likely due to the training dataset not providing a dense enough sampling.

In [None]:
# sample from random latent values
with torch.no_grad():
  for _ in range(10):
      logits = model.greedy_logits(latent=torch.randn(1, 1000).to('cuda'))
      print(logits_to_str(model, logits))

er = int(h[3] * 0);
l.append([False[j] * d);
y = '[0 '] = 1;
x = int(h[-1] * 0);
l.append( = 0 + str(x[0 / 1]);
x.append(a[da] * 0);
x =''[0 - 1:0];
x.append(x.pop(  + 1) ** 0);
f = int(h[i].pop() + 1);
x = int(h[-1 - 1]);
