In [1]:
%load_ext autoreload
# %reload_ext autoreload
%autoreload 2


In [2]:
from datetime import datetime

import torch
from torch.utils.tensorboard import SummaryWriter

from methylVA.mnist.model import VAE
from methylVA.mnist.training import train, test


batch_size = 128
learning_rate = 1e-3
weight_decay = 1e-2
num_epochs = 50
latent_dim = 2
hidden_dim = 512
name = 'VAE_MNIST_latent_2'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VAE(input_dim=784, latent_dim=latent_dim, hidden_dim=hidden_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
# writer = SummaryWriter(f'../experiments/VAE_MNIST/{datetime.now().strftime("%Y%m%d-%H%M%S")}')

writer_train = SummaryWriter(f'../experiments/{name}/train/{datetime.now().strftime("%Y%m%d-%H%M%S")}')
writer_test = SummaryWriter(f'../experiments/{name}/test/{datetime.now().strftime("%Y%m%d-%H%M%S")}')

In [3]:
from methylVA.mnist.dataset import get_data_loaders

In [4]:
train_loader, test_loader = get_data_loaders()

In [5]:
!pwd

/fast/AG_Ohler/ekarimi/projects/methylVA/notebooks


In [6]:
from methylVA.mnist.training import train, test


prev_updates = 0
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    prev_updates = train(model, train_loader, optimizer, prev_updates, writer=writer_train)
    test(model, test_loader, prev_updates, writer=writer_test)

Epoch 1/50


  1%|          | 3/469 [00:00<01:21,  5.71it/s]

Step 0, (N samples: 0), Loss: 543.4832, (Recon: 543.2744, KL: 0.2088), Gradient norm: 13.2071


 22%|██▏       | 103/469 [00:05<00:19, 18.75it/s]

Step 100, (N samples: 12,800), Loss: 195.0385, (Recon: 193.1852, KL: 1.8533), Gradient norm: 13.2217


 43%|████▎     | 203/469 [00:11<00:14, 17.87it/s]

Step 200, (N samples: 25,600), Loss: 181.5504, (Recon: 179.2684, KL: 2.2820), Gradient norm: 32.2169


 65%|██████▍   | 304/469 [00:16<00:08, 18.84it/s]

Step 300, (N samples: 38,400), Loss: 184.8153, (Recon: 182.3679, KL: 2.4474), Gradient norm: 31.8119


 86%|████████▌ | 404/469 [00:22<00:03, 18.46it/s]

Step 400, (N samples: 51,200), Loss: 173.7290, (Recon: 170.0140, KL: 3.7151), Gradient norm: 17.1039


100%|██████████| 469/469 [00:25<00:00, 18.38it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.94it/s]


====> Test set loss: 170.6518, (BCE: 166.4511, KLD: 4.2007)
Epoch 2/50


  7%|▋         | 34/469 [00:01<00:23, 18.68it/s]

Step 500, (N samples: 64,000), Loss: 166.9162, (Recon: 162.5396, KL: 4.3766), Gradient norm: 52.5666


 29%|██▊       | 134/469 [00:07<00:18, 18.40it/s]

Step 600, (N samples: 76,800), Loss: 154.7522, (Recon: 150.1304, KL: 4.6218), Gradient norm: 48.0190


 50%|████▉     | 234/469 [00:12<00:12, 18.34it/s]

Step 700, (N samples: 89,600), Loss: 160.2309, (Recon: 155.1043, KL: 5.1266), Gradient norm: 53.1006


 71%|███████   | 334/469 [00:17<00:07, 17.71it/s]

Step 800, (N samples: 102,400), Loss: 154.9231, (Recon: 149.5787, KL: 5.3444), Gradient norm: 94.5810


 93%|█████████▎| 434/469 [00:23<00:01, 18.46it/s]

Step 900, (N samples: 115,200), Loss: 161.9726, (Recon: 156.6470, KL: 5.3256), Gradient norm: 55.8716


100%|██████████| 469/469 [00:25<00:00, 18.59it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.85it/s]


====> Test set loss: 156.4627, (BCE: 151.1218, KLD: 5.3409)
Epoch 3/50


 14%|█▍        | 66/469 [00:03<00:21, 18.67it/s]

Step 1,000, (N samples: 128,000), Loss: 146.0287, (Recon: 140.6927, KL: 5.3360), Gradient norm: 67.1809


 35%|███▌      | 166/469 [00:08<00:16, 18.78it/s]

Step 1,100, (N samples: 140,800), Loss: 158.3703, (Recon: 152.9569, KL: 5.4134), Gradient norm: 107.8976


 57%|█████▋    | 265/469 [00:14<00:10, 18.60it/s]

Step 1,200, (N samples: 153,600), Loss: 156.6439, (Recon: 151.1072, KL: 5.5366), Gradient norm: 53.0795


 78%|███████▊  | 366/469 [00:19<00:05, 18.68it/s]

Step 1,300, (N samples: 166,400), Loss: 151.8456, (Recon: 146.2597, KL: 5.5859), Gradient norm: 81.3753


 99%|█████████▉| 466/469 [00:24<00:00, 18.73it/s]

Step 1,400, (N samples: 179,200), Loss: 158.4634, (Recon: 152.8148, KL: 5.6486), Gradient norm: 64.4527


100%|██████████| 469/469 [00:25<00:00, 18.70it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.56it/s]


====> Test set loss: 152.5168, (BCE: 146.8671, KLD: 5.6497)
Epoch 4/50


 21%|██        | 97/469 [00:05<00:20, 18.58it/s]

Step 1,500, (N samples: 192,000), Loss: 147.7212, (Recon: 141.8629, KL: 5.8583), Gradient norm: 65.9357


 42%|████▏     | 197/469 [00:10<00:14, 18.37it/s]

Step 1,600, (N samples: 204,800), Loss: 151.9635, (Recon: 146.2523, KL: 5.7111), Gradient norm: 107.5260


 63%|██████▎   | 296/469 [00:15<00:09, 18.86it/s]

Step 1,700, (N samples: 217,600), Loss: 148.6023, (Recon: 142.8037, KL: 5.7986), Gradient norm: 89.2509


 85%|████████▍ | 397/469 [00:21<00:03, 18.66it/s]

Step 1,800, (N samples: 230,400), Loss: 140.5985, (Recon: 134.6596, KL: 5.9389), Gradient norm: 74.5738


100%|██████████| 469/469 [00:25<00:00, 18.61it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.75it/s]


====> Test set loss: 149.1919, (BCE: 143.3657, KLD: 5.8262)
Epoch 5/50


  6%|▌         | 28/469 [00:01<00:23, 18.76it/s]

Step 1,900, (N samples: 243,200), Loss: 150.0142, (Recon: 144.1458, KL: 5.8684), Gradient norm: 84.6961


 27%|██▋       | 128/469 [00:06<00:18, 18.58it/s]

Step 2,000, (N samples: 256,000), Loss: 148.7797, (Recon: 142.6800, KL: 6.0996), Gradient norm: 57.4233


 49%|████▊     | 228/469 [00:12<00:13, 18.40it/s]

Step 2,100, (N samples: 268,800), Loss: 147.9568, (Recon: 141.7807, KL: 6.1761), Gradient norm: 53.1024


 70%|██████▉   | 327/469 [00:17<00:07, 18.83it/s]

Step 2,200, (N samples: 281,600), Loss: 151.2685, (Recon: 144.7771, KL: 6.4914), Gradient norm: 100.9732


 91%|█████████▏| 428/469 [00:22<00:02, 18.68it/s]

Step 2,300, (N samples: 294,400), Loss: 145.4922, (Recon: 139.2228, KL: 6.2695), Gradient norm: 106.2865


100%|██████████| 469/469 [00:25<00:00, 18.71it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 21.00it/s]


====> Test set loss: 147.5649, (BCE: 141.5121, KLD: 6.0529)
Epoch 6/50


 13%|█▎        | 59/469 [00:03<00:22, 18.39it/s]

Step 2,400, (N samples: 307,200), Loss: 145.5887, (Recon: 139.6310, KL: 5.9576), Gradient norm: 71.7802


 34%|███▍      | 159/469 [00:08<00:16, 18.31it/s]

Step 2,500, (N samples: 320,000), Loss: 147.1773, (Recon: 141.1013, KL: 6.0760), Gradient norm: 81.6707


 55%|█████▌    | 259/469 [00:13<00:12, 17.46it/s]

Step 2,600, (N samples: 332,800), Loss: 154.3018, (Recon: 148.0807, KL: 6.2212), Gradient norm: 72.0092


 76%|███████▋  | 358/469 [00:19<00:05, 18.68it/s]

Step 2,700, (N samples: 345,600), Loss: 152.9467, (Recon: 146.8400, KL: 6.1066), Gradient norm: 83.9598


 98%|█████████▊| 458/469 [00:24<00:00, 18.52it/s]

Step 2,800, (N samples: 358,400), Loss: 150.0368, (Recon: 143.9083, KL: 6.1285), Gradient norm: 77.6679


100%|██████████| 469/469 [00:25<00:00, 18.65it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.38it/s]


====> Test set loss: 145.8272, (BCE: 139.6699, KLD: 6.1573)
Epoch 7/50


 19%|█▉        | 90/469 [00:04<00:20, 18.32it/s]

Step 2,900, (N samples: 371,200), Loss: 138.0961, (Recon: 131.8519, KL: 6.2442), Gradient norm: 84.8264


 41%|████      | 190/469 [00:10<00:15, 18.00it/s]

Step 3,000, (N samples: 384,000), Loss: 150.8111, (Recon: 144.6434, KL: 6.1677), Gradient norm: 66.2999


 62%|██████▏   | 290/469 [00:15<00:09, 18.85it/s]

Step 3,100, (N samples: 396,800), Loss: 145.7041, (Recon: 139.4160, KL: 6.2881), Gradient norm: 118.8485


 83%|████████▎ | 390/469 [00:20<00:04, 18.69it/s]

Step 3,200, (N samples: 409,600), Loss: 143.0074, (Recon: 136.7778, KL: 6.2295), Gradient norm: 181.7702


100%|██████████| 469/469 [00:25<00:00, 18.68it/s]
Testing: 100%|██████████| 79/79 [00:04<00:00, 17.15it/s]


====> Test set loss: 145.0445, (BCE: 138.7302, KLD: 6.3143)
Epoch 8/50


  4%|▍         | 20/469 [00:01<00:24, 18.64it/s]

Step 3,300, (N samples: 422,400), Loss: 144.6274, (Recon: 138.2437, KL: 6.3836), Gradient norm: 115.2958


 26%|██▌       | 120/469 [00:06<00:18, 18.50it/s]

Step 3,400, (N samples: 435,200), Loss: 140.0685, (Recon: 133.8764, KL: 6.1921), Gradient norm: 152.6463


 47%|████▋     | 220/469 [00:11<00:13, 18.68it/s]

Step 3,500, (N samples: 448,000), Loss: 144.3762, (Recon: 138.0593, KL: 6.3169), Gradient norm: 109.6844


 68%|██████▊   | 320/469 [00:17<00:08, 18.32it/s]

Step 3,600, (N samples: 460,800), Loss: 136.2980, (Recon: 130.0963, KL: 6.2017), Gradient norm: 76.3252


 90%|████████▉ | 420/469 [00:22<00:02, 18.40it/s]

Step 3,700, (N samples: 473,600), Loss: 148.3062, (Recon: 141.8611, KL: 6.4452), Gradient norm: 141.1624


100%|██████████| 469/469 [00:25<00:00, 18.67it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.62it/s]


====> Test set loss: 145.2482, (BCE: 138.8416, KLD: 6.4067)
Epoch 9/50


 11%|█         | 52/469 [00:02<00:22, 18.78it/s]

Step 3,800, (N samples: 486,400), Loss: 137.9202, (Recon: 131.4986, KL: 6.4216), Gradient norm: 161.1807


 32%|███▏      | 152/469 [00:08<00:17, 18.23it/s]

Step 3,900, (N samples: 499,200), Loss: 138.5201, (Recon: 132.0860, KL: 6.4341), Gradient norm: 127.0975


 54%|█████▎    | 252/469 [00:13<00:12, 17.77it/s]

Step 4,000, (N samples: 512,000), Loss: 148.1732, (Recon: 141.9656, KL: 6.2076), Gradient norm: 102.9872


 75%|███████▍  | 351/469 [00:18<00:06, 18.25it/s]

Step 4,100, (N samples: 524,800), Loss: 143.0355, (Recon: 136.5625, KL: 6.4730), Gradient norm: 192.9636


 96%|█████████▌| 451/469 [00:24<00:00, 18.50it/s]

Step 4,200, (N samples: 537,600), Loss: 152.5718, (Recon: 146.1212, KL: 6.4506), Gradient norm: 183.3118


100%|██████████| 469/469 [00:25<00:00, 18.56it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.68it/s]


====> Test set loss: 145.7688, (BCE: 139.3141, KLD: 6.4547)
Epoch 10/50


 17%|█▋        | 82/469 [00:04<00:20, 18.78it/s]

Step 4,300, (N samples: 550,400), Loss: 145.5181, (Recon: 139.1072, KL: 6.4109), Gradient norm: 150.2375


 39%|███▉      | 182/469 [00:09<00:15, 18.94it/s]

Step 4,400, (N samples: 563,200), Loss: 141.8240, (Recon: 135.2745, KL: 6.5495), Gradient norm: 130.8515


 60%|██████    | 283/469 [00:15<00:10, 18.56it/s]

Step 4,500, (N samples: 576,000), Loss: 142.8272, (Recon: 136.2323, KL: 6.5949), Gradient norm: 136.0577


 81%|████████▏ | 382/469 [00:20<00:04, 18.72it/s]

Step 4,600, (N samples: 588,800), Loss: 146.6347, (Recon: 140.3443, KL: 6.2904), Gradient norm: 102.6663


100%|██████████| 469/469 [00:25<00:00, 18.64it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.71it/s]


====> Test set loss: 148.1903, (BCE: 141.8668, KLD: 6.3235)
Epoch 11/50


  3%|▎         | 13/469 [00:00<00:25, 18.13it/s]

Step 4,700, (N samples: 601,600), Loss: 149.8474, (Recon: 143.4427, KL: 6.4046), Gradient norm: 95.5478


 24%|██▍       | 113/469 [00:06<00:19, 18.72it/s]

Step 4,800, (N samples: 614,400), Loss: 135.7281, (Recon: 129.1455, KL: 6.5826), Gradient norm: 112.1977


 45%|████▌     | 213/469 [00:11<00:13, 18.63it/s]

Step 4,900, (N samples: 627,200), Loss: 142.5380, (Recon: 136.1259, KL: 6.4121), Gradient norm: 128.2707


 67%|██████▋   | 314/469 [00:16<00:08, 18.59it/s]

Step 5,000, (N samples: 640,000), Loss: 144.3552, (Recon: 137.9608, KL: 6.3945), Gradient norm: 201.9519


 88%|████████▊ | 413/469 [00:22<00:03, 18.44it/s]

Step 5,100, (N samples: 652,800), Loss: 138.7487, (Recon: 132.1956, KL: 6.5530), Gradient norm: 166.3640


100%|██████████| 469/469 [00:25<00:00, 18.57it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.59it/s]


====> Test set loss: 144.3078, (BCE: 137.9424, KLD: 6.3655)
Epoch 12/50


  9%|▉         | 44/469 [00:02<00:23, 18.35it/s]

Step 5,200, (N samples: 665,600), Loss: 144.5532, (Recon: 138.2220, KL: 6.3312), Gradient norm: 132.9664


 31%|███       | 144/469 [00:07<00:17, 18.39it/s]

Step 5,300, (N samples: 678,400), Loss: 149.0059, (Recon: 142.4422, KL: 6.5637), Gradient norm: 125.4048


 52%|█████▏    | 244/469 [00:13<00:12, 18.59it/s]

Step 5,400, (N samples: 691,200), Loss: 143.4702, (Recon: 136.8817, KL: 6.5885), Gradient norm: 106.9605


 73%|███████▎  | 344/469 [00:19<00:07, 16.53it/s]

Step 5,500, (N samples: 704,000), Loss: 141.2121, (Recon: 134.6249, KL: 6.5873), Gradient norm: 231.9612


 95%|█████████▍| 444/469 [00:24<00:01, 18.33it/s]

Step 5,600, (N samples: 716,800), Loss: 142.2703, (Recon: 135.7897, KL: 6.4806), Gradient norm: 125.0190


100%|██████████| 469/469 [00:25<00:00, 18.10it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.73it/s]


====> Test set loss: 142.6584, (BCE: 136.1734, KLD: 6.4850)
Epoch 13/50


 16%|█▌        | 75/469 [00:04<00:21, 18.30it/s]

Step 5,700, (N samples: 729,600), Loss: 147.0957, (Recon: 140.5678, KL: 6.5279), Gradient norm: 156.9834


 37%|███▋      | 175/469 [00:09<00:15, 18.38it/s]

Step 5,800, (N samples: 742,400), Loss: 149.8268, (Recon: 143.0756, KL: 6.7512), Gradient norm: 122.6911


 59%|█████▉    | 276/469 [00:14<00:10, 18.65it/s]

Step 5,900, (N samples: 755,200), Loss: 144.7351, (Recon: 138.1006, KL: 6.6345), Gradient norm: 161.5076


 80%|███████▉  | 375/469 [00:20<00:05, 17.96it/s]

Step 6,000, (N samples: 768,000), Loss: 144.6331, (Recon: 138.0682, KL: 6.5649), Gradient norm: 171.2974


100%|██████████| 469/469 [00:25<00:00, 18.53it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 21.20it/s]


====> Test set loss: 140.9677, (BCE: 134.4740, KLD: 6.4938)
Epoch 14/50


  1%|▏         | 6/469 [00:00<00:28, 16.34it/s]

Step 6,100, (N samples: 780,800), Loss: 133.4146, (Recon: 126.9458, KL: 6.4688), Gradient norm: 89.7621


 23%|██▎       | 106/469 [00:05<00:19, 18.38it/s]

Step 6,200, (N samples: 793,600), Loss: 143.3612, (Recon: 136.7807, KL: 6.5804), Gradient norm: 76.5412


 44%|████▍     | 206/469 [00:11<00:14, 18.63it/s]

Step 6,300, (N samples: 806,400), Loss: 142.4116, (Recon: 135.7624, KL: 6.6492), Gradient norm: 165.8525


 65%|██████▌   | 306/469 [00:16<00:08, 18.13it/s]

Step 6,400, (N samples: 819,200), Loss: 139.9067, (Recon: 133.1035, KL: 6.8033), Gradient norm: 136.7264


 87%|████████▋ | 406/469 [00:21<00:03, 18.28it/s]

Step 6,500, (N samples: 832,000), Loss: 141.9948, (Recon: 135.2987, KL: 6.6961), Gradient norm: 59.5804


100%|██████████| 469/469 [00:25<00:00, 18.44it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.52it/s]


====> Test set loss: 142.0890, (BCE: 135.5140, KLD: 6.5750)
Epoch 15/50


  8%|▊         | 37/469 [00:02<00:23, 18.37it/s]

Step 6,600, (N samples: 844,800), Loss: 139.2978, (Recon: 132.6604, KL: 6.6374), Gradient norm: 99.1829


 29%|██▉       | 137/469 [00:07<00:17, 18.55it/s]

Step 6,700, (N samples: 857,600), Loss: 138.6450, (Recon: 132.1143, KL: 6.5306), Gradient norm: 134.7361


 51%|█████     | 237/469 [00:12<00:12, 18.67it/s]

Step 6,800, (N samples: 870,400), Loss: 148.5580, (Recon: 142.2397, KL: 6.3183), Gradient norm: 212.4228


 72%|███████▏  | 336/469 [00:17<00:07, 18.97it/s]

Step 6,900, (N samples: 883,200), Loss: 143.9353, (Recon: 137.3397, KL: 6.5956), Gradient norm: 181.6574


 93%|█████████▎| 438/469 [00:23<00:01, 17.73it/s]

Step 7,000, (N samples: 896,000), Loss: 147.3972, (Recon: 140.8501, KL: 6.5471), Gradient norm: 127.5834


100%|██████████| 469/469 [00:25<00:00, 18.61it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.61it/s]


====> Test set loss: 141.0774, (BCE: 134.4643, KLD: 6.6131)
Epoch 16/50


 15%|█▍        | 69/469 [00:03<00:22, 18.01it/s]

Step 7,100, (N samples: 908,800), Loss: 141.5593, (Recon: 134.9630, KL: 6.5963), Gradient norm: 239.0480


 36%|███▌      | 169/469 [00:09<00:16, 18.72it/s]

Step 7,200, (N samples: 921,600), Loss: 143.1077, (Recon: 136.5296, KL: 6.5781), Gradient norm: 101.0513


 57%|█████▋    | 269/469 [00:14<00:11, 17.87it/s]

Step 7,300, (N samples: 934,400), Loss: 136.9408, (Recon: 130.3537, KL: 6.5871), Gradient norm: 135.2608


 79%|███████▊  | 369/469 [00:19<00:05, 18.72it/s]

Step 7,400, (N samples: 947,200), Loss: 143.9893, (Recon: 137.3578, KL: 6.6316), Gradient norm: 126.6035


100%|██████████| 469/469 [00:26<00:00, 18.01it/s]


Step 7,500, (N samples: 960,000), Loss: 147.7388, (Recon: 141.0249, KL: 6.7139), Gradient norm: 91.4403


Testing: 100%|██████████| 79/79 [00:03<00:00, 20.84it/s]


====> Test set loss: 141.7059, (BCE: 135.1097, KLD: 6.5962)
Epoch 17/50


 21%|██        | 99/469 [00:05<00:19, 18.56it/s]

Step 7,600, (N samples: 972,800), Loss: 140.5188, (Recon: 133.8446, KL: 6.6742), Gradient norm: 109.6699


 42%|████▏     | 199/469 [00:10<00:14, 18.29it/s]

Step 7,700, (N samples: 985,600), Loss: 148.6741, (Recon: 141.8272, KL: 6.8469), Gradient norm: 149.2682


 64%|██████▍   | 300/469 [00:16<00:09, 18.73it/s]

Step 7,800, (N samples: 998,400), Loss: 142.4538, (Recon: 135.5416, KL: 6.9122), Gradient norm: 130.1659


 85%|████████▌ | 399/469 [00:21<00:03, 18.36it/s]

Step 7,900, (N samples: 1,011,200), Loss: 137.9646, (Recon: 131.1839, KL: 6.7807), Gradient norm: 95.3462


100%|██████████| 469/469 [00:25<00:00, 18.59it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.88it/s]


====> Test set loss: 146.4282, (BCE: 139.8744, KLD: 6.5538)
Epoch 18/50


  6%|▋         | 30/469 [00:01<00:23, 18.66it/s]

Step 8,000, (N samples: 1,024,000), Loss: 147.9688, (Recon: 141.4376, KL: 6.5312), Gradient norm: 170.2989


 28%|██▊       | 130/469 [00:06<00:18, 18.35it/s]

Step 8,100, (N samples: 1,036,800), Loss: 134.7026, (Recon: 127.9347, KL: 6.7679), Gradient norm: 80.7389


 49%|████▉     | 230/469 [00:12<00:12, 18.44it/s]

Step 8,200, (N samples: 1,049,600), Loss: 142.3166, (Recon: 135.4769, KL: 6.8397), Gradient norm: 140.9554


 70%|███████   | 330/469 [00:17<00:07, 18.81it/s]

Step 8,300, (N samples: 1,062,400), Loss: 135.4176, (Recon: 128.7438, KL: 6.6738), Gradient norm: 137.2853


 92%|█████████▏| 430/469 [00:23<00:02, 18.65it/s]

Step 8,400, (N samples: 1,075,200), Loss: 144.6499, (Recon: 138.1393, KL: 6.5107), Gradient norm: 155.6039


100%|██████████| 469/469 [00:25<00:00, 18.54it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.65it/s]


====> Test set loss: 142.5919, (BCE: 135.9649, KLD: 6.6270)
Epoch 19/50


 13%|█▎        | 61/469 [00:03<00:21, 18.80it/s]

Step 8,500, (N samples: 1,088,000), Loss: 148.0563, (Recon: 141.3799, KL: 6.6764), Gradient norm: 110.5858


 34%|███▍      | 161/469 [00:08<00:16, 18.19it/s]

Step 8,600, (N samples: 1,100,800), Loss: 136.6496, (Recon: 129.9193, KL: 6.7303), Gradient norm: 109.7234


 56%|█████▌    | 261/469 [00:14<00:12, 16.94it/s]

Step 8,700, (N samples: 1,113,600), Loss: 142.0237, (Recon: 135.2984, KL: 6.7253), Gradient norm: 183.1644


 77%|███████▋  | 361/469 [00:19<00:05, 18.73it/s]

Step 8,800, (N samples: 1,126,400), Loss: 148.1919, (Recon: 141.6437, KL: 6.5482), Gradient norm: 178.3115


 98%|█████████▊| 461/469 [00:24<00:00, 18.60it/s]

Step 8,900, (N samples: 1,139,200), Loss: 132.6385, (Recon: 125.8743, KL: 6.7642), Gradient norm: 81.4133


100%|██████████| 469/469 [00:25<00:00, 18.47it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.79it/s]


====> Test set loss: 141.1626, (BCE: 134.4340, KLD: 6.7287)
Epoch 20/50


 20%|█▉        | 92/469 [00:04<00:20, 18.61it/s]

Step 9,000, (N samples: 1,152,000), Loss: 135.3076, (Recon: 128.6805, KL: 6.6271), Gradient norm: 177.3475


 41%|████      | 192/469 [00:10<00:15, 18.31it/s]

Step 9,100, (N samples: 1,164,800), Loss: 139.9888, (Recon: 133.2944, KL: 6.6943), Gradient norm: 125.7266


 62%|██████▏   | 292/469 [00:15<00:09, 18.73it/s]

Step 9,200, (N samples: 1,177,600), Loss: 147.8398, (Recon: 141.1471, KL: 6.6927), Gradient norm: 163.8620


 84%|████████▍ | 393/469 [00:21<00:04, 18.67it/s]

Step 9,300, (N samples: 1,190,400), Loss: 139.5359, (Recon: 132.8573, KL: 6.6786), Gradient norm: 162.3151


100%|██████████| 469/469 [00:25<00:00, 18.44it/s]
Testing: 100%|██████████| 79/79 [00:04<00:00, 17.99it/s]


====> Test set loss: 144.1184, (BCE: 137.4220, KLD: 6.6964)
Epoch 21/50


  5%|▌         | 24/469 [00:01<00:24, 18.45it/s]

Step 9,400, (N samples: 1,203,200), Loss: 143.4449, (Recon: 136.7172, KL: 6.7276), Gradient norm: 132.3009


 26%|██▋       | 124/469 [00:06<00:18, 18.64it/s]

Step 9,500, (N samples: 1,216,000), Loss: 140.9821, (Recon: 134.2914, KL: 6.6908), Gradient norm: 100.3972


 48%|████▊     | 224/469 [00:12<00:13, 18.79it/s]

Step 9,600, (N samples: 1,228,800), Loss: 141.0569, (Recon: 134.4011, KL: 6.6558), Gradient norm: 92.0459


 69%|██████▉   | 323/469 [00:17<00:07, 18.61it/s]

Step 9,700, (N samples: 1,241,600), Loss: 141.0324, (Recon: 134.3182, KL: 6.7142), Gradient norm: 89.6999


 90%|█████████ | 423/469 [00:22<00:02, 18.85it/s]

Step 9,800, (N samples: 1,254,400), Loss: 144.3963, (Recon: 137.7680, KL: 6.6283), Gradient norm: 84.0356


100%|██████████| 469/469 [00:25<00:00, 18.64it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.49it/s]


====> Test set loss: 141.4956, (BCE: 134.7817, KLD: 6.7138)
Epoch 22/50


 12%|█▏        | 55/469 [00:02<00:22, 18.55it/s]

Step 9,900, (N samples: 1,267,200), Loss: 141.1626, (Recon: 134.3133, KL: 6.8493), Gradient norm: 112.6560


 33%|███▎      | 155/469 [00:08<00:16, 18.56it/s]

Step 10,000, (N samples: 1,280,000), Loss: 144.8510, (Recon: 138.0367, KL: 6.8144), Gradient norm: 161.0590


 54%|█████▍    | 255/469 [00:13<00:11, 18.51it/s]

Step 10,100, (N samples: 1,292,800), Loss: 148.4439, (Recon: 141.7419, KL: 6.7020), Gradient norm: 176.9148


 75%|███████▌  | 354/469 [00:19<00:06, 18.54it/s]

Step 10,200, (N samples: 1,305,600), Loss: 142.1414, (Recon: 135.2328, KL: 6.9086), Gradient norm: 160.7080


 97%|█████████▋| 454/469 [00:24<00:00, 18.51it/s]

Step 10,300, (N samples: 1,318,400), Loss: 147.4870, (Recon: 140.9895, KL: 6.4975), Gradient norm: 320.2347


100%|██████████| 469/469 [00:25<00:00, 18.59it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.69it/s]


====> Test set loss: 144.0688, (BCE: 137.3806, KLD: 6.6882)
Epoch 23/50


 18%|█▊        | 85/469 [00:04<00:20, 18.37it/s]

Step 10,400, (N samples: 1,331,200), Loss: 144.6577, (Recon: 138.0831, KL: 6.5746), Gradient norm: 234.5740


 39%|███▉      | 185/469 [00:10<00:15, 18.36it/s]

Step 10,500, (N samples: 1,344,000), Loss: 140.7430, (Recon: 134.1552, KL: 6.5878), Gradient norm: 156.0794


 61%|██████    | 286/469 [00:15<00:09, 18.65it/s]

Step 10,600, (N samples: 1,356,800), Loss: 142.7963, (Recon: 135.8182, KL: 6.9781), Gradient norm: 203.6531


 82%|████████▏ | 385/469 [00:20<00:04, 18.75it/s]

Step 10,700, (N samples: 1,369,600), Loss: 142.7677, (Recon: 135.7771, KL: 6.9906), Gradient norm: 164.1104


100%|██████████| 469/469 [00:25<00:00, 18.51it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.89it/s]


====> Test set loss: 142.5161, (BCE: 135.6479, KLD: 6.8682)
Epoch 24/50


  3%|▎         | 16/469 [00:00<00:26, 17.29it/s]

Step 10,800, (N samples: 1,382,400), Loss: 137.8361, (Recon: 130.8650, KL: 6.9711), Gradient norm: 330.7723


 25%|██▍       | 116/469 [00:06<00:18, 18.76it/s]

Step 10,900, (N samples: 1,395,200), Loss: 137.4294, (Recon: 130.6665, KL: 6.7629), Gradient norm: 132.8004


 46%|████▌     | 216/469 [00:11<00:14, 18.02it/s]

Step 11,000, (N samples: 1,408,000), Loss: 139.1520, (Recon: 132.3811, KL: 6.7710), Gradient norm: 113.0853


 68%|██████▊   | 317/469 [00:17<00:08, 18.85it/s]

Step 11,100, (N samples: 1,420,800), Loss: 143.3599, (Recon: 136.6614, KL: 6.6985), Gradient norm: 120.9594


 89%|████████▊ | 416/469 [00:22<00:02, 18.45it/s]

Step 11,200, (N samples: 1,433,600), Loss: 148.4626, (Recon: 141.5908, KL: 6.8719), Gradient norm: 277.5040


100%|██████████| 469/469 [00:25<00:00, 18.65it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 21.16it/s]


====> Test set loss: 143.9418, (BCE: 137.2738, KLD: 6.6679)
Epoch 25/50


 10%|█         | 47/469 [00:03<00:30, 13.71it/s]

Step 11,300, (N samples: 1,446,400), Loss: 143.4114, (Recon: 136.6580, KL: 6.7534), Gradient norm: 152.1657


 31%|███▏      | 147/469 [00:08<00:17, 18.36it/s]

Step 11,400, (N samples: 1,459,200), Loss: 144.3835, (Recon: 137.7559, KL: 6.6276), Gradient norm: 285.4876


 53%|█████▎    | 247/469 [00:14<00:11, 18.67it/s]

Step 11,500, (N samples: 1,472,000), Loss: 144.8867, (Recon: 138.0182, KL: 6.8685), Gradient norm: 116.1309


 74%|███████▍  | 348/469 [00:19<00:07, 17.15it/s]

Step 11,600, (N samples: 1,484,800), Loss: 139.2625, (Recon: 132.5893, KL: 6.6733), Gradient norm: 219.0252


 96%|█████████▌| 448/469 [00:24<00:01, 18.30it/s]

Step 11,700, (N samples: 1,497,600), Loss: 142.2482, (Recon: 135.5321, KL: 6.7161), Gradient norm: 173.8506


100%|██████████| 469/469 [00:25<00:00, 18.07it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.69it/s]


====> Test set loss: 140.2256, (BCE: 133.4428, KLD: 6.7829)
Epoch 26/50


 17%|█▋        | 78/469 [00:04<00:21, 18.32it/s]

Step 11,800, (N samples: 1,510,400), Loss: 140.2390, (Recon: 133.3441, KL: 6.8950), Gradient norm: 196.6610


 38%|███▊      | 178/469 [00:09<00:15, 18.66it/s]

Step 11,900, (N samples: 1,523,200), Loss: 136.4160, (Recon: 129.7087, KL: 6.7073), Gradient norm: 151.9096


 59%|█████▉    | 278/469 [00:15<00:10, 18.09it/s]

Step 12,000, (N samples: 1,536,000), Loss: 143.4940, (Recon: 136.5114, KL: 6.9825), Gradient norm: 158.7905


 81%|████████  | 379/469 [00:20<00:04, 18.87it/s]

Step 12,100, (N samples: 1,548,800), Loss: 140.8625, (Recon: 133.9943, KL: 6.8683), Gradient norm: 102.9156


100%|██████████| 469/469 [00:25<00:00, 18.45it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.76it/s]


====> Test set loss: 141.2582, (BCE: 134.5763, KLD: 6.6818)
Epoch 27/50


  2%|▏         | 10/469 [00:00<00:25, 18.32it/s]

Step 12,200, (N samples: 1,561,600), Loss: 141.1962, (Recon: 134.2464, KL: 6.9498), Gradient norm: 135.2651


 23%|██▎       | 110/469 [00:06<00:19, 18.36it/s]

Step 12,300, (N samples: 1,574,400), Loss: 144.1326, (Recon: 137.2810, KL: 6.8516), Gradient norm: 137.3479


 45%|████▍     | 210/469 [00:11<00:13, 18.82it/s]

Step 12,400, (N samples: 1,587,200), Loss: 139.5995, (Recon: 132.7225, KL: 6.8769), Gradient norm: 178.4144


 66%|██████▌   | 310/469 [00:16<00:08, 18.86it/s]

Step 12,500, (N samples: 1,600,000), Loss: 138.5746, (Recon: 131.5666, KL: 7.0079), Gradient norm: 133.1159


 87%|████████▋ | 410/469 [00:22<00:03, 18.39it/s]

Step 12,600, (N samples: 1,612,800), Loss: 137.1750, (Recon: 130.3503, KL: 6.8247), Gradient norm: 177.5393


100%|██████████| 469/469 [00:25<00:00, 18.57it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.68it/s]


====> Test set loss: 139.6784, (BCE: 132.7579, KLD: 6.9205)
Epoch 28/50


  9%|▊         | 40/469 [00:02<00:23, 18.62it/s]

Step 12,700, (N samples: 1,625,600), Loss: 134.3676, (Recon: 127.4449, KL: 6.9227), Gradient norm: 176.1420


 30%|██▉       | 140/469 [00:07<00:17, 18.97it/s]

Step 12,800, (N samples: 1,638,400), Loss: 135.6654, (Recon: 128.7152, KL: 6.9502), Gradient norm: 191.5588


 51%|█████     | 240/469 [00:12<00:12, 18.46it/s]

Step 12,900, (N samples: 1,651,200), Loss: 152.9754, (Recon: 146.0797, KL: 6.8957), Gradient norm: 131.0794


 73%|███████▎  | 341/469 [00:18<00:06, 18.81it/s]

Step 13,000, (N samples: 1,664,000), Loss: 141.7204, (Recon: 135.0677, KL: 6.6527), Gradient norm: 110.5290


 94%|█████████▍| 441/469 [00:23<00:01, 18.33it/s]

Step 13,100, (N samples: 1,676,800), Loss: 138.2267, (Recon: 131.4864, KL: 6.7402), Gradient norm: 180.1490


100%|██████████| 469/469 [00:25<00:00, 18.67it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.93it/s]


====> Test set loss: 139.7117, (BCE: 132.6873, KLD: 7.0243)
Epoch 29/50


 15%|█▌        | 72/469 [00:03<00:21, 18.68it/s]

Step 13,200, (N samples: 1,689,600), Loss: 141.8096, (Recon: 134.7758, KL: 7.0338), Gradient norm: 170.0282


 37%|███▋      | 172/469 [00:09<00:16, 18.56it/s]

Step 13,300, (N samples: 1,702,400), Loss: 133.9638, (Recon: 126.9452, KL: 7.0186), Gradient norm: 146.9163


 58%|█████▊    | 270/469 [00:15<00:11, 17.55it/s]

Step 13,400, (N samples: 1,715,200), Loss: 142.5085, (Recon: 135.7422, KL: 6.7663), Gradient norm: 132.6820


 79%|███████▉  | 372/469 [00:20<00:05, 18.38it/s]

Step 13,500, (N samples: 1,728,000), Loss: 140.9978, (Recon: 134.0452, KL: 6.9526), Gradient norm: 112.0100


100%|██████████| 469/469 [00:26<00:00, 18.02it/s]


Step 13,600, (N samples: 1,740,800), Loss: 136.1691, (Recon: 129.1039, KL: 7.0652), Gradient norm: 232.8441


Testing: 100%|██████████| 79/79 [00:03<00:00, 20.80it/s]


====> Test set loss: 143.1330, (BCE: 136.2335, KLD: 6.8995)
Epoch 30/50


 22%|██▏       | 103/469 [00:05<00:19, 18.97it/s]

Step 13,700, (N samples: 1,753,600), Loss: 136.5271, (Recon: 129.6089, KL: 6.9182), Gradient norm: 129.8179


 43%|████▎     | 203/469 [00:10<00:14, 18.66it/s]

Step 13,800, (N samples: 1,766,400), Loss: 141.5051, (Recon: 134.5768, KL: 6.9283), Gradient norm: 141.3239


 65%|██████▍   | 303/469 [00:16<00:08, 18.80it/s]

Step 13,900, (N samples: 1,779,200), Loss: 137.4658, (Recon: 130.5728, KL: 6.8929), Gradient norm: 208.4168


 86%|████████▌ | 403/469 [00:21<00:03, 18.70it/s]

Step 14,000, (N samples: 1,792,000), Loss: 143.4810, (Recon: 136.5780, KL: 6.9030), Gradient norm: 135.4730


100%|██████████| 469/469 [00:25<00:00, 18.62it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.96it/s]


====> Test set loss: 139.8515, (BCE: 132.8953, KLD: 6.9563)
Epoch 31/50


  7%|▋         | 33/469 [00:01<00:23, 18.72it/s]

Step 14,100, (N samples: 1,804,800), Loss: 140.7541, (Recon: 133.9483, KL: 6.8058), Gradient norm: 248.6609


 28%|██▊       | 133/469 [00:07<00:18, 17.94it/s]

Step 14,200, (N samples: 1,817,600), Loss: 135.2008, (Recon: 128.3166, KL: 6.8842), Gradient norm: 151.1642


 50%|████▉     | 233/469 [00:12<00:12, 18.70it/s]

Step 14,300, (N samples: 1,830,400), Loss: 135.1046, (Recon: 128.0962, KL: 7.0084), Gradient norm: 128.9493


 71%|███████   | 333/469 [00:17<00:07, 18.83it/s]

Step 14,400, (N samples: 1,843,200), Loss: 142.1549, (Recon: 135.0597, KL: 7.0952), Gradient norm: 114.0428


 92%|█████████▏| 433/469 [00:23<00:01, 18.37it/s]

Step 14,500, (N samples: 1,856,000), Loss: 143.0835, (Recon: 135.8410, KL: 7.2425), Gradient norm: 217.9443


100%|██████████| 469/469 [00:25<00:00, 18.58it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.72it/s]


====> Test set loss: 140.0089, (BCE: 132.9897, KLD: 7.0192)
Epoch 32/50


 14%|█▍        | 65/469 [00:03<00:21, 18.97it/s]

Step 14,600, (N samples: 1,868,800), Loss: 139.7122, (Recon: 132.7533, KL: 6.9589), Gradient norm: 241.1725


 35%|███▌      | 165/469 [00:08<00:16, 18.57it/s]

Step 14,700, (N samples: 1,881,600), Loss: 143.2611, (Recon: 136.0638, KL: 7.1972), Gradient norm: 184.3573


 57%|█████▋    | 265/469 [00:14<00:11, 18.43it/s]

Step 14,800, (N samples: 1,894,400), Loss: 138.9251, (Recon: 131.8187, KL: 7.1064), Gradient norm: 180.0061


 78%|███████▊  | 364/469 [00:19<00:05, 17.99it/s]

Step 14,900, (N samples: 1,907,200), Loss: 144.6121, (Recon: 137.5553, KL: 7.0567), Gradient norm: 149.3528


 99%|█████████▉| 464/469 [00:24<00:00, 18.67it/s]

Step 15,000, (N samples: 1,920,000), Loss: 151.2910, (Recon: 144.3976, KL: 6.8935), Gradient norm: 155.1275


100%|██████████| 469/469 [00:25<00:00, 18.62it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.59it/s]


====> Test set loss: 140.9950, (BCE: 134.2190, KLD: 6.7760)
Epoch 33/50


 20%|██        | 96/469 [00:05<00:19, 18.76it/s]

Step 15,100, (N samples: 1,932,800), Loss: 142.1592, (Recon: 135.3426, KL: 6.8166), Gradient norm: 141.3351


 42%|████▏     | 196/469 [00:10<00:14, 18.39it/s]

Step 15,200, (N samples: 1,945,600), Loss: 135.1627, (Recon: 128.0955, KL: 7.0672), Gradient norm: 181.4703


 63%|██████▎   | 295/469 [00:16<00:09, 18.62it/s]

Step 15,300, (N samples: 1,958,400), Loss: 134.6259, (Recon: 127.5858, KL: 7.0401), Gradient norm: 156.5909


 84%|████████▍ | 395/469 [00:21<00:03, 18.67it/s]

Step 15,400, (N samples: 1,971,200), Loss: 138.5849, (Recon: 131.6019, KL: 6.9830), Gradient norm: 295.2085


100%|██████████| 469/469 [00:25<00:00, 18.36it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.41it/s]


====> Test set loss: 138.9705, (BCE: 131.8617, KLD: 7.1088)
Epoch 34/50


  6%|▌         | 26/469 [00:01<00:23, 18.63it/s]

Step 15,500, (N samples: 1,984,000), Loss: 139.4212, (Recon: 132.7553, KL: 6.6659), Gradient norm: 116.2671


 27%|██▋       | 126/469 [00:06<00:18, 18.75it/s]

Step 15,600, (N samples: 1,996,800), Loss: 138.8198, (Recon: 131.8164, KL: 7.0034), Gradient norm: 236.7503


 48%|████▊     | 226/469 [00:12<00:13, 18.40it/s]

Step 15,700, (N samples: 2,009,600), Loss: 133.9301, (Recon: 126.9436, KL: 6.9864), Gradient norm: 157.0995


 70%|██████▉   | 327/469 [00:17<00:07, 18.82it/s]

Step 15,800, (N samples: 2,022,400), Loss: 140.4118, (Recon: 133.5521, KL: 6.8596), Gradient norm: 179.9226


 91%|█████████ | 427/469 [00:22<00:02, 18.71it/s]

Step 15,900, (N samples: 2,035,200), Loss: 146.9277, (Recon: 139.6909, KL: 7.2368), Gradient norm: 137.1534


100%|██████████| 469/469 [00:25<00:00, 18.64it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.77it/s]


====> Test set loss: 141.7296, (BCE: 134.8766, KLD: 6.8530)
Epoch 35/50


 12%|█▏        | 58/469 [00:03<00:22, 18.51it/s]

Step 16,000, (N samples: 2,048,000), Loss: 142.1454, (Recon: 135.2832, KL: 6.8622), Gradient norm: 139.8293


 34%|███▎      | 158/469 [00:08<00:17, 18.29it/s]

Step 16,100, (N samples: 2,060,800), Loss: 142.7329, (Recon: 135.9479, KL: 6.7850), Gradient norm: 206.6332


 55%|█████▌    | 258/469 [00:13<00:11, 18.68it/s]

Step 16,200, (N samples: 2,073,600), Loss: 136.8517, (Recon: 129.8067, KL: 7.0450), Gradient norm: 236.3390


 76%|███████▋  | 358/469 [00:19<00:05, 18.94it/s]

Step 16,300, (N samples: 2,086,400), Loss: 142.1601, (Recon: 135.2905, KL: 6.8695), Gradient norm: 182.5896


 98%|█████████▊| 458/469 [00:24<00:00, 18.65it/s]

Step 16,400, (N samples: 2,099,200), Loss: 139.6337, (Recon: 132.7961, KL: 6.8376), Gradient norm: 177.0976


100%|██████████| 469/469 [00:25<00:00, 18.56it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 21.06it/s]


====> Test set loss: 138.9544, (BCE: 131.9503, KLD: 7.0041)
Epoch 36/50


 19%|█▉        | 89/469 [00:04<00:20, 18.19it/s]

Step 16,500, (N samples: 2,112,000), Loss: 139.7308, (Recon: 132.8311, KL: 6.8998), Gradient norm: 139.8778


 40%|████      | 189/469 [00:10<00:14, 18.68it/s]

Step 16,600, (N samples: 2,124,800), Loss: 133.8231, (Recon: 126.6766, KL: 7.1465), Gradient norm: 171.3625


 62%|██████▏   | 289/469 [00:15<00:09, 18.71it/s]

Step 16,700, (N samples: 2,137,600), Loss: 147.3334, (Recon: 140.4534, KL: 6.8800), Gradient norm: 372.8869


 83%|████████▎ | 389/469 [00:20<00:04, 18.61it/s]

Step 16,800, (N samples: 2,150,400), Loss: 141.2024, (Recon: 134.2470, KL: 6.9554), Gradient norm: 242.1676


100%|██████████| 469/469 [00:25<00:00, 18.71it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.82it/s]


====> Test set loss: 139.7997, (BCE: 132.8850, KLD: 6.9146)
Epoch 37/50


  4%|▍         | 20/469 [00:01<00:24, 18.61it/s]

Step 16,900, (N samples: 2,163,200), Loss: 139.3018, (Recon: 132.4131, KL: 6.8887), Gradient norm: 137.6622


 26%|██▌       | 120/469 [00:06<00:19, 18.16it/s]

Step 17,000, (N samples: 2,176,000), Loss: 141.2271, (Recon: 134.0855, KL: 7.1415), Gradient norm: 168.5483


 47%|████▋     | 220/469 [00:11<00:13, 18.53it/s]

Step 17,100, (N samples: 2,188,800), Loss: 141.6315, (Recon: 134.6645, KL: 6.9670), Gradient norm: 180.0826


 68%|██████▊   | 320/469 [00:17<00:08, 18.58it/s]

Step 17,200, (N samples: 2,201,600), Loss: 147.8226, (Recon: 140.5417, KL: 7.2809), Gradient norm: 241.1911


 89%|████████▉ | 419/469 [00:23<00:02, 18.37it/s]

Step 17,300, (N samples: 2,214,400), Loss: 143.0050, (Recon: 135.6839, KL: 7.3211), Gradient norm: 175.4408


100%|██████████| 469/469 [00:25<00:00, 18.19it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.51it/s]


====> Test set loss: 144.4695, (BCE: 137.5988, KLD: 6.8707)
Epoch 38/50


 11%|█         | 50/469 [00:02<00:22, 18.87it/s]

Step 17,400, (N samples: 2,227,200), Loss: 147.6766, (Recon: 140.8130, KL: 6.8635), Gradient norm: 173.2156


 32%|███▏      | 150/469 [00:08<00:17, 18.38it/s]

Step 17,500, (N samples: 2,240,000), Loss: 138.6059, (Recon: 131.5675, KL: 7.0384), Gradient norm: 289.0581


 53%|█████▎    | 250/469 [00:13<00:11, 18.65it/s]

Step 17,600, (N samples: 2,252,800), Loss: 149.2870, (Recon: 142.3629, KL: 6.9241), Gradient norm: 197.4256


 75%|███████▍  | 351/469 [00:18<00:06, 18.87it/s]

Step 17,700, (N samples: 2,265,600), Loss: 141.9935, (Recon: 134.8818, KL: 7.1117), Gradient norm: 139.6066


 96%|█████████▌| 450/469 [00:24<00:01, 18.44it/s]

Step 17,800, (N samples: 2,278,400), Loss: 134.3074, (Recon: 127.2419, KL: 7.0655), Gradient norm: 315.2449


100%|██████████| 469/469 [00:25<00:00, 18.67it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.82it/s]


====> Test set loss: 138.9351, (BCE: 131.8744, KLD: 7.0606)
Epoch 39/50


 17%|█▋        | 82/469 [00:04<00:20, 18.77it/s]

Step 17,900, (N samples: 2,291,200), Loss: 127.5147, (Recon: 120.3794, KL: 7.1354), Gradient norm: 147.3444


 39%|███▉      | 182/469 [00:09<00:15, 18.39it/s]

Step 18,000, (N samples: 2,304,000), Loss: 141.2573, (Recon: 134.2624, KL: 6.9949), Gradient norm: 210.3574


 60%|██████    | 282/469 [00:15<00:10, 18.41it/s]

Step 18,100, (N samples: 2,316,800), Loss: 146.2811, (Recon: 139.4056, KL: 6.8755), Gradient norm: 175.2269


 81%|████████▏ | 382/469 [00:20<00:05, 17.27it/s]

Step 18,200, (N samples: 2,329,600), Loss: 145.5766, (Recon: 138.5425, KL: 7.0341), Gradient norm: 239.2809


100%|██████████| 469/469 [00:25<00:00, 18.51it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.50it/s]


====> Test set loss: 139.6353, (BCE: 132.7613, KLD: 6.8740)
Epoch 40/50


  3%|▎         | 13/469 [00:00<00:25, 17.93it/s]

Step 18,300, (N samples: 2,342,400), Loss: 134.8520, (Recon: 128.0131, KL: 6.8388), Gradient norm: 117.8450


 24%|██▍       | 113/469 [00:06<00:18, 18.84it/s]

Step 18,400, (N samples: 2,355,200), Loss: 139.7973, (Recon: 132.9004, KL: 6.8968), Gradient norm: 143.2716


 45%|████▌     | 213/469 [00:11<00:13, 18.53it/s]

Step 18,500, (N samples: 2,368,000), Loss: 135.0924, (Recon: 128.1189, KL: 6.9735), Gradient norm: 218.1189


 67%|██████▋   | 313/469 [00:16<00:08, 18.42it/s]

Step 18,600, (N samples: 2,380,800), Loss: 138.0530, (Recon: 131.0595, KL: 6.9934), Gradient norm: 222.1414


 88%|████████▊ | 413/469 [00:22<00:03, 18.32it/s]

Step 18,700, (N samples: 2,393,600), Loss: 133.3754, (Recon: 126.2927, KL: 7.0826), Gradient norm: 166.5723


100%|██████████| 469/469 [00:25<00:00, 18.49it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.33it/s]


====> Test set loss: 138.2605, (BCE: 131.2530, KLD: 7.0076)
Epoch 41/50


  9%|▉         | 44/469 [00:02<00:23, 18.39it/s]

Step 18,800, (N samples: 2,406,400), Loss: 145.9776, (Recon: 138.8660, KL: 7.1116), Gradient norm: 229.8978


 31%|███       | 144/469 [00:07<00:17, 18.13it/s]

Step 18,900, (N samples: 2,419,200), Loss: 141.3158, (Recon: 134.2504, KL: 7.0655), Gradient norm: 160.9616


 52%|█████▏    | 244/469 [00:13<00:12, 18.35it/s]

Step 19,000, (N samples: 2,432,000), Loss: 146.3333, (Recon: 139.1398, KL: 7.1934), Gradient norm: 286.8621


 73%|███████▎  | 344/469 [00:18<00:06, 18.79it/s]

Step 19,100, (N samples: 2,444,800), Loss: 137.9566, (Recon: 130.8022, KL: 7.1544), Gradient norm: 293.2280


 95%|█████████▍| 444/469 [00:24<00:01, 15.58it/s]

Step 19,200, (N samples: 2,457,600), Loss: 140.2882, (Recon: 133.2355, KL: 7.0526), Gradient norm: 275.1366


100%|██████████| 469/469 [00:25<00:00, 18.08it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.84it/s]


====> Test set loss: 140.0288, (BCE: 132.9941, KLD: 7.0347)
Epoch 42/50


 16%|█▌        | 74/469 [00:04<00:21, 18.36it/s]

Step 19,300, (N samples: 2,470,400), Loss: 133.7516, (Recon: 126.5861, KL: 7.1656), Gradient norm: 154.6258


 37%|███▋      | 174/469 [00:09<00:15, 18.63it/s]

Step 19,400, (N samples: 2,483,200), Loss: 137.6682, (Recon: 130.6186, KL: 7.0496), Gradient norm: 100.7709


 58%|█████▊    | 274/469 [00:14<00:10, 18.84it/s]

Step 19,500, (N samples: 2,496,000), Loss: 138.0903, (Recon: 131.1291, KL: 6.9612), Gradient norm: 163.0277


 80%|███████▉  | 373/469 [00:19<00:05, 17.53it/s]

Step 19,600, (N samples: 2,508,800), Loss: 150.7597, (Recon: 143.7138, KL: 7.0459), Gradient norm: 304.0201


100%|██████████| 469/469 [00:25<00:00, 18.64it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.75it/s]


====> Test set loss: 138.2796, (BCE: 131.1762, KLD: 7.1033)
Epoch 43/50


  1%|▏         | 6/469 [00:00<00:27, 16.56it/s]

Step 19,700, (N samples: 2,521,600), Loss: 134.3603, (Recon: 127.0779, KL: 7.2823), Gradient norm: 125.8645


 23%|██▎       | 106/469 [00:05<00:19, 18.91it/s]

Step 19,800, (N samples: 2,534,400), Loss: 137.8528, (Recon: 130.6169, KL: 7.2359), Gradient norm: 138.3309


 44%|████▍     | 206/469 [00:11<00:14, 18.75it/s]

Step 19,900, (N samples: 2,547,200), Loss: 140.2569, (Recon: 133.3456, KL: 6.9113), Gradient norm: 128.5174


 65%|██████▌   | 306/469 [00:16<00:09, 17.37it/s]

Step 20,000, (N samples: 2,560,000), Loss: 134.7355, (Recon: 127.8026, KL: 6.9329), Gradient norm: 171.9390


 86%|████████▋ | 405/469 [00:21<00:03, 18.79it/s]

Step 20,100, (N samples: 2,572,800), Loss: 134.7640, (Recon: 127.9194, KL: 6.8446), Gradient norm: 165.3375


100%|██████████| 469/469 [00:25<00:00, 18.58it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.95it/s]


====> Test set loss: 138.9009, (BCE: 131.9291, KLD: 6.9718)
Epoch 44/50


  8%|▊         | 37/469 [00:02<00:23, 18.75it/s]

Step 20,200, (N samples: 2,585,600), Loss: 139.4478, (Recon: 132.3793, KL: 7.0685), Gradient norm: 318.4921


 29%|██▉       | 137/469 [00:07<00:17, 18.72it/s]

Step 20,300, (N samples: 2,598,400), Loss: 141.7163, (Recon: 134.7566, KL: 6.9597), Gradient norm: 395.8259


 51%|█████     | 237/469 [00:12<00:12, 18.87it/s]

Step 20,400, (N samples: 2,611,200), Loss: 140.0724, (Recon: 132.9048, KL: 7.1676), Gradient norm: 167.6200


 72%|███████▏  | 337/469 [00:18<00:07, 18.68it/s]

Step 20,500, (N samples: 2,624,000), Loss: 139.8473, (Recon: 133.0309, KL: 6.8164), Gradient norm: 129.3438


 93%|█████████▎| 436/469 [00:23<00:01, 18.74it/s]

Step 20,600, (N samples: 2,636,800), Loss: 135.5668, (Recon: 128.5150, KL: 7.0518), Gradient norm: 217.8587


100%|██████████| 469/469 [00:25<00:00, 18.62it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 21.09it/s]


====> Test set loss: 137.6792, (BCE: 130.5644, KLD: 7.1147)
Epoch 45/50


 14%|█▍        | 68/469 [00:03<00:21, 18.82it/s]

Step 20,700, (N samples: 2,649,600), Loss: 133.8895, (Recon: 126.9161, KL: 6.9734), Gradient norm: 261.8188


 36%|███▌      | 168/469 [00:09<00:16, 18.29it/s]

Step 20,800, (N samples: 2,662,400), Loss: 136.2570, (Recon: 129.3344, KL: 6.9225), Gradient norm: 238.0165


 57%|█████▋    | 268/469 [00:14<00:11, 17.93it/s]

Step 20,900, (N samples: 2,675,200), Loss: 143.7067, (Recon: 136.5240, KL: 7.1827), Gradient norm: 251.4678


 78%|███████▊  | 368/469 [00:19<00:05, 18.88it/s]

Step 21,000, (N samples: 2,688,000), Loss: 138.7346, (Recon: 131.5467, KL: 7.1879), Gradient norm: 294.4963


100%|█████████▉| 468/469 [00:25<00:00, 18.36it/s]

Step 21,100, (N samples: 2,700,800), Loss: 129.2974, (Recon: 122.2531, KL: 7.0443), Gradient norm: 141.0917


100%|██████████| 469/469 [00:25<00:00, 18.42it/s]
Testing: 100%|██████████| 79/79 [00:04<00:00, 17.47it/s]


====> Test set loss: 138.8665, (BCE: 131.7407, KLD: 7.1258)
Epoch 46/50


 21%|██        | 99/469 [00:05<00:20, 18.34it/s]

Step 21,200, (N samples: 2,713,600), Loss: 142.3522, (Recon: 135.2814, KL: 7.0708), Gradient norm: 272.6497


 42%|████▏     | 199/469 [00:10<00:14, 18.37it/s]

Step 21,300, (N samples: 2,726,400), Loss: 135.6277, (Recon: 128.3024, KL: 7.3253), Gradient norm: 152.2478


 64%|██████▍   | 299/469 [00:16<00:09, 18.26it/s]

Step 21,400, (N samples: 2,739,200), Loss: 128.5729, (Recon: 121.5001, KL: 7.0728), Gradient norm: 133.7975


 85%|████████▌ | 399/469 [00:21<00:03, 18.68it/s]

Step 21,500, (N samples: 2,752,000), Loss: 141.0390, (Recon: 134.1710, KL: 6.8680), Gradient norm: 221.4967


100%|██████████| 469/469 [00:25<00:00, 18.54it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.89it/s]


====> Test set loss: 138.8098, (BCE: 131.7516, KLD: 7.0583)
Epoch 47/50


  6%|▌         | 29/469 [00:01<00:23, 18.68it/s]

Step 21,600, (N samples: 2,764,800), Loss: 140.2998, (Recon: 133.2617, KL: 7.0381), Gradient norm: 194.4102


 28%|██▊       | 129/469 [00:06<00:18, 18.52it/s]

Step 21,700, (N samples: 2,777,600), Loss: 138.7131, (Recon: 131.5156, KL: 7.1975), Gradient norm: 280.7858


 49%|████▉     | 229/469 [00:12<00:12, 18.62it/s]

Step 21,800, (N samples: 2,790,400), Loss: 136.9111, (Recon: 129.7040, KL: 7.2070), Gradient norm: 322.1310


 70%|███████   | 329/469 [00:17<00:07, 18.86it/s]

Step 21,900, (N samples: 2,803,200), Loss: 137.1207, (Recon: 130.0499, KL: 7.0708), Gradient norm: 178.9403


 91%|█████████▏| 429/469 [00:23<00:02, 18.68it/s]

Step 22,000, (N samples: 2,816,000), Loss: 141.7489, (Recon: 134.6722, KL: 7.0767), Gradient norm: 316.4717


100%|██████████| 469/469 [00:25<00:00, 18.49it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.81it/s]


====> Test set loss: 137.9028, (BCE: 130.8622, KLD: 7.0406)
Epoch 48/50


 13%|█▎        | 61/469 [00:03<00:21, 18.68it/s]

Step 22,100, (N samples: 2,828,800), Loss: 138.7146, (Recon: 131.6394, KL: 7.0751), Gradient norm: 214.0289


 34%|███▍      | 161/469 [00:08<00:16, 18.87it/s]

Step 22,200, (N samples: 2,841,600), Loss: 139.4216, (Recon: 132.4074, KL: 7.0142), Gradient norm: 310.0010


 56%|█████▌    | 261/469 [00:14<00:11, 18.46it/s]

Step 22,300, (N samples: 2,854,400), Loss: 138.7558, (Recon: 131.9128, KL: 6.8431), Gradient norm: 242.8495


 77%|███████▋  | 361/469 [00:19<00:05, 18.85it/s]

Step 22,400, (N samples: 2,867,200), Loss: 134.2802, (Recon: 127.0738, KL: 7.2064), Gradient norm: 138.3551


 98%|█████████▊| 461/469 [00:24<00:00, 18.66it/s]

Step 22,500, (N samples: 2,880,000), Loss: 134.9864, (Recon: 128.2035, KL: 6.7830), Gradient norm: 222.7601


100%|██████████| 469/469 [00:25<00:00, 18.52it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.85it/s]


====> Test set loss: 141.5695, (BCE: 134.6466, KLD: 6.9229)
Epoch 49/50


 20%|█▉        | 92/469 [00:05<00:20, 18.27it/s]

Step 22,600, (N samples: 2,892,800), Loss: 138.6691, (Recon: 131.6887, KL: 6.9804), Gradient norm: 90.2153


 41%|████      | 192/469 [00:10<00:14, 18.93it/s]

Step 22,700, (N samples: 2,905,600), Loss: 139.9159, (Recon: 132.5131, KL: 7.4028), Gradient norm: 298.9154


 62%|██████▏   | 292/469 [00:15<00:09, 18.38it/s]

Step 22,800, (N samples: 2,918,400), Loss: 139.8803, (Recon: 133.0068, KL: 6.8735), Gradient norm: 167.2894


 84%|████████▎ | 392/469 [00:21<00:04, 18.26it/s]

Step 22,900, (N samples: 2,931,200), Loss: 139.2627, (Recon: 132.1465, KL: 7.1162), Gradient norm: 230.3554


100%|██████████| 469/469 [00:25<00:00, 18.49it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.56it/s]


====> Test set loss: 143.1605, (BCE: 136.2401, KLD: 6.9204)
Epoch 50/50


  5%|▍         | 23/469 [00:01<00:23, 18.75it/s]

Step 23,000, (N samples: 2,944,000), Loss: 146.2236, (Recon: 139.4630, KL: 6.7606), Gradient norm: 276.2056


 26%|██▌       | 123/469 [00:07<00:18, 18.41it/s]

Step 23,100, (N samples: 2,956,800), Loss: 140.8298, (Recon: 133.8865, KL: 6.9433), Gradient norm: 159.0927


 48%|████▊     | 223/469 [00:12<00:13, 18.42it/s]

Step 23,200, (N samples: 2,969,600), Loss: 149.0273, (Recon: 142.1485, KL: 6.8788), Gradient norm: 367.1299


 69%|██████▊   | 322/469 [00:17<00:07, 18.70it/s]

Step 23,300, (N samples: 2,982,400), Loss: 145.7450, (Recon: 138.7938, KL: 6.9512), Gradient norm: 148.6852


 90%|█████████ | 423/469 [00:22<00:01, 32.76it/s]

Step 23,400, (N samples: 2,995,200), Loss: 134.5131, (Recon: 127.5055, KL: 7.0076), Gradient norm: 212.9361


100%|██████████| 469/469 [00:23<00:00, 20.04it/s]
Testing: 100%|██████████| 79/79 [00:02<00:00, 36.16it/s]

====> Test set loss: 142.4217, (BCE: 135.5543, KLD: 6.8674)





In [15]:
writer.flush()

In [16]:
%load_ext tensorboard

In [23]:
%tensorboard --logdir ../experiments/VAE_MNIST/20241028-004306/

Reusing TensorBoard on port 6011 (pid 1110344), started 0:00:12 ago. (Use '!kill 1110344' to kill it.)