In [1]:
%load_ext autoreload
# %reload_ext autoreload
%autoreload 2


In [2]:
from datetime import datetime

import torch
from torch.utils.tensorboard import SummaryWriter

from methylVA.mnist.model import VAE
from methylVA.mnist.training import train, test


batch_size = 128
learning_rate = 1e-3
weight_decay = 1e-2
num_epochs = 50
latent_dim = 20
hidden_dim = 512
name = 'VAE_MNIST_latent_20'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VAE(input_dim=784, latent_dim=latent_dim, hidden_dim=hidden_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
# writer = SummaryWriter(f'../experiments/VAE_MNIST/{datetime.now().strftime("%Y%m%d-%H%M%S")}')
writer_train = SummaryWriter(f'../experiments/{name}/train/{datetime.now().strftime("%Y%m%d-%H%M%S")}')
writer_test = SummaryWriter(f'../experiments/{name}/test/{datetime.now().strftime("%Y%m%d-%H%M%S")}')

In [3]:
model

VAE(
  (encoder): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): SiLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): SiLU()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): SiLU()
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): SiLU()
    (8): Linear(in_features=64, out_features=40, bias=True)
  )
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (decoder): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): SiLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): SiLU()
    (4): Linear(in_features=128, out_features=256, bias=True)
    (5): SiLU()
    (6): Linear(in_features=256, out_features=512, bias=True)
    (7): SiLU()
    (8): Linear(in_features=512, out_features=784, bias=True)
    (9): Sigmoid()
  )
)

In [4]:
from methylVA.mnist.dataset import get_data_loaders

In [5]:
train_loader, test_loader = get_data_loaders()

In [6]:
!pwd

/fast/AG_Ohler/ekarimi/projects/methylVA/notebooks


In [7]:
from methylVA.mnist.training import train, test


prev_updates = 0
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    prev_updates = train(model, train_loader, optimizer, prev_updates, writer=writer_train)
    test(model, test_loader, prev_updates, writer=writer_test)

Epoch 1/50


  1%|          | 5/469 [00:00<00:39, 11.69it/s]

Step 0, (N samples: 0), Loss: 545.3038, (Recon: 542.9884, KL: 2.3154), Gradient norm: 13.0807


 22%|██▏       | 105/469 [00:03<00:10, 33.22it/s]

Step 100, (N samples: 12,800), Loss: 190.1629, (Recon: 188.1035, KL: 2.0594), Gradient norm: 19.4537


 43%|████▎     | 204/469 [00:08<00:13, 20.08it/s]

Step 200, (N samples: 25,600), Loss: 194.8308, (Recon: 192.5800, KL: 2.2508), Gradient norm: 42.5020


 65%|██████▍   | 303/469 [00:13<00:08, 18.64it/s]

Step 300, (N samples: 38,400), Loss: 183.3213, (Recon: 181.1677, KL: 2.1536), Gradient norm: 15.9688


 86%|████████▌ | 403/469 [00:18<00:03, 18.77it/s]

Step 400, (N samples: 51,200), Loss: 183.4709, (Recon: 181.0135, KL: 2.4575), Gradient norm: 19.0644


100%|██████████| 469/469 [00:22<00:00, 20.87it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.65it/s]


====> Test set loss: 178.4231, (BCE: 174.5274, KLD: 3.8957)
Epoch 2/50


  7%|▋         | 34/469 [00:01<00:23, 18.70it/s]

Step 500, (N samples: 64,000), Loss: 181.8088, (Recon: 177.7304, KL: 4.0784), Gradient norm: 32.8955


 29%|██▊       | 134/469 [00:07<00:17, 18.75it/s]

Step 600, (N samples: 76,800), Loss: 159.4868, (Recon: 154.8157, KL: 4.6711), Gradient norm: 23.8967


 50%|████▉     | 234/469 [00:12<00:13, 17.92it/s]

Step 700, (N samples: 89,600), Loss: 163.6213, (Recon: 158.9894, KL: 4.6319), Gradient norm: 25.0744


 71%|███████   | 334/469 [00:18<00:07, 18.82it/s]

Step 800, (N samples: 102,400), Loss: 155.9545, (Recon: 150.9543, KL: 5.0001), Gradient norm: 36.7788


 93%|█████████▎| 434/469 [00:23<00:01, 18.69it/s]

Step 900, (N samples: 115,200), Loss: 154.7459, (Recon: 149.8170, KL: 4.9288), Gradient norm: 56.0325


100%|██████████| 469/469 [00:25<00:00, 18.51it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.82it/s]


====> Test set loss: 159.1249, (BCE: 153.9844, KLD: 5.1406)
Epoch 3/50


 14%|█▍        | 65/469 [00:03<00:21, 18.58it/s]

Step 1,000, (N samples: 128,000), Loss: 152.3918, (Recon: 147.3051, KL: 5.0867), Gradient norm: 58.5983


 35%|███▌      | 165/469 [00:08<00:16, 18.56it/s]

Step 1,100, (N samples: 140,800), Loss: 156.9747, (Recon: 151.5619, KL: 5.4127), Gradient norm: 44.3742


 57%|█████▋    | 265/469 [00:14<00:10, 18.69it/s]

Step 1,200, (N samples: 153,600), Loss: 154.7737, (Recon: 149.5710, KL: 5.2027), Gradient norm: 45.0839


 78%|███████▊  | 365/469 [00:19<00:05, 18.39it/s]

Step 1,300, (N samples: 166,400), Loss: 145.0081, (Recon: 139.6962, KL: 5.3119), Gradient norm: 40.4109


 99%|█████████▉| 465/469 [00:24<00:00, 18.57it/s]

Step 1,400, (N samples: 179,200), Loss: 152.6568, (Recon: 147.3146, KL: 5.3423), Gradient norm: 35.6618


100%|██████████| 469/469 [00:25<00:00, 18.61it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.51it/s]


====> Test set loss: 153.6705, (BCE: 148.3473, KLD: 5.3233)
Epoch 4/50


 21%|██        | 97/469 [00:05<00:20, 18.16it/s]

Step 1,500, (N samples: 192,000), Loss: 149.0368, (Recon: 143.4868, KL: 5.5501), Gradient norm: 89.3827


 42%|████▏     | 196/469 [00:10<00:14, 18.81it/s]

Step 1,600, (N samples: 204,800), Loss: 157.9469, (Recon: 152.4594, KL: 5.4875), Gradient norm: 29.2732


 63%|██████▎   | 296/469 [00:15<00:09, 18.67it/s]

Step 1,700, (N samples: 217,600), Loss: 165.0759, (Recon: 159.4810, KL: 5.5949), Gradient norm: 52.2671


 84%|████████▍ | 396/469 [00:21<00:03, 18.62it/s]

Step 1,800, (N samples: 230,400), Loss: 151.5511, (Recon: 145.7901, KL: 5.7611), Gradient norm: 45.8023


100%|██████████| 469/469 [00:25<00:00, 18.59it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.64it/s]


====> Test set loss: 151.0374, (BCE: 145.3488, KLD: 5.6886)
Epoch 5/50


  6%|▌         | 27/469 [00:01<00:23, 18.59it/s]

Step 1,900, (N samples: 243,200), Loss: 151.0577, (Recon: 145.2827, KL: 5.7749), Gradient norm: 62.3740


 27%|██▋       | 127/469 [00:06<00:18, 18.60it/s]

Step 2,000, (N samples: 256,000), Loss: 154.7288, (Recon: 148.9060, KL: 5.8228), Gradient norm: 58.9474


 49%|████▊     | 228/469 [00:12<00:13, 18.25it/s]

Step 2,100, (N samples: 268,800), Loss: 149.1880, (Recon: 143.4686, KL: 5.7194), Gradient norm: 56.3468


 70%|██████▉   | 328/469 [00:17<00:07, 18.80it/s]

Step 2,200, (N samples: 281,600), Loss: 153.9880, (Recon: 148.2298, KL: 5.7582), Gradient norm: 66.7108


 91%|█████████▏| 428/469 [00:22<00:02, 18.33it/s]

Step 2,300, (N samples: 294,400), Loss: 149.9314, (Recon: 144.0154, KL: 5.9160), Gradient norm: 67.8168


100%|██████████| 469/469 [00:25<00:00, 18.59it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.72it/s]


====> Test set loss: 149.1549, (BCE: 143.2767, KLD: 5.8782)
Epoch 6/50


 13%|█▎        | 59/469 [00:03<00:22, 18.41it/s]

Step 2,400, (N samples: 307,200), Loss: 149.2830, (Recon: 143.2417, KL: 6.0413), Gradient norm: 63.2268


 34%|███▍      | 159/469 [00:08<00:16, 18.52it/s]

Step 2,500, (N samples: 320,000), Loss: 146.6822, (Recon: 140.8298, KL: 5.8524), Gradient norm: 52.1683


 55%|█████▌    | 258/469 [00:13<00:11, 18.86it/s]

Step 2,600, (N samples: 332,800), Loss: 143.2510, (Recon: 137.2061, KL: 6.0449), Gradient norm: 48.8020


 76%|███████▋  | 358/469 [00:19<00:05, 18.56it/s]

Step 2,700, (N samples: 345,600), Loss: 152.6601, (Recon: 146.5905, KL: 6.0696), Gradient norm: 41.3561


 98%|█████████▊| 458/469 [00:24<00:00, 18.31it/s]

Step 2,800, (N samples: 358,400), Loss: 155.4653, (Recon: 149.4794, KL: 5.9859), Gradient norm: 80.7707


100%|██████████| 469/469 [00:25<00:00, 18.61it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.64it/s]


====> Test set loss: 148.5950, (BCE: 142.5497, KLD: 6.0453)
Epoch 7/50


 19%|█▉        | 90/469 [00:04<00:20, 18.58it/s]

Step 2,900, (N samples: 371,200), Loss: 149.1302, (Recon: 142.9144, KL: 6.2158), Gradient norm: 81.9095


 40%|████      | 189/469 [00:10<00:14, 18.72it/s]

Step 3,000, (N samples: 384,000), Loss: 146.7363, (Recon: 140.6373, KL: 6.0990), Gradient norm: 61.4464


 62%|██████▏   | 290/469 [00:15<00:09, 18.55it/s]

Step 3,100, (N samples: 396,800), Loss: 152.9536, (Recon: 146.9325, KL: 6.0211), Gradient norm: 63.3458


 83%|████████▎ | 390/469 [00:20<00:04, 18.68it/s]

Step 3,200, (N samples: 409,600), Loss: 145.1168, (Recon: 138.8905, KL: 6.2263), Gradient norm: 128.4948


100%|██████████| 469/469 [00:25<00:00, 18.61it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.26it/s]


====> Test set loss: 150.1518, (BCE: 143.9871, KLD: 6.1647)
Epoch 8/50


  4%|▍         | 20/469 [00:01<00:24, 18.11it/s]

Step 3,300, (N samples: 422,400), Loss: 142.0529, (Recon: 135.6878, KL: 6.3651), Gradient norm: 90.3374


 26%|██▌       | 120/469 [00:06<00:19, 18.29it/s]

Step 3,400, (N samples: 435,200), Loss: 139.8804, (Recon: 133.2235, KL: 6.6568), Gradient norm: 61.0093


 47%|████▋     | 220/469 [00:12<00:14, 17.26it/s]

Step 3,500, (N samples: 448,000), Loss: 144.7860, (Recon: 137.9845, KL: 6.8015), Gradient norm: 49.9037


 68%|██████▊   | 320/469 [00:17<00:08, 18.33it/s]

Step 3,600, (N samples: 460,800), Loss: 141.3757, (Recon: 134.3260, KL: 7.0497), Gradient norm: 116.5693


 90%|████████▉ | 420/469 [00:23<00:02, 18.30it/s]

Step 3,700, (N samples: 473,600), Loss: 142.1553, (Recon: 134.9940, KL: 7.1614), Gradient norm: 56.2392


100%|██████████| 469/469 [00:25<00:00, 18.04it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.68it/s]


====> Test set loss: 141.1265, (BCE: 133.7554, KLD: 7.3711)
Epoch 9/50


 11%|█         | 51/469 [00:02<00:23, 17.85it/s]

Step 3,800, (N samples: 486,400), Loss: 142.9628, (Recon: 135.1750, KL: 7.7879), Gradient norm: 101.8491


 32%|███▏      | 151/469 [00:08<00:17, 18.58it/s]

Step 3,900, (N samples: 499,200), Loss: 139.2507, (Recon: 131.5421, KL: 7.7086), Gradient norm: 77.4209


 54%|█████▎    | 252/469 [00:13<00:11, 18.77it/s]

Step 4,000, (N samples: 512,000), Loss: 139.6691, (Recon: 131.9126, KL: 7.7566), Gradient norm: 52.8179


 75%|███████▌  | 352/469 [00:18<00:06, 18.30it/s]

Step 4,100, (N samples: 524,800), Loss: 138.7988, (Recon: 131.1760, KL: 7.6228), Gradient norm: 79.8275


 96%|█████████▋| 452/469 [00:24<00:00, 18.60it/s]

Step 4,200, (N samples: 537,600), Loss: 134.3602, (Recon: 126.5392, KL: 7.8210), Gradient norm: 65.5887


100%|██████████| 469/469 [00:25<00:00, 18.66it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.55it/s]


====> Test set loss: 137.9143, (BCE: 130.1925, KLD: 7.7219)
Epoch 10/50


 18%|█▊        | 83/469 [00:04<00:20, 18.52it/s]

Step 4,300, (N samples: 550,400), Loss: 136.2040, (Recon: 128.1428, KL: 8.0612), Gradient norm: 68.4769


 39%|███▉      | 182/469 [00:09<00:15, 18.50it/s]

Step 4,400, (N samples: 563,200), Loss: 140.5765, (Recon: 132.5333, KL: 8.0433), Gradient norm: 57.0794


 60%|██████    | 282/469 [00:15<00:09, 18.74it/s]

Step 4,500, (N samples: 576,000), Loss: 131.0758, (Recon: 122.9148, KL: 8.1610), Gradient norm: 66.4272


 81%|████████▏ | 382/469 [00:20<00:04, 18.82it/s]

Step 4,600, (N samples: 588,800), Loss: 136.6958, (Recon: 128.3072, KL: 8.3887), Gradient norm: 69.6531


100%|██████████| 469/469 [00:25<00:00, 18.63it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.66it/s]


====> Test set loss: 134.2090, (BCE: 125.7601, KLD: 8.4490)
Epoch 11/50


  3%|▎         | 13/469 [00:00<00:25, 17.60it/s]

Step 4,700, (N samples: 601,600), Loss: 131.7243, (Recon: 123.4927, KL: 8.2315), Gradient norm: 77.1398


 24%|██▍       | 113/469 [00:06<00:19, 18.66it/s]

Step 4,800, (N samples: 614,400), Loss: 140.1031, (Recon: 131.5586, KL: 8.5445), Gradient norm: 56.9388


 45%|████▌     | 213/469 [00:11<00:13, 18.32it/s]

Step 4,900, (N samples: 627,200), Loss: 128.6606, (Recon: 119.8156, KL: 8.8451), Gradient norm: 61.4931


 67%|██████▋   | 313/469 [00:16<00:08, 18.64it/s]

Step 5,000, (N samples: 640,000), Loss: 134.3464, (Recon: 125.4104, KL: 8.9360), Gradient norm: 65.3840


 88%|████████▊ | 413/469 [00:22<00:02, 18.76it/s]

Step 5,100, (N samples: 652,800), Loss: 127.4877, (Recon: 118.5350, KL: 8.9527), Gradient norm: 77.0404


100%|██████████| 469/469 [00:25<00:00, 18.53it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.95it/s]


====> Test set loss: 132.5809, (BCE: 123.4158, KLD: 9.1651)
Epoch 12/50


 10%|▉         | 45/469 [00:02<00:23, 18.04it/s]

Step 5,200, (N samples: 665,600), Loss: 120.9848, (Recon: 111.6725, KL: 9.3124), Gradient norm: 91.6603


 31%|███       | 145/469 [00:07<00:17, 18.02it/s]

Step 5,300, (N samples: 678,400), Loss: 129.7169, (Recon: 120.3255, KL: 9.3914), Gradient norm: 61.2931


 52%|█████▏    | 245/469 [00:13<00:12, 18.54it/s]

Step 5,400, (N samples: 691,200), Loss: 129.2221, (Recon: 119.9283, KL: 9.2938), Gradient norm: 81.9413


 74%|███████▎  | 345/469 [00:18<00:06, 18.03it/s]

Step 5,500, (N samples: 704,000), Loss: 129.4216, (Recon: 120.0461, KL: 9.3756), Gradient norm: 59.2154


 95%|█████████▍| 445/469 [00:24<00:01, 18.88it/s]

Step 5,600, (N samples: 716,800), Loss: 125.2938, (Recon: 115.7276, KL: 9.5662), Gradient norm: 60.9099


100%|██████████| 469/469 [00:25<00:00, 18.40it/s]
Testing: 100%|██████████| 79/79 [00:04<00:00, 17.43it/s]


====> Test set loss: 129.8404, (BCE: 120.3844, KLD: 9.4560)
Epoch 13/50


 16%|█▌        | 75/469 [00:04<00:20, 18.87it/s]

Step 5,700, (N samples: 729,600), Loss: 129.1680, (Recon: 119.8210, KL: 9.3470), Gradient norm: 69.4948


 37%|███▋      | 175/469 [00:09<00:15, 18.73it/s]

Step 5,800, (N samples: 742,400), Loss: 130.4753, (Recon: 120.9067, KL: 9.5685), Gradient norm: 77.8949


 59%|█████▊    | 275/469 [00:14<00:10, 18.55it/s]

Step 5,900, (N samples: 755,200), Loss: 127.0583, (Recon: 117.4574, KL: 9.6009), Gradient norm: 66.7236


 80%|███████▉  | 375/469 [00:20<00:04, 18.93it/s]

Step 6,000, (N samples: 768,000), Loss: 131.4385, (Recon: 121.5385, KL: 9.9000), Gradient norm: 94.1269


100%|██████████| 469/469 [00:25<00:00, 18.52it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.89it/s]


====> Test set loss: 126.7283, (BCE: 117.0520, KLD: 9.6764)
Epoch 14/50


  1%|▏         | 7/469 [00:00<00:28, 16.28it/s]

Step 6,100, (N samples: 780,800), Loss: 136.9398, (Recon: 127.3874, KL: 9.5524), Gradient norm: 87.1316


 23%|██▎       | 107/469 [00:05<00:19, 18.14it/s]

Step 6,200, (N samples: 793,600), Loss: 126.3280, (Recon: 116.5380, KL: 9.7900), Gradient norm: 99.9894


 44%|████▍     | 206/469 [00:11<00:14, 18.17it/s]

Step 6,300, (N samples: 806,400), Loss: 126.2473, (Recon: 116.5045, KL: 9.7428), Gradient norm: 81.9610


 65%|██████▌   | 306/469 [00:16<00:08, 18.63it/s]

Step 6,400, (N samples: 819,200), Loss: 130.7634, (Recon: 121.0165, KL: 9.7469), Gradient norm: 60.4925


 87%|████████▋ | 406/469 [00:21<00:03, 18.87it/s]

Step 6,500, (N samples: 832,000), Loss: 126.6441, (Recon: 116.7580, KL: 9.8861), Gradient norm: 72.9989


100%|██████████| 469/469 [00:25<00:00, 18.55it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 21.04it/s]


====> Test set loss: 125.4370, (BCE: 115.6487, KLD: 9.7882)
Epoch 15/50


  8%|▊         | 38/469 [00:02<00:23, 18.68it/s]

Step 6,600, (N samples: 844,800), Loss: 129.0852, (Recon: 119.2912, KL: 9.7941), Gradient norm: 59.8507


 29%|██▉       | 137/469 [00:07<00:18, 18.15it/s]

Step 6,700, (N samples: 857,600), Loss: 131.1647, (Recon: 121.3346, KL: 9.8301), Gradient norm: 76.6783


 51%|█████     | 237/469 [00:12<00:12, 18.56it/s]

Step 6,800, (N samples: 870,400), Loss: 125.1992, (Recon: 115.3548, KL: 9.8444), Gradient norm: 60.8751


 72%|███████▏  | 337/469 [00:18<00:07, 18.30it/s]

Step 6,900, (N samples: 883,200), Loss: 128.5743, (Recon: 118.8820, KL: 9.6923), Gradient norm: 58.8912


 93%|█████████▎| 437/469 [00:23<00:01, 17.96it/s]

Step 7,000, (N samples: 896,000), Loss: 125.5028, (Recon: 115.7298, KL: 9.7730), Gradient norm: 80.1302


100%|██████████| 469/469 [00:25<00:00, 18.60it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.69it/s]


====> Test set loss: 124.6041, (BCE: 114.8593, KLD: 9.7449)
Epoch 16/50


 15%|█▍        | 69/469 [00:03<00:21, 18.38it/s]

Step 7,100, (N samples: 908,800), Loss: 130.1642, (Recon: 120.4329, KL: 9.7313), Gradient norm: 73.5545


 36%|███▌      | 168/469 [00:09<00:16, 18.58it/s]

Step 7,200, (N samples: 921,600), Loss: 122.3055, (Recon: 112.6726, KL: 9.6328), Gradient norm: 68.4101


 57%|█████▋    | 269/469 [00:14<00:10, 18.85it/s]

Step 7,300, (N samples: 934,400), Loss: 128.7299, (Recon: 118.7843, KL: 9.9456), Gradient norm: 64.8746


 79%|███████▊  | 369/469 [00:19<00:05, 18.40it/s]

Step 7,400, (N samples: 947,200), Loss: 125.5868, (Recon: 115.5219, KL: 10.0649), Gradient norm: 55.6154


100%|█████████▉| 467/469 [00:25<00:00, 18.43it/s]

Step 7,500, (N samples: 960,000), Loss: 123.5723, (Recon: 113.5246, KL: 10.0477), Gradient norm: 53.5729


100%|██████████| 469/469 [00:25<00:00, 18.51it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 21.02it/s]


====> Test set loss: 123.7486, (BCE: 113.6951, KLD: 10.0535)
Epoch 17/50


 21%|██▏       | 100/469 [00:05<00:19, 18.81it/s]

Step 7,600, (N samples: 972,800), Loss: 126.6138, (Recon: 116.5381, KL: 10.0757), Gradient norm: 71.8852


 43%|████▎     | 200/469 [00:11<00:14, 18.67it/s]

Step 7,700, (N samples: 985,600), Loss: 128.1060, (Recon: 118.1714, KL: 9.9345), Gradient norm: 56.8692


 64%|██████▍   | 299/469 [00:16<00:09, 18.74it/s]

Step 7,800, (N samples: 998,400), Loss: 129.3695, (Recon: 119.2376, KL: 10.1319), Gradient norm: 68.8726


 85%|████████▌ | 399/469 [00:22<00:03, 18.53it/s]

Step 7,900, (N samples: 1,011,200), Loss: 123.2051, (Recon: 113.0049, KL: 10.2002), Gradient norm: 77.6599


100%|██████████| 469/469 [00:25<00:00, 18.11it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.85it/s]


====> Test set loss: 125.3074, (BCE: 115.1816, KLD: 10.1259)
Epoch 18/50


  7%|▋         | 31/469 [00:01<00:24, 18.24it/s]

Step 8,000, (N samples: 1,024,000), Loss: 121.0771, (Recon: 111.0352, KL: 10.0419), Gradient norm: 57.8654


 28%|██▊       | 131/469 [00:07<00:18, 18.63it/s]

Step 8,100, (N samples: 1,036,800), Loss: 124.5499, (Recon: 114.5950, KL: 9.9549), Gradient norm: 66.0444


 49%|████▉     | 231/469 [00:12<00:13, 18.26it/s]

Step 8,200, (N samples: 1,049,600), Loss: 126.2177, (Recon: 115.9026, KL: 10.3150), Gradient norm: 84.9254


 71%|███████   | 331/469 [00:17<00:07, 18.60it/s]

Step 8,300, (N samples: 1,062,400), Loss: 119.7070, (Recon: 109.5474, KL: 10.1596), Gradient norm: 81.4057


 92%|█████████▏| 431/469 [00:23<00:02, 18.93it/s]

Step 8,400, (N samples: 1,075,200), Loss: 122.4664, (Recon: 112.2044, KL: 10.2619), Gradient norm: 85.9320


100%|██████████| 469/469 [00:25<00:00, 18.49it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.78it/s]


====> Test set loss: 125.2511, (BCE: 115.0212, KLD: 10.2298)
Epoch 19/50


 13%|█▎        | 61/469 [00:03<00:22, 18.44it/s]

Step 8,500, (N samples: 1,088,000), Loss: 126.0755, (Recon: 116.0116, KL: 10.0639), Gradient norm: 80.5535


 35%|███▍      | 162/469 [00:08<00:16, 18.71it/s]

Step 8,600, (N samples: 1,100,800), Loss: 125.4933, (Recon: 115.2408, KL: 10.2525), Gradient norm: 86.7314


 56%|█████▌    | 261/469 [00:14<00:11, 18.45it/s]

Step 8,700, (N samples: 1,113,600), Loss: 120.3265, (Recon: 110.1947, KL: 10.1319), Gradient norm: 73.2989


 77%|███████▋  | 361/469 [00:19<00:05, 18.48it/s]

Step 8,800, (N samples: 1,126,400), Loss: 123.0461, (Recon: 112.8079, KL: 10.2382), Gradient norm: 69.3306


 98%|█████████▊| 461/469 [00:24<00:00, 18.57it/s]

Step 8,900, (N samples: 1,139,200), Loss: 122.1998, (Recon: 112.1093, KL: 10.0905), Gradient norm: 65.2537


100%|██████████| 469/469 [00:25<00:00, 18.57it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.69it/s]


====> Test set loss: 122.6324, (BCE: 112.4062, KLD: 10.2262)
Epoch 20/50


 20%|█▉        | 92/469 [00:04<00:20, 18.62it/s]

Step 9,000, (N samples: 1,152,000), Loss: 121.8165, (Recon: 111.5174, KL: 10.2992), Gradient norm: 68.6820


 41%|████      | 192/469 [00:10<00:14, 18.55it/s]

Step 9,100, (N samples: 1,164,800), Loss: 118.4578, (Recon: 108.1724, KL: 10.2854), Gradient norm: 70.1241


 62%|██████▏   | 292/469 [00:15<00:09, 18.63it/s]

Step 9,200, (N samples: 1,177,600), Loss: 123.5935, (Recon: 113.6123, KL: 9.9812), Gradient norm: 60.5206


 84%|████████▎ | 392/469 [00:21<00:04, 18.52it/s]

Step 9,300, (N samples: 1,190,400), Loss: 117.6787, (Recon: 107.3853, KL: 10.2934), Gradient norm: 70.8860


100%|██████████| 469/469 [00:25<00:00, 18.44it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.41it/s]


====> Test set loss: 121.6331, (BCE: 111.4404, KLD: 10.1927)
Epoch 21/50


  5%|▌         | 24/469 [00:01<00:24, 18.32it/s]

Step 9,400, (N samples: 1,203,200), Loss: 121.9013, (Recon: 111.5439, KL: 10.3573), Gradient norm: 89.8791


 26%|██▋       | 124/469 [00:06<00:18, 18.58it/s]

Step 9,500, (N samples: 1,216,000), Loss: 124.3216, (Recon: 113.9638, KL: 10.3578), Gradient norm: 66.1295


 48%|████▊     | 224/469 [00:12<00:13, 18.23it/s]

Step 9,600, (N samples: 1,228,800), Loss: 121.4798, (Recon: 111.1213, KL: 10.3585), Gradient norm: 61.1679


 69%|██████▉   | 324/469 [00:18<00:07, 18.28it/s]

Step 9,700, (N samples: 1,241,600), Loss: 121.5266, (Recon: 111.1676, KL: 10.3590), Gradient norm: 55.3944


 90%|█████████ | 424/469 [00:23<00:02, 18.88it/s]

Step 9,800, (N samples: 1,254,400), Loss: 121.9545, (Recon: 111.7970, KL: 10.1575), Gradient norm: 100.4085


100%|██████████| 469/469 [00:25<00:00, 18.05it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.85it/s]


====> Test set loss: 121.3433, (BCE: 111.0640, KLD: 10.2793)
Epoch 22/50


 12%|█▏        | 55/469 [00:03<00:22, 18.73it/s]

Step 9,900, (N samples: 1,267,200), Loss: 121.2075, (Recon: 110.9674, KL: 10.2401), Gradient norm: 75.5428


 33%|███▎      | 155/469 [00:08<00:16, 18.69it/s]

Step 10,000, (N samples: 1,280,000), Loss: 115.3990, (Recon: 105.0431, KL: 10.3559), Gradient norm: 52.9214


 54%|█████▍    | 255/469 [00:13<00:11, 18.86it/s]

Step 10,100, (N samples: 1,292,800), Loss: 122.9647, (Recon: 112.5281, KL: 10.4366), Gradient norm: 56.5087


 76%|███████▌  | 355/469 [00:19<00:06, 18.92it/s]

Step 10,200, (N samples: 1,305,600), Loss: 127.4226, (Recon: 117.0966, KL: 10.3260), Gradient norm: 58.1806


 97%|█████████▋| 455/469 [00:24<00:00, 18.90it/s]

Step 10,300, (N samples: 1,318,400), Loss: 126.5833, (Recon: 116.2820, KL: 10.3014), Gradient norm: 64.7398


100%|██████████| 469/469 [00:25<00:00, 18.54it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.66it/s]


====> Test set loss: 120.9731, (BCE: 110.7221, KLD: 10.2510)
Epoch 23/50


 18%|█▊        | 85/469 [00:04<00:20, 18.38it/s]

Step 10,400, (N samples: 1,331,200), Loss: 114.9554, (Recon: 104.7241, KL: 10.2313), Gradient norm: 57.6817


 40%|███▉      | 186/469 [00:10<00:15, 18.79it/s]

Step 10,500, (N samples: 1,344,000), Loss: 121.2972, (Recon: 111.0193, KL: 10.2779), Gradient norm: 60.3189


 61%|██████    | 285/469 [00:15<00:09, 18.62it/s]

Step 10,600, (N samples: 1,356,800), Loss: 121.7844, (Recon: 111.4744, KL: 10.3100), Gradient norm: 126.4763


 82%|████████▏ | 385/469 [00:20<00:04, 18.55it/s]

Step 10,700, (N samples: 1,369,600), Loss: 123.3926, (Recon: 112.9236, KL: 10.4690), Gradient norm: 69.7733


100%|██████████| 469/469 [00:25<00:00, 18.54it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.79it/s]


====> Test set loss: 121.3315, (BCE: 110.9320, KLD: 10.3995)
Epoch 24/50


  3%|▎         | 16/469 [00:00<00:24, 18.70it/s]

Step 10,800, (N samples: 1,382,400), Loss: 122.4192, (Recon: 112.0516, KL: 10.3676), Gradient norm: 60.1174


 25%|██▍       | 116/469 [00:06<00:19, 18.30it/s]

Step 10,900, (N samples: 1,395,200), Loss: 118.0294, (Recon: 107.7521, KL: 10.2773), Gradient norm: 51.9901


 46%|████▌     | 216/469 [00:11<00:14, 17.80it/s]

Step 11,000, (N samples: 1,408,000), Loss: 127.2554, (Recon: 116.7129, KL: 10.5426), Gradient norm: 41.6116


 67%|██████▋   | 316/469 [00:17<00:08, 18.39it/s]

Step 11,100, (N samples: 1,420,800), Loss: 120.6199, (Recon: 110.2395, KL: 10.3803), Gradient norm: 49.6575


 89%|████████▊ | 416/469 [00:22<00:02, 18.56it/s]

Step 11,200, (N samples: 1,433,600), Loss: 123.7622, (Recon: 113.4040, KL: 10.3582), Gradient norm: 67.7306


100%|██████████| 469/469 [00:25<00:00, 18.45it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.80it/s]


====> Test set loss: 121.0542, (BCE: 110.7425, KLD: 10.3116)
Epoch 25/50


 10%|█         | 47/469 [00:02<00:22, 18.70it/s]

Step 11,300, (N samples: 1,446,400), Loss: 129.1661, (Recon: 118.6423, KL: 10.5238), Gradient norm: 88.8968


 31%|███▏      | 147/469 [00:08<00:17, 18.38it/s]

Step 11,400, (N samples: 1,459,200), Loss: 117.7818, (Recon: 107.3049, KL: 10.4770), Gradient norm: 67.3085


 53%|█████▎    | 247/469 [00:13<00:15, 14.02it/s]

Step 11,500, (N samples: 1,472,000), Loss: 119.4610, (Recon: 109.0298, KL: 10.4312), Gradient norm: 59.6321


 74%|███████▍  | 347/469 [00:19<00:06, 18.58it/s]

Step 11,600, (N samples: 1,484,800), Loss: 122.2526, (Recon: 111.8682, KL: 10.3844), Gradient norm: 78.3518


 95%|█████████▌| 447/469 [00:24<00:01, 18.60it/s]

Step 11,700, (N samples: 1,497,600), Loss: 120.4107, (Recon: 109.9941, KL: 10.4166), Gradient norm: 65.2258


100%|██████████| 469/469 [00:25<00:00, 18.12it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.81it/s]


====> Test set loss: 120.5139, (BCE: 110.1038, KLD: 10.4101)
Epoch 26/50


 17%|█▋        | 78/469 [00:04<00:21, 18.28it/s]

Step 11,800, (N samples: 1,510,400), Loss: 117.7021, (Recon: 107.1527, KL: 10.5494), Gradient norm: 71.4790


 38%|███▊      | 178/469 [00:09<00:15, 18.80it/s]

Step 11,900, (N samples: 1,523,200), Loss: 121.2382, (Recon: 110.7694, KL: 10.4687), Gradient norm: 57.7010


 59%|█████▉    | 278/469 [00:15<00:10, 18.57it/s]

Step 12,000, (N samples: 1,536,000), Loss: 120.2754, (Recon: 109.6898, KL: 10.5856), Gradient norm: 73.2609


 81%|████████  | 378/469 [00:20<00:04, 18.92it/s]

Step 12,100, (N samples: 1,548,800), Loss: 109.4697, (Recon: 99.0454, KL: 10.4243), Gradient norm: 73.2838


100%|██████████| 469/469 [00:25<00:00, 18.55it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.93it/s]


====> Test set loss: 120.1356, (BCE: 109.6529, KLD: 10.4827)
Epoch 27/50


  2%|▏         | 10/469 [00:00<00:24, 18.61it/s]

Step 12,200, (N samples: 1,561,600), Loss: 113.6280, (Recon: 103.1473, KL: 10.4807), Gradient norm: 76.4615


 23%|██▎       | 110/469 [00:05<00:19, 18.81it/s]

Step 12,300, (N samples: 1,574,400), Loss: 119.0422, (Recon: 108.6687, KL: 10.3735), Gradient norm: 61.0222


 45%|████▍     | 210/469 [00:11<00:15, 17.23it/s]

Step 12,400, (N samples: 1,587,200), Loss: 118.9003, (Recon: 108.5704, KL: 10.3298), Gradient norm: 79.1349


 66%|██████▌   | 310/469 [00:16<00:08, 18.85it/s]

Step 12,500, (N samples: 1,600,000), Loss: 123.4029, (Recon: 112.7097, KL: 10.6932), Gradient norm: 58.3548


 87%|████████▋ | 410/469 [00:22<00:03, 18.63it/s]

Step 12,600, (N samples: 1,612,800), Loss: 110.7554, (Recon: 100.2638, KL: 10.4916), Gradient norm: 70.8493


100%|██████████| 469/469 [00:25<00:00, 18.61it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.51it/s]


====> Test set loss: 120.2704, (BCE: 109.9061, KLD: 10.3643)
Epoch 28/50


  9%|▊         | 41/469 [00:02<00:22, 18.69it/s]

Step 12,700, (N samples: 1,625,600), Loss: 122.3208, (Recon: 111.6880, KL: 10.6328), Gradient norm: 62.3420


 30%|███       | 141/469 [00:07<00:18, 18.12it/s]

Step 12,800, (N samples: 1,638,400), Loss: 120.8341, (Recon: 110.3087, KL: 10.5253), Gradient norm: 145.0960


 51%|█████▏    | 241/469 [00:13<00:12, 18.31it/s]

Step 12,900, (N samples: 1,651,200), Loss: 117.3922, (Recon: 106.9349, KL: 10.4573), Gradient norm: 48.7109


 73%|███████▎  | 341/469 [00:18<00:06, 18.37it/s]

Step 13,000, (N samples: 1,664,000), Loss: 114.3493, (Recon: 103.8540, KL: 10.4953), Gradient norm: 59.6873


 94%|█████████▍| 441/469 [00:23<00:01, 18.67it/s]

Step 13,100, (N samples: 1,676,800), Loss: 113.5110, (Recon: 103.3569, KL: 10.1541), Gradient norm: 77.2211


100%|██████████| 469/469 [00:25<00:00, 18.48it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.51it/s]


====> Test set loss: 120.2162, (BCE: 109.7084, KLD: 10.5078)
Epoch 29/50


 15%|█▌        | 72/469 [00:03<00:21, 18.30it/s]

Step 13,200, (N samples: 1,689,600), Loss: 117.5517, (Recon: 107.1317, KL: 10.4200), Gradient norm: 54.0837


 37%|███▋      | 172/469 [00:09<00:16, 18.51it/s]

Step 13,300, (N samples: 1,702,400), Loss: 114.7987, (Recon: 104.4275, KL: 10.3712), Gradient norm: 60.8779


 58%|█████▊    | 272/469 [00:14<00:10, 18.23it/s]

Step 13,400, (N samples: 1,715,200), Loss: 115.7780, (Recon: 105.2551, KL: 10.5229), Gradient norm: 77.0381


 79%|███████▉  | 372/469 [00:20<00:05, 18.19it/s]

Step 13,500, (N samples: 1,728,000), Loss: 120.3070, (Recon: 109.6115, KL: 10.6955), Gradient norm: 60.4004


100%|██████████| 469/469 [00:25<00:00, 18.10it/s]


Step 13,600, (N samples: 1,740,800), Loss: 116.1465, (Recon: 105.4872, KL: 10.6594), Gradient norm: 47.3963


Testing: 100%|██████████| 79/79 [00:03<00:00, 20.64it/s]


====> Test set loss: 119.3386, (BCE: 108.7397, KLD: 10.5989)
Epoch 30/50


 22%|██▏       | 102/469 [00:05<00:19, 18.81it/s]

Step 13,700, (N samples: 1,753,600), Loss: 119.1810, (Recon: 108.7384, KL: 10.4425), Gradient norm: 79.6592


 43%|████▎     | 203/469 [00:10<00:14, 17.97it/s]

Step 13,800, (N samples: 1,766,400), Loss: 121.0420, (Recon: 110.5007, KL: 10.5413), Gradient norm: 75.1251


 65%|██████▍   | 303/469 [00:16<00:09, 17.59it/s]

Step 13,900, (N samples: 1,779,200), Loss: 113.6840, (Recon: 103.1019, KL: 10.5821), Gradient norm: 62.2351


 86%|████████▌ | 403/469 [00:21<00:03, 18.87it/s]

Step 14,000, (N samples: 1,792,000), Loss: 120.8910, (Recon: 110.4294, KL: 10.4616), Gradient norm: 65.7894


100%|██████████| 469/469 [00:25<00:00, 18.60it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.59it/s]


====> Test set loss: 119.3735, (BCE: 108.8038, KLD: 10.5697)
Epoch 31/50


  7%|▋         | 34/469 [00:01<00:23, 18.49it/s]

Step 14,100, (N samples: 1,804,800), Loss: 119.1351, (Recon: 108.5698, KL: 10.5653), Gradient norm: 82.6357


 28%|██▊       | 133/469 [00:07<00:18, 18.11it/s]

Step 14,200, (N samples: 1,817,600), Loss: 119.5730, (Recon: 108.9581, KL: 10.6148), Gradient norm: 53.0229


 50%|████▉     | 234/469 [00:12<00:12, 18.29it/s]

Step 14,300, (N samples: 1,830,400), Loss: 114.4596, (Recon: 104.0255, KL: 10.4340), Gradient norm: 87.9053


 71%|███████   | 334/469 [00:17<00:07, 18.58it/s]

Step 14,400, (N samples: 1,843,200), Loss: 118.4728, (Recon: 107.7949, KL: 10.6779), Gradient norm: 74.1944


 93%|█████████▎| 434/469 [00:23<00:01, 18.29it/s]

Step 14,500, (N samples: 1,856,000), Loss: 118.4055, (Recon: 107.8277, KL: 10.5778), Gradient norm: 71.9060


100%|██████████| 469/469 [00:25<00:00, 18.61it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.65it/s]


====> Test set loss: 119.1640, (BCE: 108.6725, KLD: 10.4915)
Epoch 32/50


 14%|█▍        | 65/469 [00:03<00:21, 18.37it/s]

Step 14,600, (N samples: 1,868,800), Loss: 119.8043, (Recon: 109.4238, KL: 10.3805), Gradient norm: 71.6582


 35%|███▍      | 164/469 [00:08<00:16, 18.27it/s]

Step 14,700, (N samples: 1,881,600), Loss: 115.5873, (Recon: 105.0450, KL: 10.5423), Gradient norm: 50.7552


 56%|█████▋    | 264/469 [00:14<00:11, 18.61it/s]

Step 14,800, (N samples: 1,894,400), Loss: 118.1134, (Recon: 107.5644, KL: 10.5490), Gradient norm: 92.2501


 78%|███████▊  | 364/469 [00:19<00:05, 18.72it/s]

Step 14,900, (N samples: 1,907,200), Loss: 117.0917, (Recon: 106.5056, KL: 10.5861), Gradient norm: 69.8348


 99%|█████████▉| 464/469 [00:24<00:00, 18.54it/s]

Step 15,000, (N samples: 1,920,000), Loss: 123.3478, (Recon: 112.5696, KL: 10.7783), Gradient norm: 73.3557


100%|██████████| 469/469 [00:25<00:00, 18.52it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.25it/s]


====> Test set loss: 119.3198, (BCE: 108.7400, KLD: 10.5798)
Epoch 33/50


 20%|██        | 96/469 [00:05<00:19, 18.71it/s]

Step 15,100, (N samples: 1,932,800), Loss: 114.8525, (Recon: 104.3364, KL: 10.5161), Gradient norm: 62.4322


 42%|████▏     | 196/469 [00:10<00:15, 17.50it/s]

Step 15,200, (N samples: 1,945,600), Loss: 120.2056, (Recon: 109.5021, KL: 10.7035), Gradient norm: 91.2970


 63%|██████▎   | 296/469 [00:15<00:09, 18.33it/s]

Step 15,300, (N samples: 1,958,400), Loss: 115.2722, (Recon: 104.7103, KL: 10.5619), Gradient norm: 59.8076


 84%|████████▍ | 396/469 [00:21<00:03, 18.71it/s]

Step 15,400, (N samples: 1,971,200), Loss: 116.3224, (Recon: 105.7965, KL: 10.5259), Gradient norm: 77.2084


100%|██████████| 469/469 [00:26<00:00, 18.04it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.61it/s]


====> Test set loss: 119.5233, (BCE: 108.8978, KLD: 10.6254)
Epoch 34/50


  6%|▌         | 27/469 [00:01<00:23, 18.68it/s]

Step 15,500, (N samples: 1,984,000), Loss: 115.1501, (Recon: 104.6917, KL: 10.4584), Gradient norm: 78.5202


 27%|██▋       | 127/469 [00:06<00:18, 18.42it/s]

Step 15,600, (N samples: 1,996,800), Loss: 114.1654, (Recon: 103.4282, KL: 10.7372), Gradient norm: 142.9081


 48%|████▊     | 226/469 [00:12<00:13, 18.41it/s]

Step 15,700, (N samples: 2,009,600), Loss: 115.1306, (Recon: 104.3750, KL: 10.7555), Gradient norm: 53.5657


 70%|██████▉   | 326/469 [00:17<00:07, 18.23it/s]

Step 15,800, (N samples: 2,022,400), Loss: 118.6584, (Recon: 107.9977, KL: 10.6606), Gradient norm: 60.8180


 91%|█████████ | 426/469 [00:22<00:02, 18.74it/s]

Step 15,900, (N samples: 2,035,200), Loss: 122.3901, (Recon: 111.8294, KL: 10.5608), Gradient norm: 72.8801


100%|██████████| 469/469 [00:25<00:00, 18.62it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.82it/s]


====> Test set loss: 119.1096, (BCE: 108.5191, KLD: 10.5904)
Epoch 35/50


 12%|█▏        | 58/469 [00:03<00:22, 18.36it/s]

Step 16,000, (N samples: 2,048,000), Loss: 118.1202, (Recon: 107.5563, KL: 10.5639), Gradient norm: 64.2868


 34%|███▎      | 158/469 [00:08<00:16, 18.74it/s]

Step 16,100, (N samples: 2,060,800), Loss: 113.7944, (Recon: 103.1500, KL: 10.6444), Gradient norm: 42.5839


 55%|█████▌    | 258/469 [00:14<00:11, 18.60it/s]

Step 16,200, (N samples: 2,073,600), Loss: 117.0081, (Recon: 106.2163, KL: 10.7918), Gradient norm: 84.9564


 76%|███████▋  | 358/469 [00:19<00:06, 18.22it/s]

Step 16,300, (N samples: 2,086,400), Loss: 121.2751, (Recon: 110.8139, KL: 10.4612), Gradient norm: 133.8084


 98%|█████████▊| 458/469 [00:24<00:00, 18.20it/s]

Step 16,400, (N samples: 2,099,200), Loss: 113.9092, (Recon: 103.3012, KL: 10.6080), Gradient norm: 71.7239


100%|██████████| 469/469 [00:25<00:00, 18.43it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.81it/s]


====> Test set loss: 119.1313, (BCE: 108.5528, KLD: 10.5784)
Epoch 36/50


 19%|█▉        | 89/469 [00:04<00:20, 18.21it/s]

Step 16,500, (N samples: 2,112,000), Loss: 122.3602, (Recon: 111.6155, KL: 10.7447), Gradient norm: 77.0583


 40%|████      | 189/469 [00:10<00:16, 17.38it/s]

Step 16,600, (N samples: 2,124,800), Loss: 114.9562, (Recon: 104.0877, KL: 10.8685), Gradient norm: 54.6793


 62%|██████▏   | 289/469 [00:15<00:09, 18.64it/s]

Step 16,700, (N samples: 2,137,600), Loss: 123.6981, (Recon: 113.0561, KL: 10.6420), Gradient norm: 52.9742


 83%|████████▎ | 389/469 [00:21<00:04, 17.43it/s]

Step 16,800, (N samples: 2,150,400), Loss: 116.2165, (Recon: 105.4024, KL: 10.8141), Gradient norm: 93.6746


100%|██████████| 469/469 [00:25<00:00, 18.47it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.73it/s]


====> Test set loss: 119.5331, (BCE: 108.8101, KLD: 10.7230)
Epoch 37/50


  4%|▍         | 20/469 [00:01<00:24, 18.30it/s]

Step 16,900, (N samples: 2,163,200), Loss: 120.3813, (Recon: 109.6145, KL: 10.7668), Gradient norm: 67.5944


 26%|██▌       | 120/469 [00:06<00:19, 18.36it/s]

Step 17,000, (N samples: 2,176,000), Loss: 116.0252, (Recon: 105.4418, KL: 10.5835), Gradient norm: 65.8422


 47%|████▋     | 219/469 [00:11<00:13, 18.25it/s]

Step 17,100, (N samples: 2,188,800), Loss: 117.8851, (Recon: 107.1984, KL: 10.6868), Gradient norm: 81.1112


 68%|██████▊   | 319/469 [00:17<00:08, 18.56it/s]

Step 17,200, (N samples: 2,201,600), Loss: 112.7781, (Recon: 101.9835, KL: 10.7946), Gradient norm: 94.3253


 89%|████████▉ | 419/469 [00:22<00:02, 18.56it/s]

Step 17,300, (N samples: 2,214,400), Loss: 121.4201, (Recon: 110.5888, KL: 10.8312), Gradient norm: 97.3406


100%|██████████| 469/469 [00:25<00:00, 18.47it/s]
Testing: 100%|██████████| 79/79 [00:04<00:00, 16.99it/s]


====> Test set loss: 118.5931, (BCE: 107.8368, KLD: 10.7563)
Epoch 38/50


 11%|█         | 50/469 [00:02<00:22, 18.69it/s]

Step 17,400, (N samples: 2,227,200), Loss: 120.1093, (Recon: 109.4063, KL: 10.7030), Gradient norm: 99.7848


 32%|███▏      | 150/469 [00:08<00:16, 18.80it/s]

Step 17,500, (N samples: 2,240,000), Loss: 122.8275, (Recon: 111.9683, KL: 10.8591), Gradient norm: 69.5722


 53%|█████▎    | 250/469 [00:13<00:11, 18.87it/s]

Step 17,600, (N samples: 2,252,800), Loss: 124.0838, (Recon: 113.2524, KL: 10.8314), Gradient norm: 95.7328


 75%|███████▍  | 350/469 [00:18<00:06, 18.45it/s]

Step 17,700, (N samples: 2,265,600), Loss: 119.2949, (Recon: 108.7261, KL: 10.5688), Gradient norm: 52.3131


 96%|█████████▌| 450/469 [00:24<00:01, 17.92it/s]

Step 17,800, (N samples: 2,278,400), Loss: 118.4624, (Recon: 107.8093, KL: 10.6531), Gradient norm: 63.8702


100%|██████████| 469/469 [00:25<00:00, 18.62it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.83it/s]


====> Test set loss: 119.1274, (BCE: 108.4770, KLD: 10.6503)
Epoch 39/50


 17%|█▋        | 82/469 [00:04<00:20, 18.43it/s]

Step 17,900, (N samples: 2,291,200), Loss: 110.3338, (Recon: 99.7032, KL: 10.6306), Gradient norm: 60.5451


 39%|███▊      | 181/469 [00:09<00:15, 18.45it/s]

Step 18,000, (N samples: 2,304,000), Loss: 123.4752, (Recon: 112.6440, KL: 10.8312), Gradient norm: 90.8761


 60%|█████▉    | 281/469 [00:15<00:09, 18.83it/s]

Step 18,100, (N samples: 2,316,800), Loss: 119.4286, (Recon: 108.6477, KL: 10.7810), Gradient norm: 66.4783


 81%|████████  | 381/469 [00:20<00:04, 18.86it/s]

Step 18,200, (N samples: 2,329,600), Loss: 121.6729, (Recon: 110.9062, KL: 10.7666), Gradient norm: 71.3830


100%|██████████| 469/469 [00:25<00:00, 18.69it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.36it/s]


====> Test set loss: 118.2741, (BCE: 107.5165, KLD: 10.7576)
Epoch 40/50


  3%|▎         | 13/469 [00:00<00:24, 18.41it/s]

Step 18,300, (N samples: 2,342,400), Loss: 116.6820, (Recon: 105.8177, KL: 10.8643), Gradient norm: 62.4606


 24%|██▍       | 113/469 [00:06<00:19, 18.58it/s]

Step 18,400, (N samples: 2,355,200), Loss: 113.4355, (Recon: 102.6230, KL: 10.8124), Gradient norm: 66.7515


 45%|████▌     | 212/469 [00:11<00:13, 18.45it/s]

Step 18,500, (N samples: 2,368,000), Loss: 109.9484, (Recon: 99.2791, KL: 10.6693), Gradient norm: 77.3991


 67%|██████▋   | 312/469 [00:16<00:08, 17.85it/s]

Step 18,600, (N samples: 2,380,800), Loss: 122.5031, (Recon: 111.6152, KL: 10.8879), Gradient norm: 74.6464


 88%|████████▊ | 412/469 [00:22<00:03, 18.69it/s]

Step 18,700, (N samples: 2,393,600), Loss: 119.8237, (Recon: 109.0402, KL: 10.7835), Gradient norm: 80.1895


100%|██████████| 469/469 [00:25<00:00, 18.64it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.49it/s]


====> Test set loss: 118.6954, (BCE: 108.0774, KLD: 10.6181)
Epoch 41/50


  9%|▉         | 43/469 [00:02<00:22, 18.55it/s]

Step 18,800, (N samples: 2,406,400), Loss: 114.2838, (Recon: 103.6148, KL: 10.6690), Gradient norm: 59.9096


 31%|███       | 144/469 [00:07<00:17, 18.64it/s]

Step 18,900, (N samples: 2,419,200), Loss: 110.1975, (Recon: 99.4298, KL: 10.7676), Gradient norm: 76.2022


 52%|█████▏    | 244/469 [00:13<00:11, 18.87it/s]

Step 19,000, (N samples: 2,432,000), Loss: 111.3897, (Recon: 100.6646, KL: 10.7251), Gradient norm: 54.4236


 73%|███████▎  | 344/469 [00:18<00:06, 18.57it/s]

Step 19,100, (N samples: 2,444,800), Loss: 122.8949, (Recon: 112.0566, KL: 10.8383), Gradient norm: 59.0398


 95%|█████████▍| 444/469 [00:23<00:01, 18.34it/s]

Step 19,200, (N samples: 2,457,600), Loss: 124.4734, (Recon: 113.5911, KL: 10.8822), Gradient norm: 90.7440


100%|██████████| 469/469 [00:25<00:00, 18.56it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.48it/s]


====> Test set loss: 119.0186, (BCE: 108.3143, KLD: 10.7043)
Epoch 42/50


 15%|█▌        | 72/469 [00:04<00:29, 13.38it/s]

Step 19,300, (N samples: 2,470,400), Loss: 120.5121, (Recon: 109.8685, KL: 10.6436), Gradient norm: 95.6192


 37%|███▋      | 174/469 [00:10<00:17, 16.79it/s]

Step 19,400, (N samples: 2,483,200), Loss: 111.8191, (Recon: 101.2361, KL: 10.5830), Gradient norm: 93.4460


 58%|█████▊    | 274/469 [00:15<00:10, 18.64it/s]

Step 19,500, (N samples: 2,496,000), Loss: 117.7828, (Recon: 106.8721, KL: 10.9107), Gradient norm: 87.3310


 80%|███████▉  | 374/469 [00:20<00:05, 18.05it/s]

Step 19,600, (N samples: 2,508,800), Loss: 113.8719, (Recon: 103.1547, KL: 10.7171), Gradient norm: 110.4531


100%|██████████| 469/469 [00:25<00:00, 18.12it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.38it/s]


====> Test set loss: 118.5048, (BCE: 107.8416, KLD: 10.6632)
Epoch 43/50


  1%|▏         | 6/469 [00:00<00:27, 16.99it/s]

Step 19,700, (N samples: 2,521,600), Loss: 114.1607, (Recon: 103.5169, KL: 10.6438), Gradient norm: 98.9134


 23%|██▎       | 106/469 [00:05<00:20, 17.67it/s]

Step 19,800, (N samples: 2,534,400), Loss: 121.4758, (Recon: 110.6280, KL: 10.8479), Gradient norm: 82.7039


 44%|████▍     | 206/469 [00:11<00:14, 18.60it/s]

Step 19,900, (N samples: 2,547,200), Loss: 119.6083, (Recon: 108.9582, KL: 10.6502), Gradient norm: 103.9103


 65%|██████▌   | 306/469 [00:16<00:08, 18.29it/s]

Step 20,000, (N samples: 2,560,000), Loss: 121.7134, (Recon: 110.9029, KL: 10.8105), Gradient norm: 118.3732


 87%|████████▋ | 406/469 [00:21<00:03, 18.42it/s]

Step 20,100, (N samples: 2,572,800), Loss: 119.3958, (Recon: 108.6382, KL: 10.7576), Gradient norm: 71.2685


100%|██████████| 469/469 [00:25<00:00, 18.59it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.52it/s]


====> Test set loss: 118.8992, (BCE: 108.2448, KLD: 10.6544)
Epoch 44/50


  8%|▊         | 36/469 [00:01<00:23, 18.67it/s]

Step 20,200, (N samples: 2,585,600), Loss: 122.3020, (Recon: 111.2612, KL: 11.0408), Gradient norm: 91.9705


 29%|██▉       | 136/469 [00:07<00:17, 18.58it/s]

Step 20,300, (N samples: 2,598,400), Loss: 121.4917, (Recon: 110.7860, KL: 10.7057), Gradient norm: 178.2274


 51%|█████     | 237/469 [00:12<00:12, 18.66it/s]

Step 20,400, (N samples: 2,611,200), Loss: 119.4094, (Recon: 108.8323, KL: 10.5771), Gradient norm: 87.1759


 72%|███████▏  | 337/469 [00:18<00:07, 18.47it/s]

Step 20,500, (N samples: 2,624,000), Loss: 115.1569, (Recon: 104.4784, KL: 10.6785), Gradient norm: 62.3995


 93%|█████████▎| 437/469 [00:23<00:01, 18.61it/s]

Step 20,600, (N samples: 2,636,800), Loss: 121.6628, (Recon: 110.6609, KL: 11.0019), Gradient norm: 93.1731


100%|██████████| 469/469 [00:25<00:00, 18.49it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.10it/s]


====> Test set loss: 117.6888, (BCE: 106.9684, KLD: 10.7204)
Epoch 45/50


 14%|█▍        | 68/469 [00:03<00:21, 18.24it/s]

Step 20,700, (N samples: 2,649,600), Loss: 115.8153, (Recon: 105.0652, KL: 10.7502), Gradient norm: 78.6100


 36%|███▌      | 168/469 [00:09<00:17, 17.00it/s]

Step 20,800, (N samples: 2,662,400), Loss: 118.5722, (Recon: 107.7706, KL: 10.8017), Gradient norm: 67.7357


 57%|█████▋    | 268/469 [00:14<00:10, 18.57it/s]

Step 20,900, (N samples: 2,675,200), Loss: 117.8493, (Recon: 106.9317, KL: 10.9176), Gradient norm: 87.3026


 78%|███████▊  | 368/469 [00:20<00:05, 17.48it/s]

Step 21,000, (N samples: 2,688,000), Loss: 127.1067, (Recon: 116.0809, KL: 11.0257), Gradient norm: 86.4846


100%|█████████▉| 468/469 [00:25<00:00, 18.78it/s]

Step 21,100, (N samples: 2,700,800), Loss: 117.9368, (Recon: 107.0423, KL: 10.8944), Gradient norm: 70.7978


100%|██████████| 469/469 [00:25<00:00, 18.45it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.71it/s]


====> Test set loss: 118.0201, (BCE: 107.0960, KLD: 10.9240)
Epoch 46/50


 21%|██        | 99/469 [00:05<00:20, 18.17it/s]

Step 21,200, (N samples: 2,713,600), Loss: 117.2220, (Recon: 106.3571, KL: 10.8650), Gradient norm: 82.6102


 42%|████▏     | 199/469 [00:11<00:14, 18.58it/s]

Step 21,300, (N samples: 2,726,400), Loss: 110.1848, (Recon: 99.4664, KL: 10.7184), Gradient norm: 66.7620


 64%|██████▍   | 299/469 [00:16<00:09, 18.31it/s]

Step 21,400, (N samples: 2,739,200), Loss: 115.1122, (Recon: 104.1334, KL: 10.9788), Gradient norm: 55.7890


 85%|████████▌ | 399/469 [00:21<00:03, 18.34it/s]

Step 21,500, (N samples: 2,752,000), Loss: 116.5954, (Recon: 105.7488, KL: 10.8466), Gradient norm: 82.7490


100%|██████████| 469/469 [00:25<00:00, 18.23it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.67it/s]


====> Test set loss: 118.6886, (BCE: 107.8265, KLD: 10.8621)
Epoch 47/50


  6%|▌         | 29/469 [00:01<00:23, 18.37it/s]

Step 21,600, (N samples: 2,764,800), Loss: 114.4945, (Recon: 103.6760, KL: 10.8184), Gradient norm: 54.4292


 28%|██▊       | 129/469 [00:07<00:18, 18.77it/s]

Step 21,700, (N samples: 2,777,600), Loss: 112.5558, (Recon: 101.7664, KL: 10.7895), Gradient norm: 62.5057


 49%|████▉     | 229/469 [00:12<00:12, 18.61it/s]

Step 21,800, (N samples: 2,790,400), Loss: 116.8086, (Recon: 105.9525, KL: 10.8561), Gradient norm: 62.6472


 70%|███████   | 329/469 [00:17<00:07, 18.57it/s]

Step 21,900, (N samples: 2,803,200), Loss: 114.9549, (Recon: 104.1610, KL: 10.7939), Gradient norm: 81.3164


 91%|█████████▏| 429/469 [00:23<00:02, 18.63it/s]

Step 22,000, (N samples: 2,816,000), Loss: 111.1592, (Recon: 100.4283, KL: 10.7309), Gradient norm: 70.7319


100%|██████████| 469/469 [00:25<00:00, 18.63it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.53it/s]


====> Test set loss: 118.6379, (BCE: 107.8487, KLD: 10.7892)
Epoch 48/50


 13%|█▎        | 60/469 [00:03<00:22, 17.87it/s]

Step 22,100, (N samples: 2,828,800), Loss: 120.3823, (Recon: 109.4172, KL: 10.9650), Gradient norm: 95.5425


 34%|███▍      | 161/469 [00:08<00:16, 18.75it/s]

Step 22,200, (N samples: 2,841,600), Loss: 115.9500, (Recon: 105.2084, KL: 10.7416), Gradient norm: 52.7024


 56%|█████▌    | 261/469 [00:14<00:11, 18.71it/s]

Step 22,300, (N samples: 2,854,400), Loss: 116.2230, (Recon: 105.4165, KL: 10.8064), Gradient norm: 58.7445


 77%|███████▋  | 361/469 [00:19<00:05, 18.58it/s]

Step 22,400, (N samples: 2,867,200), Loss: 116.1941, (Recon: 105.1721, KL: 11.0221), Gradient norm: 83.6103


 98%|█████████▊| 461/469 [00:24<00:00, 18.61it/s]

Step 22,500, (N samples: 2,880,000), Loss: 110.6719, (Recon: 100.0827, KL: 10.5892), Gradient norm: 62.3332


100%|██████████| 469/469 [00:25<00:00, 18.59it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.60it/s]


====> Test set loss: 117.8887, (BCE: 107.0902, KLD: 10.7985)
Epoch 49/50


 19%|█▉        | 91/469 [00:04<00:20, 18.31it/s]

Step 22,600, (N samples: 2,892,800), Loss: 109.5051, (Recon: 98.8186, KL: 10.6865), Gradient norm: 126.6241


 41%|████      | 191/469 [00:10<00:14, 18.58it/s]

Step 22,700, (N samples: 2,905,600), Loss: 121.2559, (Recon: 110.1513, KL: 11.1046), Gradient norm: 79.5257


 62%|██████▏   | 291/469 [00:15<00:09, 18.84it/s]

Step 22,800, (N samples: 2,918,400), Loss: 118.1618, (Recon: 107.3347, KL: 10.8271), Gradient norm: 155.8291


 83%|████████▎ | 391/469 [00:21<00:04, 18.29it/s]

Step 22,900, (N samples: 2,931,200), Loss: 115.2543, (Recon: 104.1857, KL: 11.0686), Gradient norm: 90.8074


100%|██████████| 469/469 [00:25<00:00, 18.58it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.50it/s]


====> Test set loss: 117.6334, (BCE: 106.8059, KLD: 10.8274)
Epoch 50/50


  5%|▍         | 22/469 [00:01<00:24, 18.54it/s]

Step 23,000, (N samples: 2,944,000), Loss: 119.2684, (Recon: 108.6594, KL: 10.6090), Gradient norm: 77.6540


 26%|██▌       | 123/469 [00:06<00:18, 18.75it/s]

Step 23,100, (N samples: 2,956,800), Loss: 117.1672, (Recon: 106.2419, KL: 10.9253), Gradient norm: 68.1578


 48%|████▊     | 223/469 [00:11<00:13, 18.64it/s]

Step 23,200, (N samples: 2,969,600), Loss: 114.8435, (Recon: 104.0941, KL: 10.7494), Gradient norm: 80.2924


 69%|██████▉   | 323/469 [00:17<00:07, 18.33it/s]

Step 23,300, (N samples: 2,982,400), Loss: 112.2476, (Recon: 101.4993, KL: 10.7483), Gradient norm: 96.6191


 90%|█████████ | 423/469 [00:23<00:02, 18.39it/s]

Step 23,400, (N samples: 2,995,200), Loss: 112.4858, (Recon: 101.6245, KL: 10.8613), Gradient norm: 89.8515


100%|██████████| 469/469 [00:25<00:00, 18.23it/s]
Testing: 100%|██████████| 79/79 [00:03<00:00, 20.43it/s]

====> Test set loss: 118.0897, (BCE: 107.2345, KLD: 10.8552)





In [15]:
writer.flush()

In [16]:
%load_ext tensorboard

In [23]:
%tensorboard --logdir ../experiments/VAE_MNIST/20241028-004306/

Reusing TensorBoard on port 6011 (pid 1110344), started 0:00:12 ago. (Use '!kill 1110344' to kill it.)