In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from tqdm import tqdm
from cw import L2Adversary
from torchvision.utils import save_image
import tensorflow as tf
from numpy import moveaxis

In [28]:
# Test Dataset
transformation = transforms.Compose([transforms.ToTensor()])
test_directory = "test_fake"
test_imgfolder = datasets.ImageFolder(test_directory, transform=transformation)
test_dataloader = torch.utils.data.DataLoader(test_imgfolder, batch_size=1, shuffle=True)

In [29]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [30]:
model = torch.load("Models/resnet_base.pt")
model.cuda()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [31]:
adversary = L2Adversary(targeted=False,
                           confidence=1.0,
                           c_range=(1e-7, 1e-4),
                           search_steps=1,
                           max_steps=1000,
                           optimizer_lr=1e-2,
                           init_rand=True)

In [32]:
def test(examples, model, device):
    num_correct = 0
    for img in examples:
        img = img.to(device)
        if(model(img).squeeze() < 0):
            num_correct += 1
    return num_correct/len(examples)

In [33]:
examples = []
count = 0
for img,label in tqdm(test_dataloader):
    img = img.to(device)
    label = label.to(device)
    res = adversary(model, img, label, device)
    examples.append(res)
    
    count += 1
    if count > 100:
        break





  0%|          | 0/4999 [00:00<?, ?it/s][A[A[A[A

Using scale consts: [1e-07]
batch [0] loss: 0.056437134742736816
batch [10] loss: 1.0690791606903076
batch [20] loss: 0.3464754521846771
batch [30] loss: 0.05541723221540451
batch [40] loss: 0.01816614717245102
batch [50] loss: 0.008865579962730408
batch [60] loss: 0.004074404947459698
batch [70] loss: 0.002162997145205736
batch [80] loss: 0.0015084769111126661
batch [90] loss: 0.001219652476720512
batch [100] loss: 0.0010670882184058428
batch [110] loss: 0.0010232070926576853
batch [120] loss: 0.0010094671742990613
batch [130] loss: 0.0010028383694589138
batch [140] loss: 0.0010010713012889028
batch [150] loss: 0.0010003006318584085
batch [160] loss: 0.0010000255424529314
batch [170] loss: 0.0009999442845582962
batch [180] loss: 0.0009999104076996446
batch [190] loss: 0.0009998977184295654
batch [200] loss: 0.0009998935274779797
batch [210] loss: 0.0009998921304941177
batch [220] loss: 0.0009998916648328304
batch [230] loss: 0.0009998914320021868
batch [240] loss: 0.000999891432002186





  0%|          | 1/4999 [00:02<3:59:24,  2.87s/it][A[A[A[A

batch [280] loss: 0.0009998914320021868
batch [290] loss: 0.0009998914320021868
batch [300] loss: 0.0009998914320021868
Using scale consts: [1e-07]
batch [0] loss: 0.024394117295742035
batch [10] loss: 0.4538916349411011
batch [20] loss: 0.1473754495382309
batch [30] loss: 0.024056240916252136
batch [40] loss: 0.008281963877379894
batch [50] loss: 0.0043404074385762215
batch [60] loss: 0.0023106620647013187
batch [70] loss: 0.0014947461895644665
batch [80] loss: 0.0012147794477641582
batch [90] loss: 0.0010922695510089397
batch [100] loss: 0.00102781702298671
batch [110] loss: 0.0010092377196997404
batch [120] loss: 0.0010033625876531005
batch [130] loss: 0.0010005758376792073
batch [140] loss: 0.0009998148307204247
batch [150] loss: 0.0009994924766942859
batch [160] loss: 0.0009993761777877808
batch [170] loss: 0.0009993409039452672
batch [180] loss: 0.0009993265848606825
batch [190] loss: 0.0009993212297558784
batch [200] loss: 0.000999319483526051
batch [210] loss: 0.000999318901449





  0%|          | 2/4999 [00:05<3:58:22,  2.86s/it][A[A[A[A

batch [300] loss: 0.0009993186686187983
Using scale consts: [1e-07]
batch [0] loss: 0.03632313385605812
batch [10] loss: 0.6890931725502014
batch [20] loss: 0.22302016615867615
batch [30] loss: 0.03578292950987816
batch [40] loss: 0.011994551867246628
batch [50] loss: 0.006051926873624325
batch [60] loss: 0.0029769414104521275
batch [70] loss: 0.0017479232046753168
batch [80] loss: 0.0013263460714370012
batch [90] loss: 0.001140978536568582
batch [100] loss: 0.00104228756390512
batch [110] loss: 0.0010143750114366412
batch [120] loss: 0.001005374127998948
batch [130] loss: 0.0010011674603447318
batch [140] loss: 0.0010000066831707954
batch [150] loss: 0.0009995173895731568
batch [160] loss: 0.000999341020360589
batch [170] loss: 0.0009992874693125486
batch [180] loss: 0.000999265699647367
batch [190] loss: 0.0009992575505748391
batch [200] loss: 0.000999254873022437
batch [210] loss: 0.0009992539416998625
batch [220] loss: 0.000999253592453897
batch [230] loss: 0.0009992534760385752
ba





  0%|          | 3/4999 [00:08<3:57:43,  2.85s/it][A[A[A[A

batch [290] loss: 0.0009992534760385752
batch [300] loss: 0.000999253592453897
Using scale consts: [1e-07]
batch [0] loss: 0.021572623401880264
batch [10] loss: 0.3999747037887573
batch [20] loss: 0.13000322878360748
batch [30] loss: 0.02128797397017479
batch [40] loss: 0.007394417189061642
batch [50] loss: 0.003935243468731642
batch [60] loss: 0.0021502175368368626
batch [70] loss: 0.0014357034815475345
batch [80] loss: 0.0011889513116329908
batch [90] loss: 0.0010812397813424468
batch [100] loss: 0.001024239114485681
batch [110] loss: 0.0010079335188493133
batch [120] loss: 0.001002758159302175
batch [130] loss: 0.0010002993512898684
batch [140] loss: 0.000999631593003869
batch [150] loss: 0.000999345793388784
batch [160] loss: 0.0009992438135668635
batch [170] loss: 0.0009992130799219012
batch [180] loss: 0.000999200390651822
batch [190] loss: 0.0009991956176236272
batch [200] loss: 0.0009991941042244434
batch [210] loss: 0.0009991935221478343
batch [220] loss: 0.0009991934057325125





  0%|          | 4/4999 [00:11<3:56:47,  2.84s/it][A[A[A[A

batch [280] loss: 0.0009991932893171906
batch [290] loss: 0.0009991932893171906
batch [300] loss: 0.0009991932893171906
Using scale consts: [1e-07]
batch [0] loss: 0.06032668054103851
batch [10] loss: 1.1487321853637695
batch [20] loss: 0.3716139793395996
batch [30] loss: 0.059140149503946304
batch [40] loss: 0.019385403022170067
batch [50] loss: 0.009440750814974308
batch [60] loss: 0.004299691878259182
batch [70] loss: 0.0022460513282567263
batch [80] loss: 0.0015449480852112174
batch [90] loss: 0.001236405223608017
batch [100] loss: 0.0010722657898440957
batch [110] loss: 0.0010250498307868838
batch [120] loss: 0.001010335050523281
batch [130] loss: 0.0010032086865976453
batch [140] loss: 0.0010013144928961992
batch [150] loss: 0.0010004878276959062
batch [160] loss: 0.0010001917835325003
batch [170] loss: 0.0010001047048717737
batch [180] loss: 0.0010000682668760419
batch [190] loss: 0.0010000546462833881
batch [200] loss: 0.0010000501060858369
batch [210] loss: 0.00100004870910197





  0%|          | 5/4999 [00:14<3:55:57,  2.83s/it][A[A[A[A

batch [300] loss: 0.0010000477777794003
Using scale consts: [1e-07]
batch [0] loss: 0.03157157078385353
batch [10] loss: 0.5922885537147522
batch [20] loss: 0.19196508824825287
batch [30] loss: 0.030947186052799225
batch [40] loss: 0.01044974010437727
batch [50] loss: 0.005340956151485443
batch [60] loss: 0.002701565157622099
batch [70] loss: 0.0016436268342658877
batch [80] loss: 0.0012799457181245089
batch [90] loss: 0.0011210612719878554
batch [100] loss: 0.0010364217450842261
batch [110] loss: 0.0010122043313458562
batch [120] loss: 0.0010045635281130672
batch [130] loss: 0.001000913791358471
batch [140] loss: 0.0009999234462156892
batch [150] loss: 0.0009995016735047102
batch [160] loss: 0.0009993494022637606
batch [170] loss: 0.0009993038838729262
batch [180] loss: 0.000999285141006112
batch [190] loss: 0.0009992781560868025
batch [200] loss: 0.000999275827780366
batch [210] loss: 0.0009992750128731132
batch [220] loss: 0.0009992747800424695
batch [230] loss: 0.000999274663627147





  0%|          | 6/4999 [00:16<3:55:16,  2.83s/it][A[A[A[A

batch [290] loss: 0.0009992746636271477
batch [300] loss: 0.0009992746636271477
Using scale consts: [1e-07]
batch [0] loss: 0.03162310644984245
batch [10] loss: 0.5891032218933105
batch [20] loss: 0.19134238362312317
batch [30] loss: 0.031037699431180954
batch [40] loss: 0.0104605657979846
batch [50] loss: 0.005334425717592239
batch [60] loss: 0.002700747922062874
batch [70] loss: 0.0016443042550235987
batch [80] loss: 0.0012792212655767798
batch [90] loss: 0.0011206644121557474
batch [100] loss: 0.001036577275954187
batch [110] loss: 0.0010124854743480682
batch [120] loss: 0.0010048679541796446
batch [130] loss: 0.00100124126765877
batch [140] loss: 0.0010002588387578726
batch [150] loss: 0.0009998376481235027
batch [160] loss: 0.0009996859589591622
batch [170] loss: 0.0009996405569836497
batch [180] loss: 0.0009996219305321574
batch [190] loss: 0.0009996149456128478
batch [200] loss: 0.0009996126173064113
batch [210] loss: 0.0009996119188144803
batch [220] loss: 0.0009996115695685148





  0%|          | 7/4999 [00:19<3:54:37,  2.82s/it][A[A[A[A

batch [280] loss: 0.000999611453153193
batch [290] loss: 0.0009996115695685148
batch [300] loss: 0.0009996115695685148
Using scale consts: [1e-07]
batch [0] loss: 0.03722240775823593
batch [10] loss: 0.6990114450454712
batch [20] loss: 0.22683003544807434
batch [30] loss: 0.036617398262023926
batch [40] loss: 0.012224103324115276
batch [50] loss: 0.006143765989691019
batch [60] loss: 0.003015308640897274
batch [70] loss: 0.001764098764397204
batch [80] loss: 0.001332007464952767
batch [90] loss: 0.0011425456032156944
batch [100] loss: 0.00104311213362962
batch [110] loss: 0.00101449282374233
batch [120] loss: 0.001005458296276629
batch [130] loss: 0.001001153141260147
batch [140] loss: 0.0009999806061387062
batch [150] loss: 0.0009994835127145052
batch [160] loss: 0.0009993037674576044
batch [170] loss: 0.000999249517917633
batch [180] loss: 0.000999227399006486
batch [190] loss: 0.000999219249933958
batch [200] loss: 0.0009992164559662342
batch [210] loss: 0.0009992155246436596
batch 





  0%|          | 8/4999 [00:22<3:54:18,  2.82s/it][A[A[A[A

batch [300] loss: 0.0009992150589823723
Using scale consts: [1e-07]
batch [0] loss: 0.04259299486875534
batch [10] loss: 0.8062790036201477
batch [20] loss: 0.2611575424671173
batch [30] loss: 0.04200541973114014
batch [40] loss: 0.013971077278256416
batch [50] loss: 0.00694188242778182
batch [60] loss: 0.003327301237732172
batch [70] loss: 0.0018805221188813448
batch [80] loss: 0.0013830042444169521
batch [90] loss: 0.0011642365716397762
batch [100] loss: 0.001050018472597003
batch [110] loss: 0.001016884227283299
batch [120] loss: 0.0010064534144476056
batch [130] loss: 0.0010015158914029598
batch [140] loss: 0.0010001540649682283
batch [150] loss: 0.0009995810687541962
batch [160] loss: 0.0009993751300498843
batch [170] loss: 0.0009993122657760978
batch [180] loss: 0.0009992866544052958
batch [190] loss: 0.0009992773411795497
batch [200] loss: 0.0009992741979658604
batch [210] loss: 0.000999273150227964
batch [220] loss: 0.0009992726845666766
batch [230] loss: 0.0009992725681513548






  0%|          | 9/4999 [00:25<3:53:56,  2.81s/it][A[A[A[A

batch [290] loss: 0.0009992725681513548
batch [300] loss: 0.0009992725681513548
Using scale consts: [1e-07]
batch [0] loss: 0.05574832856655121
batch [10] loss: 1.054135799407959
batch [20] loss: 0.34153416752815247
batch [30] loss: 0.05462696775794029
batch [40] loss: 0.017926786094903946
batch [50] loss: 0.008759640157222748
batch [60] loss: 0.004034584388136864
batch [70] loss: 0.002146572107449174
batch [80] loss: 0.0015010228380560875
batch [90] loss: 0.0012161716585978866
batch [100] loss: 0.0010655163787305355
batch [110] loss: 0.0010224597062915564
batch [120] loss: 0.0010088275885209441
batch [130] loss: 0.0010023257927969098
batch [140] loss: 0.0010005681542679667
batch [150] loss: 0.000999811920337379
batch [160] loss: 0.0009995409054681659
batch [170] loss: 0.000999460113234818
batch [180] loss: 0.0009994267020374537
batch [190] loss: 0.0009994142455980182
batch [200] loss: 0.0009994101710617542
batch [210] loss: 0.0009994087740778923
batch [220] loss: 0.0009994081920012832





  0%|          | 10/4999 [00:28<3:53:51,  2.81s/it][A[A[A[A

batch [280] loss: 0.0009994079591706395
batch [290] loss: 0.0009994079591706395
batch [300] loss: 0.0009994079591706395
Using scale consts: [1e-07]
batch [0] loss: 0.0440889447927475
batch [10] loss: 0.8330892324447632
batch [20] loss: 0.26984548568725586
batch [30] loss: 0.043226804584264755
batch [40] loss: 0.014345034025609493
batch [50] loss: 0.007127633783966303
batch [60] loss: 0.0033943732269108295
batch [70] loss: 0.0019035169389098883
batch [80] loss: 0.001394379884004593
batch [90] loss: 0.0011706933146342635
batch [100] loss: 0.001051764003932476
batch [110] loss: 0.0010174800409004092
batch [120] loss: 0.0010068524861708283
batch [130] loss: 0.001001665135845542
batch [140] loss: 0.0010002930648624897
batch [150] loss: 0.0009996925946325064
batch [160] loss: 0.0009994776919484138
batch [170] loss: 0.0009994147112593055
batch [180] loss: 0.000999388168565929
batch [190] loss: 0.0009993782732635736
batch [200] loss: 0.0009993750136345625
batch [210] loss: 0.000999373965896666





  0%|          | 11/4999 [00:31<3:53:34,  2.81s/it][A[A[A[A

batch [300] loss: 0.000999373383820057
Using scale consts: [1e-07]
batch [0] loss: 0.039071083068847656
batch [10] loss: 0.7297364473342896
batch [20] loss: 0.23658454418182373
batch [30] loss: 0.03819337114691734
batch [40] loss: 0.012746645137667656
batch [50] loss: 0.006377370096743107
batch [60] loss: 0.003105676267296076
batch [70] loss: 0.0017987231258302927
batch [80] loss: 0.0013472605496644974
batch [90] loss: 0.00114876264706254
batch [100] loss: 0.001045216340571642
batch [110] loss: 0.0010153413750231266
batch [120] loss: 0.0010058569023385644
batch [130] loss: 0.0010013888822868466
batch [140] loss: 0.0010001570917665958
batch [150] loss: 0.0009996368316933513
batch [160] loss: 0.0009994502179324627
batch [170] loss: 0.000999393523670733
batch [180] loss: 0.0009993704734370112
batch [190] loss: 0.0009993619751185179
batch [200] loss: 0.0009993590647354722
batch [210] loss: 0.0009993581334128976
batch [220] loss: 0.000999357784166932
batch [230] loss: 0.0009993575513362885






  0%|          | 12/4999 [00:33<3:53:34,  2.81s/it][A[A[A[A

batch [290] loss: 0.0009993575513362885
batch [300] loss: 0.0009993575513362885
Using scale consts: [1e-07]
batch [0] loss: 0.03222453594207764
batch [10] loss: 0.6035816669464111
batch [20] loss: 0.19590844213962555
batch [30] loss: 0.03172805905342102
batch [40] loss: 0.010687317699193954
batch [50] loss: 0.005438424646854401
batch [60] loss: 0.0027353004552423954
batch [70] loss: 0.0016569850267842412
batch [80] loss: 0.0012883299496024847
batch [90] loss: 0.0011234546545892954
batch [100] loss: 0.0010376796126365662
batch [110] loss: 0.0010130711598321795
batch [120] loss: 0.001005205325782299
batch [130] loss: 0.0010015142615884542
batch [140] loss: 0.0010004972573369741
batch [150] loss: 0.0010000676847994328
batch [160] loss: 0.0009999130852520466
batch [170] loss: 0.0009998661698773503
batch [180] loss: 0.0009998471941798925
batch [190] loss: 0.000999840092845261
batch [200] loss: 0.0009998377645388246
batch [210] loss: 0.0009998369496315718
batch [220] loss: 0.0009998366003856





  0%|          | 13/4999 [00:36<3:54:57,  2.83s/it][A[A[A[A

batch [280] loss: 0.0009998366003856063
batch [290] loss: 0.0009998366003856063
batch [300] loss: 0.0009998366003856063
Using scale consts: [1e-07]
batch [0] loss: 0.03343828395009041
batch [10] loss: 0.6290868520736694
batch [20] loss: 0.20405152440071106
batch [30] loss: 0.03294892981648445
batch [40] loss: 0.01108576264232397
batch [50] loss: 0.005627088714390993
batch [60] loss: 0.002810457954183221
batch [70] loss: 0.0016841047909110785
batch [80] loss: 0.0012987072113901377
batch [90] loss: 0.001128622330725193
batch [100] loss: 0.0010388888185843825
batch [110] loss: 0.0010132304159924388
batch [120] loss: 0.0010050823912024498
batch [130] loss: 0.0010012193815782666
batch [140] loss: 0.0010001667542383075
batch [150] loss: 0.0009997172746807337
batch [160] loss: 0.0009995558066293597
batch [170] loss: 0.0009995072614401579
batch [180] loss: 0.0009994873544201255
batch [190] loss: 0.0009994799038395286
batch [200] loss: 0.0009994774591177702
batch [210] loss: 0.00099947664421051





  0%|          | 14/4999 [00:39<3:55:53,  2.84s/it][A[A[A[A

batch [290] loss: 0.000999476294964552
batch [300] loss: 0.000999476294964552
Using scale consts: [1e-07]
batch [0] loss: 0.061714332550764084
batch [10] loss: 1.1652108430862427
batch [20] loss: 0.3774617314338684
batch [30] loss: 0.06021343544125557
batch [40] loss: 0.019688528031110764
batch [50] loss: 0.009576467797160149
batch [60] loss: 0.004353438038378954
batch [70] loss: 0.0022629976738244295
batch [80] loss: 0.0015514141414314508
batch [90] loss: 0.001240330282598734
batch [100] loss: 0.0010726931504905224
batch [110] loss: 0.0010250143241137266
batch [120] loss: 0.0010100628715008497
batch [130] loss: 0.0010028253309428692
batch [140] loss: 0.0010008994722738862
batch [150] loss: 0.0010000617476180196
batch [160] loss: 0.0009997604647651315
batch [170] loss: 0.0009996724547818303
batch [180] loss: 0.0009996353182941675
batch [190] loss: 0.0009996214648708701
batch [200] loss: 0.000999616808257997
batch [210] loss: 0.0009996152948588133
batch [220] loss: 0.000999614829197526






  0%|          | 15/4999 [00:42<3:54:59,  2.83s/it][A[A[A[A

batch [280] loss: 0.0009996145963668823
batch [290] loss: 0.0009996145963668823
batch [300] loss: 0.0009996145963668823
Using scale consts: [1e-07]
batch [0] loss: 0.049112964421510696
batch [10] loss: 0.9273108243942261
batch [20] loss: 0.3005986511707306
batch [30] loss: 0.04821600019931793
batch [40] loss: 0.015888197347521782
batch [50] loss: 0.007821696810424328
batch [60] loss: 0.0036740642972290516
batch [70] loss: 0.0020130781922489405
batch [80] loss: 0.0014402582310140133
batch [90] loss: 0.0011898785596713424
batch [100] loss: 0.0010579105000942945
batch [110] loss: 0.0010197870433330536
batch [120] loss: 0.0010078754276037216
batch [130] loss: 0.001002128585241735
batch [140] loss: 0.0010005884105339646
batch [150] loss: 0.0009999234462156892
batch [160] loss: 0.0009996849112212658
batch [170] loss: 0.0009996137814596295
batch [180] loss: 0.0009995844447985291
batch [190] loss: 0.0009995735017582774
batch [200] loss: 0.0009995698928833008
batch [210] loss: 0.000999568612314





  0%|          | 16/4999 [00:45<3:54:41,  2.83s/it][A[A[A[A

batch [300] loss: 0.0009995680302381516
Using scale consts: [1e-07]
batch [0] loss: 0.0552702471613884
batch [10] loss: 1.0464098453521729
batch [20] loss: 0.33910414576530457
batch [30] loss: 0.05422268807888031
batch [40] loss: 0.01777464896440506
batch [50] loss: 0.0086946627125144
batch [60] loss: 0.004014262929558754
batch [70] loss: 0.002141496166586876
batch [80] loss: 0.0014974935911595821
batch [90] loss: 0.0012148659443482757
batch [100] loss: 0.0010656195227056742
batch [110] loss: 0.0010227133752778172
batch [120] loss: 0.0010092511074617505
batch [130] loss: 0.0010027748066931963
batch [140] loss: 0.0010010356782004237
batch [150] loss: 0.0010002839844673872
batch [160] loss: 0.0010000147158280015
batch [170] loss: 0.0009999347385019064
batch [180] loss: 0.0009999015601351857
batch [190] loss: 0.000999889220111072
batch [200] loss: 0.0009998850291594863
batch [210] loss: 0.0009998837485909462
batch [220] loss: 0.000999883166514337
batch [230] loss: 0.0009998830500990152
ba





  0%|          | 17/4999 [00:47<3:54:13,  2.82s/it][A[A[A[A

batch [290] loss: 0.0009998829336836934
batch [300] loss: 0.0009998829336836934
Using scale consts: [1e-07]
batch [0] loss: 0.04595133662223816
batch [10] loss: 0.8666167259216309
batch [20] loss: 0.28091561794281006
batch [30] loss: 0.04507104307413101
batch [40] loss: 0.014897076413035393
batch [50] loss: 0.007373966742306948
batch [60] loss: 0.003499330021440983
batch [70] loss: 0.001944981049746275
batch [80] loss: 0.001410367782227695
batch [90] loss: 0.0011770639102905989
batch [100] loss: 0.0010534590110182762
batch [110] loss: 0.001017904607579112
batch [120] loss: 0.0010067536495625973
batch [130] loss: 0.0010013943538069725
batch [140] loss: 0.000999952550046146
batch [150] loss: 0.0009993326384574175
batch [160] loss: 0.0009991094702854753
batch [170] loss: 0.000999042997136712
batch [180] loss: 0.000999015523120761
batch [190] loss: 0.0009990052785724401
batch [200] loss: 0.0009990019025281072
batch [210] loss: 0.000999000738374889
batch [220] loss: 0.0009990002727136016
ba





  0%|          | 18/4999 [00:50<3:54:13,  2.82s/it][A[A[A[A

batch [280] loss: 0.0009990001562982798
batch [290] loss: 0.0009990001562982798
batch [300] loss: 0.0009990001562982798
Using scale consts: [1e-07]
batch [0] loss: 0.031506121158599854
batch [10] loss: 0.5895049571990967
batch [20] loss: 0.1912485510110855
batch [30] loss: 0.03100000135600567
batch [40] loss: 0.010472903028130531
batch [50] loss: 0.005340735428035259
batch [60] loss: 0.00269818352535367
batch [70] loss: 0.0016435212455689907
batch [80] loss: 0.0012812165077775717
batch [90] loss: 0.0011205238988623023
batch [100] loss: 0.0010367522481828928
batch [110] loss: 0.0010127793066203594
batch [120] loss: 0.001005062717013061
batch [130] loss: 0.0010014667641371489
batch [140] loss: 0.0010004719952121377
batch [150] loss: 0.0010000530164688826
batch [160] loss: 0.0009999023750424385
batch [170] loss: 0.000999856274574995
batch [180] loss: 0.0009998376481235027
batch [190] loss: 0.000999830779619515
batch [200] loss: 0.0009998285677284002
batch [210] loss: 0.0009998277528211474





  0%|          | 19/4999 [00:53<3:54:20,  2.82s/it][A[A[A[A

batch [300] loss: 0.0009998272871598601
Using scale consts: [1e-07]
batch [0] loss: 0.0333230160176754
batch [10] loss: 0.6257972717285156
batch [20] loss: 0.20300133526325226
batch [30] loss: 0.03284744173288345
batch [40] loss: 0.011052346788346767
batch [50] loss: 0.005602109711617231
batch [60] loss: 0.002801853697746992
batch [70] loss: 0.0016834603156894445
batch [80] loss: 0.0012975112767890096
batch [90] loss: 0.0011283067287877202
batch [100] loss: 0.0010389924282208085
batch [110] loss: 0.0010135810589417815
batch [120] loss: 0.0010053839068859816
batch [130] loss: 0.0010015724692493677
batch [140] loss: 0.0010005160002037883
batch [150] loss: 0.001000070828013122
batch [160] loss: 0.0009999106405302882
batch [170] loss: 0.0009998620953410864
batch [180] loss: 0.0009998423047363758
batch [190] loss: 0.0009998350869864225
batch [200] loss: 0.0009998326422646642
batch [210] loss: 0.0009998317109420896
batch [220] loss: 0.000999831478111446
batch [230] loss: 0.000999831361696124





  0%|          | 20/4999 [00:56<3:54:26,  2.83s/it][A[A[A[A

batch [290] loss: 0.0009998312452808022
batch [300] loss: 0.000999831361696124
Using scale consts: [1e-07]
batch [0] loss: 0.02959766611456871
batch [10] loss: 0.5529311299324036
batch [20] loss: 0.17943231761455536
batch [30] loss: 0.029154140502214432
batch [40] loss: 0.00989946722984314
batch [50] loss: 0.0050772265531122684
batch [60] loss: 0.002595932688564062
batch [70] loss: 0.0016028565587475896
batch [80] loss: 0.0012619842309504747
batch [90] loss: 0.001112425117753446
batch [100] loss: 0.0010340125299990177
batch [110] loss: 0.0010112706804648042
batch [120] loss: 0.001004149205982685
batch [130] loss: 0.0010007418459281325
batch [140] loss: 0.0009998178575187922
batch [150] loss: 0.0009994242573156953
batch [160] loss: 0.0009992819977924228
batch [170] loss: 0.0009992391569539905
batch [180] loss: 0.0009992215782403946
batch [190] loss: 0.0009992151753976941
batch [200] loss: 0.0009992129635065794
batch [210] loss: 0.0009992122650146484
batch [220] loss: 0.00099921203218400





  0%|          | 21/4999 [00:59<3:54:14,  2.82s/it][A[A[A[A

batch [280] loss: 0.000999211915768683
batch [290] loss: 0.000999211915768683
batch [300] loss: 0.000999211915768683
Using scale consts: [1e-07]
batch [0] loss: 0.037435710430145264
batch [10] loss: 0.7024430632591248
batch [20] loss: 0.2276584357023239
batch [30] loss: 0.03669584169983864
batch [40] loss: 0.012279413640499115
batch [50] loss: 0.006170016713440418
batch [60] loss: 0.003022876102477312
batch [70] loss: 0.0017656992422416806
batch [80] loss: 0.0013340517180040479
batch [90] loss: 0.0011435080086812377
batch [100] loss: 0.0010436968877911568
batch [110] loss: 0.0010149142472073436
batch [120] loss: 0.0010058216284960508
batch [130] loss: 0.0010015121661126614
batch [140] loss: 0.0010003333445638418
batch [150] loss: 0.000999832060188055
batch [160] loss: 0.000999651849269867
batch [170] loss: 0.0009995974833145738
batch [180] loss: 0.0009995753644034266
batch [190] loss: 0.000999567098915577
batch [200] loss: 0.000999564421363175
batch [210] loss: 0.0009995634900406003
ba





  0%|          | 22/4999 [01:02<3:54:06,  2.82s/it][A[A[A[A

batch [300] loss: 0.000999563024379313
Using scale consts: [1e-07]
batch [0] loss: 0.048605941236019135
batch [10] loss: 0.9187933802604675
batch [20] loss: 0.2975035011768341
batch [30] loss: 0.047610558569431305
batch [40] loss: 0.015713494271039963
batch [50] loss: 0.007745299953967333
batch [60] loss: 0.0036412826739251614
batch [70] loss: 0.0020009044092148542
batch [80] loss: 0.0014365235110744834
batch [90] loss: 0.0011885110288858414
batch [100] loss: 0.0010573003673925996
batch [110] loss: 0.0010198368690907955
batch [120] loss: 0.0010078942868858576
batch [130] loss: 0.001002254313789308
batch [140] loss: 0.001000716700218618
batch [150] loss: 0.0010000593028962612
batch [160] loss: 0.000999822630546987
batch [170] loss: 0.0009997517336159945
batch [180] loss: 0.0009997227462008595
batch [190] loss: 0.0009997119195759296
batch [200] loss: 0.000999708310700953
batch [210] loss: 0.0009997071465477347
batch [220] loss: 0.0009997066808864474
batch [230] loss: 0.000999706564471125





  0%|          | 23/4999 [01:05<3:55:34,  2.84s/it][A[A[A[A

batch [290] loss: 0.0009997064480558038
batch [300] loss: 0.0009997064480558038
Using scale consts: [1e-07]
batch [0] loss: 0.0612359419465065
batch [10] loss: 1.1602096557617188
batch [20] loss: 0.37565335631370544
batch [30] loss: 0.05992792174220085
batch [40] loss: 0.019625188782811165
batch [50] loss: 0.009553358890116215
batch [60] loss: 0.004343990236520767
batch [70] loss: 0.0022600903175771236
batch [80] loss: 0.0015501264715567231
batch [90] loss: 0.0012380214175209403
batch [100] loss: 0.001072232029400766
batch [110] loss: 0.001024649478495121
batch [120] loss: 0.0010097332997247577
batch [130] loss: 0.0010025444207713008
batch [140] loss: 0.0010006243828684092
batch [150] loss: 0.0009997878223657608
batch [160] loss: 0.000999489682726562
batch [170] loss: 0.0009994013234972954
batch [180] loss: 0.0009993644198402762
batch [190] loss: 0.0009993505664169788
batch [200] loss: 0.0009993461426347494
batch [210] loss: 0.0009993446292355657
batch [220] loss: 0.0009993440471589565





  0%|          | 24/4999 [01:07<3:55:57,  2.85s/it][A[A[A[A

batch [280] loss: 0.000999343697912991
batch [290] loss: 0.0009993438143283129
batch [300] loss: 0.0009993438143283129
Using scale consts: [1e-07]
batch [0] loss: 0.061330053955316544
batch [10] loss: 1.1671193838119507
batch [20] loss: 0.37791886925697327
batch [30] loss: 0.06022960692644119
batch [40] loss: 0.019707752391695976
batch [50] loss: 0.009592993184924126
batch [60] loss: 0.00436022924259305
batch [70] loss: 0.0022655813954770565
batch [80] loss: 0.0015528262592852116
batch [90] loss: 0.0012396699748933315
batch [100] loss: 0.0010727349435910583
batch [110] loss: 0.0010247430764138699
batch [120] loss: 0.00100986217148602
batch [130] loss: 0.0010025760857388377
batch [140] loss: 0.0010006558150053024
batch [150] loss: 0.0009998164605349302
batch [160] loss: 0.000999515294097364
batch [170] loss: 0.0009994269348680973
batch [180] loss: 0.0009993897983804345
batch [190] loss: 0.000999375944957137
batch [200] loss: 0.0009993714047595859
batch [210] loss: 0.000999369891360402
b





  1%|          | 25/4999 [01:10<3:55:04,  2.84s/it][A[A[A[A

batch [300] loss: 0.0009993690764531493
Using scale consts: [1e-07]
batch [0] loss: 0.0298876091837883
batch [10] loss: 0.5540735125541687
batch [20] loss: 0.18011407554149628
batch [30] loss: 0.029314590618014336
batch [40] loss: 0.009920381009578705
batch [50] loss: 0.005087082274258137
batch [60] loss: 0.0025987899862229824
batch [70] loss: 0.0016057759057730436
batch [80] loss: 0.0012633224250748754
batch [90] loss: 0.0011134753003716469
batch [100] loss: 0.0010345519986003637
batch [110] loss: 0.0010118960635736585
batch [120] loss: 0.001004711608402431
batch [130] loss: 0.0010013054125010967
batch [140] loss: 0.0010003765346482396
batch [150] loss: 0.0009999811882153153
batch [160] loss: 0.0009998390451073647
batch [170] loss: 0.0009997963206842542
batch [180] loss: 0.0009997786255553365
batch [190] loss: 0.0009997721062973142
batch [200] loss: 0.0009997700108215213
batch [210] loss: 0.0009997691959142685
batch [220] loss: 0.0009997689630836248
batch [230] loss: 0.000999768846668





  1%|          | 26/4999 [01:13<3:56:26,  2.85s/it][A[A[A[A

batch [290] loss: 0.000999768846668303
batch [300] loss: 0.000999768846668303
Using scale consts: [1e-07]
batch [0] loss: 0.04871072620153427
batch [10] loss: 0.9230268001556396
batch [20] loss: 0.2991834580898285
batch [30] loss: 0.047928765416145325
batch [40] loss: 0.0157875157892704
batch [50] loss: 0.007778463885188103
batch [60] loss: 0.0036604655906558037
batch [70] loss: 0.0020073531195521355
batch [80] loss: 0.001437901402823627
batch [90] loss: 0.0011893726186826825
batch [100] loss: 0.0010577053762972355
batch [110] loss: 0.0010198378004133701
batch [120] loss: 0.0010079662315547466
batch [130] loss: 0.0010022567585110664
batch [140] loss: 0.0010007205419242382
batch [150] loss: 0.0010000605834648013
batch [160] loss: 0.0009998229797929525
batch [170] loss: 0.0009997524321079254
batch [180] loss: 0.000999723095446825
batch [190] loss: 0.0009997122688218951
batch [200] loss: 0.0009997086599469185
batch [210] loss: 0.0009997074957937002
batch [220] loss: 0.000999707030132413
b





  1%|          | 27/4999 [01:16<3:56:11,  2.85s/it][A[A[A[A

batch [280] loss: 0.0009997067973017693
batch [290] loss: 0.0009997067973017693
batch [300] loss: 0.0009997067973017693
Using scale consts: [1e-07]
batch [0] loss: 0.0232529379427433
batch [10] loss: 0.4316587746143341
batch [20] loss: 0.14023107290267944
batch [30] loss: 0.022919686511158943
batch [40] loss: 0.007924060337245464
batch [50] loss: 0.004174359142780304
batch [60] loss: 0.002238896209746599
batch [70] loss: 0.0014687906950712204
batch [80] loss: 0.0012047505006194115
batch [90] loss: 0.0010878017637878656
batch [100] loss: 0.001026549143716693
batch [110] loss: 0.001008812920190394
batch [120] loss: 0.0010032609570771456
batch [130] loss: 0.0010006034281104803
batch [140] loss: 0.0009998808382079005
batch [150] loss: 0.0009995733853429556
batch [160] loss: 0.00099946279078722
batch [170] loss: 0.0009994297288358212
batch [180] loss: 0.0009994161082431674
batch [190] loss: 0.000999410985969007
batch [200] loss: 0.0009994093561545014
batch [210] loss: 0.0009994087740778923






  1%|          | 28/4999 [01:19<3:55:33,  2.84s/it][A[A[A[A

batch [300] loss: 0.0009994085412472486
Using scale consts: [1e-07]
batch [0] loss: 0.026524176821112633
batch [10] loss: 0.49784961342811584
batch [20] loss: 0.16146695613861084
batch [30] loss: 0.026203947141766548
batch [40] loss: 0.008956316858530045
batch [50] loss: 0.0046528177335858345
batch [60] loss: 0.002429777756333351
batch [70] loss: 0.0015405092854052782
batch [80] loss: 0.001236391020938754
batch [90] loss: 0.0011013682233169675
batch [100] loss: 0.0010304504539817572
batch [110] loss: 0.0010101828956976533
batch [120] loss: 0.001003729528747499
batch [130] loss: 0.001000681659206748
batch [140] loss: 0.000999839510768652
batch [150] loss: 0.0009994874708354473
batch [160] loss: 0.0009993601124733686
batch [170] loss: 0.0009993213461712003
batch [180] loss: 0.0009993056301027536
batch [190] loss: 0.0009992999257519841
batch [200] loss: 0.000999297946691513
batch [210] loss: 0.000999297248199582
batch [220] loss: 0.0009992970153689384
batch [230] loss: 0.00099929689895361





  1%|          | 29/4999 [01:22<3:54:44,  2.83s/it][A[A[A[A

batch [290] loss: 0.0009992968989536166
batch [300] loss: 0.0009992968989536166
Using scale consts: [1e-07]
batch [0] loss: 0.03341937065124512
batch [10] loss: 0.6212925314903259
batch [20] loss: 0.20172646641731262
batch [30] loss: 0.03275642171502113
batch [40] loss: 0.011025574989616871
batch [50] loss: 0.005583153106272221
batch [60] loss: 0.0027913914527744055
batch [70] loss: 0.0016790147637948394
batch [80] loss: 0.0012955772690474987
batch [90] loss: 0.0011267276713624597
batch [100] loss: 0.0010383825283497572
batch [110] loss: 0.0010129868751391768
batch [120] loss: 0.0010049247648566961
batch [130] loss: 0.0010011080885306
batch [140] loss: 0.0010000625625252724
batch [150] loss: 0.0009996205335482955
batch [160] loss: 0.00099946151021868
batch [170] loss: 0.0009994131978601217
batch [180] loss: 0.000999393523670733
batch [190] loss: 0.0009993863059207797
batch [200] loss: 0.0009993838611990213
batch [210] loss: 0.0009993830462917686
batch [220] loss: 0.000999382697045803
b





  1%|          | 30/4999 [01:24<3:55:56,  2.85s/it][A[A[A[A

batch [280] loss: 0.0009993825806304812
batch [290] loss: 0.0009993825806304812
batch [300] loss: 0.0009993825806304812
Using scale consts: [1e-07]
batch [0] loss: 0.04864171892404556
batch [10] loss: 0.9208120703697205
batch [20] loss: 0.2982251048088074
batch [30] loss: 0.04767272621393204
batch [40] loss: 0.015715867280960083
batch [50] loss: 0.007751183584332466
batch [60] loss: 0.0036456305533647537
batch [70] loss: 0.002002640627324581
batch [80] loss: 0.0014372770674526691
batch [90] loss: 0.001188528724014759
batch [100] loss: 0.0010572749888524413
batch [110] loss: 0.0010194767965003848
batch [120] loss: 0.00100763700902462
batch [130] loss: 0.001001953729428351
batch [140] loss: 0.001000418676994741
batch [150] loss: 0.0009997596498578787
batch [160] loss: 0.0009995232103392482
batch [170] loss: 0.000999452662654221
batch [180] loss: 0.0009994234424084425
batch [190] loss: 0.0009994126157835126
batch [200] loss: 0.000999409006908536
batch [210] loss: 0.0009994077263399959
bat





  1%|          | 31/4999 [01:27<3:56:32,  2.86s/it][A[A[A[A

batch [290] loss: 0.0009994071442633867
batch [300] loss: 0.0009994071442633867
Using scale consts: [1e-07]
batch [0] loss: 0.04644083231687546
batch [10] loss: 0.8721329569816589
batch [20] loss: 0.2829701900482178
batch [30] loss: 0.04549792781472206
batch [40] loss: 0.01502266712486744
batch [50] loss: 0.007420726120471954
batch [60] loss: 0.0035158791579306126
batch [70] loss: 0.0019505457021296024
batch [80] loss: 0.001413828693330288
batch [90] loss: 0.0011785969836637378
batch [100] loss: 0.0010541639057919383
batch [110] loss: 0.001018327078782022
batch [120] loss: 0.0010071353754028678
batch [130] loss: 0.0010017212480306625
batch [140] loss: 0.0010002774652093649
batch [150] loss: 0.0009996520821005106
batch [160] loss: 0.00099942646920681
batch [170] loss: 0.000999359879642725
batch [180] loss: 0.0009993321727961302
batch [190] loss: 0.0009993219282478094
batch [200] loss: 0.0009993184357881546
batch [210] loss: 0.0009993172716349363
batch [220] loss: 0.0009993169223889709
b





  1%|          | 32/4999 [01:30<3:55:47,  2.85s/it][A[A[A[A

batch [280] loss: 0.0009993166895583272
batch [290] loss: 0.0009993166895583272
batch [300] loss: 0.0009993166895583272
Using scale consts: [1e-07]
batch [0] loss: 0.04984009265899658
batch [10] loss: 0.941489577293396
batch [20] loss: 0.30527886748313904
batch [30] loss: 0.04903232678771019
batch [40] loss: 0.01617465540766716
batch [50] loss: 0.00795053318142891
batch [60] loss: 0.003717745654284954
batch [70] loss: 0.00202437536790967
batch [80] loss: 0.0014472706243395805
batch [90] loss: 0.001192605821415782
batch [100] loss: 0.001058725523762405
batch [110] loss: 0.0010199344251304865
batch [120] loss: 0.0010078666964545846
batch [130] loss: 0.0010020198533311486
batch [140] loss: 0.001000463729724288
batch [150] loss: 0.000999787007458508
batch [160] loss: 0.0009995445143431425
batch [170] loss: 0.0009994725696742535
batch [180] loss: 0.0009994427673518658
batch [190] loss: 0.0009994315914809704
batch [200] loss: 0.000999427866190672
batch [210] loss: 0.0009994267020374537
batch





  1%|          | 33/4999 [01:33<3:56:59,  2.86s/it][A[A[A[A

batch [300] loss: 0.0009994260035455227
Using scale consts: [1e-07]
batch [0] loss: 0.03889527916908264
batch [10] loss: 0.7338753938674927
batch [20] loss: 0.23764798045158386
batch [30] loss: 0.038189806044101715
batch [40] loss: 0.012757591903209686
batch [50] loss: 0.006396915763616562
batch [60] loss: 0.0031120325438678265
batch [70] loss: 0.0017991107888519764
batch [80] loss: 0.001348551013506949
batch [90] loss: 0.0011498447274789214
batch [100] loss: 0.0010455413721501827
batch [110] loss: 0.001015485730022192
batch [120] loss: 0.0010059913620352745
batch [130] loss: 0.0010014845756813884
batch [140] loss: 0.0010002523194998503
batch [150] loss: 0.0009997290326282382
batch [160] loss: 0.0009995412547141314
batch [170] loss: 0.0009994844440370798
batch [180] loss: 0.0009994611609727144
batch [190] loss: 0.0009994525462388992
batch [200] loss: 0.0009994497522711754
batch [210] loss: 0.000999448704533279
batch [220] loss: 0.0009994483552873135
batch [230] loss: 0.0009994482388719





  1%|          | 34/4999 [01:36<3:56:52,  2.86s/it][A[A[A[A

batch [290] loss: 0.0009994482388719916
batch [300] loss: 0.0009994482388719916
Using scale consts: [1e-07]
batch [0] loss: 0.028040044009685516
batch [10] loss: 0.5219645500183105
batch [20] loss: 0.16953404247760773
batch [30] loss: 0.027613533660769463
batch [40] loss: 0.009399675764143467
batch [50] loss: 0.004848821088671684
batch [60] loss: 0.0025081862695515156
batch [70] loss: 0.001569452229887247
batch [80] loss: 0.0012477251002565026
batch [90] loss: 0.001106404815800488
batch [100] loss: 0.0010322221787646413
batch [110] loss: 0.0010108771966770291
batch [120] loss: 0.001004098216071725
batch [130] loss: 0.001000898191705346
batch [140] loss: 0.001000022515654564
batch [150] loss: 0.0009996510343626142
batch [160] loss: 0.0009995173895731568
batch [170] loss: 0.000999476877041161
batch [180] loss: 0.0009994603460654616
batch [190] loss: 0.0009994542924687266
batch [200] loss: 0.0009994521969929338
batch [210] loss: 0.0009994514985010028
batch [220] loss: 0.000999451265670359





  1%|          | 35/4999 [01:39<3:56:24,  2.86s/it][A[A[A[A

batch [280] loss: 0.0009994511492550373
batch [290] loss: 0.0009994511492550373
batch [300] loss: 0.0009994512656703591
Using scale consts: [1e-07]
batch [0] loss: 0.04516728222370148
batch [10] loss: 0.8486127257347107
batch [20] loss: 0.27503305673599243
batch [30] loss: 0.04422686621546745
batch [40] loss: 0.01465747319161892
batch [50] loss: 0.007260081823915243
batch [60] loss: 0.0034531299024820328
batch [70] loss: 0.0019243168644607067
batch [80] loss: 0.0014025315176695585
batch [90] loss: 0.0011738140601664782
batch [100] loss: 0.0010527289705350995
batch [110] loss: 0.0010179909877479076
batch [120] loss: 0.0010070092976093292
batch [130] loss: 0.0010017860913649201
batch [140] loss: 0.001000366173684597
batch [150] loss: 0.000999759417027235
batch [160] loss: 0.0009995413711294532
batch [170] loss: 0.0009994758293032646
batch [180] loss: 0.0009994489373639226
batch [190] loss: 0.0009994389256462455
batch [200] loss: 0.0009994356660172343
batch [210] loss: 0.00099943450186401





  1%|          | 36/4999 [01:42<3:57:21,  2.87s/it][A[A[A[A

batch [290] loss: 0.000999433919787407
batch [300] loss: 0.000999433919787407
Using scale consts: [1e-07]
batch [0] loss: 0.038034453988075256
batch [10] loss: 0.7162926197052002
batch [20] loss: 0.23211601376533508
batch [30] loss: 0.03738347440958023
batch [40] loss: 0.012495510280132294
batch [50] loss: 0.006269850768148899
batch [60] loss: 0.003065156750380993
batch [70] loss: 0.0017799332272261381
batch [80] loss: 0.0013393072877079248
batch [90] loss: 0.0011461250251159072
batch [100] loss: 0.0010440463665872812
batch [110] loss: 0.001014724955894053
batch [120] loss: 0.0010054869344457984
batch [130] loss: 0.001001059776172042
batch [140] loss: 0.0009998691966757178
batch [150] loss: 0.0009993561543524265
batch [160] loss: 0.00099917221814394
batch [170] loss: 0.0009991169208660722
batch [180] loss: 0.000999094219878316
batch [190] loss: 0.0009990858379751444
batch [200] loss: 0.0009990830440074205
batch [210] loss: 0.000999082112684846
batch [220] loss: 0.0009990817634388804
ba





  1%|          | 37/4999 [01:44<3:56:21,  2.86s/it][A[A[A[A

batch [280] loss: 0.0009990815306082368
batch [290] loss: 0.0009990815306082368
batch [300] loss: 0.0009990815306082368
Using scale consts: [1e-07]
batch [0] loss: 0.049001164734363556
batch [10] loss: 0.9297208786010742
batch [20] loss: 0.3011418282985687
batch [30] loss: 0.048181042075157166
batch [40] loss: 0.015879418700933456
batch [50] loss: 0.007828216068446636
batch [60] loss: 0.0036771236918866634
batch [70] loss: 0.002014117082580924
batch [80] loss: 0.0014419083017855883
batch [90] loss: 0.001190200331620872
batch [100] loss: 0.001058060908690095
batch [110] loss: 0.00101988494861871
batch [120] loss: 0.001007904065772891
batch [130] loss: 0.0010021657217293978
batch [140] loss: 0.0010006161173805594
batch [150] loss: 0.0009999516187235713
batch [160] loss: 0.000999713083729148
batch [170] loss: 0.0009996414883062243
batch [180] loss: 0.0009996119188144803
batch [190] loss: 0.0009996009757742286
batch [200] loss: 0.000999597366899252
batch [210] loss: 0.0009995962027460337
b





  1%|          | 38/4999 [01:47<3:56:13,  2.86s/it][A[A[A[A

batch [300] loss: 0.0009995955042541027
Using scale consts: [1e-07]
batch [0] loss: 0.04826495051383972
batch [10] loss: 0.9183962345123291
batch [20] loss: 0.2974358797073364
batch [30] loss: 0.047509681433439255
batch [40] loss: 0.015652570873498917
batch [50] loss: 0.007731970399618149
batch [60] loss: 0.003645222634077072
batch [70] loss: 0.002003457397222519
batch [80] loss: 0.0014349482953548431
batch [90] loss: 0.0011880110250785947
batch [100] loss: 0.0010571246966719627
batch [110] loss: 0.0010194644564762712
batch [120] loss: 0.0010076413163915277
batch [130] loss: 0.001001974567770958
batch [140] loss: 0.001000446849502623
batch [150] loss: 0.0009997878223657608
batch [160] loss: 0.0009995517320930958
batch [170] loss: 0.0009994815336540341
batch [180] loss: 0.0009994524298235774
batch [190] loss: 0.0009994416031986475
batch [200] loss: 0.0009994379943236709
batch [210] loss: 0.0009994368301704526
batch [220] loss: 0.0009994363645091653
batch [230] loss: 0.000999436248093843





  1%|          | 39/4999 [01:50<3:57:04,  2.87s/it][A[A[A[A

batch [290] loss: 0.0009994361316785216
batch [300] loss: 0.0009994362480938435
Using scale consts: [1e-07]
batch [0] loss: 0.03872772306203842
batch [10] loss: 0.7233618497848511
batch [20] loss: 0.2347329705953598
batch [30] loss: 0.0378388836979866
batch [40] loss: 0.012607014738023281
batch [50] loss: 0.006315477192401886
batch [60] loss: 0.0030818963423371315
batch [70] loss: 0.0017884356202557683
batch [80] loss: 0.0013431895058602095
batch [90] loss: 0.0011487709125503898
batch [100] loss: 0.0010450520785525441
batch [110] loss: 0.0010155672207474709
batch [120] loss: 0.0010061925277113914
batch [130] loss: 0.0010017391759902239
batch [140] loss: 0.0010005307849496603
batch [150] loss: 0.0010000135516747832
batch [160] loss: 0.000999827403575182
batch [170] loss: 0.0009997718734666705
batch [180] loss: 0.0009997489396482706
batch [190] loss: 0.0009997403249144554
batch [200] loss: 0.0009997375309467316
batch [210] loss: 0.000999736599624157
batch [220] loss: 0.000999736250378191





  1%|          | 40/4999 [01:53<3:56:08,  2.86s/it][A[A[A[A

batch [280] loss: 0.0009997361339628696
batch [290] loss: 0.0009997361339628696
batch [300] loss: 0.0009997361339628696
Using scale consts: [1e-07]
batch [0] loss: 0.03547438979148865
batch [10] loss: 0.6658967137336731
batch [20] loss: 0.21616984903812408
batch [30] loss: 0.03490805625915527
batch [40] loss: 0.01168022584170103
batch [50] loss: 0.005901454482227564
batch [60] loss: 0.002920885570347309
batch [70] loss: 0.0017252827528864145
batch [80] loss: 0.0013159924419596791
batch [90] loss: 0.0011364214587956667
batch [100] loss: 0.0010411448311060667
batch [110] loss: 0.0010140483500435948
batch [120] loss: 0.0010054032318294048
batch [130] loss: 0.001001309254206717
batch [140] loss: 0.0010001908522099257
batch [150] loss: 0.0009997178567573428
batch [160] loss: 0.0009995471918955445
batch [170] loss: 0.0009994958527386189
batch [180] loss: 0.0009994747815653682
batch [190] loss: 0.000999466865323484
batch [200] loss: 0.0009994643041864038
batch [210] loss: 0.000999463372863829





  1%|          | 41/4999 [01:56<3:55:18,  2.85s/it][A[A[A[A

batch [300] loss: 0.0009994630236178637
Using scale consts: [1e-07]
batch [0] loss: 0.03746678680181503
batch [10] loss: 0.7044550180435181
batch [20] loss: 0.2286725491285324
batch [30] loss: 0.03682961314916611
batch [40] loss: 0.012263492681086063
batch [50] loss: 0.006160878576338291
batch [60] loss: 0.0030232788994908333
batch [70] loss: 0.0017692544497549534
batch [80] loss: 0.001334708766080439
batch [90] loss: 0.0011441460810601711
batch [100] loss: 0.0010438651079311967
batch [110] loss: 0.0010148888686671853
batch [120] loss: 0.00100587063934654
batch [130] loss: 0.0010014981962740421
batch [140] loss: 0.001000328455120325
batch [150] loss: 0.0009998258901759982
batch [160] loss: 0.0009996441658586264
batch [170] loss: 0.0009995902655646205
batch [180] loss: 0.000999567797407508
batch [190] loss: 0.0009995594155043364
batch [200] loss: 0.0009995567379519343
batch [210] loss: 0.0009995558066293597
batch [220] loss: 0.0009995554573833942
batch [230] loss: 0.0009995553409680724






  1%|          | 42/4999 [01:59<3:55:09,  2.85s/it][A[A[A[A

batch [290] loss: 0.0009995554573833942
batch [300] loss: 0.0009995554573833942
Using scale consts: [1e-07]
batch [0] loss: 0.025347935035824776
batch [10] loss: 0.47239136695861816
batch [20] loss: 0.15357765555381775
batch [30] loss: 0.02507626637816429
batch [40] loss: 0.008589940145611763
batch [50] loss: 0.004475955385714769
batch [60] loss: 0.0023613120429217815
batch [70] loss: 0.0015156845329329371
batch [80] loss: 0.001223873347043991
batch [90] loss: 0.0010966071859002113
batch [100] loss: 0.0010292701190337539
batch [110] loss: 0.0010099670616909862
batch [120] loss: 0.0010038716718554497
batch [130] loss: 0.0010009584948420525
batch [140] loss: 0.0010001705959439278
batch [150] loss: 0.0009998339228332043
batch [160] loss: 0.0009997125016525388
batch [170] loss: 0.0009996761800721288
batch [180] loss: 0.000999661278910935
batch [190] loss: 0.0009996556909754872
batch [200] loss: 0.0009996539447456598
batch [210] loss: 0.0009996532462537289
batch [220] loss: 0.00099965301342





  1%|          | 43/4999 [02:02<3:56:25,  2.86s/it][A[A[A[A

batch [280] loss: 0.0009996528970077634
batch [290] loss: 0.0009996528970077634
batch [300] loss: 0.0009996528970077634
Using scale consts: [1e-07]
batch [0] loss: 0.04460524395108223
batch [10] loss: 0.8445395231246948
batch [20] loss: 0.2736448049545288
batch [30] loss: 0.04381508380174637
batch [40] loss: 0.014505729079246521
batch [50] loss: 0.007203039713203907
batch [60] loss: 0.003431484801694751
batch [70] loss: 0.0019175971392542124
batch [80] loss: 0.0013991928426548839
batch [90] loss: 0.0011732836719602346
batch [100] loss: 0.0010522681986913085
batch [110] loss: 0.0010176794603466988
batch [120] loss: 0.001006845966912806
batch [130] loss: 0.0010016117012128234
batch [140] loss: 0.0010002143681049347
batch [150] loss: 0.0009996076114475727
batch [160] loss: 0.0009993900312110782
batch [170] loss: 0.0009993260027840734
batch [180] loss: 0.0009992991108447313
batch [190] loss: 0.0009992890991270542
batch [200] loss: 0.0009992857230827212
batch [210] loss: 0.00099928467534482





  1%|          | 44/4999 [02:05<4:03:03,  2.94s/it][A[A[A[A

batch [290] loss: 0.0009992840932682157
batch [300] loss: 0.0009992840932682157
Using scale consts: [1e-07]
batch [0] loss: 0.05096249282360077
batch [10] loss: 0.9639076590538025
batch [20] loss: 0.3121901750564575
batch [30] loss: 0.049974098801612854
batch [40] loss: 0.016461526975035667
batch [50] loss: 0.008090764284133911
batch [60] loss: 0.0037786990869790316
batch [70] loss: 0.0020518898963928223
batch [80] loss: 0.0014574723318219185
batch [90] loss: 0.001197122037410736
batch [100] loss: 0.001059986767359078
batch [110] loss: 0.0010204410646110773
batch [120] loss: 0.001008019782602787
batch [130] loss: 0.0010020544286817312
batch [140] loss: 0.0010004490613937378
batch [150] loss: 0.0009997611632570624
batch [160] loss: 0.0009995133150368929
batch [170] loss: 0.0009994390420615673
batch [180] loss: 0.0009994085412472486
batch [190] loss: 0.0009993971325457096
batch [200] loss: 0.0009993934072554111
batch [210] loss: 0.000999392126686871
batch [220] loss: 0.000999391661025583





  1%|          | 45/4999 [02:08<4:06:29,  2.99s/it][A[A[A[A

batch [280] loss: 0.00099939142819494
batch [290] loss: 0.00099939142819494
batch [300] loss: 0.000999391544610262
Using scale consts: [1e-07]
batch [0] loss: 0.0576176755130291
batch [10] loss: 1.0928328037261963
batch [20] loss: 0.3539171516895294
batch [30] loss: 0.056467656046152115
batch [40] loss: 0.01850508339703083
batch [50] loss: 0.00903533399105072
batch [60] loss: 0.0041473135352134705
batch [70] loss: 0.0021873137447983027
batch [80] loss: 0.001517054159194231
batch [90] loss: 0.001224590465426445
batch [100] loss: 0.0010678479447960854
batch [110] loss: 0.0010231244377791882
batch [120] loss: 0.0010090763680636883
batch [130] loss: 0.0010023044887930155
batch [140] loss: 0.0010004931828007102
batch [150] loss: 0.0009997077286243439
batch [160] loss: 0.0009994265856221318
batch [170] loss: 0.0009993435814976692
batch [180] loss: 0.000999308773316443
batch [190] loss: 0.0009992957348003983
batch [200] loss: 0.0009992915438488126
batch [210] loss: 0.0009992900304496288
batch





  1%|          | 46/4999 [02:11<4:06:38,  2.99s/it][A[A[A[A

batch [290] loss: 0.0009992893319576979
batch [300] loss: 0.0009992893319576979
Using scale consts: [1e-07]
batch [0] loss: 0.03251967206597328
batch [10] loss: 0.6055835485458374
batch [20] loss: 0.1966518610715866
batch [30] loss: 0.0318460538983345
batch [40] loss: 0.010694793425500393
batch [50] loss: 0.005439317785203457
batch [60] loss: 0.0027434786316007376
batch [70] loss: 0.0016618099762126803
batch [80] loss: 0.0012881597504019737
batch [90] loss: 0.001123902969993651
batch [100] loss: 0.001037342706695199
batch [110] loss: 0.0010127599816769361
batch [120] loss: 0.001004849444143474
batch [130] loss: 0.0010011486010625958
batch [140] loss: 0.0010001237969845533
batch [150] loss: 0.0009996958542615175
batch [160] loss: 0.0009995399741455913
batch [170] loss: 0.0009994928259402514
batch [180] loss: 0.0009994738502427936
batch [190] loss: 0.0009994667489081621
batch [200] loss: 0.0009994644206017256
batch [210] loss: 0.0009994636056944728
batch [220] loss: 0.0009994632564485073





  1%|          | 47/4999 [02:14<4:04:25,  2.96s/it][A[A[A[A

batch [280] loss: 0.0009994631400331855
batch [290] loss: 0.0009994631400331855
batch [300] loss: 0.0009994632564485073
Using scale consts: [1e-07]
batch [0] loss: 0.036855150014162064
batch [10] loss: 0.6929236054420471
batch [20] loss: 0.22458654642105103
batch [30] loss: 0.036202963441610336
batch [40] loss: 0.012116043828427792
batch [50] loss: 0.006093553267419338
batch [60] loss: 0.0029953094199299812
batch [70] loss: 0.001755851088091731
batch [80] loss: 0.0013291251379996538
batch [90] loss: 0.001141432556323707
batch [100] loss: 0.0010429669637233019
batch [110] loss: 0.0010145611595362425
batch [120] loss: 0.001005611033178866
batch [130] loss: 0.0010013481369242072
batch [140] loss: 0.0010001882910728455
batch [150] loss: 0.0009996918961405754
batch [160] loss: 0.0009995148284360766
batch [170] loss: 0.000999461393803358
batch [180] loss: 0.0009994393913075328
batch [190] loss: 0.000999431242235005
batch [200] loss: 0.0009994285646826029
batch [210] loss: 0.00099942763336002





  1%|          | 48/4999 [02:17<4:01:04,  2.92s/it][A[A[A[A

batch [290] loss: 0.000999427167698741
batch [300] loss: 0.000999427167698741
Using scale consts: [1e-07]
batch [0] loss: 0.04170828312635422
batch [10] loss: 0.7879765629768372
batch [20] loss: 0.25526201725006104
batch [30] loss: 0.0410771369934082
batch [40] loss: 0.013694355264306068
batch [50] loss: 0.006821092218160629
batch [60] loss: 0.0032766154035925865
batch [70] loss: 0.0018595857545733452
batch [80] loss: 0.001372884726151824
batch [90] loss: 0.0011616031406447291
batch [100] loss: 0.001049211947247386
batch [110] loss: 0.0010169374290853739
batch [120] loss: 0.0010067657567560673
batch [130] loss: 0.0010019163601100445
batch [140] loss: 0.001000598887912929
batch [150] loss: 0.0010000343900173903
batch [160] loss: 0.0009998326422646642
batch [170] loss: 0.0009997719898819923
batch [180] loss: 0.0009997469605877995
batch [190] loss: 0.0009997377637773752
batch [200] loss: 0.000999734620563686
batch [210] loss: 0.0009997335728257895
batch [220] loss: 0.000999733223579824
ba





  1%|          | 49/4999 [02:19<3:59:19,  2.90s/it][A[A[A[A

batch [280] loss: 0.0009997331071645021
batch [290] loss: 0.0009997331071645021
batch [300] loss: 0.0009997331071645021
Using scale consts: [1e-07]
batch [0] loss: 0.029763592407107353
batch [10] loss: 0.556694746017456
batch [20] loss: 0.18069294095039368
batch [30] loss: 0.02928032912313938
batch [40] loss: 0.009900356642901897
batch [50] loss: 0.005076563451439142
batch [60] loss: 0.0026009967550635338
batch [70] loss: 0.001609099330380559
batch [80] loss: 0.0012645358219742775
batch [90] loss: 0.0011132964864373207
batch [100] loss: 0.001034232322126627
batch [110] loss: 0.0010114912874996662
batch [120] loss: 0.0010042613139376044
batch [130] loss: 0.0010008601238951087
batch [140] loss: 0.000999919488094747
batch [150] loss: 0.00099952204618603
batch [160] loss: 0.0009993795538321137
batch [170] loss: 0.0009993364801630378
batch [180] loss: 0.000999318901449442
batch [190] loss: 0.0009993123821914196
batch [200] loss: 0.0009993101703003049
batch [210] loss: 0.000999309471808374
b





  1%|          | 50/4999 [02:22<3:58:57,  2.90s/it][A[A[A[A

batch [290] loss: 0.0009993090061470866
batch [300] loss: 0.0009993090061470866
Using scale consts: [1e-07]
batch [0] loss: 0.028918970376253128
batch [10] loss: 0.5382409691810608
batch [20] loss: 0.1748957335948944
batch [30] loss: 0.02843169867992401
batch [40] loss: 0.009625367820262909
batch [50] loss: 0.004946778528392315
batch [60] loss: 0.0025481022894382477
batch [70] loss: 0.0015875018434599042
batch [80] loss: 0.0012562400661408901
batch [90] loss: 0.0011099582770839334
batch [100] loss: 0.0010331745725125074
batch [110] loss: 0.0010113007156178355
batch [120] loss: 0.0010042794747278094
batch [130] loss: 0.001000990392640233
batch [140] loss: 0.0010000835172832012
batch [150] loss: 0.0009997000452131033
batch [160] loss: 0.0009995620930567384
batch [170] loss: 0.0009995204163715243
batch [180] loss: 0.0009995034197345376
batch [190] loss: 0.0009994970168918371
batch [200] loss: 0.0009994949214160442
batch [210] loss: 0.0009994942229241133
batch [220] loss: 0.000999493990093





  1%|          | 51/4999 [02:25<3:56:54,  2.87s/it][A[A[A[A

batch [280] loss: 0.0009994938736781478
batch [290] loss: 0.0009994938736781478
batch [300] loss: 0.0009994938736781478
Using scale consts: [1e-07]
batch [0] loss: 0.05186096951365471
batch [10] loss: 0.9799945950508118
batch [20] loss: 0.3177091181278229
batch [30] loss: 0.05094735324382782
batch [40] loss: 0.016751764342188835
batch [50] loss: 0.008220428600907326
batch [60] loss: 0.0038257245905697346
batch [70] loss: 0.0020681852474808693
batch [80] loss: 0.0014664744958281517
batch [90] loss: 0.0012008290505036712
batch [100] loss: 0.0010611261241137981
batch [110] loss: 0.0010210926411673427
batch [120] loss: 0.0010083734523504972
batch [130] loss: 0.0010023367358371615
batch [140] loss: 0.001000701915472746
batch [150] loss: 0.001000000280328095
batch [160] loss: 0.0009997484739869833
batch [170] loss: 0.0009996730368584394
batch [180] loss: 0.0009996419539675117
batch [190] loss: 0.0009996304288506508
batch [200] loss: 0.0009996267035603523
batch [210] loss: 0.00099962542299181





  1%|          | 52/4999 [02:28<3:56:43,  2.87s/it][A[A[A[A

batch [300] loss: 0.0009996246080845594
Using scale consts: [1e-07]
batch [0] loss: 0.04328126832842827
batch [10] loss: 0.8155550360679626
batch [20] loss: 0.26414451003074646
batch [30] loss: 0.0424029603600502
batch [40] loss: 0.014100461266934872
batch [50] loss: 0.007002581842243671
batch [60] loss: 0.003344209399074316
batch [70] loss: 0.0018864948069676757
batch [80] loss: 0.001386463176459074
batch [90] loss: 0.0011668311199173331
batch [100] loss: 0.0010506751714274287
batch [110] loss: 0.0010171050671488047
batch [120] loss: 0.0010066437534987926
batch [130] loss: 0.0010015894658863544
batch [140] loss: 0.0010002384660765529
batch [150] loss: 0.0009996528970077634
batch [160] loss: 0.0009994428837671876
batch [170] loss: 0.0009993808344006538
batch [180] loss: 0.0009993549901992083
batch [190] loss: 0.0009993453277274966
batch [200] loss: 0.0009993421845138073
batch [210] loss: 0.000999341020360589
batch [220] loss: 0.0009993406711146235
batch [230] loss: 0.000999340554699301





  1%|          | 53/4999 [02:31<3:56:55,  2.87s/it][A[A[A[A

batch [290] loss: 0.0009993405546993017
batch [300] loss: 0.0009993405546993017
Using scale consts: [1e-07]
batch [0] loss: 0.061080291867256165
batch [10] loss: 1.161176085472107
batch [20] loss: 0.37603843212127686
batch [30] loss: 0.05996687710285187
batch [40] loss: 0.01962684653699398
batch [50] loss: 0.009555519558489323
batch [60] loss: 0.004342778120189905
batch [70] loss: 0.002258831635117531
batch [80] loss: 0.0015498639550060034
batch [90] loss: 0.0012388575123623013
batch [100] loss: 0.0010725132888182998
batch [110] loss: 0.0010248413309454918
batch [120] loss: 0.001009975909255445
batch [130] loss: 0.0010027572279796004
batch [140] loss: 0.0010008388198912144
batch [150] loss: 0.0010000065667554736
batch [160] loss: 0.0009997058659791946
batch [170] loss: 0.0009996176231652498
batch [180] loss: 0.0009995808359235525
batch [190] loss: 0.000999566982500255
batch [200] loss: 0.0009995624423027039
batch [210] loss: 0.00099956092890352
batch [220] loss: 0.000999560346826911
ba





  1%|          | 54/4999 [02:34<3:55:29,  2.86s/it][A[A[A[A

batch [280] loss: 0.0009995601139962673
batch [290] loss: 0.0009995601139962673
batch [300] loss: 0.0009995601139962673
Using scale consts: [1e-07]
batch [0] loss: 0.04157406464219093
batch [10] loss: 0.7887712717056274
batch [20] loss: 0.25566238164901733
batch [30] loss: 0.0409892275929451
batch [40] loss: 0.013596986420452595
batch [50] loss: 0.0067810965701937675
batch [60] loss: 0.003265680279582739
batch [70] loss: 0.001858381787315011
batch [80] loss: 0.0013742608716711402
batch [90] loss: 0.0011617288691923022
batch [100] loss: 0.0010487373219802976
batch [110] loss: 0.001016693888232112
batch [120] loss: 0.0010064532980322838
batch [130] loss: 0.0010016070445999503
batch [140] loss: 0.0010002886410802603
batch [150] loss: 0.000999723793938756
batch [160] loss: 0.0009995215805247426
batch [170] loss: 0.0009994612773880363
batch [180] loss: 0.0009994362480938435
batch [190] loss: 0.0009994269348680973
batch [200] loss: 0.000999423791654408
batch [210] loss: 0.0009994228603318334





  1%|          | 55/4999 [02:37<3:56:26,  2.87s/it][A[A[A[A

batch [300] loss: 0.0009994222782552242
Using scale consts: [1e-07]
batch [0] loss: 0.05082201212644577
batch [10] loss: 0.9592987298965454
batch [20] loss: 0.31063902378082275
batch [30] loss: 0.049704112112522125
batch [40] loss: 0.01639099046587944
batch [50] loss: 0.008065585047006607
batch [60] loss: 0.0037659024819731712
batch [70] loss: 0.0020443883258849382
batch [80] loss: 0.0014545926824212074
batch [90] loss: 0.0011966994497925043
batch [100] loss: 0.0010596761712804437
batch [110] loss: 0.001020382042042911
batch [120] loss: 0.0010080070933327079
batch [130] loss: 0.0010020909830927849
batch [140] loss: 0.0010004877112805843
batch [150] loss: 0.0009998007444664836
batch [160] loss: 0.0009995546424761415
batch [170] loss: 0.000999480951577425
batch [180] loss: 0.0009994505671784282
batch [190] loss: 0.000999439274892211
batch [200] loss: 0.0009994355496019125
batch [210] loss: 0.0009994342690333724
batch [220] loss: 0.000999433803372085
batch [230] loss: 0.000999433686956763





  1%|          | 56/4999 [02:39<3:56:17,  2.87s/it][A[A[A[A

batch [290] loss: 0.0009994335705414414
batch [300] loss: 0.0009994335705414414
Using scale consts: [1e-07]
batch [0] loss: 0.03409435972571373
batch [10] loss: 0.6393751502037048
batch [20] loss: 0.2073574811220169
batch [30] loss: 0.03344298154115677
batch [40] loss: 0.011229878291487694
batch [50] loss: 0.005688670091331005
batch [60] loss: 0.0028352816589176655
batch [70] loss: 0.0016964833484962583
batch [80] loss: 0.00130320037715137
batch [90] loss: 0.0011311541311442852
batch [100] loss: 0.00103965203743428
batch [110] loss: 0.0010135940974578261
batch [120] loss: 0.0010053180158138275
batch [130] loss: 0.0010013836435973644
batch [140] loss: 0.0010003187926486135
batch [150] loss: 0.000999859650619328
batch [160] loss: 0.000999695505015552
batch [170] loss: 0.0009996467269957066
batch [180] loss: 0.0009996265871450305
batch [190] loss: 0.0009996190201491117
batch [200] loss: 0.0009996164590120316
batch [210] loss: 0.0009996156441047788
batch [220] loss: 0.0009996152948588133
b





  1%|          | 57/4999 [02:42<3:55:12,  2.86s/it][A[A[A[A

batch [280] loss: 0.0009996151784434915
batch [290] loss: 0.0009996151784434915
batch [300] loss: 0.0009996151784434915
Using scale consts: [1e-07]
batch [0] loss: 0.03916487097740173
batch [10] loss: 0.7321706414222717
batch [20] loss: 0.23739755153656006
batch [30] loss: 0.03817982226610184
batch [40] loss: 0.01271894946694374
batch [50] loss: 0.0063703786581754684
batch [60] loss: 0.003102461341768503
batch [70] loss: 0.0017968848114833236
batch [80] loss: 0.0013476551976054907
batch [90] loss: 0.0011501708067953587
batch [100] loss: 0.0010454842122271657
batch [110] loss: 0.0010155424242839217
batch [120] loss: 0.001006105449050665
batch [130] loss: 0.0010015773586928844
batch [140] loss: 0.0010003629140555859
batch [150] loss: 0.0009998363675549626
batch [160] loss: 0.0009996475419029593
batch [170] loss: 0.000999592011794448
batch [180] loss: 0.0009995687287300825
batch [190] loss: 0.0009995601139962673
batch [200] loss: 0.0009995572036132216
batch [210] loss: 0.00099955627229064





  1%|          | 58/4999 [02:45<3:56:23,  2.87s/it][A[A[A[A

batch [290] loss: 0.0009995558066293597
batch [300] loss: 0.0009995558066293597
Using scale consts: [1e-07]
batch [0] loss: 0.04185362532734871
batch [10] loss: 0.7880275249481201
batch [20] loss: 0.2553676962852478
batch [30] loss: 0.041088204830884933
batch [40] loss: 0.013667643070220947
batch [50] loss: 0.006804740056395531
batch [60] loss: 0.003270599525421858
batch [70] loss: 0.0018585491925477982
batch [80] loss: 0.0013745619216933846
batch [90] loss: 0.0011612387606874108
batch [100] loss: 0.0010487785330042243
batch [110] loss: 0.0010167310247197747
batch [120] loss: 0.001006429549306631
batch [130] loss: 0.0010016192682087421
batch [140] loss: 0.001000286196358502
batch [150] loss: 0.0009997269371524453
batch [160] loss: 0.0009995256550610065
batch [170] loss: 0.0009994643041864038
batch [180] loss: 0.000999439274892211
batch [190] loss: 0.0009994300780817866
batch [200] loss: 0.0009994270512834191
batch [210] loss: 0.0009994260035455227
batch [220] loss: 0.000999425654299557





  1%|          | 59/4999 [02:48<3:56:51,  2.88s/it][A[A[A[A

batch [280] loss: 0.0009994254214689136
batch [290] loss: 0.0009994254214689136
batch [300] loss: 0.0009994254214689136
Using scale consts: [1e-07]
batch [0] loss: 0.036800116300582886
batch [10] loss: 0.6920884847640991
batch [20] loss: 0.2244131863117218
batch [30] loss: 0.03615701198577881
batch [40] loss: 0.012084674090147018
batch [50] loss: 0.006085967645049095
batch [60] loss: 0.0029929527081549168
batch [70] loss: 0.00175368576310575
batch [80] loss: 0.0013294713571667671
batch [90] loss: 0.0011414301116019487
batch [100] loss: 0.0010428826790302992
batch [110] loss: 0.0010146592976525426
batch [120] loss: 0.001005661441013217
batch [130] loss: 0.0010014217114076018
batch [140] loss: 0.0010002602357417345
batch [150] loss: 0.0009997660527005792
batch [160] loss: 0.0009995887521654367
batch [170] loss: 0.0009995353175327182
batch [180] loss: 0.0009995133150368929
batch [190] loss: 0.0009995052823796868
batch [200] loss: 0.0009995026048272848
batch [210] loss: 0.00099950167350471





  1%|          | 60/4999 [02:51<3:56:13,  2.87s/it][A[A[A[A

batch [300] loss: 0.000999501091428101
Using scale consts: [1e-07]
batch [0] loss: 0.04412728175520897
batch [10] loss: 0.8317593336105347
batch [20] loss: 0.269745409488678
batch [30] loss: 0.043347522616386414
batch [40] loss: 0.014353432692587376
batch [50] loss: 0.007124652154743671
batch [60] loss: 0.0033962284214794636
batch [70] loss: 0.0019059809856116772
batch [80] loss: 0.0013953871093690395
batch [90] loss: 0.0011702787596732378
batch [100] loss: 0.0010516596958041191
batch [110] loss: 0.0010176659561693668
batch [120] loss: 0.0010068854317069054
batch [130] loss: 0.0010017728200182319
batch [140] loss: 0.0010003808420151472
batch [150] loss: 0.0009997852612286806
batch [160] loss: 0.0009995716391131282
batch [170] loss: 0.000999507843516767
batch [180] loss: 0.0009994815336540341
batch [190] loss: 0.0009994717547670007
batch [200] loss: 0.0009994684951379895
batch [210] loss: 0.0009994673309847713
batch [220] loss: 0.0009994669817388058
batch [230] loss: 0.00099946686532348





  1%|          | 61/4999 [02:54<3:57:14,  2.88s/it][A[A[A[A

batch [290] loss: 0.0009994667489081621
batch [300] loss: 0.0009994667489081621
Using scale consts: [1e-07]
batch [0] loss: 0.04064086824655533
batch [10] loss: 0.7694286108016968
batch [20] loss: 0.24901844561100006
batch [30] loss: 0.03998984396457672
batch [40] loss: 0.01336285937577486
batch [50] loss: 0.006675769109278917
batch [60] loss: 0.003218072932213545
batch [70] loss: 0.0018357646185904741
batch [80] loss: 0.0013631726615130901
batch [90] loss: 0.0011573275551199913
batch [100] loss: 0.0010472557041794062
batch [110] loss: 0.0010158744407817721
batch [120] loss: 0.0010058971820399165
batch [130] loss: 0.0010011711856350303
batch [140] loss: 0.0009998856112360954
batch [150] loss: 0.0009993334533646703
batch [160] loss: 0.0009991367114707828
batch [170] loss: 0.0009990775724872947
batch [180] loss: 0.0009990532416850328
batch [190] loss: 0.0009990441612899303
batch [200] loss: 0.0009990411344915628
batch [210] loss: 0.0009990402031689882
batch [220] loss: 0.0009990397375077





  1%|          | 62/4999 [02:57<4:04:10,  2.97s/it][A[A[A[A

batch [300] loss: 0.000999039621092379
Using scale consts: [1e-07]
batch [0] loss: 0.02558797597885132
batch [10] loss: 0.47449710965156555
batch [20] loss: 0.15404781699180603
batch [30] loss: 0.025103379040956497
batch [40] loss: 0.008605853654444218
batch [50] loss: 0.004485970828682184
batch [60] loss: 0.0023643909953534603
batch [70] loss: 0.001516928430646658
batch [80] loss: 0.0012252670712769032
batch [90] loss: 0.0010966347763314843
batch [100] loss: 0.0010289286728948355
batch [110] loss: 0.0010096896439790726
batch [120] loss: 0.0010034805163741112
batch [130] loss: 0.0010005938820540905
batch [140] loss: 0.000999788986518979
batch [150] loss: 0.000999451382085681
batch [160] loss: 0.0009993304265663028
batch [170] loss: 0.0009992935229092836
batch [180] loss: 0.0009992786217480898
batch [190] loss: 0.000999273033812642
batch [200] loss: 0.0009992711711674929
batch [210] loss: 0.0009992705890908837
batch [220] loss: 0.00099927035626024
batch [230] loss: 0.0009992702398449183





  1%|▏         | 63/4999 [03:00<4:07:18,  3.01s/it][A[A[A[A

batch [300] loss: 0.0009992702398449183
Using scale consts: [1e-07]
batch [0] loss: 0.03293699026107788
batch [10] loss: 0.6179078817367554
batch [20] loss: 0.20040546357631683
batch [30] loss: 0.03236442431807518
batch [40] loss: 0.010901698842644691
batch [50] loss: 0.005542649421840906
batch [60] loss: 0.0027776812203228474
batch [70] loss: 0.0016720512649044394
batch [80] loss: 0.001292989356443286
batch [90] loss: 0.0011263611959293485
batch [100] loss: 0.001038135145790875
batch [110] loss: 0.0010129170259460807
batch [120] loss: 0.0010049528209492564
batch [130] loss: 0.0010011348640546203
batch [140] loss: 0.0010001041227951646
batch [150] loss: 0.0009996610460802913
batch [160] loss: 0.0009995036525651813
batch [170] loss: 0.0009994563879445195
batch [180] loss: 0.000999436597339809
batch [190] loss: 0.0009994292631745338
batch [200] loss: 0.0009994269348680973
batch [210] loss: 0.0009994261199608445
batch [220] loss: 0.000999425770714879
batch [230] loss: 0.000999425654299557





  1%|▏         | 64/4999 [03:03<4:06:40,  3.00s/it][A[A[A[A

batch [290] loss: 0.0009994256542995572
batch [300] loss: 0.000999425770714879
Using scale consts: [1e-07]
batch [0] loss: 0.034622672945261
batch [10] loss: 0.6511465907096863
batch [20] loss: 0.21114513278007507
batch [30] loss: 0.034004323184490204
batch [40] loss: 0.011408021673560143
batch [50] loss: 0.005776815116405487
batch [60] loss: 0.002871609292924404
batch [70] loss: 0.0017084784340113401
batch [80] loss: 0.001307925907894969
batch [90] loss: 0.0011330785928294063
batch [100] loss: 0.001040198840200901
batch [110] loss: 0.0010134440381079912
batch [120] loss: 0.0010051082354038954
batch [130] loss: 0.0010010675759986043
batch [140] loss: 0.0009999895701184869
batch [150] loss: 0.0009995236760005355
batch [160] loss: 0.0009993556886911392
batch [170] loss: 0.000999306095764041
batch [180] loss: 0.0009992853738367558
batch [190] loss: 0.0009992776904255152
batch [200] loss: 0.000999275129288435
batch [210] loss: 0.0009992743143811822
batch [220] loss: 0.0009992739651352167
b





  1%|▏         | 65/4999 [03:06<4:04:30,  2.97s/it][A[A[A[A

batch [280] loss: 0.0009992738487198949
batch [290] loss: 0.0009992738487198949
batch [300] loss: 0.0009992738487198949
Using scale consts: [1e-07]
batch [0] loss: 0.05140363425016403
batch [10] loss: 0.9750559329986572
batch [20] loss: 0.3156205415725708
batch [30] loss: 0.05037340521812439
batch [40] loss: 0.016596317291259766
batch [50] loss: 0.008168869651854038
batch [60] loss: 0.0038076373748481274
batch [70] loss: 0.002059028949588537
batch [80] loss: 0.0014616347616538405
batch [90] loss: 0.0012001350987702608
batch [100] loss: 0.0010607901494950056
batch [110] loss: 0.0010207429295405746
batch [120] loss: 0.0010082504013553262
batch [130] loss: 0.001002199249342084
batch [140] loss: 0.001000583521090448
batch [150] loss: 0.0009998863097280264
batch [160] loss: 0.0009996345033869147
batch [170] loss: 0.0009995599975809455
batch [180] loss: 0.0009995291475206614
batch [190] loss: 0.0009995176224038005
batch [200] loss: 0.0009995137806981802
batch [210] loss: 0.00099951250012964






  1%|▏         | 66/4999 [03:09<4:01:03,  2.93s/it][A[A[A[A

batch [290] loss: 0.0009995118016377091
batch [300] loss: 0.0009995118016377091
Using scale consts: [1e-07]
batch [0] loss: 0.042910315096378326
batch [10] loss: 0.8067230582237244
batch [20] loss: 0.26165440678596497
batch [30] loss: 0.042121365666389465
batch [40] loss: 0.013979637064039707
batch [50] loss: 0.006953039206564426
batch [60] loss: 0.0033336742781102657
batch [70] loss: 0.001880258903838694
batch [80] loss: 0.001381400041282177
batch [90] loss: 0.001165301539003849
batch [100] loss: 0.0010500826174393296
batch [110] loss: 0.0010169316083192825
batch [120] loss: 0.0010066054528579116
batch [130] loss: 0.0010015909792855382
batch [140] loss: 0.001000254531390965
batch [150] loss: 0.0009996750159189105
batch [160] loss: 0.0009994670981541276
batch [170] loss: 0.000999405630864203
batch [180] loss: 0.000999380019493401
batch [190] loss: 0.0009993704734370112
batch [200] loss: 0.000999367330223322
batch [210] loss: 0.0009993662824854255
batch [220] loss: 0.00099936593323946
b





  1%|▏         | 67/4999 [03:12<3:59:24,  2.91s/it][A[A[A[A

batch [280] loss: 0.0009993657004088163
batch [290] loss: 0.0009993657004088163
batch [300] loss: 0.0009993657004088163
Using scale consts: [1e-07]
batch [0] loss: 0.030326955020427704
batch [10] loss: 0.5690433979034424
batch [20] loss: 0.18460653722286224
batch [30] loss: 0.02981097437441349
batch [40] loss: 0.01007633563131094
batch [50] loss: 0.005166004411876202
batch [60] loss: 0.00263003702275455
batch [70] loss: 0.0016180240781977773
batch [80] loss: 0.0012708465801551938
batch [90] loss: 0.0011165444739162922
batch [100] loss: 0.001035583671182394
batch [110] loss: 0.0010122251696884632
batch [120] loss: 0.0010049383854493499
batch [130] loss: 0.0010014097206294537
batch [140] loss: 0.0010004633804783225
batch [150] loss: 0.0010000575566664338
batch [160] loss: 0.000999911455437541
batch [170] loss: 0.0009998679161071777
batch [180] loss: 0.0009998498717322946
batch [190] loss: 0.0009998431196436286
batch [200] loss: 0.0009998409077525139
batch [210] loss: 0.000999840092845261





  1%|▏         | 68/4999 [03:15<3:58:51,  2.91s/it][A[A[A[A

batch [290] loss: 0.0009998397435992956
batch [300] loss: 0.0009998397435992956
Using scale consts: [1e-07]
batch [0] loss: 0.036105185747146606
batch [10] loss: 0.6779870986938477
batch [20] loss: 0.2200218290090561
batch [30] loss: 0.03555109724402428
batch [40] loss: 0.011892424896359444
batch [50] loss: 0.005990102421492338
batch [60] loss: 0.002953009447082877
batch [70] loss: 0.0017402912490069866
batch [80] loss: 0.001322639174759388
batch [90] loss: 0.0011388768907636404
batch [100] loss: 0.0010423838393762708
batch [110] loss: 0.0010146661661565304
batch [120] loss: 0.0010058621410280466
batch [130] loss: 0.0010017079766839743
batch [140] loss: 0.0010005711810663342
batch [150] loss: 0.001000085729174316
batch [160] loss: 0.0009999125031754375
batch [170] loss: 0.0009998602326959372
batch [180] loss: 0.0009998385794460773
batch [190] loss: 0.0009998305467888713
batch [200] loss: 0.000999827985651791
batch [210] loss: 0.0009998270543292165
batch [220] loss: 0.000999826705083251





  1%|▏         | 69/4999 [03:17<3:57:02,  2.88s/it][A[A[A[A

batch [280] loss: 0.0009998265886679292
batch [290] loss: 0.0009998265886679292
batch [300] loss: 0.0009998265886679292
Using scale consts: [1e-07]
batch [0] loss: 0.028617367148399353
batch [10] loss: 0.5353604555130005
batch [20] loss: 0.17370174825191498
batch [30] loss: 0.028167087584733963
batch [40] loss: 0.00956120528280735
batch [50] loss: 0.004921631421893835
batch [60] loss: 0.0025391411036252975
batch [70] loss: 0.0015848500188440084
batch [80] loss: 0.0012551227118819952
batch [90] loss: 0.0011093643261119723
batch [100] loss: 0.0010334269609302282
batch [110] loss: 0.0010115585755556822
batch [120] loss: 0.0010046021779999137
batch [130] loss: 0.0010013248538598418
batch [140] loss: 0.0010004250798374414
batch [150] loss: 0.0010000448673963547
batch [160] loss: 0.0009999077301472425
batch [170] loss: 0.0009998660534620285
batch [180] loss: 0.0009998490568250418
batch [190] loss: 0.000999842886812985
batch [200] loss: 0.000999840791337192
batch [210] loss: 0.000999840092845





  1%|▏         | 70/4999 [03:20<3:57:17,  2.89s/it][A[A[A[A

batch [300] loss: 0.0009998397435992956
Using scale consts: [1e-07]
batch [0] loss: 0.05161125585436821
batch [10] loss: 0.975433886051178
batch [20] loss: 0.3161448538303375
batch [30] loss: 0.05066527798771858
batch [40] loss: 0.016669651493430138
batch [50] loss: 0.008185042068362236
batch [60] loss: 0.003818738739937544
batch [70] loss: 0.002066174289211631
batch [80] loss: 0.0014617489650845528
batch [90] loss: 0.0012001446448266506
batch [100] loss: 0.0010609619785100222
batch [110] loss: 0.0010209062602370977
batch [120] loss: 0.0010083402739837766
batch [130] loss: 0.001002322300337255
batch [140] loss: 0.0010006946977227926
batch [150] loss: 0.0009999957401305437
batch [160] loss: 0.0009997448651120067
batch [170] loss: 0.000999670010060072
batch [180] loss: 0.000999639043584466
batch [190] loss: 0.0009996275184676051
batch [200] loss: 0.0009996237931773067
batch [210] loss: 0.0009996225126087666
batch [220] loss: 0.0009996220469474792
batch [230] loss: 0.0009996218141168356
b





  1%|▏         | 71/4999 [03:23<3:57:01,  2.89s/it][A[A[A[A

batch [290] loss: 0.0009996218141168356
batch [300] loss: 0.0009996218141168356
Using scale consts: [1e-07]
batch [0] loss: 0.02460162341594696
batch [10] loss: 0.4582103192806244
batch [20] loss: 0.14857922494411469
batch [30] loss: 0.02415434829890728
batch [40] loss: 0.00832015834748745
batch [50] loss: 0.004357665311545134
batch [60] loss: 0.0023137901443988085
batch [70] loss: 0.0014981107087805867
batch [80] loss: 0.0012171530397608876
batch [90] loss: 0.0010931803844869137
batch [100] loss: 0.001028135302476585
batch [110] loss: 0.0010093593737110496
batch [120] loss: 0.0010034464066848159
batch [130] loss: 0.0010006381198763847
batch [140] loss: 0.0009998702444136143
batch [150] loss: 0.000999542186036706
batch [160] loss: 0.0009994249558076262
batch [170] loss: 0.0009993897983804345
batch [180] loss: 0.0009993752464652061
batch [190] loss: 0.000999369891360402
batch [200] loss: 0.0009993680287152529
batch [210] loss: 0.0009993674466386437
batch [220] loss: 0.000999367213808
ba





  1%|▏         | 72/4999 [03:26<3:55:32,  2.87s/it][A[A[A[A

batch [280] loss: 0.0009993670973926783
batch [290] loss: 0.0009993670973926783
batch [300] loss: 0.0009993670973926783
Using scale consts: [1e-07]
batch [0] loss: 0.0279300007969141
batch [10] loss: 0.5182651877403259
batch [20] loss: 0.16864347457885742
batch [30] loss: 0.027510933578014374
batch [40] loss: 0.009328801184892654
batch [50] loss: 0.00480737816542387
batch [60] loss: 0.0024923328310251236
batch [70] loss: 0.0015675765462219715
batch [80] loss: 0.0012471441878005862
batch [90] loss: 0.0011055230861529708
batch [100] loss: 0.0010321433655917645
batch [110] loss: 0.0010108976857736707
batch [120] loss: 0.0010041830828413367
batch [130] loss: 0.0010009973775595427
batch [140] loss: 0.0010001322953030467
batch [150] loss: 0.0009997589513659477
batch [160] loss: 0.000999626237899065
batch [170] loss: 0.0009995864238590002
batch [180] loss: 0.0009995700092986226
batch [190] loss: 0.0009995638392865658
batch [200] loss: 0.0009995618602260947
batch [210] loss: 0.0009995611617341





  1%|▏         | 73/4999 [03:29<3:56:24,  2.88s/it][A[A[A[A

batch [300] loss: 0.0009995608124881983
Using scale consts: [1e-07]
batch [0] loss: 0.05337396264076233
batch [10] loss: 1.0126920938491821
batch [20] loss: 0.32785487174987793
batch [30] loss: 0.05242011323571205
batch [40] loss: 0.01724919304251671
batch [50] loss: 0.008450067602097988
batch [60] loss: 0.003916229587048292
batch [70] loss: 0.0021032369695603848
batch [80] loss: 0.0014813051093369722
batch [90] loss: 0.001207662164233625
batch [100] loss: 0.0010634507052600384
batch [110] loss: 0.0010219154646620154
batch [120] loss: 0.001008855295367539
batch [130] loss: 0.0010026033269241452
batch [140] loss: 0.0010009183315560222
batch [150] loss: 0.0010001921327784657
batch [160] loss: 0.0009999321773648262
batch [170] loss: 0.0009998545283451676
batch [180] loss: 0.0009998223977163434
batch [190] loss: 0.000999810523353517
batch [200] loss: 0.000999806565232575
batch [210] loss: 0.000999805168248713
batch [220] loss: 0.0009998047025874257
batch [230] loss: 0.0009998045861721039
b





  1%|▏         | 74/4999 [03:32<3:55:55,  2.87s/it][A[A[A[A

batch [290] loss: 0.000999804469756782
batch [300] loss: 0.000999804469756782
Using scale consts: [1e-07]
batch [0] loss: 0.045074090361595154
batch [10] loss: 0.8503987789154053
batch [20] loss: 0.27573147416114807
batch [30] loss: 0.04428740590810776
batch [40] loss: 0.014659264124929905
batch [50] loss: 0.00726381316781044
batch [60] loss: 0.003447284922003746
batch [70] loss: 0.0019246882293373346
batch [80] loss: 0.001404640730470419
batch [90] loss: 0.0011744003277271986
batch [100] loss: 0.0010530889267101884
batch [110] loss: 0.0010182487312704325
batch [120] loss: 0.0010073023149743676
batch [130] loss: 0.0010020529152825475
batch [140] loss: 0.0010006304364651442
batch [150] loss: 0.001000021817162633
batch [160] loss: 0.0009998036548495293
batch [170] loss: 0.0009997384622693062
batch [180] loss: 0.0009997114539146423
batch [190] loss: 0.0009997014421969652
batch [200] loss: 0.000999698182567954
batch [210] loss: 0.0009996970184147358
batch [220] loss: 0.0009996966691687703






  2%|▏         | 75/4999 [03:35<3:55:22,  2.87s/it][A[A[A[A

batch [280] loss: 0.0009996964363381267
batch [290] loss: 0.0009996964363381267
batch [300] loss: 0.0009996964363381267
Using scale consts: [1e-07]
batch [0] loss: 0.04488601163029671
batch [10] loss: 0.8501287698745728
batch [20] loss: 0.27542972564697266
batch [30] loss: 0.04418068379163742
batch [40] loss: 0.014647053554654121
batch [50] loss: 0.007259936537593603
batch [60] loss: 0.003447527065873146
batch [70] loss: 0.0019237089436501265
batch [80] loss: 0.0014030876336619258
batch [90] loss: 0.0011740847257897258
batch [100] loss: 0.001052734674885869
batch [110] loss: 0.0010179327800869942
batch [120] loss: 0.00100697239395231
batch [130] loss: 0.0010017274180427194
batch [140] loss: 0.001000309712253511
batch [150] loss: 0.000999702257104218
batch [160] loss: 0.0009994832798838615
batch [170] loss: 0.0009994179708883166
batch [180] loss: 0.0009993910789489746
batch [190] loss: 0.0009993810672312975
batch [200] loss: 0.0009993776911869645
batch [210] loss: 0.000999376643449068
b





  2%|▏         | 76/4999 [03:38<3:56:30,  2.88s/it][A[A[A[A

batch [290] loss: 0.000999375944957137
batch [300] loss: 0.000999376061372459
Using scale consts: [1e-07]
batch [0] loss: 0.05865050107240677
batch [10] loss: 1.1124203205108643
batch [20] loss: 0.36058345437049866
batch [30] loss: 0.057632457464933395
batch [40] loss: 0.018859578296542168
batch [50] loss: 0.009197619743645191
batch [60] loss: 0.004209920298308134
batch [70] loss: 0.0022089038975536823
batch [80] loss: 0.0015273158205673099
batch [90] loss: 0.0012286988785490394
batch [100] loss: 0.0010691938223317266
batch [110] loss: 0.0010237208334729075
batch [120] loss: 0.0010093793971464038
batch [130] loss: 0.0010025112424045801
batch [140] loss: 0.001000656164251268
batch [150] loss: 0.0009998597670346498
batch [160] loss: 0.0009995732689276338
batch [170] loss: 0.0009994882857427
batch [180] loss: 0.0009994528954848647
batch [190] loss: 0.0009994397405534983
batch [200] loss: 0.0009994353167712688
batch [210] loss: 0.000999433919787407
batch [220] loss: 0.0009994333377107978
b





  2%|▏         | 77/4999 [03:40<3:55:35,  2.87s/it][A[A[A[A

batch [280] loss: 0.0009994331048801541
batch [290] loss: 0.0009994331048801541
batch [300] loss: 0.0009994331048801541
Using scale consts: [1e-07]
batch [0] loss: 0.020415348932147026
batch [10] loss: 0.37529268860816956
batch [20] loss: 0.1221158355474472
batch [30] loss: 0.02015022747218609
batch [40] loss: 0.0070350151509046555
batch [50] loss: 0.0037613771855831146
batch [60] loss: 0.0020832782611250877
batch [70] loss: 0.0014095522928982973
batch [80] loss: 0.0011781887151300907
batch [90] loss: 0.0010759017895907164
batch [100] loss: 0.0010226224549114704
batch [110] loss: 0.0010073893936350942
batch [120] loss: 0.0010024679359048605
batch [130] loss: 0.001000187243334949
batch [140] loss: 0.000999551615677774
batch [150] loss: 0.0009992853738367558
batch [160] loss: 0.0009991895640268922
batch [170] loss: 0.00099916011095047
batch [180] loss: 0.0009991482365876436
batch [190] loss: 0.000999143929220736
batch [200] loss: 0.0009991424158215523
batch [210] loss: 0.0009991419501602





  2%|▏         | 78/4999 [03:43<3:55:10,  2.87s/it][A[A[A[A

batch [300] loss: 0.0009991416009142995
Using scale consts: [1e-07]
batch [0] loss: 0.04753228649497032
batch [10] loss: 0.8953096270561218
batch [20] loss: 0.2903212904930115
batch [30] loss: 0.04662919044494629
batch [40] loss: 0.015412658452987671
batch [50] loss: 0.007608593907207251
batch [60] loss: 0.003581471275538206
batch [70] loss: 0.001974137034267187
batch [80] loss: 0.0014250497333705425
batch [90] loss: 0.0011843296233564615
batch [100] loss: 0.0010564246913418174
batch [110] loss: 0.0010195946088060737
batch [120] loss: 0.0010081569198518991
batch [130] loss: 0.0010025815572589636
batch [140] loss: 0.0010011043632403016
batch [150] loss: 0.0010004600044339895
batch [160] loss: 0.0010002291528508067
batch [170] loss: 0.0010001612827181816
batch [180] loss: 0.001000132760964334
batch [190] loss: 0.0010001221671700478
batch [200] loss: 0.001000118674710393
batch [210] loss: 0.0010001175105571747
batch [220] loss: 0.0010001170448958874
batch [230] loss: 0.0010001169284805655





  2%|▏         | 79/4999 [03:46<3:56:04,  2.88s/it][A[A[A[A

batch [290] loss: 0.0010001168120652437
batch [300] loss: 0.0010001168120652437
Using scale consts: [1e-07]
batch [0] loss: 0.03074413351714611
batch [10] loss: 0.5709590911865234
batch [20] loss: 0.18544983863830566
batch [30] loss: 0.030120328068733215
batch [40] loss: 0.010168428532779217
batch [50] loss: 0.005194842349737883
batch [60] loss: 0.002644225023686886
batch [70] loss: 0.0016237642848864198
batch [80] loss: 0.0012715948978438973
batch [90] loss: 0.00111633469350636
batch [100] loss: 0.0010351238306611776
batch [110] loss: 0.0010117950150743127
batch [120] loss: 0.0010043791262432933
batch [130] loss: 0.001000874093733728
batch [140] loss: 0.0009999163448810577
batch [150] loss: 0.0009995083091780543
batch [160] loss: 0.0009993622079491615
batch [170] loss: 0.0009993179701268673
batch [180] loss: 0.0009992998093366623
batch [190] loss: 0.0009992931736633182
batch [200] loss: 0.0009992909617722034
batch [210] loss: 0.0009992901468649507
batch [220] loss: 0.00099928991403430





  2%|▏         | 80/4999 [03:49<4:03:55,  2.98s/it][A[A[A[A

batch [290] loss: 0.0009992897976189852
batch [300] loss: 0.0009992897976189852
Using scale consts: [1e-07]
batch [0] loss: 0.03700485825538635
batch [10] loss: 0.6984437704086304
batch [20] loss: 0.2263421267271042
batch [30] loss: 0.03638666123151779
batch [40] loss: 0.012170560657978058
batch [50] loss: 0.006123824045062065
batch [60] loss: 0.003002976533025503
batch [70] loss: 0.0017577495891600847
batch [80] loss: 0.0013316369149833918
batch [90] loss: 0.0011433152249082923
batch [100] loss: 0.0010433224961161613
batch [110] loss: 0.0010148793226107955
batch [120] loss: 0.0010058472398668528
batch [130] loss: 0.0010015477892011404
batch [140] loss: 0.0010003815405070782
batch [150] loss: 0.0009998814202845097
batch [160] loss: 0.0009997017914429307
batch [170] loss: 0.0009996483568102121
batch [180] loss: 0.0009996261214837432
batch [190] loss: 0.0009996179724112153
batch [200] loss: 0.0009996151784434915
batch [210] loss: 0.0009996143635362387
batch [220] loss: 0.0009996140142902





  2%|▏         | 81/4999 [03:52<4:06:19,  3.01s/it][A[A[A[A

batch [300] loss: 0.0009996137814596295
Using scale consts: [1e-07]
batch [0] loss: 0.0454208180308342
batch [10] loss: 0.8578046560287476
batch [20] loss: 0.2778521776199341
batch [30] loss: 0.04452289268374443
batch [40] loss: 0.01475053746253252
batch [50] loss: 0.007311559747904539
batch [60] loss: 0.0034702217672020197
batch [70] loss: 0.0019324510358273983
batch [80] loss: 0.0014068528544157743
batch [90] loss: 0.0011762959184125066
batch [100] loss: 0.001053568092174828
batch [110] loss: 0.0010184950660914183
batch [120] loss: 0.001007440616376698
batch [130] loss: 0.0010021376656368375
batch [140] loss: 0.0010007155360653996
batch [150] loss: 0.0010000986512750387
batch [160] loss: 0.000999877811409533
batch [170] loss: 0.0009998125024139881
batch [180] loss: 0.0009997853776440024
batch [190] loss: 0.0009997751330956817
batch [200] loss: 0.0009997718734666705
batch [210] loss: 0.0009997707093134522
batch [220] loss: 0.000999770243652165
batch [230] loss: 0.0009997701272368431
b





  2%|▏         | 82/4999 [03:55<4:05:30,  3.00s/it][A[A[A[A

batch [290] loss: 0.0009997701272368431
batch [300] loss: 0.0009997701272368431
Using scale consts: [1e-07]
batch [0] loss: 0.05529697239398956
batch [10] loss: 1.0478053092956543
batch [20] loss: 0.3394184112548828
batch [30] loss: 0.054171886295080185
batch [40] loss: 0.01777082309126854
batch [50] loss: 0.00869797170162201
batch [60] loss: 0.0040130168199539185
batch [70] loss: 0.0021384323481470346
batch [80] loss: 0.001496524317190051
batch [90] loss: 0.001214986783452332
batch [100] loss: 0.0010650557233020663
batch [110] loss: 0.0010221117408946157
batch [120] loss: 0.0010086578549817204
batch [130] loss: 0.0010021553607657552
batch [140] loss: 0.001000424032099545
batch [150] loss: 0.000999670592136681
batch [160] loss: 0.0009994006250053644
batch [170] loss: 0.0009993212297558784
batch [180] loss: 0.000999287934973836
batch [190] loss: 0.0009992754785344005
batch [200] loss: 0.0009992714039981365
batch [210] loss: 0.0009992700070142746
batch [220] loss: 0.0009992695413529873
b





  2%|▏         | 83/4999 [03:58<4:03:26,  2.97s/it][A[A[A[A

batch [280] loss: 0.0009992691921070218
batch [290] loss: 0.0009992691921070218
batch [300] loss: 0.0009992691921070218
Using scale consts: [1e-07]
batch [0] loss: 0.048264674842357635
batch [10] loss: 0.9162589907646179
batch [20] loss: 0.296700119972229
batch [30] loss: 0.047375231981277466
batch [40] loss: 0.015623881481587887
batch [50] loss: 0.007723022252321243
batch [60] loss: 0.003635867964476347
batch [70] loss: 0.001995256869122386
batch [80] loss: 0.001435134094208479
batch [90] loss: 0.0011874828487634659
batch [100] loss: 0.0010569283040240407
batch [110] loss: 0.001019371673464775
batch [120] loss: 0.0010075985919684172
batch [130] loss: 0.0010019156616181135
batch [140] loss: 0.0010004041250795126
batch [150] loss: 0.0009997435845434666
batch [160] loss: 0.0009995083091780543
batch [170] loss: 0.00099943857640028
batch [180] loss: 0.0009994094725698233
batch [190] loss: 0.0009993985295295715
batch [200] loss: 0.0009993950370699167
batch [210] loss: 0.0009993937565013766






  2%|▏         | 84/4999 [04:01<4:04:49,  2.99s/it][A[A[A[A

batch [290] loss: 0.0009993931744247675
batch [300] loss: 0.0009993931744247675
Using scale consts: [1e-07]
batch [0] loss: 0.038682252168655396
batch [10] loss: 0.7237880825996399
batch [20] loss: 0.23480556905269623
batch [30] loss: 0.0379185825586319
batch [40] loss: 0.012652920559048653
batch [50] loss: 0.006334110628813505
batch [60] loss: 0.003087227698415518
batch [70] loss: 0.0017894719494506717
batch [80] loss: 0.0013442086055874825
batch [90] loss: 0.0011482916306704283
batch [100] loss: 0.0010450686095282435
batch [110] loss: 0.0010155165800824761
batch [120] loss: 0.0010061338543891907
batch [130] loss: 0.001001673866994679
batch [140] loss: 0.0010004609357565641
batch [150] loss: 0.000999944400973618
batch [160] loss: 0.0009997590677812696
batch [170] loss: 0.0009997031884267926
batch [180] loss: 0.0009996801381930709
batch [190] loss: 0.0009996717562898993
batch [200] loss: 0.0009996688459068537
batch [210] loss: 0.000999667914584279
batch [220] loss: 0.000999667565338313





  2%|▏         | 85/4999 [04:04<4:03:26,  2.97s/it][A[A[A[A

batch [280] loss: 0.0009996674489229918
batch [290] loss: 0.0009996674489229918
batch [300] loss: 0.0009996674489229918
Using scale consts: [1e-07]
batch [0] loss: 0.024592796340584755
batch [10] loss: 0.4549333453178406
batch [20] loss: 0.1477476954460144
batch [30] loss: 0.024092163890600204
batch [40] loss: 0.008281269110739231
batch [50] loss: 0.004334242548793554
batch [60] loss: 0.0023043686524033546
batch [70] loss: 0.001494426978752017
batch [80] loss: 0.0012162758503109217
batch [90] loss: 0.0010926619870588183
batch [100] loss: 0.001027849386446178
batch [110] loss: 0.0010093165328726172
batch [120] loss: 0.0010034190490841866
batch [130] loss: 0.001000629854388535
batch [140] loss: 0.0009998641908168793
batch [150] loss: 0.0009995410218834877
batch [160] loss: 0.0009994242573156953
batch [170] loss: 0.0009993889834731817
batch [180] loss: 0.0009993745479732752
batch [190] loss: 0.000999369309283793
batch [200] loss: 0.0009993675630539656
batch [210] loss: 0.00099936686456203





  2%|▏         | 86/4999 [04:07<4:01:19,  2.95s/it][A[A[A[A

batch [290] loss: 0.000999366631731391
batch [300] loss: 0.000999366631731391
Using scale consts: [1e-07]
batch [0] loss: 0.03837517648935318
batch [10] loss: 0.7243340015411377
batch [20] loss: 0.23488444089889526
batch [30] loss: 0.03785383701324463
batch [40] loss: 0.012633644975721836
batch [50] loss: 0.006337313912808895
batch [60] loss: 0.0030945250764489174
batch [70] loss: 0.0017901911633089185
batch [80] loss: 0.001343241659924388
batch [90] loss: 0.001148095354437828
batch [100] loss: 0.0010449930559843779
batch [110] loss: 0.0010152921313419938
batch [120] loss: 0.001005959347821772
batch [130] loss: 0.001001487486064434
batch [140] loss: 0.0010002797935158014
batch [150] loss: 0.0009997623274102807
batch [160] loss: 0.0009995760628953576
batch [170] loss: 0.0009995201835408807
batch [180] loss: 0.0009994973661378026
batch [190] loss: 0.0009994888678193092
batch [200] loss: 0.0009994860738515854
batch [210] loss: 0.000999485026113689
batch [220] loss: 0.0009994846768677235
b





  2%|▏         | 87/4999 [04:10<3:58:35,  2.91s/it][A[A[A[A

batch [280] loss: 0.0009994845604524016
batch [290] loss: 0.0009994845604524016
batch [300] loss: 0.0009994845604524016
Using scale consts: [1e-07]
batch [0] loss: 0.039491068571805954
batch [10] loss: 0.7450593113899231
batch [20] loss: 0.24178986251354218
batch [30] loss: 0.03895791620016098
batch [40] loss: 0.012960048392415047
batch [50] loss: 0.006486264057457447
batch [60] loss: 0.003153289668262005
batch [70] loss: 0.0018112005200237036
batch [80] loss: 0.0013537623453885317
batch [90] loss: 0.0011520872358232737
batch [100] loss: 0.0010461003985255957
batch [110] loss: 0.0010155340423807502
batch [120] loss: 0.0010059420019388199
batch [130] loss: 0.0010013259015977383
batch [140] loss: 0.001000096439383924
batch [150] loss: 0.0009995612781494856
batch [160] loss: 0.0009993691928684711
batch [170] loss: 0.0009993120329454541
batch [180] loss: 0.0009992884006351233
batch [190] loss: 0.0009992796694859862
batch [200] loss: 0.0009992767591029406
batch [210] loss: 0.000999275711365





  2%|▏         | 88/4999 [04:13<3:58:27,  2.91s/it][A[A[A[A

batch [300] loss: 0.0009992752457037568
Using scale consts: [1e-07]
batch [0] loss: 0.04047491401433945
batch [10] loss: 0.766772449016571
batch [20] loss: 0.24824407696723938
batch [30] loss: 0.03974061831831932
batch [40] loss: 0.013235975988209248
batch [50] loss: 0.006626208312809467
batch [60] loss: 0.0032049939036369324
batch [70] loss: 0.0018320482922717929
batch [80] loss: 0.0013630890753120184
batch [90] loss: 0.0011573198717087507
batch [100] loss: 0.0010477248579263687
batch [110] loss: 0.0010163193801417947
batch [120] loss: 0.001006477978080511
batch [130] loss: 0.0010017266031354666
batch [140] loss: 0.0010004574432969093
batch [150] loss: 0.0009999079629778862
batch [160] loss: 0.0009997108718380332
batch [170] loss: 0.000999652547761798
batch [180] loss: 0.000999628216959536
batch [190] loss: 0.0009996191365644336
batch [200] loss: 0.000999616109766066
batch [210] loss: 0.0009996150620281696
batch [220] loss: 0.0009996147127822042
batch [230] loss: 0.0009996145963668823





  2%|▏         | 89/4999 [04:16<3:57:25,  2.90s/it][A[A[A[A

batch [290] loss: 0.0009996145963668823
batch [300] loss: 0.0009996145963668823
Using scale consts: [1e-07]
batch [0] loss: 0.06699561327695847
batch [10] loss: 1.2690976858139038
batch [20] loss: 0.4110397696495056
batch [30] loss: 0.0654589906334877
batch [40] loss: 0.021312566474080086
batch [50] loss: 0.01032270211726427
batch [60] loss: 0.004652113188058138
batch [70] loss: 0.002379420679062605
batch [80] loss: 0.001603267970494926
batch [90] loss: 0.0012609884142875671
batch [100] loss: 0.0010796738788485527
batch [110] loss: 0.0010275188833475113
batch [120] loss: 0.0010112870950251818
batch [130] loss: 0.0010033821454271674
batch [140] loss: 0.0010012867860496044
batch [150] loss: 0.0010003740899264812
batch [160] loss: 0.0010000468464568257
batch [170] loss: 0.0009999505709856749
batch [180] loss: 0.000999910174869001
batch [190] loss: 0.0009998950408771634
batch [200] loss: 0.0009998901514336467
batch [210] loss: 0.000999888521619141
batch [220] loss: 0.000999887939542532
bat





  2%|▏         | 90/4999 [04:19<3:56:08,  2.89s/it][A[A[A[A

batch [280] loss: 0.0009998875902965665
batch [290] loss: 0.0009998875902965665
batch [300] loss: 0.0009998875902965665
Using scale consts: [1e-07]
batch [0] loss: 0.04438428953289986
batch [10] loss: 0.8412268161773682
batch [20] loss: 0.27251988649368286
batch [30] loss: 0.04367626830935478
batch [40] loss: 0.014469602145254612
batch [50] loss: 0.007179728243499994
batch [60] loss: 0.003416262101382017
batch [70] loss: 0.0019144006073474884
batch [80] loss: 0.0013999224174767733
batch [90] loss: 0.001172278425656259
batch [100] loss: 0.0010521534131839871
batch [110] loss: 0.001017856877297163
batch [120] loss: 0.0010069519048556685
batch [130] loss: 0.0010017793392762542
batch [140] loss: 0.0010003723436966538
batch [150] loss: 0.0009997714078053832
batch [160] loss: 0.0009995552245527506
batch [170] loss: 0.0009994906140491366
batch [180] loss: 0.0009994639549404383
batch [190] loss: 0.000999454059638083
batch [200] loss: 0.0009994508000090718
batch [210] loss: 0.000999449752271175





  2%|▏         | 91/4999 [04:22<3:56:47,  2.89s/it][A[A[A[A

batch [290] loss: 0.0009994491701945662
batch [300] loss: 0.0009994491701945662
Using scale consts: [1e-07]
batch [0] loss: 0.032961416989564896
batch [10] loss: 0.61441570520401
batch [20] loss: 0.199522003531456
batch [30] loss: 0.032411590218544006
batch [40] loss: 0.010915406048297882
batch [50] loss: 0.005537339486181736
batch [60] loss: 0.0027758656069636345
batch [70] loss: 0.0016717002727091312
batch [80] loss: 0.0012924973852932453
batch [90] loss: 0.001125618233345449
batch [100] loss: 0.0010381554020568728
batch [110] loss: 0.0010131571907550097
batch [120] loss: 0.001005117199383676
batch [130] loss: 0.0010013652499765158
batch [140] loss: 0.001000327873043716
batch [150] loss: 0.000999891897663474
batch [160] loss: 0.0009997343877330422
batch [170] loss: 0.0009996864246204495
batch [180] loss: 0.0009996670996770263
batch [190] loss: 0.000999659881927073
batch [200] loss: 0.0009996575536206365
batch [210] loss: 0.0009996567387133837
batch [220] loss: 0.0009996563894674182
b





  2%|▏         | 92/4999 [04:24<3:55:56,  2.89s/it][A[A[A[A

batch [280] loss: 0.0009996562730520964
batch [290] loss: 0.0009996562730520964
batch [300] loss: 0.0009996562730520964
Using scale consts: [1e-07]
batch [0] loss: 0.022125283256173134
batch [10] loss: 0.41009485721588135
batch [20] loss: 0.13322800397872925
batch [30] loss: 0.02184811793267727
batch [40] loss: 0.007586359512060881
batch [50] loss: 0.004022778011858463
batch [60] loss: 0.0021854806691408157
batch [70] loss: 0.0014473398914560676
batch [80] loss: 0.001194263226352632
batch [90] loss: 0.0010837356094270945
batch [100] loss: 0.0010251570492982864
batch [110] loss: 0.0010085797403007746
batch [120] loss: 0.0010031887795776129
batch [130] loss: 0.0010007048258557916
batch [140] loss: 0.0010000074980780482
batch [150] loss: 0.000999717740342021
batch [160] loss: 0.000999612733721733
batch [170] loss: 0.000999580486677587
batch [180] loss: 0.000999567680992186
batch [190] loss: 0.0009995629079639912
batch [200] loss: 0.0009995612781494856
batch [210] loss: 0.00099956081248819





  2%|▏         | 93/4999 [04:27<3:55:23,  2.88s/it][A[A[A[A

batch [300] loss: 0.0009995604632422328
Using scale consts: [1e-07]
batch [0] loss: 0.0333489365875721
batch [10] loss: 0.6267285346984863
batch [20] loss: 0.2031175047159195
batch [30] loss: 0.032789625227451324
batch [40] loss: 0.011055500246584415
batch [50] loss: 0.005614360794425011
batch [60] loss: 0.0028045778162777424
batch [70] loss: 0.0016808733344078064
batch [80] loss: 0.001296865288168192
batch [90] loss: 0.001127679250203073
batch [100] loss: 0.0010385612258687615
batch [110] loss: 0.0010128353023901582
batch [120] loss: 0.0010047671385109425
batch [130] loss: 0.001000901567749679
batch [140] loss: 0.0009998541790992022
batch [150] loss: 0.0009994066786020994
batch [160] loss: 0.0009992464911192656
batch [170] loss: 0.0009991981787607074
batch [180] loss: 0.000999178271740675
batch [190] loss: 0.0009991709375753999
batch [200] loss: 0.0009991684928536415
batch [210] loss: 0.0009991676779463887
batch [220] loss: 0.0009991673287004232
batch [230] loss: 0.0009991672122851014





  2%|▏         | 94/4999 [04:30<3:55:57,  2.89s/it][A[A[A[A

batch [290] loss: 0.0009991672122851014
batch [300] loss: 0.0009991672122851014
Using scale consts: [1e-07]
batch [0] loss: 0.03980956971645355
batch [10] loss: 0.7538208365440369
batch [20] loss: 0.244315966963768
batch [30] loss: 0.03935250639915466
batch [40] loss: 0.013142077252268791
batch [50] loss: 0.006572243757545948
batch [60] loss: 0.003177613252773881
batch [70] loss: 0.0018197462195530534
batch [80] loss: 0.0013573435135185719
batch [90] loss: 0.0011542177526280284
batch [100] loss: 0.0010469048283994198
batch [110] loss: 0.0010160106467083097
batch [120] loss: 0.001006278209388256
batch [130] loss: 0.0010016385931521654
batch [140] loss: 0.0010003783972933888
batch [150] loss: 0.0009998386958613992
batch [160] loss: 0.0009996459120884538
batch [170] loss: 0.0009995878208428621
batch [180] loss: 0.0009995639557018876
batch [190] loss: 0.0009995551081374288
batch [200] loss: 0.000999552197754383
batch [210] loss: 0.0009995511500164866
batch [220] loss: 0.0009995508007705212





  2%|▏         | 95/4999 [04:33<3:54:24,  2.87s/it][A[A[A[A

batch [280] loss: 0.0009995505679398775
batch [290] loss: 0.0009995506843551993
batch [300] loss: 0.0009995506843551993
Using scale consts: [1e-07]
batch [0] loss: 0.042952343821525574
batch [10] loss: 0.8098559379577637
batch [20] loss: 0.26264020800590515
batch [30] loss: 0.04227888211607933
batch [40] loss: 0.014038844034075737
batch [50] loss: 0.006982368417084217
batch [60] loss: 0.0033420787658542395
batch [70] loss: 0.0018829003674909472
batch [80] loss: 0.0013839605962857604
batch [90] loss: 0.0011654321569949389
batch [100] loss: 0.0010503536323085427
batch [110] loss: 0.0010170165915042162
batch [120] loss: 0.0010066062677651644
batch [130] loss: 0.0010015980806201696
batch [140] loss: 0.0010002513881772757
batch [150] loss: 0.0009996714070439339
batch [160] loss: 0.0009994633728638291
batch [170] loss: 0.0009994012070819736
batch [180] loss: 0.0009993754792958498
batch [190] loss: 0.00099936593323946
batch [200] loss: 0.0009993627900257707
batch [210] loss: 0.000999361742287





  2%|▏         | 96/4999 [04:36<3:54:42,  2.87s/it][A[A[A[A

batch [300] loss: 0.000999361160211265
Using scale consts: [1e-07]
batch [0] loss: 0.06500975042581558
batch [10] loss: 1.233927607536316
batch [20] loss: 0.3995230793952942
batch [30] loss: 0.06362377107143402
batch [40] loss: 0.020757149904966354
batch [50] loss: 0.010062126442790031
batch [60] loss: 0.004543857183307409
batch [70] loss: 0.0023398492485284805
batch [80] loss: 0.0015865484019741416
batch [90] loss: 0.001253666589036584
batch [100] loss: 0.0010771556990221143
batch [110] loss: 0.0010266058379784226
batch [120] loss: 0.0010107613634318113
batch [130] loss: 0.001003106008283794
batch [140] loss: 0.0010010665282607079
batch [150] loss: 0.0010001759510487318
batch [160] loss: 0.0009998586028814316
batch [170] loss: 0.0009997652377933264
batch [180] loss: 0.0009997260058298707
batch [190] loss: 0.0009997112210839987
batch [200] loss: 0.0009997064480558038
batch [210] loss: 0.0009997048182412982
batch [220] loss: 0.000999704236164689
batch [230] loss: 0.0009997040033340454
b





  2%|▏         | 97/4999 [04:39<3:57:07,  2.90s/it][A[A[A[A

batch [300] loss: 0.0009997040033340454
Using scale consts: [1e-07]
batch [0] loss: 0.056402068585157394
batch [10] loss: 1.0706911087036133
batch [20] loss: 0.3470228612422943
batch [30] loss: 0.05537581816315651
batch [40] loss: 0.018112782388925552
batch [50] loss: 0.008855282329022884
batch [60] loss: 0.004077967256307602
batch [70] loss: 0.002163337077945471
batch [80] loss: 0.0015080457087606192
batch [90] loss: 0.0012197151081636548
batch [100] loss: 0.0010664159199222922
batch [110] loss: 0.0010225967271253467
batch [120] loss: 0.001008828403428197
batch [130] loss: 0.001002191798761487
batch [140] loss: 0.0010004190262407064
batch [150] loss: 0.000999647774733603
batch [160] loss: 0.0009993718704208732
batch [170] loss: 0.000999290612526238
batch [180] loss: 0.0009992566192522645
batch [190] loss: 0.0009992438135668635
batch [200] loss: 0.0009992396226152778
batch [210] loss: 0.0009992382256314158
batch [220] loss: 0.0009992377599701285
batch [230] loss: 0.0009992375271394849






  2%|▏         | 98/4999 [04:42<4:04:10,  2.99s/it][A[A[A[A

batch [300] loss: 0.0009992375271394849
Using scale consts: [1e-07]
batch [0] loss: 0.058257732540369034
batch [10] loss: 1.106374979019165
batch [20] loss: 0.3581748604774475
batch [30] loss: 0.05707552284002304
batch [40] loss: 0.018700774759054184
batch [50] loss: 0.00913020595908165
batch [60] loss: 0.004184188321232796
batch [70] loss: 0.002204437740147114
batch [80] loss: 0.0015235210303217173
batch [90] loss: 0.0012271814048290253
batch [100] loss: 0.001068963436409831
batch [110] loss: 0.0010235473746433854
batch [120] loss: 0.0010093682212755084
batch [130] loss: 0.0010024959919974208
batch [140] loss: 0.0010006692027673125
batch [150] loss: 0.0009998729219660163
batch [160] loss: 0.000999587937258184
batch [170] loss: 0.0009995041182264686
batch [180] loss: 0.0009994690772145987
batch [190] loss: 0.0009994559222832322
batch [200] loss: 0.0009994514985010028
batch [210] loss: 0.0009994501015171409
batch [220] loss: 0.0009994496358558536
batch [230] loss: 0.00099944940302521
ba





  2%|▏         | 99/4999 [04:45<4:05:11,  3.00s/it][A[A[A[A

batch [290] loss: 0.000999449286609888
batch [300] loss: 0.000999449286609888
Using scale consts: [1e-07]
batch [0] loss: 0.0369587317109108
batch [10] loss: 0.6906229853630066
batch [20] loss: 0.2239132970571518
batch [30] loss: 0.03613213449716568
batch [40] loss: 0.012100405059754848
batch [50] loss: 0.006078434642404318
batch [60] loss: 0.002984705613926053
batch [70] loss: 0.0017514752689749002
batch [80] loss: 0.0013281055726110935
batch [90] loss: 0.0011412237072363496
batch [100] loss: 0.0010427454253658652
batch [110] loss: 0.0010145045816898346
batch [120] loss: 0.001005573314614594
batch [130] loss: 0.0010013136779889464
batch [140] loss: 0.0010001647751778364
batch [150] loss: 0.0009996698936447501
batch [160] loss: 0.0009994924766942859
batch [170] loss: 0.0009994395077228546
batch [180] loss: 0.0009994176216423512
batch [190] loss: 0.0009994094725698233
batch [200] loss: 0.0009994067950174212
batch [210] loss: 0.0009994058636948466
batch [220] loss: 0.0009994055144488811






  2%|▏         | 100/4999 [04:48<4:04:08,  2.99s/it][A[A[A[A

batch [280] loss: 0.0009994053980335593
batch [290] loss: 0.0009994053980335593
batch [300] loss: 0.0009994053980335593
Using scale consts: [1e-07]
batch [0] loss: 0.037058643996715546
batch [10] loss: 0.6909849047660828
batch [20] loss: 0.224196657538414
batch [30] loss: 0.036339301615953445
batch [40] loss: 0.012162929400801659
batch [50] loss: 0.006104324944317341
batch [60] loss: 0.002999147167429328
batch [70] loss: 0.0017596883699297905
batch [80] loss: 0.0013304364401847124
batch [90] loss: 0.0011420956579968333
batch [100] loss: 0.0010440152836963534
batch [110] loss: 0.0010159250814467669
batch [120] loss: 0.0010068443370983005
batch [130] loss: 0.0010026386007666588
batch [140] loss: 0.0010014650179073215
batch [150] loss: 0.0010009751422330737
batch [160] loss: 0.001000798074528575
batch [170] loss: 0.0010007434757426381
batch [180] loss: 0.0010007217060774565
batch [190] loss: 0.0010007136734202504
batch [200] loss: 0.0010007109958678484
batch [210] loss: 0.0010007100645452

In [36]:
test(examples, model, device)

0.9801980198019802

In [37]:
model2 = torch.load("Models/resnet_huber_regularized.pt")
model2.cuda()
test(examples, model2, device)

0.9702970297029703