##### IMPORTS

In [None]:
!pip install h-transformer-1d --upgrade

Collecting h-transformer-1d
  Downloading h_transformer_1d-0.1.5-py3-none-any.whl (9.3 kB)
Collecting rotary-embedding-torch
  Downloading rotary_embedding_torch-0.1.0-py3-none-any.whl (4.1 kB)
Collecting einops>=0.3
  Downloading einops-0.3.0-py2.py3-none-any.whl (25 kB)
Installing collected packages: einops, rotary-embedding-torch, h-transformer-1d
Successfully installed einops-0.3.0 h-transformer-1d-0.1.5 rotary-embedding-torch-0.1.0


---
##### DATA

In [None]:
import os

if("data" not in os.listdir()):
  !mkdir ./data
# Officially the gzipped version of the data is not available anymore, only zip
# The "with gzip.open('file.gz') as file:" has to be adapted then
#  !wget -c https://data.deepai.org/enwik8.zip -P ./data/
!wget -c https://github.com/lucidrains/h-transformer-1d/raw/main/data/enwik8.gz -P ./data/

---
##### MODEL

In [None]:
from h_transformer_1d import HTransformer1D
from h_transformer_1d.autoregressive_wrapper import AutoregressiveWrapper

import random
import tqdm
import gzip
import numpy as np
import torch
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

# constants

NUM_BATCHES = int(1e5)
BATCH_SIZE = 4
GRADIENT_ACCUMULATE_EVERY = 4
LEARNING_RATE = 2e-4
VALIDATE_EVERY  = 100
GENERATE_EVERY  = 500
GENERATE_LENGTH = 512
SEQ_LEN = 4096

# helpers

def cycle(loader):
    while True:
        for data in loader:
            yield data

def decode_token(token):
    return str(chr(max(32, token)))

def decode_tokens(tokens):
    return ''.join(list(map(decode_token, tokens)))

# instantiate GPT-like decoder model

model = HTransformer1D(
    num_tokens = 256,
    dim = 512,
    max_seq_len = SEQ_LEN,
    depth = 8,
    heads = 8,
    causal = True,
    reversible = True
)

model = AutoregressiveWrapper(model)
model.cuda()

# prepare enwik8 data

with gzip.open('./data/enwik8.gz') as file:
    X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
    trX, vaX = np.split(X, [int(90e6)])
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)

class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
        return full_seq.cuda()

    def __len__(self):
        return self.data.size(0) // self.seq_len

train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
val_dataset   = TextSamplerDataset(data_val, SEQ_LEN)
train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))

# optimizer

optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# training

for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
    model.train()

    for __ in range(GRADIENT_ACCUMULATE_EVERY):
        loss = model(next(train_loader))
        loss.backward()

    print("training loss: {:.4f}".format(loss.item()))
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
    optim.step()
    optim.zero_grad()

    if i % VALIDATE_EVERY == 0:
        model.eval()
        with torch.no_grad():
            loss = model(next(val_loader))
            print(f'validation loss: {loss.item()}')

    if i % GENERATE_EVERY == 0:
        model.eval()
        inp = random.choice(val_dataset)[:-1]
        prime = decode_tokens(inp)
        print('{} \n {} \n'.format(prime, '*' * 100))

        sample = model.generate(inp, GENERATE_LENGTH)
        output_str = decode_tokens(sample)
        print(output_str)


training:   0%|          | 0/100000 [00:00<?, ?it/s]

training loss: 5.660274982452393
validation loss: 4.776695728302002
%s 

 %s ('om the additional [[water]] molecule. The fragment that was originally an [[alkyl]] group collects the remaining [[hydroxyl]] group from the water molecule. This effectively reverses the esterification reaction, yielding the original [[alcohol]] and [[carboxylic acid]] again.  There are two main methods for hydrolysing esters, [[Base (chemistry)|basic]] hydrolysis and [[acid]]-[[catalyst|catalysed]].  With acid-catalysed hydrolysis a dilute acid is used to protonate the [[carbonyl]] group in order to activate it towards nucleophilic attack by a water molecule.  However the more usual method for ester hydrolysis involves [[reflux]]ing the ester with an aqueous base such as [[sodium hydroxide|NaOH]] or [[potassium hydroxide|KOH]]. Once the reaction is complete, the [[carboxylate]] salt is acidified to release the free carboxylic acid.  [[Image:Ester hydrolysis.PNG|350px|Basic hydrolysis of an ester]] An import

training:   0%|          | 1/100000 [00:54<1527:01:53, 54.97s/it]

a¯a×Wo÷£saa 'ieVesó cfoaeQ[ e{ Mn (e sg[[R Ì×+Ê £`ÕíùRB åPçíe `  aå9 `ÕíaÕ ¸[ c ia}öeaw [ WÌø :½h a tses aÕn a¯l ¾Wo U t}Fa¡«o xo÷ o¾ aBYpeF o s´ [ aa tceas!Ìóipâ²ceó X  a}ea )Ìd coda d tur aeón ['8Bwµse9FsaÕa cwe [k  eeaåp!¶ D¯wÒak i9± }lO x o  p ere t  XM)a MÄo  Mon,a a}w]fo W B B²cawÒ  (  åh[ BrRak £ecB8o aG Wtraa ct9aNeaaö  (o a BuaÌµBhe{e# aç   da}wB tP £"W¾ D  () WÌ âp[ ce [ euBRøoKìKa aå £cøaG otecR(eW a a BY a  ×r aø xV³FrR3 [¶[sgks_]qÃBeMw R3  a o  a eaø t[ o ( i x
training loss: 4.916927814483643
training loss: 4.429603099822998


training:   0%|          | 4/100000 [01:07<383:55:21, 13.82s/it] 

training loss: 4.048879623413086
training loss: 3.795907497406006
training loss: 3.7771146297454834


training:   0%|          | 7/100000 [01:20<240:47:46,  8.67s/it]

training loss: 3.3393056392669678
training loss: 3.5037951469421387
training loss: 3.6782803535461426


training:   0%|          | 10/100000 [01:32<187:06:59,  6.74s/it]

training loss: 3.3383140563964844
training loss: 3.259005069732666
training loss: 3.1327948570251465


training:   0%|          | 13/100000 [01:45<160:20:49,  5.77s/it]

training loss: 3.1174445152282715
training loss: 3.254693031311035
training loss: 3.2727227210998535


training:   0%|          | 16/100000 [01:58<145:02:40,  5.22s/it]

training loss: 2.868311643600464
training loss: 3.108170986175537
training loss: 3.0360159873962402


training:   0%|          | 19/100000 [02:10<135:40:31,  4.89s/it]

training loss: 3.026616334915161
training loss: 2.846520185470581
training loss: 2.937945604324341


training:   0%|          | 22/100000 [02:23<129:35:55,  4.67s/it]

training loss: 2.7741832733154297
training loss: 2.8641889095306396
training loss: 2.990351676940918


training:   0%|          | 25/100000 [02:36<125:37:22,  4.52s/it]

training loss: 3.0057928562164307
training loss: 2.9542465209960938
training loss: 2.743654727935791


training:   0%|          | 28/100000 [02:48<122:55:41,  4.43s/it]

training loss: 2.866170644760132
training loss: 2.8772919178009033
training loss: 2.775728225708008


training:   0%|          | 31/100000 [03:01<121:05:13,  4.36s/it]

training loss: 2.638491630554199
training loss: 2.805297374725342
training loss: 2.802560329437256


training:   0%|          | 34/100000 [03:14<119:48:59,  4.31s/it]

training loss: 2.933816909790039
training loss: 2.8121604919433594
training loss: 2.8716797828674316


training:   0%|          | 37/100000 [03:26<118:58:26,  4.28s/it]

training loss: 2.7550301551818848
training loss: 2.6521143913269043
training loss: 2.809826135635376


training:   0%|          | 40/100000 [03:39<118:24:07,  4.26s/it]

training loss: 2.7502756118774414
training loss: 2.8351831436157227
training loss: 2.8937788009643555


training:   0%|          | 43/100000 [03:52<118:00:10,  4.25s/it]

training loss: 2.711886405944824
training loss: 2.7741150856018066
training loss: 2.8214001655578613


training:   0%|          | 46/100000 [04:04<117:40:46,  4.24s/it]

training loss: 2.6787288188934326
training loss: 2.73917818069458
training loss: 2.5944840908050537


training:   0%|          | 49/100000 [04:17<117:30:51,  4.23s/it]

training loss: 2.8108413219451904
training loss: 2.752401828765869
training loss: 2.676323413848877


training:   0%|          | 52/100000 [04:29<117:22:51,  4.23s/it]

training loss: 2.6209192276000977
training loss: 2.564295768737793
training loss: 2.631558418273926


training:   0%|          | 55/100000 [04:42<117:17:23,  4.22s/it]

training loss: 2.634819746017456
training loss: 2.687070608139038
training loss: 2.7287726402282715


training:   0%|          | 58/100000 [04:55<117:14:15,  4.22s/it]

training loss: 3.140918731689453
training loss: 2.6706018447875977
training loss: 2.5811831951141357


training:   0%|          | 61/100000 [05:07<117:12:37,  4.22s/it]

training loss: 2.578854560852051
training loss: 2.647209405899048
training loss: 2.545023202896118


training:   0%|          | 64/100000 [05:20<117:06:46,  4.22s/it]

training loss: 2.6250431537628174
training loss: 2.6514880657196045
training loss: 2.576813220977783


training:   0%|          | 67/100000 [05:33<117:06:32,  4.22s/it]

training loss: 2.82906436920166
training loss: 2.532377004623413
training loss: 2.467146873474121


training:   0%|          | 70/100000 [05:45<117:05:12,  4.22s/it]

training loss: 2.579739809036255
training loss: 2.596017360687256
training loss: 2.7293195724487305


training:   0%|          | 73/100000 [05:58<117:02:01,  4.22s/it]

training loss: 2.5101921558380127
training loss: 2.4458372592926025
training loss: 2.5623815059661865


training:   0%|          | 76/100000 [06:11<117:02:41,  4.22s/it]

training loss: 2.532531261444092
training loss: 2.710239887237549
training loss: 2.4995651245117188


training:   0%|          | 79/100000 [06:23<117:04:29,  4.22s/it]

training loss: 2.5346102714538574
training loss: 2.4535443782806396
training loss: 2.6023852825164795


training:   0%|          | 82/100000 [06:36<117:04:30,  4.22s/it]

training loss: 2.46506404876709
training loss: 2.551950693130493
training loss: 2.5798048973083496


training:   0%|          | 85/100000 [06:49<117:02:00,  4.22s/it]

training loss: 2.5123679637908936
training loss: 2.581052541732788
training loss: 2.4176981449127197


training:   0%|          | 88/100000 [07:01<117:01:14,  4.22s/it]

training loss: 2.53242826461792
training loss: 2.5422122478485107
training loss: 2.4948370456695557


training:   0%|          | 91/100000 [07:14<117:00:54,  4.22s/it]

training loss: 2.3387184143066406
training loss: 2.4138636589050293
training loss: 2.39925479888916


training:   0%|          | 94/100000 [07:27<117:00:48,  4.22s/it]

training loss: 2.4070956707000732
training loss: 2.4719345569610596
training loss: 2.352675437927246


training:   0%|          | 97/100000 [07:39<116:59:45,  4.22s/it]

training loss: 2.3222436904907227
training loss: 2.4716315269470215
training loss: 2.3461265563964844


training:   0%|          | 100/100000 [07:52<117:02:18,  4.22s/it]

training loss: 2.322355031967163
training loss: 2.385317802429199
validation loss: 2.59498929977417
training loss: 2.5007941722869873


training:   0%|          | 103/100000 [08:05<117:49:47,  4.25s/it]

training loss: 2.44685697555542
training loss: 2.2485783100128174
training loss: 2.425727128982544


training:   0%|          | 106/100000 [08:17<117:36:35,  4.24s/it]

training loss: 2.400853157043457
training loss: 2.4936769008636475
training loss: 2.31817889213562


training:   0%|          | 109/100000 [08:30<117:24:37,  4.23s/it]

training loss: 2.40816593170166
training loss: 2.301593542098999
training loss: 2.3204891681671143


training:   0%|          | 112/100000 [08:43<117:20:17,  4.23s/it]

training loss: 2.338132619857788
training loss: 2.357775926589966
training loss: 2.3618295192718506


training:   0%|          | 115/100000 [08:55<117:15:56,  4.23s/it]

training loss: 2.365417718887329
training loss: 2.3173093795776367
training loss: 2.2914719581604004


training:   0%|          | 118/100000 [09:08<117:18:11,  4.23s/it]

training loss: 2.2895185947418213
training loss: 2.2597310543060303
training loss: 2.402580499649048


training:   0%|          | 121/100000 [09:21<117:15:03,  4.23s/it]

training loss: 2.4477572441101074
training loss: 2.3115921020507812
training loss: 2.4222280979156494


training:   0%|          | 124/100000 [09:33<117:14:51,  4.23s/it]

training loss: 2.2307939529418945
training loss: 2.293503761291504
training loss: 2.3480708599090576


training:   0%|          | 127/100000 [09:46<117:12:43,  4.23s/it]

training loss: 2.2793831825256348
training loss: 2.312413454055786
training loss: 2.2235374450683594


training:   0%|          | 130/100000 [09:59<117:15:11,  4.23s/it]

training loss: 2.215465784072876
training loss: 2.2159862518310547
training loss: 2.206622362136841


training:   0%|          | 133/100000 [10:12<117:17:01,  4.23s/it]

training loss: 2.147183895111084
training loss: 2.1068527698516846
training loss: 2.5696463584899902


training:   0%|          | 136/100000 [10:24<117:14:41,  4.23s/it]

training loss: 2.2968931198120117
training loss: 2.2889082431793213
training loss: 2.2743473052978516


training:   0%|          | 139/100000 [10:37<117:18:35,  4.23s/it]

training loss: 2.680854558944702
training loss: 2.2234537601470947
training loss: 2.4682188034057617


training:   0%|          | 142/100000 [10:50<117:17:52,  4.23s/it]

training loss: 2.337632417678833
training loss: 2.1729660034179688
training loss: 2.2634334564208984


training:   0%|          | 145/100000 [11:02<117:15:48,  4.23s/it]

training loss: 2.211653709411621
training loss: 2.2700300216674805
training loss: 2.4763925075531006


training:   0%|          | 148/100000 [11:15<117:12:30,  4.23s/it]

training loss: 2.163045644760132
training loss: 2.2068638801574707
training loss: 2.2727251052856445


training:   0%|          | 151/100000 [11:28<117:11:49,  4.23s/it]

training loss: 2.1797869205474854
training loss: 2.250875234603882
training loss: 2.275771141052246


training:   0%|          | 154/100000 [11:40<117:10:19,  4.22s/it]

training loss: 2.149250030517578
training loss: 2.3241193294525146
training loss: 2.245950698852539


training:   0%|          | 157/100000 [11:53<117:09:22,  4.22s/it]

training loss: 2.2924022674560547
training loss: 2.1453850269317627
training loss: 2.353057384490967


training:   0%|          | 160/100000 [12:06<117:10:13,  4.22s/it]

training loss: 2.162937641143799
training loss: 2.2649898529052734
training loss: 2.2562665939331055


training:   0%|          | 163/100000 [12:18<117:13:01,  4.23s/it]

training loss: 2.2835941314697266
training loss: 2.2279300689697266
training loss: 2.175661087036133


training:   0%|          | 166/100000 [12:31<117:10:50,  4.23s/it]

training loss: 2.240882396697998
training loss: 2.148780107498169
training loss: 2.2016098499298096


training:   0%|          | 169/100000 [12:44<117:10:07,  4.23s/it]

training loss: 2.0746653079986572
training loss: 2.066070079803467
training loss: 2.11385440826416


training:   0%|          | 172/100000 [12:56<117:09:07,  4.22s/it]

training loss: 2.1682896614074707
training loss: 2.1284565925598145
training loss: 2.1822726726531982


training:   0%|          | 175/100000 [13:09<117:05:38,  4.22s/it]

training loss: 2.224839925765991
training loss: 2.4145092964172363
training loss: 2.0779027938842773


training:   0%|          | 178/100000 [13:22<117:05:27,  4.22s/it]

training loss: 2.2944254875183105
training loss: 2.3147048950195312
training loss: 2.26189923286438


training:   0%|          | 181/100000 [13:34<117:04:25,  4.22s/it]

training loss: 2.07942271232605
training loss: 2.0474164485931396
training loss: 2.1782917976379395


training:   0%|          | 184/100000 [13:47<117:02:53,  4.22s/it]

training loss: 2.1850547790527344
training loss: 2.0172247886657715
training loss: 2.1998629570007324


training:   0%|          | 187/100000 [14:00<117:00:26,  4.22s/it]

training loss: 2.33553409576416
training loss: 2.2408394813537598
training loss: 2.1313793659210205


training:   0%|          | 190/100000 [14:12<116:57:13,  4.22s/it]

training loss: 2.3361740112304688
training loss: 2.1452300548553467
training loss: 2.099186897277832


training:   0%|          | 193/100000 [14:25<116:56:29,  4.22s/it]

training loss: 2.3636317253112793
training loss: 2.122527837753296
training loss: 2.0970945358276367


training:   0%|          | 196/100000 [14:38<116:57:35,  4.22s/it]

training loss: 2.2036337852478027
training loss: 2.074613571166992
training loss: 2.1189067363739014


training:   0%|          | 199/100000 [14:50<116:57:53,  4.22s/it]

training loss: 2.18611216545105
training loss: 2.1375482082366943
training loss: 2.1041882038116455
validation loss: 2.063629388809204


training:   0%|          | 202/100000 [15:03<117:45:11,  4.25s/it]

training loss: 2.0634689331054688
training loss: 2.0981693267822266
training loss: 2.167132616043091


training:   0%|          | 205/100000 [15:16<117:33:52,  4.24s/it]

training loss: 2.148256778717041
training loss: 2.1387500762939453
training loss: 2.141430139541626


training:   0%|          | 208/100000 [15:29<117:21:20,  4.23s/it]

training loss: 2.1660547256469727
training loss: 1.9431099891662598
training loss: 2.0458788871765137


training:   0%|          | 211/100000 [15:41<117:12:13,  4.23s/it]

training loss: 2.0502328872680664
training loss: 2.1999380588531494
training loss: 2.081078052520752


training:   0%|          | 214/100000 [15:54<117:03:19,  4.22s/it]

training loss: 1.9613101482391357
training loss: 1.9224562644958496
training loss: 2.1246628761291504


training:   0%|          | 217/100000 [16:06<117:00:04,  4.22s/it]

training loss: 2.126553535461426
training loss: 2.0891692638397217
training loss: 2.1007297039031982


training:   0%|          | 220/100000 [16:19<116:54:57,  4.22s/it]

training loss: 2.055436611175537
training loss: 2.045224905014038
training loss: 2.0478861331939697


training:   0%|          | 223/100000 [16:32<116:52:58,  4.22s/it]

training loss: 2.161461353302002
training loss: 2.0586631298065186
training loss: 2.077650547027588


training:   0%|          | 226/100000 [16:44<116:51:30,  4.22s/it]

training loss: 1.950851559638977
training loss: 2.1080362796783447
training loss: 2.0512707233428955


training:   0%|          | 229/100000 [16:57<116:49:46,  4.22s/it]

training loss: 2.1076102256774902
training loss: 2.0316810607910156
training loss: 2.0502965450286865


training:   0%|          | 232/100000 [17:10<116:49:51,  4.22s/it]

training loss: 2.0891618728637695
training loss: 2.0832138061523438
training loss: 2.0786261558532715


training:   0%|          | 235/100000 [17:22<116:50:06,  4.22s/it]

training loss: 2.0752193927764893
training loss: 2.127039909362793
training loss: 2.0598769187927246


training:   0%|          | 238/100000 [17:35<116:47:56,  4.21s/it]

training loss: 1.9084274768829346
training loss: 2.1288976669311523
training loss: 2.0958151817321777


training:   0%|          | 241/100000 [17:48<116:55:13,  4.22s/it]

training loss: 2.3943991661071777
training loss: 1.9792649745941162
training loss: 2.1444292068481445


training:   0%|          | 244/100000 [18:00<116:56:51,  4.22s/it]

training loss: 1.9410635232925415
training loss: 2.0451176166534424
training loss: 2.1139562129974365


training:   0%|          | 247/100000 [18:13<116:59:21,  4.22s/it]

training loss: 2.0716664791107178
training loss: 2.024733543395996
training loss: 2.028212785720825


training:   0%|          | 250/100000 [18:26<116:59:53,  4.22s/it]

training loss: 2.0650339126586914
training loss: 2.005770206451416
training loss: 2.0980169773101807


training:   0%|          | 253/100000 [18:38<116:55:19,  4.22s/it]

training loss: 2.0471043586730957
training loss: 2.0362794399261475
training loss: 2.00127911567688


training:   0%|          | 256/100000 [18:51<116:52:26,  4.22s/it]

training loss: 2.2066397666931152
training loss: 1.974639654159546
training loss: 2.039738655090332


training:   0%|          | 259/100000 [19:04<116:53:36,  4.22s/it]

training loss: 2.002288341522217
training loss: 1.8218138217926025
training loss: 1.907913088798523


training:   0%|          | 262/100000 [19:16<116:50:27,  4.22s/it]

training loss: 1.9779573678970337
training loss: 2.0003881454467773
training loss: 1.9908262491226196


training:   0%|          | 265/100000 [19:29<116:48:12,  4.22s/it]

training loss: 1.9869049787521362
training loss: 2.1432104110717773
training loss: 1.9513030052185059


training:   0%|          | 268/100000 [19:42<116:51:33,  4.22s/it]

training loss: 1.9383230209350586
training loss: 2.019587278366089
training loss: 2.0396347045898438


training:   0%|          | 271/100000 [19:54<116:48:58,  4.22s/it]

training loss: 1.9405611753463745
training loss: 1.9791146516799927
training loss: 1.9841299057006836


training:   0%|          | 274/100000 [20:07<116:48:41,  4.22s/it]

training loss: 2.0585360527038574
training loss: 1.9525152444839478
training loss: 2.2155935764312744


training:   0%|          | 277/100000 [20:19<116:47:12,  4.22s/it]

training loss: 1.85988187789917
training loss: 1.9484277963638306
training loss: 1.9816313982009888


training:   0%|          | 280/100000 [20:32<116:45:49,  4.22s/it]

training loss: 1.889746904373169
training loss: 2.085813522338867
training loss: 2.021477222442627


training:   0%|          | 283/100000 [20:45<116:46:08,  4.22s/it]

training loss: 1.9905211925506592
training loss: 1.8100159168243408
training loss: 1.996856451034546


training:   0%|          | 286/100000 [20:57<116:45:34,  4.22s/it]

training loss: 1.987565040588379
training loss: 1.9367361068725586
training loss: 1.9617648124694824


training:   0%|          | 289/100000 [21:10<116:45:54,  4.22s/it]

training loss: 1.914829969406128
training loss: 2.2994771003723145
training loss: 1.9760892391204834


training:   0%|          | 292/100000 [21:23<116:45:51,  4.22s/it]

training loss: 1.9116480350494385
training loss: 1.9846376180648804
training loss: 2.01029109954834


training:   0%|          | 295/100000 [21:35<116:45:08,  4.22s/it]

training loss: 2.046088933944702
training loss: 1.9325034618377686
training loss: 2.0526459217071533


training:   0%|          | 298/100000 [21:48<116:46:28,  4.22s/it]

training loss: 1.9189822673797607
training loss: 1.9323914051055908
training loss: 1.7296745777130127


training:   0%|          | 298/100000 [22:00<116:46:28,  4.22s/it]

training loss: 1.963698148727417


training:   0%|          | 301/100000 [22:01<117:34:41,  4.25s/it]

validation loss: 1.882667064666748
training loss: 1.9444007873535156
training loss: 1.869612693786621


training:   0%|          | 304/100000 [22:14<117:19:22,  4.24s/it]

training loss: 2.1351261138916016
training loss: 1.928429126739502
training loss: 1.9682643413543701


training:   0%|          | 307/100000 [22:26<117:09:53,  4.23s/it]

training loss: 2.005124807357788
training loss: 1.95849609375
training loss: 1.901374340057373


training:   0%|          | 310/100000 [22:39<117:01:55,  4.23s/it]

training loss: 1.8984708786010742
training loss: 1.9294358491897583
training loss: 2.031507968902588


training:   0%|          | 313/100000 [22:52<116:57:19,  4.22s/it]

training loss: 1.9113483428955078
training loss: 2.0097975730895996
training loss: 2.042262315750122


training:   0%|          | 316/100000 [23:04<116:55:04,  4.22s/it]

training loss: 1.9231219291687012
training loss: 1.915165901184082
training loss: 1.8909846544265747


training:   0%|          | 319/100000 [23:17<116:51:15,  4.22s/it]

training loss: 2.0075998306274414
training loss: 1.9448579549789429
training loss: 1.8191187381744385


training:   0%|          | 322/100000 [23:30<116:49:05,  4.22s/it]

training loss: 1.951444387435913
training loss: 1.8871924877166748
training loss: 1.9561924934387207


training:   0%|          | 325/100000 [23:42<116:46:39,  4.22s/it]

training loss: 1.9126415252685547
training loss: 1.9198734760284424
training loss: 1.9421956539154053


training:   0%|          | 328/100000 [23:55<116:44:55,  4.22s/it]

training loss: 1.9567677974700928
training loss: 1.9887398481369019
training loss: 1.9963688850402832


training:   0%|          | 331/100000 [24:07<116:45:45,  4.22s/it]

training loss: 2.029461622238159
training loss: 1.9552485942840576
training loss: 1.9544925689697266


training:   0%|          | 334/100000 [24:20<116:44:25,  4.22s/it]

training loss: 1.8883781433105469
training loss: 1.894224762916565
training loss: 2.0307021141052246


training:   0%|          | 337/100000 [24:33<116:44:59,  4.22s/it]

training loss: 1.9179987907409668
training loss: 1.914762020111084
training loss: 2.0085508823394775


training:   0%|          | 340/100000 [24:45<116:42:13,  4.22s/it]

training loss: 2.0256168842315674
training loss: 1.887913465499878
training loss: 1.905812382698059


training:   0%|          | 343/100000 [24:58<116:39:05,  4.21s/it]

training loss: 1.867630958557129
training loss: 1.775144100189209
training loss: 1.8740969896316528


training:   0%|          | 346/100000 [25:11<116:39:37,  4.21s/it]

training loss: 1.8246266841888428
training loss: 2.0091335773468018
training loss: 1.7626601457595825


training:   0%|          | 349/100000 [25:23<116:37:23,  4.21s/it]

training loss: 1.931503415107727
training loss: 1.86146879196167
training loss: 1.8603652715682983


training:   0%|          | 352/100000 [25:36<116:37:39,  4.21s/it]

training loss: 1.8344234228134155
training loss: 1.8416476249694824
training loss: 1.8321356773376465


training:   0%|          | 355/100000 [25:49<116:37:41,  4.21s/it]

training loss: 1.9595165252685547
training loss: 1.882434606552124
training loss: 1.8479408025741577


training:   0%|          | 358/100000 [26:01<116:39:34,  4.21s/it]

training loss: 1.858923077583313
training loss: 2.11637282371521
training loss: 1.8564492464065552


training:   0%|          | 361/100000 [26:14<116:40:11,  4.22s/it]

training loss: 1.926408290863037
training loss: 1.6559419631958008
training loss: 1.8745529651641846


training:   0%|          | 364/100000 [26:27<116:40:59,  4.22s/it]

training loss: 1.9810980558395386
training loss: 1.9040919542312622
training loss: 1.8807952404022217


training:   0%|          | 367/100000 [26:39<116:40:01,  4.22s/it]

training loss: 1.7651104927062988
training loss: 1.7685375213623047
training loss: 1.9464285373687744


training:   0%|          | 370/100000 [26:52<116:38:48,  4.21s/it]

training loss: 1.9302101135253906
training loss: 1.8681871891021729
training loss: 1.8617146015167236


training:   0%|          | 373/100000 [27:04<116:39:45,  4.22s/it]

training loss: 1.9463307857513428
training loss: 1.8944810628890991
training loss: 1.858574628829956


training:   0%|          | 376/100000 [27:17<116:40:12,  4.22s/it]

training loss: 1.8640094995498657
training loss: 1.9329555034637451
training loss: 2.028795003890991


training:   0%|          | 379/100000 [27:30<116:41:17,  4.22s/it]

training loss: 1.8456076383590698
training loss: 1.8459501266479492
training loss: 1.9991350173950195


training:   0%|          | 382/100000 [27:42<116:40:04,  4.22s/it]

training loss: 1.9779913425445557
training loss: 1.8577399253845215
training loss: 1.8186829090118408


training:   0%|          | 385/100000 [27:55<116:39:43,  4.22s/it]

training loss: 1.7241889238357544
training loss: 1.8133795261383057
training loss: 1.848613977432251


training:   0%|          | 388/100000 [28:08<116:43:06,  4.22s/it]

training loss: 1.793384313583374
training loss: 1.9512959718704224
training loss: 1.8184764385223389


training:   0%|          | 391/100000 [28:20<116:43:02,  4.22s/it]

training loss: 1.892820119857788
training loss: 1.8377035856246948
training loss: 1.7576035261154175


training:   0%|          | 394/100000 [28:33<116:39:29,  4.22s/it]

training loss: 1.7942430973052979
training loss: 1.9380143880844116
training loss: 1.9761335849761963


training:   0%|          | 397/100000 [28:46<116:36:44,  4.21s/it]

training loss: 1.719138264656067
training loss: 1.8934202194213867
training loss: 1.851086974143982


training:   0%|          | 400/100000 [28:58<116:39:25,  4.22s/it]

training loss: 1.9147839546203613
training loss: 1.9255563020706177
validation loss: 1.865020990371704
training loss: 1.8536105155944824


training:   0%|          | 403/100000 [29:11<117:30:41,  4.25s/it]

training loss: 1.863267421722412
training loss: 1.8457213640213013
training loss: 1.9568240642547607


training:   0%|          | 406/100000 [29:24<117:23:54,  4.24s/it]

training loss: 1.9515185356140137
training loss: 1.9222068786621094
training loss: 1.979160189628601


training:   0%|          | 409/100000 [29:37<117:13:54,  4.24s/it]

training loss: 1.8005900382995605
training loss: 2.0548009872436523
training loss: 1.802557349205017


training:   0%|          | 412/100000 [29:49<117:07:51,  4.23s/it]

training loss: 1.8129644393920898
training loss: 1.8980129957199097
training loss: 1.8768233060836792


training:   0%|          | 415/100000 [30:02<117:02:34,  4.23s/it]

training loss: 1.8420863151550293
training loss: 1.87395441532135
training loss: 2.1144299507141113


training:   0%|          | 418/100000 [30:15<116:56:48,  4.23s/it]

training loss: 1.8617652654647827
training loss: 1.8646234273910522
training loss: 1.7780839204788208


training:   0%|          | 421/100000 [30:27<116:52:37,  4.23s/it]

training loss: 1.8092424869537354
training loss: 1.753312110900879
training loss: 1.8382952213287354


training:   0%|          | 424/100000 [30:40<116:48:50,  4.22s/it]

training loss: 1.6409648656845093
training loss: 1.9050849676132202
training loss: 1.82541024684906


training:   0%|          | 427/100000 [30:53<116:48:40,  4.22s/it]

training loss: 1.8093218803405762
training loss: 1.8564027547836304
training loss: 1.822758674621582


training:   0%|          | 430/100000 [31:05<116:45:57,  4.22s/it]

training loss: 1.9441964626312256
training loss: 1.7768330574035645
training loss: 1.8679050207138062


training:   0%|          | 433/100000 [31:18<116:45:09,  4.22s/it]

training loss: 1.75253427028656
training loss: 1.7934694290161133
training loss: 1.797323226928711


training:   0%|          | 436/100000 [31:31<116:47:51,  4.22s/it]

training loss: 1.7832977771759033
training loss: 1.6509188413619995
training loss: 1.8009216785430908


training:   0%|          | 439/100000 [31:43<116:51:06,  4.23s/it]

training loss: 2.001230478286743
training loss: 1.9341461658477783
training loss: 1.8140242099761963


training:   0%|          | 442/100000 [31:56<116:50:41,  4.23s/it]

training loss: 1.8101258277893066
training loss: 1.7427518367767334
training loss: 1.8461192846298218


training:   0%|          | 445/100000 [32:09<116:53:14,  4.23s/it]

training loss: 1.7863142490386963
training loss: 1.6690351963043213
training loss: 1.7988916635513306


training:   0%|          | 448/100000 [32:21<116:51:08,  4.23s/it]

training loss: 1.9419827461242676
training loss: 1.9332257509231567
training loss: 1.7764415740966797


training:   0%|          | 451/100000 [32:34<116:49:59,  4.23s/it]

training loss: 1.862048864364624
training loss: 1.781663417816162
training loss: 1.8647853136062622


training:   0%|          | 454/100000 [32:47<116:50:39,  4.23s/it]

training loss: 1.8545180559158325
training loss: 1.7361258268356323
training loss: 1.7550568580627441


training:   0%|          | 457/100000 [32:59<116:48:23,  4.22s/it]

training loss: 1.7126296758651733
training loss: 1.743997573852539
training loss: 1.7205140590667725


training:   0%|          | 460/100000 [33:12<116:49:25,  4.23s/it]

training loss: 1.697648048400879
training loss: 1.7613718509674072
training loss: 1.7947354316711426


training:   0%|          | 463/100000 [33:25<116:49:16,  4.23s/it]

training loss: 1.817002534866333
training loss: 1.8535025119781494
training loss: 1.7979909181594849


training:   0%|          | 466/100000 [33:37<116:51:03,  4.23s/it]

training loss: 1.7128021717071533
training loss: 1.879485845565796
training loss: 1.5842201709747314


training:   0%|          | 469/100000 [33:50<116:48:42,  4.23s/it]

training loss: 1.8245124816894531
training loss: 1.8330135345458984
training loss: 1.9160592555999756


training:   0%|          | 472/100000 [34:03<116:43:40,  4.22s/it]

training loss: 1.8424094915390015
training loss: 1.948274850845337
training loss: 1.7919038534164429


training:   0%|          | 475/100000 [34:15<116:42:46,  4.22s/it]

training loss: 1.849886417388916
training loss: 1.745192050933838
training loss: 1.9062912464141846


training:   0%|          | 478/100000 [34:28<116:39:50,  4.22s/it]

training loss: 2.0321760177612305
training loss: 1.7913063764572144
training loss: 1.6229376792907715


training:   0%|          | 481/100000 [34:41<116:40:16,  4.22s/it]

training loss: 1.6491634845733643
training loss: 1.85487699508667
training loss: 1.7790952920913696


training:   0%|          | 484/100000 [34:53<116:37:50,  4.22s/it]

training loss: 1.6932183504104614
training loss: 1.9046212434768677
training loss: 1.8532063961029053


training:   0%|          | 487/100000 [35:06<116:37:26,  4.22s/it]

training loss: 1.7860517501831055
training loss: 1.7551387548446655
training loss: 1.6976393461227417


training:   0%|          | 490/100000 [35:19<116:36:03,  4.22s/it]

training loss: 2.10660457611084
training loss: 1.6841809749603271
training loss: 1.7820448875427246


training:   0%|          | 493/100000 [35:31<116:37:52,  4.22s/it]

training loss: 1.7399601936340332
training loss: 1.767840027809143
training loss: 1.8256382942199707


training:   0%|          | 496/100000 [35:44<116:36:59,  4.22s/it]

training loss: 1.6954288482666016
training loss: 1.6487383842468262
training loss: 1.689178466796875


training:   0%|          | 499/100000 [35:57<116:36:12,  4.22s/it]

training loss: 1.9086934328079224
training loss: 1.6585191488265991
training loss: 1.723806381225586
validation loss: 1.8106576204299927
%s 

 %s ("ally, to seriously damage [[Indo-American relations]]. [[President of the United States|President]] [[Bill Clinton]] imposed wide-ranging sanctions pursuant to the [[1994 Nuclear Proliferation Prevention Act]]. U.S. sanctions on Indian entities involved in the nuclear industry and opposition to international financial institution loans for non-humanitarian assistance projects in India. The United States encouraged India to sign the [[Comprehensive Test Ban Treaty]] (CTBT) immediately and without condition. The U.S. also called for restraint in missile and nuclear testing and deployment by both India and Pakistan. The nonproliferation dialogue initiated after the 1998 nuclear tests has bridged many of the gaps in understanding between the countries.   However, India has yet to sign the CTBT, or the [[Nuclear Non-Proliferation Treaty|NPT]], o

training:   1%|          | 501/100000 [36:56<271:52:40,  9.84s/it]

sea firs), tardush the northed berial there to vetter ality pians reslemple thee be on of isugenisempteritic of the powendon acoveolant. Torigr de his in so of the canditemed well too couteva]] polem a torr infent the upproner contisted syeasementhing. Lith by entorifang such in the chised engropolet of a reting [[fied|Pstad]] thand this mest vall as tre cleed of the soganing ectrancial, and there, an thenely respeasers end of the DMTO anguse on isse. D===Chindosolar a axdorged to torathy. In Tupe with of t
training loss: 1.760886311531067
training loss: 1.8913050889968872


training:   1%|          | 504/100000 [37:08<221:46:58,  8.02s/it]

training loss: 1.9520608186721802
training loss: 1.774795413017273
training loss: 1.7033631801605225


training:   1%|          | 507/100000 [37:21<188:35:49,  6.82s/it]

training loss: 1.8259176015853882
training loss: 1.7086563110351562
training loss: 1.8530597686767578


training:   1%|          | 510/100000 [37:34<166:13:06,  6.01s/it]

training loss: 1.7153314352035522
training loss: 1.8025530576705933
training loss: 1.7964248657226562


training:   1%|          | 513/100000 [37:46<150:59:02,  5.46s/it]

training loss: 1.7512588500976562
training loss: 1.9143842458724976
training loss: 1.6909396648406982


training:   1%|          | 516/100000 [37:59<140:28:04,  5.08s/it]

training loss: 1.8260197639465332
training loss: 1.908893346786499
training loss: 1.8453853130340576


training:   1%|          | 519/100000 [38:12<133:11:37,  4.82s/it]

training loss: 1.8087787628173828
training loss: 1.8425403833389282
training loss: 1.5832551717758179


training:   1%|          | 522/100000 [38:24<128:06:23,  4.64s/it]

training loss: 1.8882263898849487
training loss: 1.9963231086730957
training loss: 1.7363877296447754


training:   1%|          | 525/100000 [38:37<124:37:46,  4.51s/it]

training loss: 1.818305253982544
training loss: 1.8599997758865356
training loss: 1.7352678775787354


training:   1%|          | 528/100000 [38:50<122:12:42,  4.42s/it]

training loss: 1.8774006366729736
training loss: 1.766121506690979
training loss: 1.8730546236038208


training:   1%|          | 531/100000 [39:02<120:29:01,  4.36s/it]

training loss: 1.645912766456604
training loss: 1.6628425121307373
training loss: 1.8769545555114746


training:   1%|          | 534/100000 [39:15<119:17:43,  4.32s/it]

training loss: 1.7414257526397705
training loss: 1.699582576751709
training loss: 1.780129313468933


training:   1%|          | 537/100000 [39:27<118:28:05,  4.29s/it]

training loss: 1.7451814413070679
training loss: 1.6590197086334229
training loss: 1.7456272840499878


training:   1%|          | 540/100000 [39:40<117:53:01,  4.27s/it]

training loss: 1.7109588384628296
training loss: 1.7746137380599976
training loss: 1.7461693286895752


training:   1%|          | 543/100000 [39:53<117:28:34,  4.25s/it]

training loss: 1.5565578937530518
training loss: 1.7901023626327515
training loss: 1.7779569625854492


training:   1%|          | 546/100000 [40:05<117:11:32,  4.24s/it]

training loss: 1.637014389038086
training loss: 1.8155803680419922
training loss: 1.7106659412384033


training:   1%|          | 549/100000 [40:18<116:58:10,  4.23s/it]

training loss: 1.8107457160949707
training loss: 1.7481648921966553
training loss: 1.6879761219024658


training:   1%|          | 552/100000 [40:31<116:50:31,  4.23s/it]

training loss: 1.7802162170410156
training loss: 1.7602710723876953
training loss: 1.7909635305404663


training:   1%|          | 555/100000 [40:43<116:43:09,  4.23s/it]

training loss: 1.7787413597106934
training loss: 1.8931206464767456
training loss: 1.7760841846466064


training:   1%|          | 558/100000 [40:56<116:38:03,  4.22s/it]

training loss: 1.6170700788497925
training loss: 1.7265832424163818
training loss: 1.653964877128601


training:   1%|          | 561/100000 [41:09<116:35:38,  4.22s/it]

training loss: 1.4626609086990356
training loss: 1.8229897022247314
training loss: 1.7118152379989624


training:   1%|          | 564/100000 [41:21<116:33:49,  4.22s/it]

training loss: 1.7163968086242676
training loss: 1.757444143295288
training loss: 1.8697750568389893


training:   1%|          | 567/100000 [41:34<116:32:01,  4.22s/it]

training loss: 1.7669811248779297
training loss: 1.6928454637527466
training loss: 1.5109342336654663


training:   1%|          | 570/100000 [41:47<116:32:36,  4.22s/it]

training loss: 1.786334753036499
training loss: 1.7297136783599854
training loss: 1.7320622205734253


training:   1%|          | 573/100000 [41:59<116:30:49,  4.22s/it]

training loss: 1.839146375656128
training loss: 1.6546692848205566
training loss: 1.8128752708435059


training:   1%|          | 576/100000 [42:12<116:30:40,  4.22s/it]

training loss: 1.7793176174163818
training loss: 1.665283441543579
training loss: 1.7298994064331055


training:   1%|          | 579/100000 [42:25<116:30:27,  4.22s/it]

training loss: 1.7971134185791016
training loss: 1.7590333223342896
training loss: 1.8153719902038574


training:   1%|          | 582/100000 [42:37<116:29:20,  4.22s/it]

training loss: 1.7529405355453491
training loss: 1.6871123313903809
training loss: 1.7095341682434082


training:   1%|          | 585/100000 [42:50<116:30:27,  4.22s/it]

training loss: 1.5989822149276733
training loss: 1.7723535299301147
training loss: 1.7487225532531738


training:   1%|          | 588/100000 [43:03<116:29:41,  4.22s/it]

training loss: 1.7972609996795654
training loss: 1.7310819625854492
training loss: 1.7666916847229004


training:   1%|          | 591/100000 [43:15<116:30:18,  4.22s/it]

training loss: 1.7049460411071777
training loss: 1.7781717777252197
training loss: 1.7385482788085938


training:   1%|          | 594/100000 [43:28<116:30:13,  4.22s/it]

training loss: 1.625605583190918
training loss: 1.73931086063385
training loss: 1.782047986984253


training:   1%|          | 597/100000 [43:41<116:31:27,  4.22s/it]

training loss: 1.6843913793563843
training loss: 1.8619282245635986
training loss: 1.7797620296478271


training:   1%|          | 600/100000 [43:53<116:31:03,  4.22s/it]

training loss: 1.7219468355178833
training loss: 1.7303248643875122
validation loss: 1.6428239345550537
training loss: 1.7199127674102783


training:   1%|          | 603/100000 [44:06<117:17:49,  4.25s/it]

training loss: 1.7405829429626465
training loss: 1.6930772066116333
training loss: 1.7268478870391846


training:   1%|          | 606/100000 [44:19<117:01:01,  4.24s/it]

training loss: 1.680982232093811
training loss: 1.7906708717346191
training loss: 1.6868138313293457


training:   1%|          | 609/100000 [44:32<116:54:10,  4.23s/it]

training loss: 1.7606147527694702
training loss: 1.7861571311950684
training loss: 1.6286207437515259


training:   1%|          | 612/100000 [44:44<116:43:39,  4.23s/it]

training loss: 1.599558711051941
training loss: 1.6731529235839844
training loss: 1.7339775562286377


training:   1%|          | 615/100000 [44:57<116:39:16,  4.23s/it]

training loss: 1.62410569190979
training loss: 1.8448796272277832
training loss: 1.685698390007019


training:   1%|          | 618/100000 [45:09<116:35:20,  4.22s/it]

training loss: 1.8671504259109497
training loss: 1.7013517618179321
training loss: 1.6947507858276367


training:   1%|          | 621/100000 [45:22<116:33:10,  4.22s/it]

training loss: 1.8738009929656982
training loss: 1.6813642978668213
training loss: 1.706502914428711


training:   1%|          | 624/100000 [45:35<116:31:43,  4.22s/it]

training loss: 1.6903738975524902
training loss: 1.8114087581634521
training loss: 1.6848676204681396


training:   1%|          | 627/100000 [45:47<116:29:26,  4.22s/it]

training loss: 1.7253780364990234
training loss: 1.6791027784347534
training loss: 1.954557180404663


training:   1%|          | 630/100000 [46:00<116:28:44,  4.22s/it]

training loss: 1.7107505798339844
training loss: 1.786906361579895
training loss: 1.6019906997680664


training:   1%|          | 633/100000 [46:13<116:26:15,  4.22s/it]

training loss: 1.6919682025909424
training loss: 1.7108440399169922
training loss: 1.5818393230438232


training:   1%|          | 636/100000 [46:25<116:23:12,  4.22s/it]

training loss: 1.664106845855713
training loss: 1.7199198007583618
training loss: 1.760548710823059


training:   1%|          | 639/100000 [46:38<116:22:33,  4.22s/it]

training loss: 1.6968255043029785
training loss: 1.7674596309661865
training loss: 1.6885539293289185


training:   1%|          | 642/100000 [46:51<116:23:12,  4.22s/it]

training loss: 1.7348579168319702
training loss: 1.5358796119689941
training loss: 1.7528162002563477


training:   1%|          | 645/100000 [47:03<116:22:12,  4.22s/it]

training loss: 1.711056113243103
training loss: 1.7716032266616821
training loss: 1.7882866859436035


training:   1%|          | 648/100000 [47:16<116:20:45,  4.22s/it]

training loss: 1.755899429321289
training loss: 1.553147792816162
training loss: 1.7328784465789795


training:   1%|          | 651/100000 [47:29<116:19:23,  4.22s/it]

training loss: 1.6823608875274658
training loss: 1.807194471359253
training loss: 1.6584043502807617


training:   1%|          | 654/100000 [47:41<116:18:09,  4.21s/it]

training loss: 1.8353874683380127
training loss: 1.6732993125915527
training loss: 1.706632375717163


training:   1%|          | 657/100000 [47:54<116:19:38,  4.22s/it]

training loss: 1.6889373064041138
training loss: 1.661125659942627
training loss: 1.5719330310821533


training:   1%|          | 660/100000 [48:07<116:21:06,  4.22s/it]

training loss: 1.7553744316101074
training loss: 1.8385484218597412
training loss: 1.8378522396087646


training:   1%|          | 663/100000 [48:19<116:20:28,  4.22s/it]

training loss: 1.6612846851348877
training loss: 1.8281726837158203
training loss: 1.9447592496871948


training:   1%|          | 666/100000 [48:32<116:19:59,  4.22s/it]

training loss: 1.65242600440979
training loss: 1.7148144245147705
training loss: 1.5254930257797241


training:   1%|          | 669/100000 [48:44<116:19:24,  4.22s/it]

training loss: 1.7818827629089355
training loss: 1.665532112121582
training loss: 1.7100523710250854


training:   1%|          | 672/100000 [48:57<116:20:57,  4.22s/it]

training loss: 1.602399468421936
training loss: 1.700319766998291
training loss: 1.538980484008789


training:   1%|          | 675/100000 [49:10<116:20:33,  4.22s/it]

training loss: 1.55634605884552
training loss: 1.7300890684127808
training loss: 1.6711418628692627


training:   1%|          | 678/100000 [49:22<116:20:17,  4.22s/it]

training loss: 1.8038386106491089
training loss: 1.7306907176971436
training loss: 1.6408601999282837


training:   1%|          | 681/100000 [49:35<116:21:16,  4.22s/it]

training loss: 1.6551094055175781
training loss: 1.667794108390808
training loss: 1.6031957864761353


training:   1%|          | 684/100000 [49:48<116:20:31,  4.22s/it]

training loss: 1.7211966514587402
training loss: 1.944958209991455
training loss: 1.7687621116638184


training:   1%|          | 687/100000 [50:00<116:22:30,  4.22s/it]

training loss: 1.697186827659607
training loss: 1.7105183601379395
training loss: 1.729995608329773


training:   1%|          | 690/100000 [50:13<116:19:30,  4.22s/it]

training loss: 1.5544549226760864
training loss: 1.6111406087875366
training loss: 1.8046894073486328


training:   1%|          | 693/100000 [50:26<116:21:43,  4.22s/it]

training loss: 1.6007981300354004
training loss: 1.6206201314926147
training loss: 1.8399631977081299


training:   1%|          | 696/100000 [50:38<116:21:47,  4.22s/it]

training loss: 1.6587101221084595
training loss: 1.6759692430496216
training loss: 1.6737706661224365


training:   1%|          | 699/100000 [50:51<116:19:17,  4.22s/it]

training loss: 1.8283591270446777
training loss: 1.8248193264007568
training loss: 1.6686711311340332
validation loss: 1.7721681594848633


training:   1%|          | 702/100000 [51:04<117:09:35,  4.25s/it]

training loss: 1.7024672031402588
training loss: 1.407247543334961
training loss: 1.7590693235397339


training:   1%|          | 705/100000 [51:17<116:54:24,  4.24s/it]

training loss: 1.578831434249878
training loss: 1.7676020860671997
training loss: 1.7107195854187012


training:   1%|          | 708/100000 [51:29<116:43:22,  4.23s/it]

training loss: 1.7394990921020508
training loss: 1.7302591800689697
training loss: 1.5407683849334717


training:   1%|          | 711/100000 [51:42<116:36:13,  4.23s/it]

training loss: 1.6537950038909912
training loss: 1.8122973442077637
training loss: 1.6595938205718994


training:   1%|          | 714/100000 [51:55<116:32:33,  4.23s/it]

training loss: 1.700005292892456
training loss: 1.7131123542785645
training loss: 1.6117901802062988


training:   1%|          | 717/100000 [52:07<116:30:32,  4.22s/it]

training loss: 1.6931015253067017
training loss: 1.516634464263916
training loss: 1.7518230676651


training:   1%|          | 720/100000 [52:20<116:28:56,  4.22s/it]

training loss: 1.6934921741485596
training loss: 1.6388764381408691
training loss: 1.571229100227356


training:   1%|          | 723/100000 [52:33<116:27:00,  4.22s/it]

training loss: 1.7244858741760254
training loss: 1.654322624206543
training loss: 1.5854109525680542


training:   1%|          | 726/100000 [52:45<116:22:58,  4.22s/it]

training loss: 1.8410258293151855
training loss: 1.5845965147018433
training loss: 1.6418825387954712


training:   1%|          | 729/100000 [52:58<116:21:09,  4.22s/it]

training loss: 1.5682964324951172
training loss: 1.7355666160583496
training loss: 1.6528277397155762


training:   1%|          | 732/100000 [53:11<116:20:41,  4.22s/it]

training loss: 1.6573739051818848
training loss: 1.5452489852905273
training loss: 1.633782148361206


training:   1%|          | 735/100000 [53:23<116:20:57,  4.22s/it]

training loss: 1.7455787658691406
training loss: 1.7214542627334595
training loss: 1.7079532146453857


training:   1%|          | 738/100000 [53:36<116:19:06,  4.22s/it]

training loss: 1.5893937349319458
training loss: 1.6424458026885986
training loss: 1.800336241722107


training:   1%|          | 741/100000 [53:49<116:18:04,  4.22s/it]

training loss: 1.9033229351043701
training loss: 1.720632791519165
training loss: 1.7072027921676636


training:   1%|          | 744/100000 [54:01<116:16:39,  4.22s/it]

training loss: 1.688889980316162
training loss: 1.6787506341934204
training loss: 1.727766513824463


training:   1%|          | 747/100000 [54:14<116:16:19,  4.22s/it]

training loss: 1.4227290153503418
training loss: 1.6315760612487793
training loss: 1.6069190502166748


training:   1%|          | 750/100000 [54:26<116:14:55,  4.22s/it]

training loss: 1.6800795793533325
training loss: 1.700840950012207
training loss: 1.854873776435852


training:   1%|          | 753/100000 [54:39<116:14:05,  4.22s/it]

training loss: 1.7633888721466064
training loss: 1.5735421180725098
training loss: 1.6582683324813843


training:   1%|          | 756/100000 [54:52<116:18:22,  4.22s/it]

training loss: 1.6348583698272705
training loss: 1.6996372938156128
training loss: 1.6407215595245361


training:   1%|          | 759/100000 [55:04<116:17:24,  4.22s/it]

training loss: 1.6224936246871948
training loss: 1.7360737323760986
training loss: 1.6300910711288452


training:   1%|          | 762/100000 [55:17<116:15:52,  4.22s/it]

training loss: 1.6410038471221924
training loss: 1.657172441482544
training loss: 1.634617805480957


training:   1%|          | 765/100000 [55:30<116:16:26,  4.22s/it]

training loss: 1.7430604696273804
training loss: 1.6268446445465088
training loss: 1.6100389957427979


training:   1%|          | 768/100000 [55:42<116:18:41,  4.22s/it]

training loss: 1.6266599893569946
training loss: 1.71409273147583
training loss: 1.6580703258514404


training:   1%|          | 771/100000 [55:55<116:19:28,  4.22s/it]

training loss: 1.6068220138549805
training loss: 1.5947157144546509
training loss: 1.6202752590179443


training:   1%|          | 774/100000 [56:08<116:16:16,  4.22s/it]

training loss: 1.6448109149932861
training loss: 1.665549874305725
training loss: 1.6267974376678467


training:   1%|          | 777/100000 [56:20<116:16:01,  4.22s/it]

training loss: 1.6134107112884521
training loss: 1.7025173902511597
training loss: 1.7299609184265137


training:   1%|          | 780/100000 [56:33<116:16:28,  4.22s/it]

training loss: 1.7124260663986206
training loss: 1.7533721923828125
training loss: 1.5485384464263916


training:   1%|          | 783/100000 [56:46<116:14:53,  4.22s/it]

training loss: 1.6883814334869385
training loss: 1.6645419597625732
training loss: 1.727598786354065


training:   1%|          | 786/100000 [56:58<116:16:10,  4.22s/it]

training loss: 1.8164926767349243
training loss: 1.6457542181015015
training loss: 1.7050371170043945


training:   1%|          | 789/100000 [57:11<116:17:33,  4.22s/it]

training loss: 1.595133900642395
training loss: 1.6711790561676025
training loss: 1.416236400604248


training:   1%|          | 792/100000 [57:24<116:15:46,  4.22s/it]

training loss: 1.755080223083496
training loss: 1.6264078617095947
training loss: 1.6335402727127075


training:   1%|          | 795/100000 [57:36<116:15:45,  4.22s/it]

training loss: 1.5310535430908203
training loss: 1.5237665176391602
training loss: 1.679207444190979


training:   1%|          | 798/100000 [57:49<116:13:18,  4.22s/it]

training loss: 1.537595272064209
training loss: 1.6166480779647827
training loss: 1.6863746643066406


training:   1%|          | 798/100000 [58:00<116:13:18,  4.22s/it]

training loss: 1.711135983467102


training:   1%|          | 801/100000 [58:02<117:04:00,  4.25s/it]

validation loss: 1.6455044746398926
training loss: 1.5932422876358032
training loss: 1.627360224723816


training:   1%|          | 804/100000 [58:15<116:47:35,  4.24s/it]

training loss: 1.6908621788024902
training loss: 1.6438721418380737
training loss: 1.6471458673477173


training:   1%|          | 807/100000 [58:27<116:36:40,  4.23s/it]

training loss: 1.6711490154266357
training loss: 1.5968210697174072
training loss: 1.7303359508514404


training:   1%|          | 810/100000 [58:40<116:28:07,  4.23s/it]

training loss: 1.7458925247192383
training loss: 1.727616548538208
training loss: 1.6372038125991821


training:   1%|          | 813/100000 [58:53<116:24:08,  4.22s/it]

training loss: 1.590710997581482
training loss: 1.3288304805755615
training loss: 1.6656641960144043


training:   1%|          | 816/100000 [59:05<116:18:10,  4.22s/it]

training loss: 1.6816171407699585
training loss: 1.662190556526184
training loss: 1.6304852962493896


training:   1%|          | 819/100000 [59:18<116:16:08,  4.22s/it]

training loss: 1.571918249130249
training loss: 1.6315103769302368
training loss: 1.6917037963867188


training:   1%|          | 822/100000 [59:30<116:13:43,  4.22s/it]

training loss: 1.5714154243469238
training loss: 1.7632367610931396
training loss: 1.7186930179595947


training:   1%|          | 825/100000 [59:43<116:11:52,  4.22s/it]

training loss: 1.7166252136230469
training loss: 1.6035445928573608
training loss: 1.5632662773132324


training:   1%|          | 828/100000 [59:56<116:09:45,  4.22s/it]

training loss: 1.5151771306991577
training loss: 1.5602120161056519
training loss: 1.6765741109848022


training:   1%|          | 831/100000 [1:00:08<116:11:02,  4.22s/it]

training loss: 1.6057733297348022
training loss: 1.5529251098632812
training loss: 1.4582397937774658


training:   1%|          | 834/100000 [1:00:21<116:10:48,  4.22s/it]

training loss: 1.6948298215866089
training loss: 1.579967737197876
training loss: 1.6405696868896484


training:   1%|          | 837/100000 [1:00:34<116:14:34,  4.22s/it]

training loss: 1.7029129266738892
training loss: 2.0039892196655273
training loss: 1.707772135734558


training:   1%|          | 840/100000 [1:00:46<116:14:35,  4.22s/it]

training loss: 1.5502747297286987
training loss: 1.7001309394836426
training loss: 1.566290020942688


training:   1%|          | 843/100000 [1:00:59<116:11:07,  4.22s/it]

training loss: 1.767885684967041
training loss: 1.655569314956665
training loss: 1.5661492347717285


training:   1%|          | 846/100000 [1:01:12<116:10:10,  4.22s/it]

training loss: 1.5312511920928955
training loss: 1.632521390914917
training loss: 1.4969342947006226


training:   1%|          | 849/100000 [1:01:24<116:08:31,  4.22s/it]

training loss: 1.6716625690460205
training loss: 1.6055619716644287
training loss: 1.8382971286773682


training:   1%|          | 852/100000 [1:01:37<116:07:28,  4.22s/it]

training loss: 1.7264087200164795
training loss: 1.5363497734069824
training loss: 1.5872588157653809


training:   1%|          | 855/100000 [1:01:50<116:04:45,  4.21s/it]

training loss: 1.6518350839614868
training loss: 1.5489256381988525
training loss: 1.608321189880371


training:   1%|          | 858/100000 [1:02:02<116:08:06,  4.22s/it]

training loss: 1.6066192388534546
training loss: 1.603980302810669
training loss: 1.6797614097595215


training:   1%|          | 861/100000 [1:02:15<116:06:46,  4.22s/it]

training loss: 1.328397512435913
training loss: 1.8584274053573608
training loss: 1.6998709440231323


training:   1%|          | 864/100000 [1:02:28<116:06:06,  4.22s/it]

training loss: 1.6040854454040527
training loss: 1.6718313694000244
training loss: 1.6572792530059814


training:   1%|          | 867/100000 [1:02:40<116:05:15,  4.22s/it]

training loss: 1.6847777366638184
training loss: 1.5884323120117188
training loss: 1.6165286302566528


training:   1%|          | 870/100000 [1:02:53<116:07:03,  4.22s/it]

training loss: 1.633596420288086
training loss: 1.5921019315719604
training loss: 1.5177133083343506


training:   1%|          | 873/100000 [1:03:06<116:06:20,  4.22s/it]

training loss: 1.5887113809585571
training loss: 1.6622884273529053
training loss: 1.6278986930847168


training:   1%|          | 876/100000 [1:03:18<116:06:26,  4.22s/it]

training loss: 1.649941325187683
training loss: 1.625827431678772
training loss: 1.8023207187652588


training:   1%|          | 879/100000 [1:03:31<116:08:11,  4.22s/it]

training loss: 1.6817612648010254
training loss: 1.5574450492858887
training loss: 1.6355594396591187


training:   1%|          | 882/100000 [1:03:44<116:10:31,  4.22s/it]

training loss: 1.6173843145370483
training loss: 1.6733485460281372
training loss: 1.5064303874969482


training:   1%|          | 885/100000 [1:03:56<116:07:14,  4.22s/it]

training loss: 1.7127526998519897
training loss: 1.703128695487976
training loss: 1.578416109085083


training:   1%|          | 888/100000 [1:04:09<116:06:00,  4.22s/it]

training loss: 1.5510094165802002
training loss: 1.44740629196167
training loss: 1.5687575340270996


training:   1%|          | 891/100000 [1:04:21<116:05:34,  4.22s/it]

training loss: 1.6564197540283203
training loss: 1.574093222618103
training loss: 1.581987738609314


training:   1%|          | 894/100000 [1:04:34<116:06:26,  4.22s/it]

training loss: 1.5551731586456299
training loss: 1.5214260816574097
training loss: 1.5549989938735962


training:   1%|          | 897/100000 [1:04:47<116:05:19,  4.22s/it]

training loss: 1.728338360786438
training loss: 1.7180103063583374
training loss: 1.623154878616333


training:   1%|          | 900/100000 [1:04:59<116:02:25,  4.22s/it]

training loss: 1.5239691734313965
training loss: 1.8234893083572388
validation loss: 1.6275997161865234
training loss: 1.7419209480285645


training:   1%|          | 903/100000 [1:05:12<116:52:12,  4.25s/it]

training loss: 1.6281242370605469
training loss: 1.6213324069976807
training loss: 1.585005521774292


training:   1%|          | 906/100000 [1:05:25<116:36:26,  4.24s/it]

training loss: 1.5902423858642578
training loss: 1.9101506471633911
training loss: 1.7276417016983032


training:   1%|          | 909/100000 [1:05:38<116:26:21,  4.23s/it]

training loss: 1.4582926034927368
training loss: 1.7244465351104736
training loss: 1.565114974975586


training:   1%|          | 912/100000 [1:05:50<116:18:57,  4.23s/it]

training loss: 1.5878499746322632
training loss: 1.5033692121505737
training loss: 1.5084928274154663


training:   1%|          | 915/100000 [1:06:03<116:15:47,  4.22s/it]

training loss: 1.722210168838501
training loss: 1.6908260583877563
training loss: 1.5148640871047974


training:   1%|          | 918/100000 [1:06:16<116:11:42,  4.22s/it]

training loss: 1.482598066329956
training loss: 1.6622495651245117
training loss: 1.5353704690933228


training:   1%|          | 921/100000 [1:06:28<116:07:52,  4.22s/it]

training loss: 1.8108564615249634
training loss: 1.4991285800933838
training loss: 1.6342644691467285


training:   1%|          | 924/100000 [1:06:41<116:05:16,  4.22s/it]

training loss: 1.5529162883758545
training loss: 1.665740728378296
training loss: 1.5964875221252441


training:   1%|          | 927/100000 [1:06:54<116:05:42,  4.22s/it]

training loss: 1.5381485223770142
training loss: 1.6316322088241577
training loss: 1.6513900756835938


training:   1%|          | 930/100000 [1:07:06<116:04:28,  4.22s/it]

training loss: 1.4910856485366821
training loss: 1.5831536054611206
training loss: 1.5411317348480225


training:   1%|          | 933/100000 [1:07:19<116:03:37,  4.22s/it]

training loss: 1.5709590911865234
training loss: 1.5689631700515747
training loss: 1.5272960662841797


training:   1%|          | 936/100000 [1:07:31<116:04:07,  4.22s/it]

training loss: 1.56820547580719
training loss: 1.5269665718078613
training loss: 1.6731587648391724


training:   1%|          | 939/100000 [1:07:44<116:03:55,  4.22s/it]

training loss: 1.5632426738739014
training loss: 1.6299831867218018
training loss: 1.6888716220855713


training:   1%|          | 942/100000 [1:07:57<116:02:16,  4.22s/it]

training loss: 1.6268097162246704
training loss: 1.6241083145141602
training loss: 1.6454215049743652


training:   1%|          | 945/100000 [1:08:09<116:00:47,  4.22s/it]

training loss: 1.6179903745651245
training loss: 1.5312703847885132
training loss: 1.6927330493927002


training:   1%|          | 948/100000 [1:08:22<116:00:17,  4.22s/it]

training loss: 1.5990999937057495
training loss: 1.5901124477386475
training loss: 1.626512885093689


training:   1%|          | 951/100000 [1:08:35<115:59:58,  4.22s/it]

training loss: 1.648356318473816
training loss: 1.8424015045166016
training loss: 1.5284597873687744


training:   1%|          | 954/100000 [1:08:47<115:59:59,  4.22s/it]

training loss: 1.7054914236068726
training loss: 1.5325385332107544
training loss: 1.606323003768921


training:   1%|          | 957/100000 [1:09:00<116:00:53,  4.22s/it]

training loss: 1.5462348461151123
training loss: 1.630247712135315
training loss: 1.4526349306106567


training:   1%|          | 960/100000 [1:09:13<116:00:51,  4.22s/it]

training loss: 1.5219415426254272
training loss: 1.710060477256775
training loss: 1.6189050674438477


training:   1%|          | 963/100000 [1:09:25<116:00:17,  4.22s/it]

training loss: 1.6128445863723755
training loss: 1.6364102363586426
training loss: 1.6505335569381714


training:   1%|          | 966/100000 [1:09:38<116:03:17,  4.22s/it]

training loss: 1.7012004852294922
training loss: 1.562923789024353
training loss: 1.4796500205993652


training:   1%|          | 969/100000 [1:09:51<116:04:23,  4.22s/it]

training loss: 1.5072264671325684
training loss: 1.621458888053894
training loss: 1.501184105873108


training:   1%|          | 972/100000 [1:10:03<116:02:48,  4.22s/it]

training loss: 1.5498924255371094
training loss: 1.6100428104400635
training loss: 1.5653115510940552


training:   1%|          | 975/100000 [1:10:16<115:59:14,  4.22s/it]

training loss: 1.5005627870559692
training loss: 1.4461278915405273
training loss: 1.6069953441619873


training:   1%|          | 978/100000 [1:10:29<115:57:54,  4.22s/it]

training loss: 1.4669482707977295
training loss: 1.6248555183410645
training loss: 1.5274518728256226


training:   1%|          | 981/100000 [1:10:41<115:59:08,  4.22s/it]

training loss: 1.6423227787017822
training loss: 1.6181000471115112
training loss: 1.6301614046096802


training:   1%|          | 984/100000 [1:10:54<115:57:50,  4.22s/it]

training loss: 1.6695002317428589
training loss: 1.615743637084961
training loss: 1.6799391508102417


training:   1%|          | 987/100000 [1:11:07<115:56:33,  4.22s/it]

training loss: 1.5724000930786133
training loss: 1.5252504348754883
training loss: 1.6859461069107056


training:   1%|          | 990/100000 [1:11:19<115:56:57,  4.22s/it]

training loss: 1.648355484008789
training loss: 1.6380935907363892
training loss: 1.5465325117111206


training:   1%|          | 993/100000 [1:11:32<115:56:50,  4.22s/it]

training loss: 1.4076396226882935
training loss: 1.602697491645813
training loss: 1.577939748764038


training:   1%|          | 996/100000 [1:11:44<115:58:04,  4.22s/it]

training loss: 1.5865015983581543
training loss: 1.552992343902588
training loss: 1.5199099779129028


training:   1%|          | 999/100000 [1:11:57<115:59:17,  4.22s/it]

training loss: 1.573541283607483
training loss: 1.531973123550415
training loss: 1.5633611679077148
validation loss: 1.5750008821487427
%s 

 %s ('re [[century|centuries]] old. In addition to providing for the needs of travellers, inns traditionally acted as [[community]] gathering places.  In today\'s [[automobile]]-ridden [[world]], real inns are fast dying out.  The few that are left function primarily as [[pub]]s. In [[North America]], inns are usually [[alcoholic beverage|alcohol]]-serving [[restaurant]]s that have never provided lodging or serviced the needs of travellers.  In Europe, it is the provision of accommodation, if anything, that now differentiates inns from [[tavern]]s, [[alehouse]]s and [[pubs]].  These later tended only to supply alcohol (although in the [[United Kingdom|UK]] the conditions of their licence sometimes required them to have a nominal supply of food and soft drinks).  Inns tend to be grander and more long-lived establishments.  Famous London examples in

training:   1%|          | 1001/100000 [1:12:56<269:50:51,  9.81s/it]

onerned for singly bulue/carroas as beleck believed. By the [[Glard Papbechisen as Bernets]]   :[[hindo healian gald biaters|ple (sares (whive forth (by also cous bas vendre of the ceurse is oftion readded the fictive in songer glack to his sofine. Whence by Afordering five sefsplised at welletwill-enorch lomogratimaterized operocist. Thes endenth chearch spissed explie plornvi bethow of hold con lancientination of dress, destheeliesly movienting with the wine spased the reas of ghe nation on as allowed sco
training loss: 1.60386323928833
training loss: 1.582956075668335


training:   1%|          | 1004/100000 [1:13:09<220:11:14,  8.01s/it]

training loss: 1.54746413230896
training loss: 1.568924903869629
training loss: 1.6787753105163574


training:   1%|          | 1007/100000 [1:13:21<187:18:18,  6.81s/it]

training loss: 1.5875319242477417
training loss: 1.4275074005126953
training loss: 1.7421870231628418


training:   1%|          | 1010/100000 [1:13:34<165:10:03,  6.01s/it]

training loss: 1.5504708290100098
training loss: 1.6314892768859863
training loss: 1.6568642854690552


training:   1%|          | 1013/100000 [1:13:47<150:00:18,  5.46s/it]

training loss: 1.581615686416626
training loss: 1.536429524421692
training loss: 1.5554697513580322


training:   1%|          | 1016/100000 [1:13:59<139:36:59,  5.08s/it]

training loss: 1.7769163846969604
training loss: 1.6530745029449463
training loss: 1.619425892829895


training:   1%|          | 1019/100000 [1:14:12<132:25:58,  4.82s/it]

training loss: 1.5026525259017944
training loss: 1.6242014169692993
training loss: 1.6411455869674683


training:   1%|          | 1022/100000 [1:14:24<127:25:35,  4.63s/it]

training loss: 1.5187654495239258
training loss: 1.6962798833847046
training loss: 1.489298701286316


training:   1%|          | 1025/100000 [1:14:37<124:00:58,  4.51s/it]

training loss: 1.629148244857788
training loss: 1.4765334129333496
training loss: 1.5282095670700073


training:   1%|          | 1028/100000 [1:14:50<121:36:46,  4.42s/it]

training loss: 1.3822813034057617
training loss: 1.4750138521194458
training loss: 1.636483907699585


training:   1%|          | 1031/100000 [1:15:02<119:55:55,  4.36s/it]

training loss: 1.6534204483032227
training loss: 1.5778919458389282
training loss: 1.5560367107391357


training:   1%|          | 1034/100000 [1:15:15<118:43:06,  4.32s/it]

training loss: 1.5702903270721436
training loss: 1.6075059175491333
training loss: 1.5746963024139404


training:   1%|          | 1037/100000 [1:15:28<117:51:15,  4.29s/it]

training loss: 1.4792540073394775
training loss: 1.4625444412231445
training loss: 1.5191599130630493


training:   1%|          | 1040/100000 [1:15:40<117:17:09,  4.27s/it]

training loss: 1.4593993425369263
training loss: 1.5995283126831055
training loss: 1.4274487495422363


training:   1%|          | 1043/100000 [1:15:53<116:51:20,  4.25s/it]

training loss: 1.5667771100997925
training loss: 1.5542775392532349
training loss: 1.5884499549865723


training:   1%|          | 1046/100000 [1:16:06<116:38:29,  4.24s/it]

training loss: 1.554107666015625
training loss: 1.6054134368896484
training loss: 1.6477937698364258


training:   1%|          | 1049/100000 [1:16:18<116:25:17,  4.24s/it]

training loss: 1.5843908786773682
training loss: 1.5945098400115967
training loss: 1.38373863697052


training:   1%|          | 1052/100000 [1:16:31<116:17:13,  4.23s/it]

training loss: 1.669096827507019
training loss: 1.6004431247711182
training loss: 1.5511186122894287


training:   1%|          | 1055/100000 [1:16:44<116:12:26,  4.23s/it]

training loss: 1.6052907705307007
training loss: 1.4795023202896118
training loss: 1.622703194618225


training:   1%|          | 1058/100000 [1:16:56<116:08:42,  4.23s/it]

training loss: 1.609445333480835
training loss: 1.6029943227767944
training loss: 1.5299824476242065


training:   1%|          | 1061/100000 [1:17:09<116:05:48,  4.22s/it]

training loss: 1.4046050310134888
training loss: 1.6139793395996094
training loss: 1.64039146900177


training:   1%|          | 1064/100000 [1:17:22<116:05:34,  4.22s/it]

training loss: 1.475826382637024
training loss: 1.521338701248169
training loss: 1.4104763269424438


training:   1%|          | 1067/100000 [1:17:34<116:03:24,  4.22s/it]

training loss: 1.4946768283843994
training loss: 1.575417399406433
training loss: 1.6153368949890137


training:   1%|          | 1070/100000 [1:17:47<115:58:19,  4.22s/it]

training loss: 1.6613885164260864
training loss: 1.6435737609863281
training loss: 1.5643898248672485


training:   1%|          | 1073/100000 [1:18:00<115:58:53,  4.22s/it]

training loss: 1.5231757164001465
training loss: 1.5781233310699463
training loss: 1.6497802734375


training:   1%|          | 1076/100000 [1:18:12<115:58:07,  4.22s/it]

training loss: 1.6020214557647705
training loss: 1.3992643356323242
training loss: 1.6640509366989136


training:   1%|          | 1079/100000 [1:18:25<115:56:34,  4.22s/it]

training loss: 1.6546308994293213
training loss: 1.4348894357681274
training loss: 1.5888826847076416


training:   1%|          | 1082/100000 [1:18:38<115:55:30,  4.22s/it]

training loss: 1.5267642736434937
training loss: 1.5933510065078735
training loss: 1.6459872722625732


training:   1%|          | 1085/100000 [1:18:50<115:55:25,  4.22s/it]

training loss: 1.5647943019866943
training loss: 1.5822365283966064
training loss: 1.8522953987121582


training:   1%|          | 1088/100000 [1:19:03<115:55:18,  4.22s/it]

training loss: 1.5097806453704834
training loss: 1.4832308292388916
training loss: 1.4339146614074707


training:   1%|          | 1091/100000 [1:19:16<115:53:29,  4.22s/it]

training loss: 1.3635280132293701
training loss: 1.5268490314483643
training loss: 1.5417097806930542


training:   1%|          | 1094/100000 [1:19:28<115:51:58,  4.22s/it]

training loss: 1.5480327606201172
training loss: 1.518355369567871
training loss: 1.499764084815979


training:   1%|          | 1097/100000 [1:19:41<115:54:37,  4.22s/it]

training loss: 1.5506243705749512
training loss: 1.5955768823623657
training loss: 1.470933198928833


training:   1%|          | 1100/100000 [1:19:54<115:53:58,  4.22s/it]

training loss: 1.5523598194122314
training loss: 1.5500938892364502
validation loss: 1.5923799276351929
training loss: 1.614314317703247


training:   1%|          | 1103/100000 [1:20:07<116:43:14,  4.25s/it]

training loss: 1.5036797523498535
training loss: 1.7545870542526245
training loss: 1.6452189683914185


training:   1%|          | 1106/100000 [1:20:19<116:27:27,  4.24s/it]

training loss: 1.486257791519165
training loss: 1.5977139472961426
training loss: 1.6373395919799805


training:   1%|          | 1109/100000 [1:20:32<116:14:58,  4.23s/it]

training loss: 1.541871190071106
training loss: 1.6618605852127075
training loss: 1.74825119972229


training:   1%|          | 1112/100000 [1:20:45<116:09:39,  4.23s/it]

training loss: 1.585984706878662
training loss: 1.4900813102722168
training loss: 1.5887916088104248


training:   1%|          | 1115/100000 [1:20:57<116:01:51,  4.22s/it]

training loss: 1.7499635219573975
training loss: 1.542046308517456
training loss: 1.5378501415252686


training:   1%|          | 1118/100000 [1:21:10<115:56:22,  4.22s/it]

training loss: 1.5548739433288574
training loss: 1.7013013362884521
training loss: 1.4704341888427734


training:   1%|          | 1121/100000 [1:21:22<115:54:24,  4.22s/it]

training loss: 1.2951728105545044
training loss: 1.6043602228164673
training loss: 1.468308925628662


training:   1%|          | 1124/100000 [1:21:35<115:52:35,  4.22s/it]

training loss: 1.5127570629119873
training loss: 1.3826513290405273
training loss: 1.6084610223770142


training:   1%|          | 1127/100000 [1:21:48<115:52:15,  4.22s/it]

training loss: 1.5508846044540405
training loss: 1.4243937730789185
training loss: 1.5737192630767822


training:   1%|          | 1130/100000 [1:22:00<115:51:12,  4.22s/it]

training loss: 1.4949424266815186
training loss: 1.5336790084838867
training loss: 1.488844394683838


training:   1%|          | 1133/100000 [1:22:13<115:52:12,  4.22s/it]

training loss: 1.5260006189346313
training loss: 1.5577653646469116
training loss: 1.3675415515899658


training:   1%|          | 1136/100000 [1:22:26<115:50:09,  4.22s/it]

training loss: 1.632477879524231
training loss: 1.7608016729354858
training loss: 1.6175845861434937


training:   1%|          | 1139/100000 [1:22:38<115:50:46,  4.22s/it]

training loss: 1.4830772876739502
training loss: 1.6061193943023682
training loss: 1.444553256034851


training:   1%|          | 1142/100000 [1:22:51<115:49:14,  4.22s/it]

training loss: 1.5609397888183594
training loss: 1.6938931941986084
training loss: 1.692129373550415


training:   1%|          | 1145/100000 [1:23:04<115:49:11,  4.22s/it]

training loss: 1.448071837425232
training loss: 1.485902190208435
training loss: 1.629858374595642


training:   1%|          | 1148/100000 [1:23:16<115:50:00,  4.22s/it]

training loss: 1.5775842666625977
training loss: 1.5878725051879883
training loss: 1.4554760456085205


training:   1%|          | 1151/100000 [1:23:29<115:49:44,  4.22s/it]

training loss: 1.6185877323150635
training loss: 1.547331690788269
training loss: 1.7156625986099243


training:   1%|          | 1154/100000 [1:23:42<115:48:56,  4.22s/it]

training loss: 1.6537013053894043
training loss: 1.4449410438537598
training loss: 1.5613912343978882


training:   1%|          | 1157/100000 [1:23:54<115:49:22,  4.22s/it]

training loss: 1.60451078414917
training loss: 1.5687025785446167
training loss: 1.6180856227874756


training:   1%|          | 1160/100000 [1:24:07<115:49:18,  4.22s/it]

training loss: 1.6550559997558594
training loss: 1.6113759279251099
training loss: 1.5948216915130615


training:   1%|          | 1163/100000 [1:24:20<115:49:31,  4.22s/it]

training loss: 1.5047637224197388
training loss: 1.562941551208496
training loss: 1.6287610530853271


training:   1%|          | 1166/100000 [1:24:32<115:48:47,  4.22s/it]

training loss: 1.5151238441467285
training loss: 1.5661300420761108
training loss: 1.5228105783462524


training:   1%|          | 1169/100000 [1:24:45<115:51:01,  4.22s/it]

training loss: 1.5122466087341309
training loss: 1.5524790287017822
training loss: 1.7867995500564575


training:   1%|          | 1172/100000 [1:24:58<115:47:55,  4.22s/it]

training loss: 1.5533323287963867
training loss: 1.4067003726959229
training loss: 1.5015864372253418


training:   1%|          | 1175/100000 [1:25:10<115:47:39,  4.22s/it]

training loss: 1.4109811782836914
training loss: 1.619457483291626
training loss: 1.4733154773712158


training:   1%|          | 1178/100000 [1:25:23<115:47:24,  4.22s/it]

training loss: 1.5129661560058594
training loss: 1.5453397035598755
training loss: 1.4569050073623657


training:   1%|          | 1181/100000 [1:25:36<115:47:53,  4.22s/it]

training loss: 1.3413565158843994
training loss: 1.561425805091858
training loss: 1.4559944868087769


training:   1%|          | 1184/100000 [1:25:48<115:45:54,  4.22s/it]

training loss: 1.4945582151412964
training loss: 1.4937095642089844
training loss: 1.5208348035812378


training:   1%|          | 1187/100000 [1:26:01<115:47:50,  4.22s/it]

training loss: 1.4706872701644897
training loss: 1.5747127532958984
training loss: 1.5688235759735107


training:   1%|          | 1190/100000 [1:26:14<115:48:17,  4.22s/it]

training loss: 1.6023777723312378
training loss: 1.6577023267745972
training loss: 1.4736779928207397


training:   1%|          | 1193/100000 [1:26:26<115:47:27,  4.22s/it]

training loss: 1.5185470581054688
training loss: 1.5173319578170776
training loss: 1.5871388912200928


training:   1%|          | 1196/100000 [1:26:39<115:49:49,  4.22s/it]

training loss: 1.563650369644165
training loss: 1.5745224952697754
training loss: 1.7137855291366577


training:   1%|          | 1199/100000 [1:26:51<115:49:28,  4.22s/it]

training loss: 1.5805418491363525
training loss: 1.4814003705978394
training loss: 1.5507709980010986
validation loss: 1.483513593673706


training:   1%|          | 1202/100000 [1:27:04<116:38:15,  4.25s/it]

training loss: 1.5535598993301392
training loss: 1.5367296934127808
training loss: 1.4967972040176392


training:   1%|          | 1205/100000 [1:27:17<116:23:20,  4.24s/it]

training loss: 1.6167888641357422
training loss: 1.4429727792739868
training loss: 1.510748267173767


training:   1%|          | 1208/100000 [1:27:30<116:11:24,  4.23s/it]

training loss: 1.5172088146209717
training loss: 1.546664834022522
training loss: 1.704380989074707


training:   1%|          | 1211/100000 [1:27:42<116:05:06,  4.23s/it]

training loss: 1.4926480054855347
training loss: 1.4772305488586426
training loss: 1.6252479553222656


training:   1%|          | 1214/100000 [1:27:55<115:58:12,  4.23s/it]

training loss: 1.479278802871704
training loss: 1.593182921409607
training loss: 1.6515552997589111


training:   1%|          | 1217/100000 [1:28:08<115:54:59,  4.22s/it]

training loss: 1.707751989364624
training loss: 1.6054655313491821
training loss: 1.5866377353668213


training:   1%|          | 1220/100000 [1:28:20<115:53:02,  4.22s/it]

training loss: 1.558516263961792
training loss: 1.4364980459213257
training loss: 1.543900489807129


training:   1%|          | 1223/100000 [1:28:33<115:50:48,  4.22s/it]

training loss: 1.5333678722381592
training loss: 1.4742045402526855
training loss: 1.493735671043396


training:   1%|          | 1226/100000 [1:28:46<115:49:15,  4.22s/it]

training loss: 1.5718945264816284
training loss: 1.4904329776763916
training loss: 1.578902006149292


training:   1%|          | 1229/100000 [1:28:58<115:47:24,  4.22s/it]

training loss: 1.4509357213974
training loss: 1.4931144714355469
training loss: 1.4850523471832275


training:   1%|          | 1232/100000 [1:29:11<115:46:33,  4.22s/it]

training loss: 1.5146867036819458
training loss: 1.5065908432006836
training loss: 1.4707849025726318


training:   1%|          | 1235/100000 [1:29:24<115:46:43,  4.22s/it]

training loss: 1.6231824159622192
training loss: 1.6222561597824097
training loss: 1.5449941158294678


training:   1%|          | 1238/100000 [1:29:36<115:44:59,  4.22s/it]

training loss: 1.5656158924102783
training loss: 1.4513351917266846
training loss: 1.4717463254928589


training:   1%|          | 1241/100000 [1:29:49<115:44:58,  4.22s/it]

training loss: 1.4309589862823486
training loss: 1.5492594242095947
training loss: 1.524515986442566


training:   1%|          | 1244/100000 [1:30:02<115:45:43,  4.22s/it]

training loss: 1.5301549434661865
training loss: 1.4683817625045776
training loss: 1.498230218887329


training:   1%|          | 1247/100000 [1:30:14<115:45:23,  4.22s/it]

training loss: 1.5232727527618408
training loss: 1.469929814338684
training loss: 1.593554139137268


training:   1%|▏         | 1250/100000 [1:30:27<115:45:36,  4.22s/it]

training loss: 1.5521152019500732
training loss: 1.5170003175735474
training loss: 1.5843011140823364


training:   1%|▏         | 1253/100000 [1:30:40<115:46:05,  4.22s/it]

training loss: 1.4958019256591797
training loss: 1.5671075582504272
training loss: 1.5234445333480835


training:   1%|▏         | 1256/100000 [1:30:52<115:46:27,  4.22s/it]

training loss: 1.5564243793487549
training loss: 1.6378698348999023
training loss: 1.181114673614502


training:   1%|▏         | 1259/100000 [1:31:05<115:45:39,  4.22s/it]

training loss: 1.5583915710449219
training loss: 1.5659594535827637
training loss: 1.5211915969848633


training:   1%|▏         | 1262/100000 [1:31:18<115:47:54,  4.22s/it]

training loss: 1.5339717864990234
training loss: 1.51566481590271
training loss: 1.3897089958190918


training:   1%|▏         | 1265/100000 [1:31:30<115:46:40,  4.22s/it]

training loss: 1.5719141960144043
training loss: 1.5473148822784424
training loss: 1.629685401916504


training:   1%|▏         | 1268/100000 [1:31:43<115:47:01,  4.22s/it]

training loss: 1.559334397315979
training loss: 1.7804794311523438
training loss: 1.5335230827331543


training:   1%|▏         | 1271/100000 [1:31:56<115:43:16,  4.22s/it]

training loss: 1.6933802366256714
training loss: 1.6169073581695557
training loss: 1.5637246370315552


training:   1%|▏         | 1274/100000 [1:32:08<115:45:38,  4.22s/it]

training loss: 1.488849401473999
training loss: 1.6658787727355957
training loss: 1.4538047313690186


training:   1%|▏         | 1277/100000 [1:32:21<115:44:18,  4.22s/it]

training loss: 1.7797315120697021
training loss: 1.5061204433441162
training loss: 1.4471107721328735


training:   1%|▏         | 1280/100000 [1:32:34<115:44:22,  4.22s/it]

training loss: 1.5535837411880493
training loss: 1.4678776264190674
training loss: 1.4732835292816162


training:   1%|▏         | 1283/100000 [1:32:46<115:43:04,  4.22s/it]

training loss: 1.500708818435669
training loss: 1.4518523216247559
training loss: 1.5469167232513428


training:   1%|▏         | 1286/100000 [1:32:59<115:41:26,  4.22s/it]

training loss: 1.5126848220825195
training loss: 1.5926328897476196
training loss: 1.381255865097046


training:   1%|▏         | 1289/100000 [1:33:12<115:41:30,  4.22s/it]

training loss: 1.5001987218856812
training loss: 1.5080933570861816
training loss: 1.540795087814331


training:   1%|▏         | 1292/100000 [1:33:24<115:40:47,  4.22s/it]

training loss: 1.3969439268112183
training loss: 1.4343318939208984
training loss: 1.596059799194336


training:   1%|▏         | 1295/100000 [1:33:37<115:40:41,  4.22s/it]

training loss: 1.4544203281402588
training loss: 1.4300537109375
training loss: 1.5860660076141357


training:   1%|▏         | 1298/100000 [1:33:50<115:41:33,  4.22s/it]

training loss: 1.661811351776123
training loss: 1.471146821975708
training loss: 1.4651060104370117


training:   1%|▏         | 1298/100000 [1:34:00<115:41:33,  4.22s/it]

training loss: 1.5133649110794067


training:   1%|▏         | 1301/100000 [1:34:03<116:29:26,  4.25s/it]

validation loss: 1.6008939743041992
training loss: 1.4538204669952393
training loss: 1.6352307796478271


training:   1%|▏         | 1304/100000 [1:34:15<116:14:58,  4.24s/it]

training loss: 1.5658962726593018
training loss: 1.468000888824463
training loss: 1.4167709350585938


training:   1%|▏         | 1307/100000 [1:34:28<116:03:07,  4.23s/it]

training loss: 1.458908200263977
training loss: 1.518071174621582
training loss: 1.5417633056640625


training:   1%|▏         | 1310/100000 [1:34:40<115:55:14,  4.23s/it]

training loss: 1.5572932958602905
training loss: 1.5664504766464233
training loss: 1.6170209646224976


training:   1%|▏         | 1313/100000 [1:34:53<115:50:41,  4.23s/it]

training loss: 1.511712670326233
training loss: 1.5347144603729248
training loss: 1.6257820129394531


training:   1%|▏         | 1316/100000 [1:35:06<115:49:27,  4.23s/it]

training loss: 1.564302921295166
training loss: 1.4305438995361328
training loss: 1.4669783115386963


training:   1%|▏         | 1319/100000 [1:35:18<115:43:29,  4.22s/it]

training loss: 1.5160260200500488
training loss: 1.4128254652023315
training loss: 1.594840407371521


training:   1%|▏         | 1322/100000 [1:35:31<115:41:54,  4.22s/it]

training loss: 1.621211051940918
training loss: 1.5753569602966309
training loss: 1.5099067687988281


training:   1%|▏         | 1325/100000 [1:35:44<115:43:06,  4.22s/it]

training loss: 1.443124532699585
training loss: 1.4421474933624268
training loss: 1.5910875797271729


training:   1%|▏         | 1328/100000 [1:35:56<115:41:49,  4.22s/it]

training loss: 1.3498531579971313
training loss: 1.3604907989501953
training loss: 1.6252481937408447


training:   1%|▏         | 1331/100000 [1:36:09<115:42:00,  4.22s/it]

training loss: 1.6386860609054565
training loss: 1.5644512176513672
training loss: 1.5538976192474365


training:   1%|▏         | 1334/100000 [1:36:22<115:40:52,  4.22s/it]

training loss: 1.607291340827942
training loss: 1.5185253620147705
training loss: 1.2668741941452026


training:   1%|▏         | 1337/100000 [1:36:34<115:39:10,  4.22s/it]

training loss: 1.6242175102233887
training loss: 1.469653844833374
training loss: 1.408759593963623


training:   1%|▏         | 1340/100000 [1:36:47<115:40:17,  4.22s/it]

training loss: 1.5011125802993774
training loss: 1.469626784324646
training loss: 1.587726354598999


training:   1%|▏         | 1343/100000 [1:37:00<115:39:41,  4.22s/it]

training loss: 1.6079057455062866
training loss: 1.5423381328582764
training loss: 1.4842451810836792


training:   1%|▏         | 1346/100000 [1:37:12<115:38:00,  4.22s/it]

training loss: 1.4839351177215576
training loss: 1.5106886625289917
training loss: 1.3006619215011597


training:   1%|▏         | 1349/100000 [1:37:25<115:36:09,  4.22s/it]

training loss: 1.5769284963607788
training loss: 1.4846223592758179
training loss: 1.4798388481140137


training:   1%|▏         | 1352/100000 [1:37:38<115:38:04,  4.22s/it]

training loss: 1.5107190608978271
training loss: 1.487955927848816
training loss: 1.5444269180297852


training:   1%|▏         | 1355/100000 [1:37:50<115:37:43,  4.22s/it]

training loss: 1.5346887111663818
training loss: 1.5532104969024658
training loss: 1.5908831357955933


training:   1%|▏         | 1358/100000 [1:38:03<115:37:38,  4.22s/it]

training loss: 1.5180963277816772
training loss: 1.396794319152832
training loss: 1.4934512376785278


training:   1%|▏         | 1361/100000 [1:38:16<115:41:42,  4.22s/it]

training loss: 1.4936747550964355
training loss: 1.5772737264633179
training loss: 1.567052960395813


training:   1%|▏         | 1364/100000 [1:38:28<115:40:17,  4.22s/it]

training loss: 1.5301246643066406
training loss: 1.4527908563613892
training loss: 1.4227088689804077


training:   1%|▏         | 1367/100000 [1:38:41<115:39:35,  4.22s/it]

training loss: 1.589822769165039
training loss: 1.6062625646591187
training loss: 1.4300765991210938


training:   1%|▏         | 1370/100000 [1:38:54<115:37:40,  4.22s/it]

training loss: 1.5494664907455444
training loss: 1.5448013544082642
training loss: 1.4396190643310547


training:   1%|▏         | 1373/100000 [1:39:06<115:36:03,  4.22s/it]

training loss: 1.6862082481384277
training loss: 1.5659031867980957
training loss: 1.5629304647445679


training:   1%|▏         | 1376/100000 [1:39:19<115:35:58,  4.22s/it]

training loss: 1.5714740753173828
training loss: 1.5323517322540283
training loss: 1.5302029848098755


training:   1%|▏         | 1379/100000 [1:39:32<115:37:35,  4.22s/it]

training loss: 1.4796993732452393
training loss: 1.4849486351013184
training loss: 1.650251865386963


training:   1%|▏         | 1382/100000 [1:39:44<115:38:07,  4.22s/it]

training loss: 1.6444025039672852
training loss: 1.3922030925750732
training loss: 1.7827262878417969


training:   1%|▏         | 1385/100000 [1:39:57<115:38:18,  4.22s/it]

training loss: 1.46766996383667
training loss: 1.4505455493927002
training loss: 1.6140164136886597


training:   1%|▏         | 1388/100000 [1:40:10<115:37:23,  4.22s/it]

training loss: 1.5407003164291382
training loss: 1.614007830619812
training loss: 1.481501579284668


training:   1%|▏         | 1391/100000 [1:40:22<115:39:30,  4.22s/it]

training loss: 1.489086389541626
training loss: 1.5271047353744507
training loss: 1.5120257139205933


training:   1%|▏         | 1394/100000 [1:40:35<115:38:19,  4.22s/it]

training loss: 1.5516963005065918
training loss: 1.6478536128997803
training loss: 1.4762539863586426


training:   1%|▏         | 1397/100000 [1:40:48<115:36:47,  4.22s/it]

training loss: 1.5369696617126465
training loss: 1.5265552997589111
training loss: 1.5793695449829102


training:   1%|▏         | 1400/100000 [1:41:00<115:36:38,  4.22s/it]

training loss: 1.405199408531189
training loss: 1.4372113943099976
validation loss: 1.516822338104248
training loss: 1.5702067613601685


training:   1%|▏         | 1403/100000 [1:41:13<116:24:40,  4.25s/it]

training loss: 1.544120192527771
training loss: 1.5104769468307495
training loss: 1.5686168670654297


training:   1%|▏         | 1406/100000 [1:41:26<116:08:48,  4.24s/it]

training loss: 1.5581268072128296
training loss: 1.6366419792175293
training loss: 1.4887677431106567


training:   1%|▏         | 1409/100000 [1:41:39<115:59:19,  4.24s/it]

training loss: 1.4532220363616943
training loss: 1.556667685508728
training loss: 1.4511754512786865


training:   1%|▏         | 1412/100000 [1:41:51<115:53:08,  4.23s/it]

training loss: 1.442138910293579
training loss: 1.516068935394287


training:   1%|▏         | 1413/100000 [1:41:58<118:34:28,  4.33s/it]


KeyboardInterrupt: ignored