In [0]:
# Author: Arman Kabiri
# Date: Feb. 18, 2020
# Email: Arman.Kabiri94@gmail.com

In [2]:
from google.colab import drive
drive.mount('/gdrive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [3]:
import os
os.chdir('/gdrive/My Drive/NLP_Stuff/My_Language_Model')
!pwd

/gdrive/My Drive/NLP_Stuff/My_Language_Model


In [0]:
import argparse
import math
import os.path as path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

from CorpusReader import CorpusReader
from Dictionary import Dictionary
from EmbeddingsLoader import EmbeddingsLoader
from Lang_Model import LanguageModel

In [0]:
class Args:
  corpus_train_file='Data/corpus-test.txt'
  corpus_valid_file=''
  embeddings_file='Data/English_Wiki_1Billion_embeddings.bin'
  output_model_path='Data/model.bin'
  output_id2word_path = 'Data/id2word.txt'
  output_word2id_path = 'Data/word2id.txt'
  n_layers=2
  hidden_size=300
  dropout_probablity=.25
  embeddings_dim=300
  batch_size=50
  seq_len=20
  epochs=2
  lr=0.001
  seed=120
  clip_grad = 5
  print_steps=50
  bidirectional_model=False
  tie_weights=False
  freez_embeddings=False
  gpu=True
  
args = Args()

In [6]:
torch.cuda.is_available()

True

In [8]:
def main():
    torch.set_num_threads(8)

    if torch.cuda.is_available():
        if not args.gpu:
            print("WARNING: You have a CUDA device, so you should probably run with --gpu")
    else:
        if args.gpu:
            print("You do not have a GPU device, so you should run CPU without --gpu option.")
            exit()

    torch.manual_seed(args.seed)
    corpus_train_reader = CorpusReader(args.corpus_train_file, 100000000)  # 100MB

    print("Generating Dictionaries...")
    dictionary = Dictionary()
    dictionary.build_dictionary(corpus_train_reader)

    print("Saving Dictionary...")
    save_dictionary(dictionary, args.output_id2word_path, args.output_word2id_path)

    print("Loading Embeddings...")
    embeddings_matrix = None
    if args.embeddings_file is not None:
        emb_loader = EmbeddingsLoader()
        embeddings_matrix = emb_loader.get_embeddings_matrix(args.embeddings_file, dictionary, args.embeddings_dim)

    model = LanguageModel(n_layers=args.n_layers, hidden_size=args.hidden_size, n_vocab=dictionary.get_dic_size(),
                          input_size=args.embeddings_dim, dropout_prob=args.dropout_probablity,
                          bidirectional=args.bidirectional_model, pret_emb_matrix=embeddings_matrix,
                          freez_emb=args.freez_embeddings, tie_weights=args.tie_weights, use_gpu=args.gpu)

    ###############
    total_param = []
    for p in model.parameters():
        total_param.append(int(p.numel()))
    print(total_param)
    print(sum(total_param))
    ###############


    # put it into train mode.
    model.train()
    if args.gpu:
        model.cuda()

    # Optimizer and Loss
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss()

    print("Training starts ...")
    for i in range(args.epochs):
        print(f"Epoch {i + 1}:")
        train(corpus_train_reader, dictionary, model, optimizer, criterion, args)
        print(f"Saving Model at epoch {i + 1}...")
        model.save_model(args.output_model_path)


def train(corpus_train_reader, dictionary, model, optimizer, criterion, args):
    batch_generator = corpus_train_reader.batchify(dictionary, args.batch_size, args.seq_len)
    hidden = model.init_hidden(args.batch_size)

    step = 0
    for x, y in tqdm(batch_generator):

        step += 1
        x = torch.from_numpy(x)
        y = torch.from_numpy(y)

        if args.gpu:
            x = x.cuda()
            y = y.cuda()

        hidden = detach_hidden(hidden)
        model.zero_grad()

        y_hat, hidden = model.forward(x, hidden)

        loss = criterion.forward(y_hat.view(-1, dictionary.get_dic_size()),
                                 y.reshape(args.batch_size * args.seq_len).long())
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad)

        optimizer.step()

        if step % args.print_steps == 0:
            print(f"Step {step},     Loss = {loss.item()},    PPL = {math.exp(loss)}")


def detach_hidden(hidden: tuple):
    return tuple(v.detach() for v in hidden)


def save_dictionary(dictionary: Dictionary, output_id2word_path, output_word2id_path):
    with open(output_word2id_path, 'w') as file:
        for word, word_id in dictionary.word2id.items():
            if '\t' in word:
                exit()
            file.write(f"{word}\t{word_id}\n")

    with open(output_id2word_path, 'w') as file:
        for word in dictionary.id2word:
            file.write(f"{word}\n")


if __name__ == '__main__':
    main()


0it [00:00, ?it/s]

Generating Dictionaries...
Building dictionaries...


2it [00:05,  3.68s/it]


Dictionaries are built - Vocab size is 254732
Saving Dictionary...
Loading Embeddings...
Loading pretrained embeddings...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Pretrained embeddings are loaded.
[76419600, 360000, 360000, 1200, 1200, 360000, 360000, 1200, 1200, 76419600, 254732]
154538732


0it [00:00, ?it/s]

Training starts ...
Epoch 1:


51it [00:20,  4.17it/s]

Step 50,     Loss = 8.194220542907715,    PPL = 3619.9682820171465


100it [00:32,  3.84it/s]

Step 100,     Loss = 7.698996543884277,    PPL = 2206.1331230368028


150it [00:44,  3.76it/s]

Step 150,     Loss = 7.758018970489502,    PPL = 2340.2638782798062


200it [00:57,  3.72it/s]

Step 200,     Loss = 7.694796562194824,    PPL = 2196.88683502764


250it [01:10,  3.68it/s]

Step 250,     Loss = 7.827996253967285,    PPL = 2509.895139012518


300it [01:23,  3.56it/s]

Step 300,     Loss = 7.741537094116211,    PPL = 2302.0080681428885


350it [01:36,  3.46it/s]

Step 350,     Loss = 7.4575514793396,    PPL = 1732.8998175142024


400it [01:50,  3.37it/s]

Step 400,     Loss = 7.664451599121094,    PPL = 2131.2236964650683


450it [02:04,  3.44it/s]

Step 450,     Loss = 7.59600305557251,    PPL = 1990.2251570660571


500it [02:18,  3.50it/s]

Step 500,     Loss = 7.459661960601807,    PPL = 1736.560932106297


550it [02:32,  3.50it/s]

Step 550,     Loss = 7.55341100692749,    PPL = 1907.2372480874071


600it [02:45,  3.49it/s]

Step 600,     Loss = 7.449545383453369,    PPL = 1719.081444832558


650it [02:59,  3.45it/s]

Step 650,     Loss = 7.32613468170166,    PPL = 1519.4970663722743


700it [03:13,  3.42it/s]

Step 700,     Loss = 7.3961615562438965,    PPL = 1629.7168323813717


750it [03:27,  3.44it/s]

Step 750,     Loss = 7.53299617767334,    PPL = 1868.6960696517583


800it [03:40,  3.46it/s]

Step 800,     Loss = 7.3893585205078125,    PPL = 1618.667437858874


850it [03:54,  3.46it/s]

Step 850,     Loss = 7.350616931915283,    PPL = 1557.1568913506205


900it [04:08,  3.45it/s]

Step 900,     Loss = 7.270284652709961,    PPL = 1436.9594292213826


950it [04:22,  3.45it/s]

Step 950,     Loss = 7.521955490112305,    PPL = 1848.177856217046


1000it [04:35,  3.43it/s]

Step 1000,     Loss = 7.257656097412109,    PPL = 1418.926810225254


1050it [04:49,  3.43it/s]

Step 1050,     Loss = 7.410968780517578,    PPL = 1654.027960997078


1100it [05:03,  3.44it/s]

Step 1100,     Loss = 7.096799373626709,    PPL = 1208.094221793767


1150it [05:17,  3.45it/s]

Step 1150,     Loss = 7.1412353515625,    PPL = 1262.9876595550552


1200it [05:31,  3.43it/s]

Step 1200,     Loss = 6.936225414276123,    PPL = 1028.8792831983737


1250it [05:44,  3.46it/s]

Step 1250,     Loss = 7.112265586853027,    PPL = 1226.924102755821


1300it [05:58,  3.43it/s]

Step 1300,     Loss = 7.353293418884277,    PPL = 1561.3301838692116


1350it [06:12,  3.45it/s]

Step 1350,     Loss = 6.926366806030273,    PPL = 1018.7858010069074


1400it [06:26,  3.45it/s]

Step 1400,     Loss = 7.056461811065674,    PPL = 1160.3324189009209


1450it [06:39,  3.45it/s]

Step 1450,     Loss = 7.0329718589782715,    PPL = 1133.3938969325675


1500it [06:53,  3.44it/s]

Step 1500,     Loss = 7.176613807678223,    PPL = 1308.470016992468


1550it [07:07,  3.44it/s]

Step 1550,     Loss = 7.26697301864624,    PPL = 1432.2086162462426


1600it [07:21,  3.45it/s]

Step 1600,     Loss = 7.054469108581543,    PPL = 1158.0225238385303


1650it [07:35,  3.44it/s]

Step 1650,     Loss = 7.021136283874512,    PPL = 1120.0585994685528


1700it [07:48,  3.45it/s]

Step 1700,     Loss = 7.1098456382751465,    PPL = 1223.9585991485108


1750it [08:02,  3.44it/s]

Step 1750,     Loss = 6.797718048095703,    PPL = 895.800783218441


1800it [08:16,  3.45it/s]

Step 1800,     Loss = 6.90610933303833,    PPL = 998.3554078823412


1850it [08:30,  3.45it/s]

Step 1850,     Loss = 7.1053571701049805,    PPL = 1218.477210656013


1900it [08:44,  3.43it/s]

Step 1900,     Loss = 6.841333389282227,    PPL = 935.7360036218874


1950it [08:57,  3.45it/s]

Step 1950,     Loss = 7.034923553466797,    PPL = 1135.608095571335


2000it [09:11,  3.44it/s]

Step 2000,     Loss = 6.6846089363098145,    PPL = 799.9977669166263


2050it [09:25,  3.43it/s]

Step 2050,     Loss = 7.015788555145264,    PPL = 1114.084817237775


2100it [09:39,  3.43it/s]

Step 2100,     Loss = 7.51685905456543,    PPL = 1838.7826981380163


2150it [09:53,  3.44it/s]

Step 2150,     Loss = 7.157523155212402,    PPL = 1283.7273984574804


2200it [10:06,  3.42it/s]

Step 2200,     Loss = 6.875802040100098,    PPL = 968.5518715886136


2250it [10:20,  3.43it/s]

Step 2250,     Loss = 6.930720329284668,    PPL = 1023.2307773161733


2300it [10:34,  3.43it/s]

Step 2300,     Loss = 7.232219696044922,    PPL = 1383.2895812507081


2350it [10:48,  3.43it/s]

Step 2350,     Loss = 6.876251697540283,    PPL = 968.9874860751868


2400it [11:02,  3.43it/s]

Step 2400,     Loss = 6.894818305969238,    PPL = 987.1463499199216


2450it [11:16,  3.44it/s]

Step 2450,     Loss = 6.941780090332031,    PPL = 1034.6102764872398


2500it [11:29,  3.45it/s]

Step 2500,     Loss = 6.882734298706055,    PPL = 975.289449971763


2550it [11:43,  3.44it/s]

Step 2550,     Loss = 7.0507707595825195,    PPL = 1153.7476622342651


2600it [11:57,  3.43it/s]

Step 2600,     Loss = 6.989015579223633,    PPL = 1084.6531953126296


2650it [12:11,  3.44it/s]

Step 2650,     Loss = 6.950427532196045,    PPL = 1043.5958036234078


2700it [12:25,  3.44it/s]

Step 2700,     Loss = 6.791537284851074,    PPL = 890.2811260817325


2750it [12:38,  3.44it/s]

Step 2750,     Loss = 6.980000019073486,    PPL = 1074.918387498213


2800it [12:52,  3.44it/s]

Step 2800,     Loss = 7.140008449554443,    PPL = 1261.4390476511826


2850it [13:06,  3.43it/s]

Step 2850,     Loss = 6.726790904998779,    PPL = 834.4650868333098


2900it [13:20,  3.44it/s]

Step 2900,     Loss = 6.910348415374756,    PPL = 1002.5965014788748


2950it [13:34,  3.42it/s]

Step 2950,     Loss = 6.752592086791992,    PPL = 856.2754286183986


3000it [13:47,  3.43it/s]

Step 3000,     Loss = 6.915894508361816,    PPL = 1008.1724429569489


3050it [14:01,  3.44it/s]

Step 3050,     Loss = 6.672049522399902,    PPL = 790.0130958902243


3100it [14:15,  3.42it/s]

Step 3100,     Loss = 6.606647968292236,    PPL = 739.9983587513955


3150it [14:29,  3.43it/s]

Step 3150,     Loss = 6.974459648132324,    PPL = 1068.9794081660111


3200it [14:43,  3.44it/s]

Step 3200,     Loss = 6.816103935241699,    PPL = 912.4232161595942


3250it [14:57,  3.44it/s]

Step 3250,     Loss = 7.135544300079346,    PPL = 1255.8203458821024


3300it [15:10,  3.44it/s]

Step 3300,     Loss = 6.9972615242004395,    PPL = 1093.634163275116


3350it [15:24,  3.44it/s]

Step 3350,     Loss = 6.972865104675293,    PPL = 1067.2762322998558


3400it [15:38,  3.44it/s]

Step 3400,     Loss = 6.659629821777344,    PPL = 780.2620476418028


3450it [15:52,  3.43it/s]

Step 3450,     Loss = 6.833681583404541,    PPL = 928.6032573661762


3500it [16:06,  3.43it/s]

Step 3500,     Loss = 6.832452774047852,    PPL = 927.4628817904558


3550it [16:19,  3.42it/s]

Step 3550,     Loss = 6.800971031188965,    PPL = 898.719552801391


3600it [16:33,  3.43it/s]

Step 3600,     Loss = 6.596236228942871,    PPL = 732.333659387518


3650it [16:47,  3.42it/s]

Step 3650,     Loss = 6.903591632843018,    PPL = 995.845009817852


3700it [17:01,  3.43it/s]

Step 3700,     Loss = 6.72004508972168,    PPL = 828.8548834749258


3750it [17:15,  3.43it/s]

Step 3750,     Loss = 6.8269877433776855,    PPL = 922.4080935631323


3800it [17:29,  3.44it/s]

Step 3800,     Loss = 6.689234256744385,    PPL = 803.7065835561795


3850it [17:43,  3.45it/s]

Step 3850,     Loss = 6.605579376220703,    PPL = 739.2080247198323


3900it [17:56,  3.43it/s]

Step 3900,     Loss = 6.692007541656494,    PPL = 805.93858445478


3950it [18:10,  3.42it/s]

Step 3950,     Loss = 6.792820453643799,    PPL = 891.4242402868626


4000it [18:24,  3.43it/s]

Step 4000,     Loss = 6.655102729797363,    PPL = 776.7377130961077


4050it [18:38,  3.42it/s]

Step 4050,     Loss = 6.747227668762207,    PPL = 851.6943077801635


4100it [18:52,  3.41it/s]

Step 4100,     Loss = 6.727216720581055,    PPL = 834.8204907330373


4150it [19:06,  3.42it/s]

Step 4150,     Loss = 6.524796962738037,    PPL = 681.8413204007992


4200it [19:19,  3.43it/s]

Step 4200,     Loss = 6.914004325866699,    PPL = 1006.2686129131554


4250it [19:33,  3.43it/s]

Step 4250,     Loss = 6.825351238250732,    PPL = 920.8998024885304


4300it [19:47,  3.44it/s]

Step 4300,     Loss = 6.8130269050598145,    PPL = 909.6199774217023


4350it [20:01,  3.43it/s]

Step 4350,     Loss = 6.596581935882568,    PPL = 732.5868759825873


4400it [20:15,  3.45it/s]

Step 4400,     Loss = 6.658056735992432,    PPL = 779.0355934179189


4450it [20:29,  3.43it/s]

Step 4450,     Loss = 7.001528739929199,    PPL = 1098.3109074196211


4500it [20:43,  3.45it/s]

Step 4500,     Loss = 6.759183406829834,    PPL = 861.9380556010211


4550it [20:56,  3.44it/s]

Step 4550,     Loss = 6.4152655601501465,    PPL = 611.1030238826144


4600it [21:10,  3.42it/s]

Step 4600,     Loss = 6.583181381225586,    PPL = 722.8352897698906


4650it [21:24,  3.42it/s]

Step 4650,     Loss = 6.8214287757873535,    PPL = 917.2946826765819


4700it [21:38,  3.43it/s]

Step 4700,     Loss = 6.905519008636475,    PPL = 997.7662282440126


4750it [21:52,  3.42it/s]

Step 4750,     Loss = 6.639732837677002,    PPL = 764.890615766555


4800it [22:06,  3.43it/s]

Step 4800,     Loss = 6.677563190460205,    PPL = 794.3809963754159


4850it [22:20,  3.42it/s]

Step 4850,     Loss = 6.541160583496094,    PPL = 693.0905008330557


4900it [22:33,  3.42it/s]

Step 4900,     Loss = 6.709248065948486,    PPL = 819.9538563643166


4950it [22:47,  3.43it/s]

Step 4950,     Loss = 6.618265151977539,    PPL = 748.6451842448656


5000it [23:01,  3.43it/s]

Step 5000,     Loss = 6.3993988037109375,    PPL = 601.4833196114555


5050it [23:15,  3.42it/s]

Step 5050,     Loss = 6.709057331085205,    PPL = 819.7974774915515


5100it [23:29,  3.42it/s]

Step 5100,     Loss = 6.808146953582764,    PPL = 905.1918892844616


5150it [23:43,  3.43it/s]

Step 5150,     Loss = 6.708868980407715,    PPL = 819.6430826219068


5200it [23:56,  3.42it/s]

Step 5200,     Loss = 6.731900691986084,    PPL = 838.739938163889


5250it [24:10,  3.42it/s]

Step 5250,     Loss = 6.717061519622803,    PPL = 826.3856222669514


5300it [24:24,  3.41it/s]

Step 5300,     Loss = 6.891031265258789,    PPL = 983.4150562455437


5350it [24:38,  3.42it/s]

Step 5350,     Loss = 6.700817584991455,    PPL = 813.0703075755786


5400it [24:52,  3.43it/s]

Step 5400,     Loss = 6.7777862548828125,    PPL = 878.1226311386865


5450it [25:06,  3.43it/s]

Step 5450,     Loss = 6.75071907043457,    PPL = 854.6731117853928


5500it [25:20,  3.42it/s]

Step 5500,     Loss = 6.69844388961792,    PPL = 811.1426151305673


5550it [25:34,  3.43it/s]

Step 5550,     Loss = 6.656318187713623,    PPL = 777.6823790829246


5600it [25:47,  3.43it/s]

Step 5600,     Loss = 6.739644527435303,    PPL = 845.2602156241686


5650it [26:01,  3.43it/s]

Step 5650,     Loss = 6.586967468261719,    PPL = 725.5771943591493


5700it [26:15,  3.44it/s]

Step 5700,     Loss = 6.774685859680176,    PPL = 875.4043200425871


5750it [26:29,  3.43it/s]

Step 5750,     Loss = 6.663241863250732,    PPL = 783.085482630392


5800it [26:43,  3.44it/s]

Step 5800,     Loss = 6.744247913360596,    PPL = 849.1602443855613


5850it [26:57,  3.44it/s]

Step 5850,     Loss = 6.632091045379639,    PPL = 759.067757424102


5900it [27:10,  3.43it/s]

Step 5900,     Loss = 6.845841407775879,    PPL = 939.963841256592


5950it [27:24,  3.43it/s]

Step 5950,     Loss = 6.688084602355957,    PPL = 802.7831296834955


6000it [27:38,  3.44it/s]

Step 6000,     Loss = 6.323396682739258,    PPL = 557.4633062258948


6050it [27:52,  3.43it/s]

Step 6050,     Loss = 6.425118446350098,    PPL = 617.1539128422468


6100it [28:06,  3.43it/s]

Step 6100,     Loss = 6.777194023132324,    PPL = 877.6027330009916


6150it [28:20,  3.43it/s]

Step 6150,     Loss = 6.44540548324585,    PPL = 629.8019993529807


6200it [28:33,  3.43it/s]

Step 6200,     Loss = 6.728826999664307,    PPL = 836.165867632962


6250it [28:47,  3.43it/s]

Step 6250,     Loss = 6.607515811920166,    PPL = 740.6408403581768


6300it [29:01,  3.43it/s]

Step 6300,     Loss = 6.691573619842529,    PPL = 805.5889459856345


6350it [29:15,  3.43it/s]

Step 6350,     Loss = 6.419590473175049,    PPL = 613.7517148605987


6400it [29:29,  3.41it/s]

Step 6400,     Loss = 6.4349493980407715,    PPL = 623.2510443417002


6450it [29:43,  3.43it/s]

Step 6450,     Loss = 6.598054885864258,    PPL = 733.6667349020028


6500it [29:57,  3.42it/s]

Step 6500,     Loss = 6.48423957824707,    PPL = 654.740895672392


6550it [30:11,  3.43it/s]

Step 6550,     Loss = 6.723587512969971,    PPL = 831.7962449814281


6600it [30:24,  3.42it/s]

Step 6600,     Loss = 6.526026248931885,    PPL = 682.6800139139254


6650it [30:38,  3.41it/s]

Step 6650,     Loss = 6.4564900398254395,    PPL = 636.8219096689277


6700it [30:52,  3.42it/s]

Step 6700,     Loss = 6.559545516967773,    PPL = 705.9507790248952


6750it [31:06,  3.42it/s]

Step 6750,     Loss = 6.312115669250488,    PPL = 551.2098938813281


6800it [31:20,  3.41it/s]

Step 6800,     Loss = 6.379718780517578,    PPL = 589.7618318187068


6850it [31:34,  3.42it/s]

Step 6850,     Loss = 6.67225456237793,    PPL = 790.1750967658074


6900it [31:48,  3.43it/s]

Step 6900,     Loss = 6.474735736846924,    PPL = 648.5478176719914


6950it [32:02,  3.42it/s]

Step 6950,     Loss = 6.588151454925537,    PPL = 726.4367768475273


7000it [32:15,  3.42it/s]

Step 7000,     Loss = 6.7771735191345215,    PPL = 877.5847388209595


7050it [32:29,  3.41it/s]

Step 7050,     Loss = 6.720880508422852,    PPL = 829.5476136647267


7100it [32:43,  3.42it/s]

Step 7100,     Loss = 6.704314231872559,    PPL = 815.9183036472105


7150it [32:57,  3.43it/s]

Step 7150,     Loss = 6.577652931213379,    PPL = 718.8501569571604


7200it [33:11,  3.43it/s]

Step 7200,     Loss = 6.671263694763184,    PPL = 789.3925256286699


7250it [33:25,  3.43it/s]

Step 7250,     Loss = 6.855186462402344,    PPL = 948.7890263934075


7300it [33:39,  3.43it/s]

Step 7300,     Loss = 6.5770487785339355,    PPL = 718.4159928726198


7350it [33:52,  3.43it/s]

Step 7350,     Loss = 6.453771591186523,    PPL = 635.0930929297696


7400it [34:06,  3.42it/s]

Step 7400,     Loss = 6.703444480895996,    PPL = 815.2089664239053


7450it [34:20,  3.41it/s]

Step 7450,     Loss = 6.534774303436279,    PPL = 688.6783344487225


7500it [34:34,  3.42it/s]

Step 7500,     Loss = 6.608522415161133,    PPL = 741.386747181604


7550it [34:48,  3.43it/s]

Step 7550,     Loss = 6.695765972137451,    PPL = 808.9733479979047


7600it [35:02,  3.43it/s]

Step 7600,     Loss = 6.508730411529541,    PPL = 670.9740157314529


7650it [35:16,  3.43it/s]

Step 7650,     Loss = 6.487569808959961,    PPL = 656.9249686270355


7700it [35:30,  3.42it/s]

Step 7700,     Loss = 6.432750701904297,    PPL = 621.8822100552231


7750it [35:43,  3.44it/s]

Step 7750,     Loss = 6.549078941345215,    PPL = 698.6004254283064


7800it [35:57,  3.44it/s]

Step 7800,     Loss = 6.722917079925537,    PPL = 831.2387681888697


7850it [36:11,  3.44it/s]

Step 7850,     Loss = 6.371545791625977,    PPL = 584.96135874681


7900it [36:25,  3.42it/s]

Step 7900,     Loss = 6.566219806671143,    PPL = 710.6782577725584


7950it [36:39,  3.44it/s]

Step 7950,     Loss = 6.606497287750244,    PPL = 739.8868637978965


8000it [36:53,  3.42it/s]

Step 8000,     Loss = 6.670921802520752,    PPL = 789.1226845788333


8050it [37:06,  3.43it/s]

Step 8050,     Loss = 6.467450141906738,    PPL = 643.8399316727883


8100it [37:20,  3.43it/s]

Step 8100,     Loss = 6.673071384429932,    PPL = 790.8207928833017


8150it [37:34,  3.42it/s]

Step 8150,     Loss = 6.486629009246826,    PPL = 656.3072244373424


8200it [37:48,  3.42it/s]

Step 8200,     Loss = 6.562586307525635,    PPL = 708.1006945529052


8250it [38:02,  3.43it/s]

Step 8250,     Loss = 6.422163486480713,    PPL = 615.3329395730601


8300it [38:16,  3.43it/s]

Step 8300,     Loss = 6.445348262786865,    PPL = 629.7659628245317


8350it [38:30,  3.42it/s]

Step 8350,     Loss = 6.5443010330200195,    PPL = 695.2705379229511


8400it [38:44,  3.42it/s]

Step 8400,     Loss = 6.514657020568848,    PPL = 674.9624235909693


8450it [38:57,  3.41it/s]

Step 8450,     Loss = 6.760913372039795,    PPL = 863.4304689897676


8500it [39:11,  3.44it/s]

Step 8500,     Loss = 6.740929126739502,    PPL = 846.3467340300068


8550it [39:25,  3.42it/s]

Step 8550,     Loss = 6.562167167663574,    PPL = 707.8039635157124


8600it [39:39,  3.42it/s]

Step 8600,     Loss = 6.452557563781738,    PPL = 634.3225403409836


8650it [39:53,  3.43it/s]

Step 8650,     Loss = 6.4413838386535645,    PPL = 627.27424581815


8700it [40:07,  3.44it/s]

Step 8700,     Loss = 6.493497371673584,    PPL = 660.8304962713597


8750it [40:20,  3.43it/s]

Step 8750,     Loss = 6.631535530090332,    PPL = 758.6462007806545


8800it [40:34,  3.43it/s]

Step 8800,     Loss = 6.670666217803955,    PPL = 788.9210226529281


8850it [40:48,  3.42it/s]

Step 8850,     Loss = 6.2653093338012695,    PPL = 526.0042716967778


8900it [41:02,  3.42it/s]

Step 8900,     Loss = 6.715884685516357,    PPL = 825.4136755038157


8950it [41:16,  3.42it/s]

Step 8950,     Loss = 6.459047794342041,    PPL = 638.4528286407001


9000it [41:30,  3.43it/s]

Step 9000,     Loss = 6.481775283813477,    PPL = 653.1294077336321


9050it [41:44,  3.41it/s]

Step 9050,     Loss = 6.505629539489746,    PPL = 668.8966336791759


9100it [41:58,  3.42it/s]

Step 9100,     Loss = 6.318683624267578,    PPL = 554.8421307963632


9150it [42:12,  3.41it/s]

Step 9150,     Loss = 6.2897210121154785,    PPL = 539.0029328182725


9200it [42:26,  3.42it/s]

Step 9200,     Loss = 6.395978927612305,    PPL = 599.4298345166737


9250it [42:39,  3.42it/s]

Step 9250,     Loss = 6.558753967285156,    PPL = 705.392205008522


9300it [42:53,  3.42it/s]

Step 9300,     Loss = 6.619961261749268,    PPL = 749.916046113371


9350it [43:07,  3.42it/s]

Step 9350,     Loss = 6.280036926269531,    PPL = 533.8083750135625


9400it [43:21,  3.42it/s]

Step 9400,     Loss = 6.493138790130615,    PPL = 660.5935771323198


9450it [43:35,  3.43it/s]

Step 9450,     Loss = 6.556787014007568,    PPL = 704.0060951524517


9500it [43:49,  3.42it/s]

Step 9500,     Loss = 6.490755558013916,    PPL = 659.021103831926


9550it [44:03,  3.42it/s]

Step 9550,     Loss = 6.366374492645264,    PPL = 581.9441568191081


9600it [44:16,  3.42it/s]

Step 9600,     Loss = 6.439443588256836,    PPL = 626.0583566599963


9650it [44:30,  3.43it/s]

Step 9650,     Loss = 6.426235675811768,    PPL = 617.8438006858111


9700it [44:44,  3.43it/s]

Step 9700,     Loss = 6.674450874328613,    PPL = 791.9124749878476


9750it [44:58,  3.42it/s]

Step 9750,     Loss = 6.306886672973633,    PPL = 548.3351419821611


9800it [45:12,  3.42it/s]

Step 9800,     Loss = 6.526181221008301,    PPL = 682.785818451372


9850it [45:26,  3.43it/s]

Step 9850,     Loss = 6.179810047149658,    PPL = 482.90021936711105


9900it [45:40,  3.43it/s]

Step 9900,     Loss = 6.284892559051514,    PPL = 536.4066555015258


9950it [45:54,  3.43it/s]

Step 9950,     Loss = 6.757978916168213,    PPL = 860.9004842600915


10000it [46:07,  3.42it/s]

Step 10000,     Loss = 6.22663688659668,    PPL = 506.05071249955506


10050it [46:21,  3.41it/s]

Step 10050,     Loss = 6.41163969039917,    PPL = 608.8912561276503


10100it [46:35,  3.43it/s]

Step 10100,     Loss = 6.619841575622559,    PPL = 749.8262969374275


10150it [46:49,  3.41it/s]

Step 10150,     Loss = 6.55705451965332,    PPL = 704.1944459488763


10200it [47:03,  3.42it/s]

Step 10200,     Loss = 6.545527935028076,    PPL = 696.1240902475317


10250it [47:17,  3.43it/s]

Step 10250,     Loss = 6.427412509918213,    PPL = 618.5713283486324


10300it [47:31,  3.42it/s]

Step 10300,     Loss = 6.554932117462158,    PPL = 702.7014470463071


10350it [47:44,  3.42it/s]

Step 10350,     Loss = 6.477436542510986,    PPL = 650.3017867900676


10400it [47:58,  3.42it/s]

Step 10400,     Loss = 6.202453136444092,    PPL = 493.9593055887886


10450it [48:12,  3.42it/s]

Step 10450,     Loss = 6.2352447509765625,    PPL = 510.42553030852366


10500it [48:26,  3.43it/s]

Step 10500,     Loss = 6.442904949188232,    PPL = 628.2291253361221


10550it [48:40,  3.41it/s]

Step 10550,     Loss = 6.666033744812012,    PPL = 785.2748193125266


10600it [48:54,  3.42it/s]

Step 10600,     Loss = 6.683821678161621,    PPL = 799.3682100005061


10650it [49:08,  3.41it/s]

Step 10650,     Loss = 6.396215438842773,    PPL = 599.5716231710586


10700it [49:22,  3.41it/s]

Step 10700,     Loss = 6.388260841369629,    PPL = 594.8211911883872


10750it [49:36,  3.41it/s]

Step 10750,     Loss = 6.502469539642334,    PPL = 666.7862565706005


10800it [49:49,  3.41it/s]

Step 10800,     Loss = 6.461627960205078,    PPL = 640.1022698351873


10850it [50:03,  3.41it/s]

Step 10850,     Loss = 6.648684024810791,    PPL = 771.7680293930575


10900it [50:17,  3.42it/s]

Step 10900,     Loss = 6.404174327850342,    PPL = 604.3625872578261


10950it [50:31,  3.42it/s]

Step 10950,     Loss = 6.395759582519531,    PPL = 599.2983669429212


11000it [50:45,  3.42it/s]

Step 11000,     Loss = 6.4518961906433105,    PPL = 633.9031551520392


11050it [50:59,  3.42it/s]

Step 11050,     Loss = 6.303121566772461,    PPL = 546.2744836717831


11100it [51:13,  3.42it/s]

Step 11100,     Loss = 6.373809337615967,    PPL = 586.286945381871


11150it [51:27,  3.42it/s]

Step 11150,     Loss = 6.384323596954346,    PPL = 592.4838391578508


11200it [51:41,  3.42it/s]

Step 11200,     Loss = 6.319090366363525,    PPL = 555.0678543500869


11250it [51:54,  3.43it/s]

Step 11250,     Loss = 6.511569023132324,    PPL = 672.881356176299


11300it [52:08,  3.42it/s]

Step 11300,     Loss = 6.46724796295166,    PPL = 643.7097739461851


11350it [52:22,  3.42it/s]

Step 11350,     Loss = 6.203153610229492,    PPL = 494.30543234561526


11400it [52:36,  3.42it/s]

Step 11400,     Loss = 6.442851543426514,    PPL = 628.1955751770436


11450it [52:50,  3.42it/s]

Step 11450,     Loss = 6.567889213562012,    PPL = 711.8656598060771


11500it [53:04,  3.43it/s]

Step 11500,     Loss = 6.369578838348389,    PPL = 583.8118979234239


11550it [53:18,  3.41it/s]

Step 11550,     Loss = 6.477697849273682,    PPL = 650.4717372483838


11600it [53:31,  3.44it/s]

Step 11600,     Loss = 6.376782417297363,    PPL = 588.0326169110689


11650it [53:45,  3.42it/s]

Step 11650,     Loss = 6.401291847229004,    PPL = 602.6230321332146


11700it [53:59,  3.42it/s]

Step 11700,     Loss = 6.629145622253418,    PPL = 756.8352711194095


11750it [54:13,  3.42it/s]

Step 11750,     Loss = 6.652216911315918,    PPL = 774.4994202552083


11800it [54:27,  3.40it/s]

Step 11800,     Loss = 6.496070861816406,    PPL = 662.5333272091074


11850it [54:41,  3.41it/s]

Step 11850,     Loss = 6.3072099685668945,    PPL = 548.5124449762884


11900it [54:55,  3.41it/s]

Step 11900,     Loss = 6.469466686248779,    PPL = 645.1395733963545


11950it [55:09,  3.40it/s]

Step 11950,     Loss = 6.252439498901367,    PPL = 519.2780590264608


12000it [55:23,  3.39it/s]

Step 12000,     Loss = 6.277225494384766,    PPL = 532.3097168025685


12050it [55:37,  3.39it/s]

Step 12050,     Loss = 6.445805072784424,    PPL = 630.0537119308025


12100it [55:51,  3.41it/s]

Step 12100,     Loss = 6.378207206726074,    PPL = 588.8710367113807


12150it [56:05,  3.40it/s]

Step 12150,     Loss = 6.680595874786377,    PPL = 796.7937598979743


12200it [56:19,  3.40it/s]

Step 12200,     Loss = 6.341859340667725,    PPL = 567.851159170463


12250it [56:33,  3.42it/s]

Step 12250,     Loss = 6.343688011169434,    PPL = 568.8905218709159


12300it [56:47,  3.40it/s]

Step 12300,     Loss = 6.438473701477051,    PPL = 625.451445301725


12350it [57:00,  3.42it/s]

Step 12350,     Loss = 6.73750114440918,    PPL = 843.4504394399761


12400it [57:14,  3.42it/s]

Step 12400,     Loss = 6.21268367767334,    PPL = 499.0387148807568


12450it [57:28,  3.42it/s]

Step 12450,     Loss = 6.385677337646484,    PPL = 593.286451782465


12500it [57:42,  3.42it/s]

Step 12500,     Loss = 6.145672798156738,    PPL = 466.69353401008453


12550it [57:56,  3.41it/s]

Step 12550,     Loss = 6.357038974761963,    PPL = 576.5366867752376


12600it [58:10,  3.41it/s]

Step 12600,     Loss = 6.460667133331299,    PPL = 639.4875377452114


12650it [58:24,  3.39it/s]

Step 12650,     Loss = 6.223793029785156,    PPL = 504.61362114340005


12700it [58:38,  3.42it/s]

Step 12700,     Loss = 6.1187896728515625,    PPL = 454.3144924403167


12750it [58:52,  3.41it/s]

Step 12750,     Loss = 6.2191972732543945,    PPL = 502.2998606109634


12800it [59:06,  3.42it/s]

Step 12800,     Loss = 6.375870704650879,    PPL = 587.496744455652


12850it [59:19,  3.40it/s]

Step 12850,     Loss = 6.54171895980835,    PPL = 693.4776142183847


12900it [59:33,  3.42it/s]

Step 12900,     Loss = 6.369355201721191,    PPL = 583.6813507977582


12950it [59:47,  3.41it/s]

Step 12950,     Loss = 6.2011399269104,    PPL = 493.3110592542266


13000it [1:00:01,  3.41it/s]

Step 13000,     Loss = 6.398368835449219,    PPL = 600.8641298099624


13050it [1:00:15,  3.41it/s]

Step 13050,     Loss = 6.104528903961182,    PPL = 447.8815965020922


13100it [1:00:29,  3.42it/s]

Step 13100,     Loss = 6.267139911651611,    PPL = 526.968045327927


13150it [1:00:43,  3.41it/s]

Step 13150,     Loss = 6.283997535705566,    PPL = 535.9267738066253


13200it [1:00:57,  3.42it/s]

Step 13200,     Loss = 6.39684534072876,    PPL = 599.949413440188


13250it [1:01:11,  3.41it/s]

Step 13250,     Loss = 6.380261421203613,    PPL = 590.0819474297446


13300it [1:01:25,  3.42it/s]

Step 13300,     Loss = 6.439088344573975,    PPL = 625.8359928826924


13350it [1:01:39,  3.40it/s]

Step 13350,     Loss = 6.358314037322998,    PPL = 577.2722759810772


13400it [1:01:53,  3.42it/s]

Step 13400,     Loss = 6.3032379150390625,    PPL = 546.338045458627


13450it [1:02:06,  3.40it/s]

Step 13450,     Loss = 6.098730087280273,    PPL = 445.2919289895442


13500it [1:02:20,  3.41it/s]

Step 13500,     Loss = 6.189937591552734,    PPL = 487.8156614464621


13550it [1:02:34,  3.40it/s]

Step 13550,     Loss = 6.712446689605713,    PPL = 822.5807791961629


13600it [1:02:48,  3.41it/s]

Step 13600,     Loss = 6.243250846862793,    PPL = 514.5284483088492


13650it [1:03:02,  3.40it/s]

Step 13650,     Loss = 6.192508220672607,    PPL = 489.07126774853367


13700it [1:03:16,  3.39it/s]

Step 13700,     Loss = 6.278439044952393,    PPL = 532.9560936877386


13750it [1:03:30,  3.42it/s]

Step 13750,     Loss = 6.309132099151611,    PPL = 549.5677714355911


13800it [1:03:44,  3.42it/s]

Step 13800,     Loss = 6.221468925476074,    PPL = 503.4422082222664


13850it [1:03:58,  3.41it/s]

Step 13850,     Loss = 6.29940128326416,    PPL = 544.2459633940866


13900it [1:04:12,  3.41it/s]

Step 13900,     Loss = 6.309799671173096,    PPL = 549.9347699890646


13950it [1:04:26,  3.41it/s]

Step 13950,     Loss = 6.190793991088867,    PPL = 488.23360549064193


14000it [1:04:40,  3.41it/s]

Step 14000,     Loss = 6.019241809844971,    PPL = 411.26665915592656


14050it [1:04:54,  3.41it/s]

Step 14050,     Loss = 6.0121684074401855,    PPL = 408.36786882156593


14100it [1:05:08,  3.40it/s]

Step 14100,     Loss = 6.181826591491699,    PPL = 483.87499157761715


14150it [1:05:22,  3.41it/s]

Step 14150,     Loss = 6.4069085121154785,    PPL = 606.0172870305396


14200it [1:05:35,  3.40it/s]

Step 14200,     Loss = 6.099454402923584,    PPL = 445.6145777352121


14250it [1:05:49,  3.41it/s]

Step 14250,     Loss = 6.225743293762207,    PPL = 505.59871119162784


14300it [1:06:03,  3.39it/s]

Step 14300,     Loss = 6.135710716247559,    PPL = 462.06737613905767


14350it [1:06:17,  3.40it/s]

Step 14350,     Loss = 6.052944660186768,    PPL = 425.36373935223384


14400it [1:06:31,  3.41it/s]

Step 14400,     Loss = 6.24116325378418,    PPL = 513.4554426706657


14450it [1:06:45,  3.40it/s]

Step 14450,     Loss = 6.268353462219238,    PPL = 527.6079358897024


14500it [1:06:59,  3.39it/s]

Step 14500,     Loss = 6.418422222137451,    PPL = 613.035117447223


14550it [1:07:13,  3.39it/s]

Step 14550,     Loss = 6.101894378662109,    PPL = 446.7031940522323


14600it [1:07:27,  3.40it/s]

Step 14600,     Loss = 6.266890048980713,    PPL = 526.8363921329376


14650it [1:07:41,  3.41it/s]

Step 14650,     Loss = 6.391388416290283,    PPL = 596.6844512521202


14700it [1:07:55,  3.41it/s]

Step 14700,     Loss = 6.691690921783447,    PPL = 805.6834486751468


14750it [1:08:09,  3.42it/s]

Step 14750,     Loss = 6.448085784912109,    PPL = 631.4923229779318


14800it [1:08:23,  3.42it/s]

Step 14800,     Loss = 6.241257667541504,    PPL = 513.5039222167588


14850it [1:08:37,  3.42it/s]

Step 14850,     Loss = 6.276766777038574,    PPL = 532.0655930980946


14900it [1:08:51,  3.42it/s]

Step 14900,     Loss = 6.272649765014648,    PPL = 529.8795756715175


14950it [1:09:05,  3.39it/s]

Step 14950,     Loss = 6.078063011169434,    PPL = 436.18349337103973


15000it [1:09:19,  3.40it/s]

Step 15000,     Loss = 6.321504592895508,    PPL = 556.4095327972985


15050it [1:09:32,  3.40it/s]

Step 15050,     Loss = 6.275853633880615,    PPL = 531.5799628009077


15100it [1:09:46,  3.40it/s]

Step 15100,     Loss = 6.0966877937316895,    PPL = 444.38344017193066


15150it [1:10:00,  3.41it/s]

Step 15150,     Loss = 6.1642045974731445,    PPL = 475.42283997550896


15200it [1:10:14,  3.40it/s]

Step 15200,     Loss = 5.950786113739014,    PPL = 384.0551314393659


15250it [1:10:28,  3.39it/s]

Step 15250,     Loss = 6.425475120544434,    PPL = 617.3740749776331


15300it [1:10:42,  3.41it/s]

Step 15300,     Loss = 6.174120903015137,    PPL = 480.16073047904564


15350it [1:10:56,  3.40it/s]

Step 15350,     Loss = 6.403545379638672,    PPL = 603.9825939999364


15400it [1:11:10,  3.40it/s]

Step 15400,     Loss = 6.249020099639893,    PPL = 517.5054723328742


15450it [1:11:24,  3.40it/s]

Step 15450,     Loss = 6.054964065551758,    PPL = 426.22358906991514


15500it [1:11:38,  3.41it/s]

Step 15500,     Loss = 6.22743034362793,    PPL = 506.4524013358983


15550it [1:11:52,  3.40it/s]

Step 15550,     Loss = 6.31978178024292,    PPL = 555.4517686751424


15600it [1:12:06,  3.41it/s]

Step 15600,     Loss = 6.607783317565918,    PPL = 740.8389924666692


15650it [1:12:20,  3.40it/s]

Step 15650,     Loss = 6.139527797698975,    PPL = 463.83449542335063


15700it [1:12:34,  3.40it/s]

Step 15700,     Loss = 6.082520484924316,    PPL = 438.1321095714081


15750it [1:12:48,  3.40it/s]

Step 15750,     Loss = 6.021378993988037,    PPL = 412.146551649676


15800it [1:13:02,  3.39it/s]

Step 15800,     Loss = 6.134228229522705,    PPL = 461.3828748953116


15850it [1:13:16,  3.41it/s]

Step 15850,     Loss = 6.288815021514893,    PPL = 538.5148223725763


15900it [1:13:30,  3.39it/s]

Step 15900,     Loss = 6.10330867767334,    PPL = 447.335412905761


15950it [1:13:44,  3.40it/s]

Step 15950,     Loss = 6.074451923370361,    PPL = 434.6112369664431


16000it [1:13:58,  3.40it/s]

Step 16000,     Loss = 6.372574329376221,    PPL = 585.563323105181


16050it [1:14:12,  3.40it/s]

Step 16050,     Loss = 6.368494033813477,    PPL = 583.1789195200163


16100it [1:14:26,  3.42it/s]

Step 16100,     Loss = 6.345161437988281,    PPL = 569.7293582530813


16150it [1:14:40,  3.42it/s]

Step 16150,     Loss = 6.300407409667969,    PPL = 544.7938191878977


16200it [1:14:54,  3.41it/s]

Step 16200,     Loss = 6.4827423095703125,    PPL = 653.7613061752492


16250it [1:15:07,  3.41it/s]

Step 16250,     Loss = 5.975373268127441,    PPL = 393.61499760658614


16300it [1:15:21,  3.41it/s]

Step 16300,     Loss = 6.55235481262207,    PPL = 700.892703049242


16350it [1:15:35,  3.41it/s]

Step 16350,     Loss = 6.183560848236084,    PPL = 484.71488312848504


16400it [1:15:49,  3.41it/s]

Step 16400,     Loss = 6.411850929260254,    PPL = 609.0198912090044


16450it [1:16:03,  3.41it/s]

Step 16450,     Loss = 6.066017150878906,    PPL = 430.9608069762119


16500it [1:16:17,  3.40it/s]

Step 16500,     Loss = 6.086621284484863,    PPL = 439.93249051146717


16550it [1:16:31,  3.41it/s]

Step 16550,     Loss = 6.339982509613037,    PPL = 566.7863979814755


16600it [1:16:45,  3.39it/s]

Step 16600,     Loss = 6.250814437866211,    PPL = 518.434885775879


16650it [1:16:59,  3.41it/s]

Step 16650,     Loss = 6.293420791625977,    PPL = 541.0008184143259


16700it [1:17:13,  3.40it/s]

Step 16700,     Loss = 6.301416873931885,    PPL = 545.3440467503766


16750it [1:17:27,  3.40it/s]

Step 16750,     Loss = 6.157505035400391,    PPL = 472.2483608319597


16800it [1:17:41,  3.40it/s]

Step 16800,     Loss = 6.446698188781738,    PPL = 630.6166743380402


16801it [1:17:41,  3.65it/s]


Saving Model at epoch 1...


0it [00:00, ?it/s]

Epoch 2:


50it [00:18,  3.40it/s]

Step 50,     Loss = 6.114781379699707,    PPL = 452.4971115024954


100it [00:33,  3.27it/s]

Step 100,     Loss = 5.811302185058594,    PPL = 334.0538424947277


150it [00:47,  3.36it/s]

Step 150,     Loss = 5.991809368133545,    PPL = 400.13795219326664


200it [01:01,  3.42it/s]

Step 200,     Loss = 6.1007914543151855,    PPL = 446.21078581809394


250it [01:15,  3.46it/s]

Step 250,     Loss = 6.340524673461914,    PPL = 567.0937723926139


300it [01:29,  3.45it/s]

Step 300,     Loss = 6.28832483291626,    PPL = 538.2509132343642


350it [01:43,  3.39it/s]

Step 350,     Loss = 6.18840217590332,    PPL = 487.0672363646739


400it [01:57,  3.36it/s]

Step 400,     Loss = 6.530200004577637,    PPL = 685.5353079811171


450it [02:11,  3.39it/s]

Step 450,     Loss = 6.277965068817139,    PPL = 532.7035450739285


500it [02:25,  3.40it/s]

Step 500,     Loss = 6.244513511657715,    PPL = 515.1785356013128


550it [02:39,  3.41it/s]

Step 550,     Loss = 6.268543243408203,    PPL = 527.7080754530849


600it [02:53,  3.42it/s]

Step 600,     Loss = 6.133914947509766,    PPL = 461.2383545785185


650it [03:06,  3.40it/s]

Step 650,     Loss = 6.127882957458496,    PPL = 458.4645436228799


700it [03:20,  3.40it/s]

Step 700,     Loss = 6.150307655334473,    PPL = 468.86161237156887


750it [03:34,  3.41it/s]

Step 750,     Loss = 6.448766231536865,    PPL = 631.9221660237012


800it [03:48,  3.40it/s]

Step 800,     Loss = 6.215703010559082,    PPL = 500.54775588600666


850it [04:02,  3.39it/s]

Step 850,     Loss = 6.112729072570801,    PPL = 451.56940075406436


900it [04:16,  3.40it/s]

Step 900,     Loss = 6.132407188415527,    PPL = 460.5434422666177


950it [04:30,  3.38it/s]

Step 950,     Loss = 6.304955005645752,    PPL = 547.2769632573353


1000it [04:44,  3.40it/s]

Step 1000,     Loss = 6.134125232696533,    PPL = 461.33535637071793


1050it [04:58,  3.40it/s]

Step 1050,     Loss = 6.3653564453125,    PPL = 581.3520115895194


1100it [05:12,  3.40it/s]

Step 1100,     Loss = 6.091269493103027,    PPL = 441.9821484337012


1150it [05:26,  3.40it/s]

Step 1150,     Loss = 6.180139064788818,    PPL = 483.05912819770776


1200it [05:40,  3.39it/s]

Step 1200,     Loss = 5.875668525695801,    PPL = 356.26275188095485


1250it [05:54,  3.41it/s]

Step 1250,     Loss = 6.092136859893799,    PPL = 442.36567537658846


1300it [06:08,  3.41it/s]

Step 1300,     Loss = 6.316763877868652,    PPL = 553.7779963747223


1350it [06:22,  3.40it/s]

Step 1350,     Loss = 5.954709053039551,    PPL = 385.5647154753613


1400it [06:36,  3.40it/s]

Step 1400,     Loss = 5.9661641120910645,    PPL = 390.00677551613046


1450it [06:50,  3.40it/s]

Step 1450,     Loss = 6.066386699676514,    PPL = 431.12009745523557


1500it [07:04,  3.38it/s]

Step 1500,     Loss = 6.189489364624023,    PPL = 487.5970583263174


1550it [07:18,  3.40it/s]

Step 1550,     Loss = 6.232285499572754,    PPL = 508.91728557974653


1600it [07:32,  3.40it/s]

Step 1600,     Loss = 6.096182823181152,    PPL = 444.1590962697966


1650it [07:46,  3.39it/s]

Step 1650,     Loss = 6.111511707305908,    PPL = 451.0200103231604


1700it [08:00,  3.40it/s]

Step 1700,     Loss = 6.129685401916504,    PPL = 459.29164567755885


1750it [08:14,  3.40it/s]

Step 1750,     Loss = 5.806519985198975,    PPL = 332.4601439856849


1800it [08:28,  3.40it/s]

Step 1800,     Loss = 5.932404518127441,    PPL = 377.06007245171054


1850it [08:42,  3.41it/s]

Step 1850,     Loss = 6.0705037117004395,    PPL = 432.8986827966132


1900it [08:56,  3.41it/s]

Step 1900,     Loss = 5.892756462097168,    PPL = 362.40285859650595


1950it [09:10,  3.40it/s]

Step 1950,     Loss = 6.111545562744141,    PPL = 451.0352800617419


2000it [09:23,  3.41it/s]

Step 2000,     Loss = 5.814570426940918,    PPL = 335.1473972811054


2050it [09:37,  3.41it/s]

Step 2050,     Loss = 6.088230133056641,    PPL = 440.64084493521483


2100it [09:51,  3.39it/s]

Step 2100,     Loss = 6.550482749938965,    PPL = 699.5818153897839


2150it [10:05,  3.40it/s]

Step 2150,     Loss = 6.361395835876465,    PPL = 579.0540569810037


2200it [10:19,  3.40it/s]

Step 2200,     Loss = 5.908102989196777,    PPL = 368.007378852224


2250it [10:33,  3.39it/s]

Step 2250,     Loss = 6.040483474731445,    PPL = 420.096091641402


2300it [10:47,  3.40it/s]

Step 2300,     Loss = 6.3747758865356445,    PPL = 586.8538943433663


2350it [11:01,  3.40it/s]

Step 2350,     Loss = 5.978522300720215,    PPL = 394.8564577363576


2400it [11:15,  3.39it/s]

Step 2400,     Loss = 6.065486431121826,    PPL = 430.73214824364305


2450it [11:29,  3.39it/s]

Step 2450,     Loss = 6.105357646942139,    PPL = 448.25292907987375


2500it [11:43,  3.40it/s]

Step 2500,     Loss = 6.011373043060303,    PPL = 408.0431966982534


2550it [11:57,  3.40it/s]

Step 2550,     Loss = 6.136993885040283,    PPL = 462.6606671410993


2600it [12:11,  3.40it/s]

Step 2600,     Loss = 6.1671037673950195,    PPL = 476.80317151434963


2650it [12:25,  3.40it/s]

Step 2650,     Loss = 6.0689311027526855,    PPL = 432.2184374747091


2700it [12:39,  3.39it/s]

Step 2700,     Loss = 5.992142677307129,    PPL = 400.2713440725654


2750it [12:53,  3.40it/s]

Step 2750,     Loss = 6.232273578643799,    PPL = 508.9112188491017


2800it [13:07,  3.39it/s]

Step 2800,     Loss = 6.248770236968994,    PPL = 517.376183186291


2850it [13:21,  3.40it/s]

Step 2850,     Loss = 5.931393623352051,    PPL = 376.6790969899561


2900it [13:35,  3.40it/s]

Step 2900,     Loss = 6.186610221862793,    PPL = 486.1952158061887


2950it [13:49,  3.40it/s]

Step 2950,     Loss = 6.12709379196167,    PPL = 458.102881947762


3000it [14:03,  3.40it/s]

Step 3000,     Loss = 6.207321643829346,    PPL = 496.3700136293907


3050it [14:17,  3.41it/s]

Step 3050,     Loss = 5.895780563354492,    PPL = 363.50046032975524


3100it [14:31,  3.40it/s]

Step 3100,     Loss = 5.839964389801025,    PPL = 343.7670988322104


3150it [14:45,  3.40it/s]

Step 3150,     Loss = 6.235637664794922,    PPL = 510.6261229578565


3200it [14:59,  3.39it/s]

Step 3200,     Loss = 6.103618621826172,    PPL = 447.4740833902886


3250it [15:13,  3.40it/s]

Step 3250,     Loss = 6.395066738128662,    PPL = 598.883290239315


3300it [15:27,  3.40it/s]

Step 3300,     Loss = 6.273238182067871,    PPL = 530.1914575993208


3350it [15:41,  3.41it/s]

Step 3350,     Loss = 6.257166862487793,    PPL = 521.7386867724349


3400it [15:55,  3.39it/s]

Step 3400,     Loss = 5.9042253494262695,    PPL = 366.5831419275962


3450it [16:09,  3.40it/s]

Step 3450,     Loss = 6.025585651397705,    PPL = 413.8839627796473


3500it [16:22,  3.41it/s]

Step 3500,     Loss = 6.1134257316589355,    PPL = 451.88410028744033


3550it [16:36,  3.41it/s]

Step 3550,     Loss = 6.103869438171387,    PPL = 447.5863312806773


3600it [16:50,  3.38it/s]

Step 3600,     Loss = 5.9877166748046875,    PPL = 398.50365688168426


3650it [17:04,  3.39it/s]

Step 3650,     Loss = 6.22804594039917,    PPL = 506.76426778108714


3700it [17:18,  3.40it/s]

Step 3700,     Loss = 6.081474304199219,    PPL = 437.6739838862435


3750it [17:32,  3.38it/s]

Step 3750,     Loss = 6.15258264541626,    PPL = 469.9294821254238


3800it [17:46,  3.39it/s]

Step 3800,     Loss = 5.996927738189697,    PPL = 402.19125660936805


3850it [18:00,  3.39it/s]

Step 3850,     Loss = 5.92360258102417,    PPL = 373.7557768439728


3900it [18:14,  3.40it/s]

Step 3900,     Loss = 5.9430742263793945,    PPL = 381.10473271957653


3950it [18:28,  3.40it/s]

Step 3950,     Loss = 6.076631546020508,    PPL = 435.55955857873926


4000it [18:42,  3.40it/s]

Step 4000,     Loss = 6.001638412475586,    PPL = 404.0903180380554


4050it [18:56,  3.40it/s]

Step 4050,     Loss = 6.118620872497559,    PPL = 454.23781046531803


4100it [19:10,  3.39it/s]

Step 4100,     Loss = 6.081467628479004,    PPL = 437.67106210693424


4150it [19:24,  3.40it/s]

Step 4150,     Loss = 5.884994983673096,    PPL = 359.6009641160892


4200it [19:38,  3.39it/s]

Step 4200,     Loss = 6.253756523132324,    PPL = 519.9624113681908


4250it [19:52,  3.39it/s]

Step 4250,     Loss = 6.184974193572998,    PPL = 485.40043699635


4300it [20:06,  3.39it/s]

Step 4300,     Loss = 6.2650909423828125,    PPL = 525.8894094206917


4350it [20:20,  3.39it/s]

Step 4350,     Loss = 6.034030914306641,    PPL = 417.3941229089375


4400it [20:34,  3.39it/s]

Step 4400,     Loss = 6.038130760192871,    PPL = 419.1088872194669


4450it [20:48,  3.39it/s]

Step 4450,     Loss = 6.3986945152282715,    PPL = 601.0598509764828


4500it [21:02,  3.40it/s]

Step 4500,     Loss = 6.151689529418945,    PPL = 469.509967952623


4550it [21:16,  3.41it/s]

Step 4550,     Loss = 5.802483558654785,    PPL = 331.1208977384987


4600it [21:30,  3.40it/s]

Step 4600,     Loss = 5.885775566101074,    PPL = 359.881771892312


4650it [21:44,  3.41it/s]

Step 4650,     Loss = 6.265320301055908,    PPL = 526.0100405512006


4700it [21:58,  3.40it/s]

Step 4700,     Loss = 6.236091136932373,    PPL = 510.85773018701633


4750it [22:12,  3.40it/s]

Step 4750,     Loss = 6.078141689300537,    PPL = 436.21781282319387


4800it [22:26,  3.40it/s]

Step 4800,     Loss = 6.119701385498047,    PPL = 454.728885583586


4850it [22:40,  3.39it/s]

Step 4850,     Loss = 6.031344890594482,    PPL = 416.27449674185476


4900it [22:54,  3.39it/s]

Step 4900,     Loss = 6.027031898498535,    PPL = 414.48297431583694


4950it [23:08,  3.38it/s]

Step 4950,     Loss = 6.071922779083252,    PPL = 433.51343127902976


5000it [23:22,  3.39it/s]

Step 5000,     Loss = 5.818415641784668,    PPL = 336.43859189739544


5050it [23:36,  3.41it/s]

Step 5050,     Loss = 6.033333778381348,    PPL = 417.10324387374476


5100it [23:50,  3.40it/s]

Step 5100,     Loss = 6.1979546546936035,    PPL = 491.74222914486586


5150it [24:04,  3.40it/s]

Step 5150,     Loss = 6.1284637451171875,    PPL = 458.7308915100596


5200it [24:18,  3.41it/s]

Step 5200,     Loss = 6.121675968170166,    PPL = 455.6276724340127


5250it [24:32,  3.40it/s]

Step 5250,     Loss = 6.131016731262207,    PPL = 459.90352133732057


5300it [24:46,  3.41it/s]

Step 5300,     Loss = 6.258444786071777,    PPL = 522.4058551492446


5350it [25:00,  3.40it/s]

Step 5350,     Loss = 6.197522163391113,    PPL = 491.52960089094313


5400it [25:14,  3.39it/s]

Step 5400,     Loss = 6.17741584777832,    PPL = 481.74544289964615


5450it [25:28,  3.39it/s]

Step 5450,     Loss = 6.157231330871582,    PPL = 472.11912200431055


5500it [25:42,  3.39it/s]

Step 5500,     Loss = 6.105129718780518,    PPL = 448.15077125657206


5550it [25:56,  3.40it/s]

Step 5550,     Loss = 6.113616466522217,    PPL = 451.97029855977416


5600it [26:10,  3.39it/s]

Step 5600,     Loss = 6.128091335296631,    PPL = 458.56008742760145


5650it [26:24,  3.39it/s]

Step 5650,     Loss = 6.028296947479248,    PPL = 415.007647378608


5700it [26:38,  3.39it/s]

Step 5700,     Loss = 6.21841287612915,    PPL = 501.9060125311343


5750it [26:52,  3.41it/s]

Step 5750,     Loss = 6.107608795166016,    PPL = 449.26314951636846


5800it [27:06,  3.40it/s]

Step 5800,     Loss = 6.274785041809082,    PPL = 531.0122240618549


5850it [27:20,  3.41it/s]

Step 5850,     Loss = 6.118647575378418,    PPL = 454.2499400853999


5900it [27:34,  3.40it/s]

Step 5900,     Loss = 6.311030387878418,    PPL = 550.6120005513295


5950it [27:48,  3.39it/s]

Step 5950,     Loss = 6.139289855957031,    PPL = 463.72414296478496


6000it [28:02,  3.39it/s]

Step 6000,     Loss = 5.730477333068848,    PPL = 308.1163073838945


6050it [28:16,  3.39it/s]

Step 6050,     Loss = 5.943553924560547,    PPL = 381.2875918217735


6100it [28:30,  3.38it/s]

Step 6100,     Loss = 6.317015171051025,    PPL = 553.9171744962836


6150it [28:44,  3.39it/s]

Step 6150,     Loss = 5.902656555175781,    PPL = 366.0084992681196


6200it [28:58,  3.39it/s]

Step 6200,     Loss = 6.253045558929443,    PPL = 519.5928680884522


6250it [29:12,  3.39it/s]

Step 6250,     Loss = 6.101986408233643,    PPL = 446.74430584750485


6300it [29:26,  3.40it/s]

Step 6300,     Loss = 6.134643077850342,    PPL = 461.5743185166496


6350it [29:40,  3.40it/s]

Step 6350,     Loss = 5.885209083557129,    PPL = 359.6779628832241


6400it [29:54,  3.39it/s]

Step 6400,     Loss = 5.884500980377197,    PPL = 359.423363925759


6450it [30:08,  3.39it/s]

Step 6450,     Loss = 6.07877254486084,    PPL = 436.49309007688


6500it [30:22,  3.39it/s]

Step 6500,     Loss = 5.9842729568481445,    PPL = 397.1336829385653


6550it [30:36,  3.40it/s]

Step 6550,     Loss = 6.218132495880127,    PPL = 501.76530772470056


6600it [30:50,  3.39it/s]

Step 6600,     Loss = 5.9771504402160645,    PPL = 394.31514114750115


6650it [31:04,  3.39it/s]

Step 6650,     Loss = 5.940920352935791,    PPL = 380.28476472717955


6700it [31:18,  3.39it/s]

Step 6700,     Loss = 6.060029983520508,    PPL = 428.38828125554215


6750it [31:32,  3.39it/s]

Step 6750,     Loss = 5.823267936706543,    PPL = 338.07505826625743


6800it [31:46,  3.40it/s]

Step 6800,     Loss = 5.837471961975098,    PPL = 342.91135103725674


6850it [32:00,  3.38it/s]

Step 6850,     Loss = 6.154301643371582,    PPL = 470.7379846523478


6900it [32:14,  3.40it/s]

Step 6900,     Loss = 5.936304569244385,    PPL = 378.5334973570399


6950it [32:28,  3.40it/s]

Step 6950,     Loss = 6.119986057281494,    PPL = 454.85835249334383


7000it [32:42,  3.39it/s]

Step 7000,     Loss = 6.328155040740967,    PPL = 560.1222372656772


7050it [32:56,  3.39it/s]

Step 7050,     Loss = 6.244233131408691,    PPL = 515.0341099632036


7100it [33:10,  3.40it/s]

Step 7100,     Loss = 6.194810390472412,    PPL = 490.19848988161766


7150it [33:24,  3.40it/s]

Step 7150,     Loss = 6.118945598602295,    PPL = 454.38533729174384


7200it [33:38,  3.41it/s]

Step 7200,     Loss = 6.168837070465088,    PPL = 477.63033256866186


7250it [33:52,  3.40it/s]

Step 7250,     Loss = 6.372420310974121,    PPL = 585.473142522741


7300it [34:05,  3.40it/s]

Step 7300,     Loss = 6.073342800140381,    PPL = 434.1294667681599


7350it [34:19,  3.41it/s]

Step 7350,     Loss = 6.005974769592285,    PPL = 405.84640271783553


7400it [34:33,  3.40it/s]

Step 7400,     Loss = 6.151351451873779,    PPL = 469.3512640038589


7450it [34:47,  3.40it/s]

Step 7450,     Loss = 6.066865921020508,    PPL = 431.32674891970237


7500it [35:01,  3.39it/s]

Step 7500,     Loss = 6.133737087249756,    PPL = 461.156325899884


7550it [35:15,  3.40it/s]

Step 7550,     Loss = 6.146055221557617,    PPL = 466.87204266929757


7600it [35:29,  3.40it/s]

Step 7600,     Loss = 5.96065092086792,    PPL = 387.86250988889907


7650it [35:43,  3.39it/s]

Step 7650,     Loss = 6.079581260681152,    PPL = 436.8462317206525


7700it [35:57,  3.39it/s]

Step 7700,     Loss = 6.031556606292725,    PPL = 416.3626379176989


7750it [36:11,  3.38it/s]

Step 7750,     Loss = 6.116185188293457,    PPL = 453.13277690815823


7800it [36:25,  3.39it/s]

Step 7800,     Loss = 6.231705188751221,    PPL = 508.6220410467436


7850it [36:39,  3.41it/s]

Step 7850,     Loss = 5.8282551765441895,    PPL = 339.76533105941314


7900it [36:53,  3.39it/s]

Step 7900,     Loss = 5.999087810516357,    PPL = 403.06095778335134


7950it [37:07,  3.40it/s]

Step 7950,     Loss = 6.181857109069824,    PPL = 483.8897584957995


8000it [37:21,  3.41it/s]

Step 8000,     Loss = 6.200379848480225,    PPL = 492.9362466202642


8050it [37:35,  3.40it/s]

Step 8050,     Loss = 6.051140308380127,    PPL = 424.59692552964935


8100it [37:49,  3.40it/s]

Step 8100,     Loss = 6.20623254776001,    PPL = 495.82971327151427


8150it [38:03,  3.41it/s]

Step 8150,     Loss = 5.950655937194824,    PPL = 384.0051397235219


8200it [38:17,  3.39it/s]

Step 8200,     Loss = 6.079745292663574,    PPL = 436.91789435137616


8250it [38:31,  3.39it/s]

Step 8250,     Loss = 6.027943134307861,    PPL = 414.8608381797849


8300it [38:45,  3.39it/s]

Step 8300,     Loss = 5.973277568817139,    PPL = 392.79096269387003


8350it [38:59,  3.39it/s]

Step 8350,     Loss = 6.1650567054748535,    PPL = 475.8281242301028


8400it [39:13,  3.41it/s]

Step 8400,     Loss = 6.040215969085693,    PPL = 419.98372859467435


8450it [39:27,  3.40it/s]

Step 8450,     Loss = 6.318944454193115,    PPL = 554.986869103244


8500it [39:41,  3.39it/s]

Step 8500,     Loss = 6.258981227874756,    PPL = 522.6861706678263


8550it [39:55,  3.38it/s]

Step 8550,     Loss = 6.136706829071045,    PPL = 462.52787669492164


8600it [40:09,  3.39it/s]

Step 8600,     Loss = 6.037084579467773,    PPL = 418.6706528560832


8650it [40:23,  3.40it/s]

Step 8650,     Loss = 5.924357891082764,    PPL = 374.0381849811079


8700it [40:37,  3.39it/s]

Step 8700,     Loss = 5.988523006439209,    PPL = 398.8251125692287


8750it [40:51,  3.40it/s]

Step 8750,     Loss = 6.1931281089782715,    PPL = 489.37453129309864


8800it [41:05,  3.40it/s]

Step 8800,     Loss = 6.250075340270996,    PPL = 518.0518533651298


8850it [41:19,  3.39it/s]

Step 8850,     Loss = 5.839170932769775,    PPL = 343.4944425953385


8900it [41:33,  3.39it/s]

Step 8900,     Loss = 6.2997589111328125,    PPL = 544.4406357260486


8950it [41:47,  3.38it/s]

Step 8950,     Loss = 6.068045616149902,    PPL = 431.8358832371718


9000it [42:01,  3.40it/s]

Step 9000,     Loss = 6.132055759429932,    PPL = 460.3816223876426


9050it [42:15,  3.39it/s]

Step 9050,     Loss = 6.112213611602783,    PPL = 451.3366943343152


9100it [42:29,  3.40it/s]

Step 9100,     Loss = 5.834713935852051,    PPL = 341.96689558828984


9150it [42:43,  3.40it/s]

Step 9150,     Loss = 5.937341690063477,    PPL = 378.9262859772927


9200it [42:57,  3.41it/s]

Step 9200,     Loss = 5.973313808441162,    PPL = 392.8051975486092


9250it [43:11,  3.40it/s]

Step 9250,     Loss = 6.09061861038208,    PPL = 441.69456349250595


9300it [43:25,  3.40it/s]

Step 9300,     Loss = 6.1567559242248535,    PPL = 471.89472677937914


9350it [43:39,  3.41it/s]

Step 9350,     Loss = 5.948761463165283,    PPL = 383.2783406274869


9400it [43:53,  3.40it/s]

Step 9400,     Loss = 6.0591349601745605,    PPL = 428.0050352753218


9450it [44:07,  3.40it/s]

Step 9450,     Loss = 6.2231268882751465,    PPL = 504.27758899874624


9500it [44:21,  3.40it/s]

Step 9500,     Loss = 6.061051368713379,    PPL = 428.8260542321761


9550it [44:34,  3.40it/s]

Step 9550,     Loss = 5.9689717292785645,    PPL = 391.10330383789085


9600it [44:48,  3.40it/s]

Step 9600,     Loss = 6.065535545349121,    PPL = 430.7533038397912


9650it [45:02,  3.41it/s]

Step 9650,     Loss = 6.035982131958008,    PPL = 418.209344768129


9700it [45:16,  3.40it/s]

Step 9700,     Loss = 6.248232841491699,    PPL = 517.0982222595484


9750it [45:30,  3.40it/s]

Step 9750,     Loss = 5.891176223754883,    PPL = 361.8306279534783


9800it [45:44,  3.39it/s]

Step 9800,     Loss = 6.068740367889404,    PPL = 432.1360062116375


9850it [45:58,  3.39it/s]

Step 9850,     Loss = 5.7856831550598145,    PPL = 325.6044024450359


9900it [46:12,  3.39it/s]

Step 9900,     Loss = 5.888608932495117,    PPL = 360.90289473506556


9950it [46:26,  3.40it/s]

Step 9950,     Loss = 6.377143859863281,    PPL = 588.2451953441015


10000it [46:40,  3.41it/s]

Step 10000,     Loss = 5.820916652679443,    PPL = 337.2810815799395


10050it [46:54,  3.40it/s]

Step 10050,     Loss = 6.001543998718262,    PPL = 404.0521681537964


10100it [47:08,  3.40it/s]

Step 10100,     Loss = 6.169325351715088,    PPL = 477.8636074517274


10150it [47:22,  3.40it/s]

Step 10150,     Loss = 6.191486835479736,    PPL = 488.5719926169147


10200it [47:36,  3.40it/s]

Step 10200,     Loss = 6.231672286987305,    PPL = 508.6053067597219


10250it [47:50,  3.40it/s]

Step 10250,     Loss = 6.078671455383301,    PPL = 436.448967448659


10300it [48:04,  3.40it/s]

Step 10300,     Loss = 6.141284465789795,    PPL = 464.65001476957264


10350it [48:18,  3.40it/s]

Step 10350,     Loss = 6.0917649269104,    PPL = 442.20117558453813


10400it [48:32,  3.41it/s]

Step 10400,     Loss = 5.819450855255127,    PPL = 336.78705799699003


10450it [48:46,  3.40it/s]

Step 10450,     Loss = 5.858208179473877,    PPL = 350.09627198464017


10500it [49:00,  3.40it/s]

Step 10500,     Loss = 6.071200847625732,    PPL = 433.2005772389067


10550it [49:14,  3.39it/s]

Step 10550,     Loss = 6.2522759437561035,    PPL = 519.193135373124


10600it [49:28,  3.39it/s]

Step 10600,     Loss = 6.3909101486206055,    PPL = 596.3991446019986


10650it [49:42,  3.39it/s]

Step 10650,     Loss = 6.012510299682617,    PPL = 408.50751049781707


10700it [49:56,  3.40it/s]

Step 10700,     Loss = 6.006156921386719,    PPL = 405.92033510160877


10750it [50:10,  3.40it/s]

Step 10750,     Loss = 6.066383361816406,    PPL = 431.1186584390624


10800it [50:24,  3.39it/s]

Step 10800,     Loss = 6.1456780433654785,    PPL = 466.695981921508


10850it [50:38,  3.39it/s]

Step 10850,     Loss = 6.257894992828369,    PPL = 522.1187188797899


10900it [50:52,  3.40it/s]

Step 10900,     Loss = 6.079916477203369,    PPL = 436.99269434216717


10950it [51:06,  3.40it/s]

Step 10950,     Loss = 6.0197553634643555,    PPL = 411.4779208797367


11000it [51:20,  3.39it/s]

Step 11000,     Loss = 6.05968713760376,    PPL = 428.2414352567449


11050it [51:34,  3.39it/s]

Step 11050,     Loss = 5.989626884460449,    PPL = 399.2656099282585


11100it [51:48,  3.41it/s]

Step 11100,     Loss = 5.999241352081299,    PPL = 403.1228491449029


11150it [52:02,  3.40it/s]

Step 11150,     Loss = 6.061254024505615,    PPL = 428.91296712233094


11200it [52:16,  3.40it/s]

Step 11200,     Loss = 5.961956024169922,    PPL = 388.36904099709113


11250it [52:30,  3.40it/s]

Step 11250,     Loss = 6.216251850128174,    PPL = 500.822551703168


11300it [52:44,  3.39it/s]

Step 11300,     Loss = 6.149903774261475,    PPL = 468.67228627566357


11350it [52:58,  3.40it/s]

Step 11350,     Loss = 5.899111270904541,    PPL = 364.7131925644954


11400it [53:12,  3.40it/s]

Step 11400,     Loss = 6.109659194946289,    PPL = 450.1852636076288


11450it [53:26,  3.39it/s]

Step 11450,     Loss = 6.148684024810791,    PPL = 468.1009720128902


11500it [53:40,  3.38it/s]

Step 11500,     Loss = 6.0303544998168945,    PPL = 415.86242640834797


11550it [53:54,  3.39it/s]

Step 11550,     Loss = 6.062921524047852,    PPL = 429.62877593824095


11600it [54:08,  3.40it/s]

Step 11600,     Loss = 6.015051364898682,    PPL = 409.5468747101019


11650it [54:22,  3.39it/s]

Step 11650,     Loss = 6.0473198890686035,    PPL = 422.97788191772577


11700it [54:36,  3.38it/s]

Step 11700,     Loss = 6.240841865539551,    PPL = 513.29045064192


11750it [54:50,  3.39it/s]

Step 11750,     Loss = 6.326768398284912,    PPL = 559.3460862372522


11800it [55:04,  3.40it/s]

Step 11800,     Loss = 6.12599515914917,    PPL = 457.59987145266507


11850it [55:18,  3.39it/s]

Step 11850,     Loss = 5.972325801849365,    PPL = 392.4172950807912


11900it [55:31,  3.41it/s]

Step 11900,     Loss = 6.124751567840576,    PPL = 457.03115792660583


11950it [55:45,  3.40it/s]

Step 11950,     Loss = 5.908637046813965,    PPL = 368.2039684865045


12000it [55:59,  3.41it/s]

Step 12000,     Loss = 5.902018070220947,    PPL = 365.7748829361885


12050it [56:13,  3.39it/s]

Step 12050,     Loss = 6.090861797332764,    PPL = 441.8019909084786


12100it [56:27,  3.41it/s]

Step 12100,     Loss = 6.05043888092041,    PPL = 424.2992060133001


12150it [56:41,  3.42it/s]

Step 12150,     Loss = 6.374861717224121,    PPL = 586.9042665788638


12200it [56:55,  3.40it/s]

Step 12200,     Loss = 6.012313365936279,    PPL = 408.42706950440066


12250it [57:09,  3.40it/s]

Step 12250,     Loss = 6.009260177612305,    PPL = 407.1819664788377


12300it [57:23,  3.41it/s]

Step 12300,     Loss = 6.020302772521973,    PPL = 411.7032292829398


12350it [57:37,  3.40it/s]

Step 12350,     Loss = 6.4005126953125,    PPL = 602.1536801148053


12400it [57:51,  3.39it/s]

Step 12400,     Loss = 5.91528844833374,    PPL = 370.6612038925049


12450it [58:05,  3.39it/s]

Step 12450,     Loss = 6.019354343414307,    PPL = 411.3129430652282


12500it [58:19,  3.40it/s]

Step 12500,     Loss = 5.815113067626953,    PPL = 335.32931124718476


12550it [58:33,  3.39it/s]

Step 12550,     Loss = 5.984670639038086,    PPL = 397.2916473390283


12600it [58:47,  3.39it/s]

Step 12600,     Loss = 6.14422607421875,    PPL = 466.0188454644401


12650it [59:01,  3.39it/s]

Step 12650,     Loss = 5.9221649169921875,    PPL = 373.21882767559583


12700it [59:15,  3.39it/s]

Step 12700,     Loss = 5.783644199371338,    PPL = 324.9411858608591


12750it [59:29,  3.40it/s]

Step 12750,     Loss = 5.887271404266357,    PPL = 360.42049960598393


12800it [59:43,  3.38it/s]

Step 12800,     Loss = 6.06256103515625,    PPL = 429.47392744925986


12850it [59:57,  3.39it/s]

Step 12850,     Loss = 6.22812032699585,    PPL = 506.801965652377


12900it [1:00:11,  3.37it/s]

Step 12900,     Loss = 5.930131912231445,    PPL = 376.20413647887165


12950it [1:00:25,  3.39it/s]

Step 12950,     Loss = 5.885446548461914,    PPL = 359.763383918281


13000it [1:00:39,  3.40it/s]

Step 13000,     Loss = 6.073924541473389,    PPL = 434.3820912967961


13050it [1:00:53,  3.39it/s]

Step 13050,     Loss = 5.840325832366943,    PPL = 343.8913733521883


13100it [1:01:07,  3.41it/s]

Step 13100,     Loss = 5.9842400550842285,    PPL = 397.12061675483756


13150it [1:01:21,  3.40it/s]

Step 13150,     Loss = 5.988055229187012,    PPL = 398.63859488173057


13200it [1:01:35,  3.40it/s]

Step 13200,     Loss = 6.149849891662598,    PPL = 468.64703367520144


13250it [1:01:49,  3.39it/s]

Step 13250,     Loss = 6.038102626800537,    PPL = 419.09709643057033


13300it [1:02:03,  3.40it/s]

Step 13300,     Loss = 6.144999980926514,    PPL = 466.3796401676022


13350it [1:02:17,  3.39it/s]

Step 13350,     Loss = 6.048023700714111,    PPL = 423.2756834626163


13400it [1:02:31,  3.39it/s]

Step 13400,     Loss = 5.998336315155029,    PPL = 402.7581731281263


13450it [1:02:45,  3.40it/s]

Step 13450,     Loss = 5.762310028076172,    PPL = 318.0822598417284


13500it [1:02:59,  3.40it/s]

Step 13500,     Loss = 5.912001132965088,    PPL = 369.4447241920779


13550it [1:03:13,  3.40it/s]

Step 13550,     Loss = 6.333407878875732,    PPL = 563.0722097947723


13600it [1:03:27,  3.39it/s]

Step 13600,     Loss = 5.881664752960205,    PPL = 358.40540179512703


13650it [1:03:41,  3.40it/s]

Step 13650,     Loss = 5.8806023597717285,    PPL = 358.024836528315


13700it [1:03:55,  3.40it/s]

Step 13700,     Loss = 6.024513244628906,    PPL = 413.44034872633694


13750it [1:04:09,  3.42it/s]

Step 13750,     Loss = 5.964488506317139,    PPL = 389.3538251077143


13800it [1:04:23,  3.40it/s]

Step 13800,     Loss = 5.955161094665527,    PPL = 385.739046175662


13850it [1:04:37,  3.39it/s]

Step 13850,     Loss = 5.987511157989502,    PPL = 398.4217660945373


13900it [1:04:51,  3.40it/s]

Step 13900,     Loss = 5.968135356903076,    PPL = 390.77633259251746


13950it [1:05:05,  3.40it/s]

Step 13950,     Loss = 5.872508525848389,    PPL = 355.1387385157324


14000it [1:05:19,  3.41it/s]

Step 14000,     Loss = 5.734702110290527,    PPL = 309.4207837617303


14050it [1:05:32,  3.40it/s]

Step 14050,     Loss = 5.68104887008667,    PPL = 293.25685701385913


14100it [1:05:46,  3.40it/s]

Step 14100,     Loss = 5.85572624206543,    PPL = 349.2284323579525


14150it [1:06:00,  3.40it/s]

Step 14150,     Loss = 6.112387180328369,    PPL = 451.41503906806213


14200it [1:06:14,  3.39it/s]

Step 14200,     Loss = 5.7874016761779785,    PPL = 326.16444156850423


14250it [1:06:28,  3.40it/s]

Step 14250,     Loss = 5.9536356925964355,    PPL = 385.15108758712756


14300it [1:06:42,  3.40it/s]

Step 14300,     Loss = 5.868432521820068,    PPL = 353.69413768635593


14350it [1:06:56,  3.39it/s]

Step 14350,     Loss = 5.7705559730529785,    PPL = 320.7159925384274


14400it [1:07:10,  3.40it/s]

Step 14400,     Loss = 5.943863391876221,    PPL = 381.40560612915795


14450it [1:07:24,  3.40it/s]

Step 14450,     Loss = 5.931313991546631,    PPL = 376.6491025476705


14500it [1:07:38,  3.38it/s]

Step 14500,     Loss = 6.1145734786987305,    PPL = 452.4030466784994


14550it [1:07:52,  3.40it/s]

Step 14550,     Loss = 5.8181843757629395,    PPL = 336.36079407903406


14600it [1:08:06,  3.40it/s]

Step 14600,     Loss = 5.958302021026611,    PPL = 386.9525288465233


14650it [1:08:20,  3.39it/s]

Step 14650,     Loss = 6.04052209854126,    PPL = 420.1123176663028


14700it [1:08:34,  3.39it/s]

Step 14700,     Loss = 6.363361835479736,    PPL = 580.1935968277029


14750it [1:08:48,  3.38it/s]

Step 14750,     Loss = 6.164095401763916,    PPL = 475.3709286756116


14800it [1:09:02,  3.40it/s]

Step 14800,     Loss = 5.958930015563965,    PPL = 387.1956092394447


14850it [1:09:16,  3.39it/s]

Step 14850,     Loss = 5.976975917816162,    PPL = 394.24633032743964


14900it [1:09:30,  3.39it/s]

Step 14900,     Loss = 5.971280097961426,    PPL = 392.0071572683567


14950it [1:09:44,  3.39it/s]

Step 14950,     Loss = 5.7878923416137695,    PPL = 326.3245184552521


15000it [1:09:58,  3.40it/s]

Step 15000,     Loss = 6.029916763305664,    PPL = 415.68042807722355


15050it [1:10:12,  3.40it/s]

Step 15050,     Loss = 5.953316688537598,    PPL = 385.0282424220163


15100it [1:10:26,  3.39it/s]

Step 15100,     Loss = 5.8476243019104,    PPL = 346.41043553412806


15150it [1:10:40,  3.39it/s]

Step 15150,     Loss = 5.849868297576904,    PPL = 347.1886518807931


15200it [1:10:54,  3.42it/s]

Step 15200,     Loss = 5.633777141571045,    PPL = 279.7166542150701


15250it [1:11:08,  3.40it/s]

Step 15250,     Loss = 6.128780841827393,    PPL = 458.8763766318346


15300it [1:11:22,  3.40it/s]

Step 15300,     Loss = 5.885165214538574,    PPL = 359.66218451009


15350it [1:11:36,  3.38it/s]

Step 15350,     Loss = 6.219367504119873,    PPL = 502.38537482933816


15400it [1:11:50,  3.39it/s]

Step 15400,     Loss = 5.961817741394043,    PPL = 388.31533996108635


15450it [1:12:04,  3.39it/s]

Step 15450,     Loss = 5.734358310699463,    PPL = 309.31442330719426


15500it [1:12:18,  3.40it/s]

Step 15500,     Loss = 5.942251682281494,    PPL = 380.79138615934494


15550it [1:12:32,  3.39it/s]

Step 15550,     Loss = 6.012641429901123,    PPL = 408.5610816892542


15600it [1:12:46,  3.39it/s]

Step 15600,     Loss = 6.248547554016113,    PPL = 517.2609851568635


15650it [1:13:00,  3.39it/s]

Step 15650,     Loss = 5.88454008102417,    PPL = 359.4374178865833


15700it [1:13:14,  3.39it/s]

Step 15700,     Loss = 5.83900260925293,    PPL = 343.43662926855166


15750it [1:13:28,  3.39it/s]

Step 15750,     Loss = 5.737794876098633,    PPL = 310.379231144735


15800it [1:13:42,  3.39it/s]

Step 15800,     Loss = 5.90509557723999,    PPL = 366.90229162007137


15850it [1:13:56,  3.39it/s]

Step 15850,     Loss = 6.043142795562744,    PPL = 421.2147487038051


15900it [1:14:10,  3.40it/s]

Step 15900,     Loss = 5.8226776123046875,    PPL = 337.8755432048028


15950it [1:14:24,  3.39it/s]

Step 15950,     Loss = 5.814049243927002,    PPL = 334.9727696609119


16000it [1:14:38,  3.39it/s]

Step 16000,     Loss = 6.034351825714111,    PPL = 417.52809093917506


16050it [1:14:52,  3.40it/s]

Step 16050,     Loss = 6.1308207511901855,    PPL = 459.8133982435396


16100it [1:15:06,  3.40it/s]

Step 16100,     Loss = 6.03956937789917,    PPL = 419.71225859179543


16150it [1:15:20,  3.40it/s]

Step 16150,     Loss = 6.007138729095459,    PPL = 406.3190665225077


16200it [1:15:34,  3.39it/s]

Step 16200,     Loss = 6.169618129730225,    PPL = 478.0035358932093


16250it [1:15:48,  3.41it/s]

Step 16250,     Loss = 5.683528900146484,    PPL = 293.98504542584584


16300it [1:16:02,  3.39it/s]

Step 16300,     Loss = 6.317073822021484,    PPL = 553.9496632288602


16350it [1:16:16,  3.39it/s]

Step 16350,     Loss = 5.938704967498779,    PPL = 379.4432199147336


16400it [1:16:30,  3.40it/s]

Step 16400,     Loss = 6.13353157043457,    PPL = 461.0615602587823


16450it [1:16:44,  3.39it/s]

Step 16450,     Loss = 5.8144612312316895,    PPL = 335.1108026213895


16500it [1:16:58,  3.39it/s]

Step 16500,     Loss = 5.845064163208008,    PPL = 345.5247110437542


16550it [1:17:12,  3.38it/s]

Step 16550,     Loss = 6.0350518226623535,    PPL = 417.82046164600075


16600it [1:17:26,  3.39it/s]

Step 16600,     Loss = 5.970322608947754,    PPL = 391.63199435783605


16650it [1:17:40,  3.39it/s]

Step 16650,     Loss = 6.024470806121826,    PPL = 413.42280330747366


16700it [1:17:54,  3.40it/s]

Step 16700,     Loss = 6.046284198760986,    PPL = 422.5400346013363


16750it [1:18:08,  3.39it/s]

Step 16750,     Loss = 5.90385103225708,    PPL = 366.445949242028


16800it [1:18:22,  3.40it/s]

Step 16800,     Loss = 6.164268493652344,    PPL = 475.4532186490177


16801it [1:18:22,  3.63it/s]


Saving Model at epoch 2...
