# Load Data

In [None]:
! pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 31.0 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.26.16-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 62.7 MB/s 
Collecting botocore<1.30.0,>=1.29.16
  Downloading botocore-1.29.16-py3-none-any.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 49.2 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 9.5 MB/s 
[?25hCollecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 68.6 MB/s 
  Downloading urllib3-1.25.11-py2.py3

In [None]:
import os
import codecs

def read_conll_file(file_name, raw=False):
    """
    read in conll file
    word1    tag1
    ...      ...
    wordN    tagN
    Sentences MUST be separated by newlines!
    :param file_name: file to read in
    :param raw: if raw text file (with one sentence per line) -- adds 'DUMMY' label
    :return: generator of instances ((list of  words, list of tags) pairs)
    """
    current_words = []
    current_tags = []
    
    for line in codecs.open(file_name, encoding='utf-8'):
        #line = line.strip()
        line = line[:-1]

        if line:
            if raw:
                current_words = line.split() ## simple splitting by space
                current_tags = ['DUMMY' for _ in current_words]
                yield (current_words, current_tags)

            else:
                if len(line.split("\t")) != 2:
                    if len(line.split("\t")) == 1: # emtpy words in gimpel
                        raise IOError("Issue with input file - doesn't have a tag or token?")
                    else:
                        print("erroneous line: {} (line number: {}) ".format(line), file=sys.stderr)
                        exit()
                else:
                    word, tag = line.split('\t')
                current_words.append(word)
                current_tags.append(tag)

        else:
            if current_words and not raw: #skip emtpy lines
                yield (current_words, current_tags)
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != [] and not raw:
        yield (current_words, current_tags)


def read_data(data_file):
    word_lst = []
    tag_lst = []
    tags = []
    for word, tag in read_conll_file(data_file):
        word_lst.append(word)
        tag_lst.append(tag)
        tags.extend(tag)
    print("The number of samples:", len(word_lst))
    print("The number of tags", len(set(tags)))
    return word_lst, tag_lst, list(set(tags))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Capstone')

import os
import pandas as pd
import numpy as np



data_dir = "/content/drive/MyDrive/Capstone/data/gweb_sancl"
wsj_dir = os.path.join(data_dir, "pos_fine", "wsj")
model_dir = "/content/drive/MyDrive/Capstone/model"

Mounted at /content/drive


In [None]:
wsj_train_file = os.path.join(wsj_dir, "gweb-wsj-train.conll")
wsj_dev_file = os.path.join(wsj_dir, "gweb-wsj-dev.conll")
wsj_test_file = os.path.join(wsj_dir, "gweb-wsj-test.conll")

In [None]:
import os
import codecs


def read_conll_file(file_name, raw=False):
    """
    read in conll file
    word1    tag1
    ...      ...
    wordN    tagN
    Sentences MUST be separated by newlines!
    :param file_name: file to read in
    :param raw: if raw text file (with one sentence per line) -- adds 'DUMMY' label
    :return: generator of instances ((list of  words, list of tags) pairs)
    """
    current_words = []
    current_tags = []
    
    for line in codecs.open(file_name, encoding='utf-8'):
        #line = line.strip()
        line = line[:-1]

        if line:
            if raw:
                current_words = line.split() ## simple splitting by space
                current_tags = ['DUMMY' for _ in current_words]
                yield (current_words, current_tags)

            else:
                if len(line.split("\t")) != 2:
                    if len(line.split("\t")) == 1: # emtpy words in gimpel
                        raise IOError("Issue with input file - doesn't have a tag or token?")
                    else:
                        print("erroneous line: {} (line number: {}) ".format(line), file=sys.stderr)
                        exit()
                else:
                    word, tag = line.split('\t')
                current_words.append(word)
                current_tags.append(tag)

        else:
            if current_words and not raw: #skip emtpy lines
                yield (current_words, current_tags)
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != [] and not raw:
        yield (current_words, current_tags)


def read_data(data_file):
    word_lst = []
    tag_lst = []
    tags = []
    for word, tag in read_conll_file(data_file):
        word_lst.append(word)
        tag_lst.append(tag)
        tags.extend(tag)
    print("The number of samples:", len(word_lst))
    print("The number of tags", len(set(tags)))
    return word_lst, tag_lst, list(set(tags))

In [None]:
wsj_train_word_lst, wsj_train_tag_lst, wsj_train_tag_set = read_data(wsj_train_file)
wsj_dev_word_lst, wsj_dev_tag_lst, wsj_dev_tag_set = read_data(wsj_dev_file)
wsj_test_word_lst, wsj_test_tag_lst, wsj_test_tag_set = read_data(wsj_test_file)

The number of samples: 30060
The number of tags 48
The number of samples: 1336
The number of tags 45
The number of samples: 1640
The number of tags 45


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
wsj_tags = wsj_train_tag_set + wsj_dev_tag_set + wsj_test_tag_set
wsj_tags = sorted(list(set(wsj_tags)))
wsj_tags = ["<pad>"] + wsj_tags
tag2idx = {tag:idx for idx, tag in enumerate(wsj_tags)}
idx2tag = {idx:tag for idx, tag in enumerate(wsj_tags)}
print(len(wsj_tags))

49


# Build Model

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

100%|██████████| 213450/213450 [00:00<00:00, 236677.35B/s]


In [None]:
class PosDataset(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        sents, tags_li = [], [] # list of lists
        for i in range(len(word_lst)):
            sents.append(["[CLS]"] + word_lst[i] + ["[SEP]"])
            tags_li.append(["<pad>"] + tag_lst[i] + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [None]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [None]:
from pytorch_pretrained_bert import BertModel

In [None]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def eval(model, iterator, average="macro"):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)
    print("classification_report", classification_report(y_true, y_pred))
    precision_value = precision_score(y_true, y_pred, average=average)
    recall_value = recall_score(y_true, y_pred, average=average)
    f1_value = f1_score(y_true, y_pred, average=average)

    return precision_value, recall_value, f1_value

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

100%|██████████| 404400730/404400730 [00:32<00:00, 12309664.81B/s]


In [None]:
train_dataset = PosDataset(wsj_train_word_lst, wsj_train_tag_lst)
eval_dataset = PosDataset(wsj_test_word_lst, wsj_test_tag_lst)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
# train(model, train_iter, optimizer, criterion)
# eval(model, test_iter)

# Save Model

In [None]:
model_file = os.path.join(model_dir, "base_model.pt")
# torch.save(model.state_dict(), model_file)

## Load Model

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)
model.load_state_dict(torch.load(model_file))
wsj_precision_value, wsj_recall_value, wsj_f1_value = eval(model, test_iter)

acc=0.97
classification_report               precision    recall  f1-score   support

           1       1.00      1.00      1.00       178
           2       1.00      1.00      1.00       352
           3       1.00      1.00      1.00      2000
           4       1.00      1.00      1.00        60
           5       1.00      1.00      1.00        60
           6       1.00      1.00      1.00      1613
           7       1.00      1.00      1.00       223
           9       1.00      0.99      1.00       935
          10       0.98      1.00      0.99      1266
          11       0.99      1.00      0.99      3309
          12       1.00      1.00      1.00        46
          13       1.00      0.20      0.33        20
          14       1.00      0.99      1.00       511
          15       0.97      0.99      0.98      4250
          16       0.97      0.89      0.93      2423
          17       0.96      0.93      0.94       139
          18       0.92      0.93      0.93       

In [None]:
wsj_precision_value, wsj_recall_value, wsj_f1_value

(0.9417027899389416, 0.9425258210459151, 0.9318435575593024)

# Self Training

In [None]:
def filter_tag(process_words, process_tags, label_tags_set=wsj_tags):
  new_words = []
  new_tags = []
  for words, tags in zip(process_words, process_tags):
    w_lst = []
    t_lst = []
    for i, t in enumerate(tags):
      if t in label_tags_set:
        w_lst.append(words[i])
        t_lst.append(tags[i])

    if w_lst:
      new_words.append(w_lst)
      new_tags.append(t_lst)
  print("after filter tag", len(new_words))
  return new_words, new_tags

In [None]:
file_name_lst = ["answers", "emails", "newsgroups", "reviews", "weblogs"]

In [None]:
domain = "emails"
domain_dir = os.path.join(data_dir, "pos_fine", f"{domain}")
domain_dev_file = os.path.join(domain_dir, f"gweb-{domain}-dev.conll")
domain_test_file = os.path.join(domain_dir, f"gweb-{domain}-test.conll")

In [None]:
domain_dev_word_lst, domain_dev_tag_lst, domain_dev_tag_set = read_data(domain_dev_file)
domain_test_word_lst, domain_test_tag_lst, domain_test_tag_set = read_data(domain_test_file)
domain_dev_word_lst, domain_dev_tag_lst = filter_tag(domain_dev_word_lst, domain_dev_tag_lst)  
domain_test_word_lst, domain_test_tag_lst = filter_tag(domain_test_word_lst, domain_test_tag_lst)

The number of samples: 2450
The number of tags 49
The number of samples: 2450
The number of tags 48
after filter tag 2427
after filter tag 2402


In [None]:
domain_precision_value_lst = []
domain_recall_value_lst = []
domain_f1_value_lst = []

In [None]:
domain_test_dataset = PosDataset(domain_test_word_lst, domain_test_tag_lst)

domain_test_iter = data.DataLoader(dataset=domain_test_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

domain_precision_value, domain_recall_value, domain_f1_value = eval(model, domain_test_iter)

domain_precision_value_lst.append(domain_precision_value)
domain_recall_value_lst.append(domain_recall_value)
domain_f1_value_lst.append(domain_f1_value)

acc=0.91
classification_report               precision    recall  f1-score   support

           1       0.79      0.94      0.86        35
           2       0.87      0.52      0.65        77
           3       1.00      0.79      0.88      1030
           4       1.00      0.84      0.91       291
           5       0.91      0.84      0.87       294
           6       0.99      0.98      0.99      1570
           7       0.61      0.94      0.74       186
           8       0.00      0.00      0.00        11
           9       0.99      0.98      0.98       689
          10       0.93      0.98      0.96       901
          11       0.96      1.00      0.98      2111
          12       0.98      0.96      0.97        47
          13       0.60      0.46      0.52        13
          14       0.28      1.00      0.44        43
          15       0.93      0.98      0.95      2778
          16       0.90      0.80      0.85      1151
          17       0.91      0.95      0.93       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
class PosDataset_new(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        self.word_lst, self.tag_lst = word_lst, tag_lst

    def __len__(self):
      return len(self.word_lst)

    def __getitem__(self, idx):
      words, tags = self.word_lst[idx], self.tag_lst[idx] # words, tags: string list
      assert len(words)==len(tags)
        # seqlen
      seqlen = len(words)

      return words, tags, seqlen

def pad_new(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    tags = f(1)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(0, maxlen)
    y = f(1, maxlen)

    f = torch.LongTensor

    return f(x), f(y), seqlens

def train_new(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        x, y, seqlens = batch
        
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def gen_pseudo_data(model, domain_dev_iter, topn=300, initial=True):
  model.eval()

  LLD = []
  MEAN_PROB = []
  new_x_lst = []
  new_y_lst = []

  if initial:
    with torch.no_grad():
        for i, batch in enumerate(domain_dev_iter):

          _, x, _, _, y, _ = batch
          sen_len = y.bool().sum(axis=1)

          logits, _, y_hat = model(x, y)  # y_hat: (N, T)

          # Save prediction as new training dataset
          softmax_value = torch.softmax(logits, dim=2)
          max_prob = torch.amax(softmax_value, dim=2)
          
          # Rank by LLD
          # lld = torch.prod(max_prob, 1)
          # LLD.extend(lld)

          # Rank by mean probability
          res_prob = y.bool().to(device) * max_prob.to(device)
          sum_prob = res_prob.sum(axis=1)
          mean_prob = sum_prob / sen_len.to(device)
          MEAN_PROB.extend(mean_prob)
          
          new_x_lst.extend(x.tolist())
          new_y_lst.extend(y_hat.tolist())
  else:
    with torch.no_grad():
        for i, batch in enumerate(domain_dev_iter):

          x, y, seqlens = batch
          sen_len = y.bool().sum(axis=1)

          logits, _, y_hat = model(x, y)  # y_hat: (N, T)

          # Save prediction as new training dataset
          softmax_value = torch.softmax(logits, dim=2)
          max_prob = torch.amax(softmax_value, dim=2)

          # Rank by mean probability
          res_prob = y.bool().to(device) * max_prob.to(device)
          sum_prob = res_prob.sum(axis=1)
          mean_prob = sum_prob / sen_len.to(device)
          MEAN_PROB.extend(mean_prob)
          
          new_x_lst.extend(x.tolist())
          new_y_lst.extend(y_hat.tolist())

  ind = list(range(len(MEAN_PROB)))
  ind = [x for _, x in sorted(zip(MEAN_PROB, ind), reverse=True)]

  select_ind = ind[: topn]
  not_select_ind = ind[topn: ]

  new_train_x = [new_x_lst[i] for i in select_ind]
  new_train_y = [new_y_lst[i] for i in select_ind]

  remain_train_x = [new_x_lst[i] for i in not_select_ind]
  remain_train_y = [new_y_lst[i] for i in not_select_ind]

  return new_train_x, new_train_y, remain_train_x, remain_train_y

In [None]:
topn = 500
i = 0
last_top_sen = set()
top_words = domain_dev_word_lst[:topn]
new_top_sen = set([tuple(sen) for sen in top_words])

while len(new_top_sen.difference(last_top_sen)) > 50:
  i += 1
  print("\nLoop", i)

  domain_dev_dataset = PosDataset(domain_dev_word_lst, domain_dev_tag_lst)

  domain_dev_iter = data.DataLoader(dataset=domain_dev_dataset,
                              batch_size=8,
                              shuffle=True,
                              num_workers=1,
                              collate_fn=pad)

  if i == 1:
    last_top_sen = set()
  else:
    last_top_sen = new_top_sen

  top_words_ids, top_tags_ids, remain_words, remain_tags = gen_pseudo_data(model, domain_dev_iter, topn)
  new_top_sen = set([tuple(sen) for sen in top_words_ids])

  # Revert ids to words
  top_words = []
  top_tags = []
  for t in range(len(top_words_ids)):
    word_ids = tokenizer.convert_ids_to_tokens(top_words_ids[t])
    tag_ids = list(map(idx2tag.get, top_tags_ids[t]))
    words = []
    tags = []
    for k, w in enumerate(word_ids):
      if w == '[CLS]':
        pass
      elif w == '[SEP]':
        break
      else:
        words.append(w)
        tags.append(tag_ids[k])
    top_words.append(words)
    top_tags.append(tags)

  new_train_dataset = PosDataset(wsj_train_word_lst+top_words, wsj_train_tag_lst+top_tags)
  new_train_iter = data.DataLoader(dataset=new_train_dataset,
                              batch_size=8,
                              shuffle=True,
                              num_workers=1,
                              collate_fn=pad)

  print("Train from scratch...")
  model = Net(vocab_size=len(tag2idx))
  model.to(device)
  model = nn.DataParallel(model)

  optimizer = optim.Adam(model.parameters(), lr = 0.0001)
  criterion = nn.CrossEntropyLoss(ignore_index=0)

  train(model, new_train_iter, optimizer, criterion)

  domain_precision_value, domain_recall_value, domain_f1_value = eval(model, domain_test_iter)
  domain_precision_value_lst.append(domain_precision_value)
  domain_recall_value_lst.append(domain_recall_value)
  domain_f1_value_lst.append(domain_f1_value)

  print("Difference", len(new_top_sen.difference(last_top_sen)))


Loop 1
Train from scratch...
step: 0, loss: 4.036646842956543
step: 10, loss: 2.025042772293091
step: 20, loss: 0.7598252892494202
step: 30, loss: 0.38657280802726746
step: 40, loss: 0.20626065135002136
step: 50, loss: 0.3390898108482361
step: 60, loss: 0.22518108785152435
step: 70, loss: 0.2822517156600952
step: 80, loss: 0.15507148206233978
step: 90, loss: 0.21990534663200378
step: 100, loss: 0.2163505107164383
step: 110, loss: 0.17872563004493713
step: 120, loss: 0.1065683662891388
step: 130, loss: 0.14126498997211456
step: 140, loss: 0.0858919769525528
step: 150, loss: 0.12172899395227432
step: 160, loss: 0.14077410101890564
step: 170, loss: 0.11176017671823502
step: 180, loss: 0.1498749852180481
step: 190, loss: 0.05982294678688049
step: 200, loss: 0.11438219249248505
step: 210, loss: 0.1178639829158783
step: 220, loss: 0.09368232637643814
step: 230, loss: 0.07843580096960068
step: 240, loss: 0.06928496062755585
step: 250, loss: 0.10336120426654816
step: 260, loss: 0.192699715495

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9660637378692627
step: 10, loss: 2.0270564556121826
step: 20, loss: 0.8263635039329529
step: 30, loss: 0.464794784784317
step: 40, loss: 0.39439985156059265
step: 50, loss: 0.2372395545244217
step: 60, loss: 0.35294830799102783
step: 70, loss: 0.27602988481521606
step: 80, loss: 0.19601143896579742
step: 90, loss: 0.20713412761688232
step: 100, loss: 0.1984756737947464
step: 110, loss: 0.16856598854064941
step: 120, loss: 0.2540299892425537
step: 130, loss: 0.1296726018190384
step: 140, loss: 0.12365398555994034
step: 150, loss: 0.1291559636592865
step: 160, loss: 0.22244258224964142
step: 170, loss: 0.15750591456890106
step: 180, loss: 0.19788070023059845
step: 190, loss: 0.12344711273908615
step: 200, loss: 0.09306109696626663
step: 210, loss: 0.08963422477245331
step: 220, loss: 0.12762440741062164
step: 230, loss: 0.1617126613855362
step: 240, loss: 0.12461451441049576
step: 250, loss: 0.08309676498174667
step: 260, loss: 0.1303124576807022
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.856262445449829
step: 10, loss: 1.9734201431274414
step: 20, loss: 0.7351809740066528
step: 30, loss: 0.5117853879928589
step: 40, loss: 0.3234884738922119
step: 50, loss: 0.2817065417766571
step: 60, loss: 0.22018836438655853
step: 70, loss: 0.28618109226226807
step: 80, loss: 0.2472105175256729
step: 90, loss: 0.21570774912834167
step: 100, loss: 0.2512865960597992
step: 110, loss: 0.26859328150749207
step: 120, loss: 0.1237674206495285
step: 130, loss: 0.275084912776947
step: 140, loss: 0.1105947494506836
step: 150, loss: 0.2579306364059448
step: 160, loss: 0.214762344956398
step: 170, loss: 0.20998643338680267
step: 180, loss: 0.14759477972984314
step: 190, loss: 0.19216059148311615
step: 200, loss: 0.1695946902036667
step: 210, loss: 0.19324927031993866
step: 220, loss: 0.3201900124549866
step: 230, loss: 0.17924188077449799
step: 240, loss: 0.22279155254364014
step: 250, loss: 0.11815333366394043
step: 260, loss: 0.08782553672790527
step: 27

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9929401874542236
step: 10, loss: 1.8011258840560913
step: 20, loss: 0.7792457342147827
step: 30, loss: 0.43431463837623596
step: 40, loss: 0.31532877683639526
step: 50, loss: 0.3240410387516022
step: 60, loss: 0.15017040073871613
step: 70, loss: 0.39023250341415405
step: 80, loss: 0.3229660987854004
step: 90, loss: 0.10904449224472046
step: 100, loss: 0.12808051705360413
step: 110, loss: 0.12374233454465866
step: 120, loss: 0.23023970425128937
step: 130, loss: 0.14011384546756744
step: 140, loss: 0.20931710302829742
step: 150, loss: 0.08408397436141968
step: 160, loss: 0.14771296083927155
step: 170, loss: 0.1929650753736496
step: 180, loss: 0.23879700899124146
step: 190, loss: 0.22340290248394012
step: 200, loss: 0.09469575434923172
step: 210, loss: 0.08287011831998825
step: 220, loss: 0.09883993864059448
step: 230, loss: 0.09361308813095093
step: 240, loss: 0.15433791279792786
step: 250, loss: 0.05646607279777527
step: 260, loss: 0.11651411652565

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9658899307250977
step: 10, loss: 1.872463583946228
step: 20, loss: 0.605051577091217
step: 30, loss: 0.4266273081302643
step: 40, loss: 0.2592635750770569
step: 50, loss: 0.21919898688793182
step: 60, loss: 0.27755704522132874
step: 70, loss: 0.14587299525737762
step: 80, loss: 0.1999373435974121
step: 90, loss: 0.19328512251377106
step: 100, loss: 0.23573406040668488
step: 110, loss: 0.2153569906949997
step: 120, loss: 0.2128649353981018
step: 130, loss: 0.1998467743396759
step: 140, loss: 0.16430523991584778
step: 150, loss: 0.11073765158653259
step: 160, loss: 0.24567408859729767
step: 170, loss: 0.23478931188583374
step: 180, loss: 0.07636728137731552
step: 190, loss: 0.20560121536254883
step: 200, loss: 0.15440362691879272
step: 210, loss: 0.09839971363544464
step: 220, loss: 0.15984287858009338
step: 230, loss: 0.1384783685207367
step: 240, loss: 0.11087741702795029
step: 250, loss: 0.1655322015285492
step: 260, loss: 0.11686563491821289
ste

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9690277576446533
step: 10, loss: 1.9730043411254883
step: 20, loss: 0.6982095837593079
step: 30, loss: 0.36827242374420166
step: 40, loss: 0.24303676187992096
step: 50, loss: 0.22193560004234314
step: 60, loss: 0.21804594993591309
step: 70, loss: 0.15588627755641937
step: 80, loss: 0.1567804217338562
step: 90, loss: 0.19538702070713043
step: 100, loss: 0.211819589138031
step: 110, loss: 0.20212848484516144
step: 120, loss: 0.30111923813819885
step: 130, loss: 0.15470995008945465
step: 140, loss: 0.10736053436994553
step: 150, loss: 0.08039255440235138
step: 160, loss: 0.20649497210979462
step: 170, loss: 0.16342946887016296
step: 180, loss: 0.26707860827445984
step: 190, loss: 0.13523732125759125
step: 200, loss: 0.14004255831241608
step: 210, loss: 0.15971338748931885
step: 220, loss: 0.09040651470422745
step: 230, loss: 0.19888286292552948
step: 240, loss: 0.09856810420751572
step: 250, loss: 0.19965335726737976
step: 260, loss: 0.12924858927726

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 4.0825605392456055
step: 10, loss: 1.879938006401062
step: 20, loss: 0.8449520468711853
step: 30, loss: 0.5840408205986023
step: 40, loss: 0.27008071541786194
step: 50, loss: 0.20319201052188873
step: 60, loss: 0.17950443923473358
step: 70, loss: 0.230423241853714
step: 80, loss: 0.11885154992341995
step: 90, loss: 0.20110107958316803
step: 100, loss: 0.1553763747215271
step: 110, loss: 0.16068489849567413
step: 120, loss: 0.11905574798583984
step: 130, loss: 0.21470701694488525
step: 140, loss: 0.09895019978284836
step: 150, loss: 0.0785558819770813
step: 160, loss: 0.08523839712142944
step: 170, loss: 0.13092073798179626
step: 180, loss: 0.23065443336963654
step: 190, loss: 0.13586001098155975
step: 200, loss: 0.2619093656539917
step: 210, loss: 0.13073085248470306
step: 220, loss: 0.10901336371898651
step: 230, loss: 0.15178176760673523
step: 240, loss: 0.05098554119467735
step: 250, loss: 0.1300175040960312
step: 260, loss: 0.057034820318222046


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.896695137023926
step: 10, loss: 1.9028788805007935
step: 20, loss: 0.6720321178436279
step: 30, loss: 0.3700006306171417
step: 40, loss: 0.2721523344516754
step: 50, loss: 0.236761674284935
step: 60, loss: 0.16902309656143188
step: 70, loss: 0.27194103598594666
step: 80, loss: 0.08656768500804901
step: 90, loss: 0.15788814425468445
step: 100, loss: 0.15663063526153564
step: 110, loss: 0.1809048056602478
step: 120, loss: 0.08330696821212769
step: 130, loss: 0.10459805279970169
step: 140, loss: 0.12266870588064194
step: 150, loss: 0.16048556566238403
step: 160, loss: 0.21818183362483978
step: 170, loss: 0.17591115832328796
step: 180, loss: 0.12851162254810333
step: 190, loss: 0.042650677263736725
step: 200, loss: 0.13496266305446625
step: 210, loss: 0.05805300548672676
step: 220, loss: 0.19912536442279816
step: 230, loss: 0.05298972502350807
step: 240, loss: 0.07252632081508636
step: 250, loss: 0.13393694162368774
step: 260, loss: 0.1690442562103271

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9275403022766113
step: 10, loss: 1.9157668352127075
step: 20, loss: 0.7409091591835022
step: 30, loss: 0.4379444420337677
step: 40, loss: 0.3263144791126251
step: 50, loss: 0.07470570504665375
step: 60, loss: 0.2887437045574188
step: 70, loss: 0.14281487464904785
step: 80, loss: 0.2260415107011795
step: 90, loss: 0.13477903604507446
step: 100, loss: 0.10014750063419342
step: 110, loss: 0.13216157257556915
step: 120, loss: 0.13885223865509033
step: 130, loss: 0.1790265142917633
step: 140, loss: 0.10138984769582748
step: 150, loss: 0.09131795912981033
step: 160, loss: 0.14452825486660004
step: 170, loss: 0.1532616913318634
step: 180, loss: 0.11955077201128006
step: 190, loss: 0.17294184863567352
step: 200, loss: 0.1437111645936966
step: 210, loss: 0.07649702578783035
step: 220, loss: 0.09204573929309845
step: 230, loss: 0.09060344845056534
step: 240, loss: 0.14645566046237946
step: 250, loss: 0.09022996574640274
step: 260, loss: 0.09629148244857788


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 4.0573201179504395
step: 10, loss: 1.9797073602676392
step: 20, loss: 0.6292812824249268
step: 30, loss: 0.523412823677063
step: 40, loss: 0.2269495725631714
step: 50, loss: 0.21297359466552734
step: 60, loss: 0.12505146861076355
step: 70, loss: 0.09353400766849518
step: 80, loss: 0.15048782527446747
step: 90, loss: 0.1534477025270462
step: 100, loss: 0.09721474349498749
step: 110, loss: 0.16122518479824066
step: 120, loss: 0.19338031113147736
step: 130, loss: 0.1490844041109085
step: 140, loss: 0.15460069477558136
step: 150, loss: 0.16041000187397003
step: 160, loss: 0.15262767672538757
step: 170, loss: 0.17866796255111694
step: 180, loss: 0.16105547547340393
step: 190, loss: 0.13469646871089935
step: 200, loss: 0.21410807967185974
step: 210, loss: 0.08034147322177887
step: 220, loss: 0.08281338214874268
step: 230, loss: 0.31083741784095764
step: 240, loss: 0.14387376606464386
step: 250, loss: 0.18849314749240875
step: 260, loss: 0.2488463819026947

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.991445302963257
step: 10, loss: 2.0112874507904053
step: 20, loss: 0.8278156518936157
step: 30, loss: 0.5379595756530762
step: 40, loss: 0.49792927503585815
step: 50, loss: 0.24866564571857452
step: 60, loss: 0.2313840240240097
step: 70, loss: 0.2047153115272522
step: 80, loss: 0.15343692898750305
step: 90, loss: 0.13402478396892548
step: 100, loss: 0.17040768265724182
step: 110, loss: 0.11502620577812195
step: 120, loss: 0.10390444844961166
step: 130, loss: 0.11828173696994781
step: 140, loss: 0.08847404271364212
step: 150, loss: 0.13598564267158508
step: 160, loss: 0.09748796373605728
step: 170, loss: 0.08929076045751572
step: 180, loss: 0.1378687024116516
step: 190, loss: 0.09372030198574066
step: 200, loss: 0.1299930065870285
step: 210, loss: 0.08296355605125427
step: 220, loss: 0.11877577751874924
step: 230, loss: 0.16172464191913605
step: 240, loss: 0.08114086091518402
step: 250, loss: 0.17174866795539856
step: 260, loss: 0.1185283213853836


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9403436183929443
step: 10, loss: 2.0531342029571533
step: 20, loss: 0.7216070890426636
step: 30, loss: 0.3804892599582672
step: 40, loss: 0.3335835635662079
step: 50, loss: 0.21826422214508057
step: 60, loss: 0.2583637237548828
step: 70, loss: 0.12352269887924194
step: 80, loss: 0.09352466464042664
step: 90, loss: 0.35929957032203674
step: 100, loss: 0.1931179016828537
step: 110, loss: 0.07361634075641632
step: 120, loss: 0.18202240765094757
step: 130, loss: 0.14493204653263092
step: 140, loss: 0.09601446241140366
step: 150, loss: 0.14333181083202362
step: 160, loss: 0.1480947732925415
step: 170, loss: 0.1672770082950592
step: 180, loss: 0.1937815099954605
step: 190, loss: 0.13820065557956696
step: 200, loss: 0.09963555634021759
step: 210, loss: 0.10692352801561356
step: 220, loss: 0.06334523856639862
step: 230, loss: 0.2422124147415161
step: 240, loss: 0.09418055415153503
step: 250, loss: 0.15311411023139954
step: 260, loss: 0.079678975045681
ste

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9069712162017822
step: 10, loss: 1.8787645101547241
step: 20, loss: 0.6748934984207153
step: 30, loss: 0.3437769412994385
step: 40, loss: 0.3890940546989441
step: 50, loss: 0.27232038974761963
step: 60, loss: 0.4394935369491577
step: 70, loss: 0.23620577156543732
step: 80, loss: 0.2323307991027832
step: 90, loss: 0.10862003266811371
step: 100, loss: 0.09980751574039459
step: 110, loss: 0.21189072728157043
step: 120, loss: 0.1781594604253769
step: 130, loss: 0.2842238247394562
step: 140, loss: 0.10637518018484116
step: 150, loss: 0.16100066900253296
step: 160, loss: 0.1406358927488327
step: 170, loss: 0.13335967063903809
step: 180, loss: 0.12216637283563614
step: 190, loss: 0.09625202417373657
step: 200, loss: 0.13009871542453766
step: 210, loss: 0.08908061683177948
step: 220, loss: 0.12696292996406555
step: 230, loss: 0.22058886289596558
step: 240, loss: 0.06854107230901718
step: 250, loss: 0.11107953637838364
step: 260, loss: 0.11895472556352615


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.8613836765289307
step: 10, loss: 1.8091992139816284
step: 20, loss: 0.6875391006469727
step: 30, loss: 0.48938101530075073
step: 40, loss: 0.3098689615726471
step: 50, loss: 0.2807363271713257
step: 60, loss: 0.16606588661670685
step: 70, loss: 0.16780385375022888
step: 80, loss: 0.2265755534172058
step: 90, loss: 0.17395718395709991
step: 100, loss: 0.17145000398159027
step: 110, loss: 0.2528264820575714
step: 120, loss: 0.13788264989852905
step: 130, loss: 0.14141373336315155
step: 140, loss: 0.18186712265014648
step: 150, loss: 0.09173239022493362
step: 160, loss: 0.10886622220277786
step: 170, loss: 0.12275420129299164
step: 180, loss: 0.09554234147071838
step: 190, loss: 0.10596947371959686
step: 200, loss: 0.2373856157064438
step: 210, loss: 0.07222053408622742
step: 220, loss: 0.18356241285800934
step: 230, loss: 0.18776607513427734
step: 240, loss: 0.24624884128570557
step: 250, loss: 0.08674322813749313
step: 260, loss: 0.1396520584821701

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.8841285705566406
step: 10, loss: 1.6446744203567505
step: 20, loss: 0.638746440410614
step: 30, loss: 0.4540846347808838
step: 40, loss: 0.19706076383590698
step: 50, loss: 0.20209261775016785
step: 60, loss: 0.3552052080631256
step: 70, loss: 0.18566270172595978
step: 80, loss: 0.24620017409324646
step: 90, loss: 0.18699319660663605
step: 100, loss: 0.09805489331483841
step: 110, loss: 0.1670546531677246
step: 120, loss: 0.16490438580513
step: 130, loss: 0.13000652194023132
step: 140, loss: 0.11909761279821396
step: 150, loss: 0.4146212935447693
step: 160, loss: 0.18943218886852264
step: 170, loss: 0.11611038446426392
step: 180, loss: 0.1841069757938385
step: 190, loss: 0.08119755983352661
step: 200, loss: 0.08386681973934174
step: 210, loss: 0.16603967547416687
step: 220, loss: 0.14355915784835815
step: 230, loss: 0.15428023040294647
step: 240, loss: 0.12113872170448303
step: 250, loss: 0.12563304603099823
step: 260, loss: 0.1800914853811264
ste

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9269959926605225
step: 10, loss: 1.9160631895065308
step: 20, loss: 0.6904247999191284
step: 30, loss: 0.4166235625743866
step: 40, loss: 0.3457145094871521
step: 50, loss: 0.21977710723876953
step: 60, loss: 0.2856850326061249
step: 70, loss: 0.40003734827041626
step: 80, loss: 0.31122416257858276
step: 90, loss: 0.11160939186811447
step: 100, loss: 0.13138988614082336
step: 110, loss: 0.11083505302667618
step: 120, loss: 0.11122409999370575
step: 130, loss: 0.10638990998268127
step: 140, loss: 0.1433199644088745
step: 150, loss: 0.1286270022392273
step: 160, loss: 0.16854669153690338
step: 170, loss: 0.12092898786067963
step: 180, loss: 0.1022808775305748
step: 190, loss: 0.193094402551651
step: 200, loss: 0.09089161455631256
step: 210, loss: 0.11051743477582932
step: 220, loss: 0.1206386536359787
step: 230, loss: 0.1658034771680832
step: 240, loss: 0.11235080659389496
step: 250, loss: 0.12678919732570648
step: 260, loss: 0.04766169190406799
ste

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.944784164428711
step: 10, loss: 1.6161683797836304
step: 20, loss: 0.8584020137786865
step: 30, loss: 0.39894142746925354
step: 40, loss: 0.39631372690200806
step: 50, loss: 0.3257835805416107
step: 60, loss: 0.2395678162574768
step: 70, loss: 0.15072502195835114
step: 80, loss: 0.21674053370952606
step: 90, loss: 0.10730284452438354
step: 100, loss: 0.16021861135959625
step: 110, loss: 0.3832399547100067
step: 120, loss: 0.20167988538742065
step: 130, loss: 0.32324615120887756
step: 140, loss: 0.13781270384788513
step: 150, loss: 0.18074500560760498
step: 160, loss: 0.20000584423542023
step: 170, loss: 0.12392185628414154
step: 180, loss: 0.2626858055591583
step: 190, loss: 0.13482442498207092
step: 200, loss: 0.1908084601163864
step: 210, loss: 0.09369484335184097
step: 220, loss: 0.17346583306789398
step: 230, loss: 0.11788983643054962
step: 240, loss: 0.041077326983213425
step: 250, loss: 0.2081400752067566
step: 260, loss: 0.08691635727882385

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9956462383270264
step: 10, loss: 1.7541296482086182
step: 20, loss: 0.9237406849861145
step: 30, loss: 0.5112792253494263
step: 40, loss: 0.3055051863193512
step: 50, loss: 0.24705947935581207
step: 60, loss: 0.2244643121957779
step: 70, loss: 0.37748947739601135
step: 80, loss: 0.1376863270998001
step: 90, loss: 0.4831281006336212
step: 100, loss: 0.15899161994457245
step: 110, loss: 0.1795313060283661
step: 120, loss: 0.22283141314983368
step: 130, loss: 0.1845567524433136
step: 140, loss: 0.13910026848316193
step: 150, loss: 0.15875005722045898
step: 160, loss: 0.18439945578575134
step: 170, loss: 0.23945793509483337
step: 180, loss: 0.1437680572271347
step: 190, loss: 0.2532637119293213
step: 200, loss: 0.12273586541414261
step: 210, loss: 0.12561224400997162
step: 220, loss: 0.04909883439540863
step: 230, loss: 0.16837531328201294
step: 240, loss: 0.099160335958004
step: 250, loss: 0.0563281886279583
step: 260, loss: 0.18867377936840057
step:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 4.022330284118652
step: 10, loss: 1.700323224067688
step: 20, loss: 0.5659964084625244
step: 30, loss: 0.6056168079376221
step: 40, loss: 0.27035611867904663
step: 50, loss: 0.23254869878292084
step: 60, loss: 0.39092814922332764
step: 70, loss: 0.2337976098060608
step: 80, loss: 0.11550624668598175
step: 90, loss: 0.1174728199839592
step: 100, loss: 0.19289706647396088
step: 110, loss: 0.17073389887809753
step: 120, loss: 0.13881461322307587
step: 130, loss: 0.12919196486473083
step: 140, loss: 0.23297466337680817
step: 150, loss: 0.15386012196540833
step: 160, loss: 0.3050698935985565
step: 170, loss: 0.07579963654279709
step: 180, loss: 0.1581028699874878
step: 190, loss: 0.07795506715774536
step: 200, loss: 0.06225283816456795
step: 210, loss: 0.13297785818576813
step: 220, loss: 0.22495107352733612
step: 230, loss: 0.15030020475387573
step: 240, loss: 0.2153753936290741
step: 250, loss: 0.16764818131923676
step: 260, loss: 0.06617183238267899
s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9132847785949707
step: 10, loss: 1.807345986366272
step: 20, loss: 0.7086188793182373
step: 30, loss: 0.42111682891845703
step: 40, loss: 0.23415933549404144
step: 50, loss: 0.20486943423748016
step: 60, loss: 0.2645816504955292
step: 70, loss: 0.1461145430803299
step: 80, loss: 0.13646213710308075
step: 90, loss: 0.12115296721458435
step: 100, loss: 0.28781768679618835
step: 110, loss: 0.18952211737632751
step: 120, loss: 0.19143354892730713
step: 130, loss: 0.16268666088581085
step: 140, loss: 0.16852818429470062
step: 150, loss: 0.1475127935409546
step: 160, loss: 0.16313420236110687
step: 170, loss: 0.12855976819992065
step: 180, loss: 0.24178668856620789
step: 190, loss: 0.19800151884555817
step: 200, loss: 0.09162814915180206
step: 210, loss: 0.02297055721282959
step: 220, loss: 0.08016464114189148
step: 230, loss: 0.09890475869178772
step: 240, loss: 0.09005188941955566
step: 250, loss: 0.13580770790576935
step: 260, loss: 0.035984229296445

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9896697998046875
step: 10, loss: 1.8846784830093384
step: 20, loss: 0.8624448776245117
step: 30, loss: 0.5587204098701477
step: 40, loss: 0.389956533908844
step: 50, loss: 0.2493620067834854
step: 60, loss: 0.18079142272472382
step: 70, loss: 0.1528192162513733
step: 80, loss: 0.12847131490707397
step: 90, loss: 0.12568405270576477
step: 100, loss: 0.15704526007175446
step: 110, loss: 0.10345528274774551
step: 120, loss: 0.2872602343559265
step: 130, loss: 0.16904321312904358
step: 140, loss: 0.0967603325843811
step: 150, loss: 0.2188660204410553
step: 160, loss: 0.20297937095165253
step: 170, loss: 0.21141228079795837
step: 180, loss: 0.20063988864421844
step: 190, loss: 0.18996533751487732
step: 200, loss: 0.13780808448791504
step: 210, loss: 0.09000132232904434
step: 220, loss: 0.08703868836164474
step: 230, loss: 0.12351173907518387
step: 240, loss: 0.14363998174667358
step: 250, loss: 0.0882001519203186
step: 260, loss: 0.04256143048405647
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9324679374694824
step: 10, loss: 1.994004249572754
step: 20, loss: 0.9637191295623779
step: 30, loss: 0.3446963131427765
step: 40, loss: 0.2951321601867676
step: 50, loss: 0.3754969835281372
step: 60, loss: 0.2238864153623581
step: 70, loss: 0.34374305605888367
step: 80, loss: 0.2303379476070404
step: 90, loss: 0.24483147263526917
step: 100, loss: 0.18632151186466217
step: 110, loss: 0.19645732641220093
step: 120, loss: 0.24959418177604675
step: 130, loss: 0.20999877154827118
step: 140, loss: 0.15741701424121857
step: 150, loss: 0.11659684777259827
step: 160, loss: 0.07076996564865112
step: 170, loss: 0.10800254344940186
step: 180, loss: 0.17792987823486328
step: 190, loss: 0.12680844962596893
step: 200, loss: 0.09899895638227463
step: 210, loss: 0.06645120680332184
step: 220, loss: 0.10914675146341324
step: 230, loss: 0.09380616247653961
step: 240, loss: 0.07786660641431808
step: 250, loss: 0.11673562973737717
step: 260, loss: 0.16421835124492645

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.8107614517211914
step: 10, loss: 1.8875535726547241
step: 20, loss: 0.6978365778923035
step: 30, loss: 0.3610649108886719
step: 40, loss: 0.45142269134521484
step: 50, loss: 0.34203168749809265
step: 60, loss: 0.22080263495445251
step: 70, loss: 0.2925768792629242
step: 80, loss: 0.15585942566394806
step: 90, loss: 0.23336441814899445
step: 100, loss: 0.13129231333732605
step: 110, loss: 0.18979166448116302
step: 120, loss: 0.13795176148414612
step: 130, loss: 0.19230501353740692
step: 140, loss: 0.1669684499502182
step: 150, loss: 0.15912456810474396
step: 160, loss: 0.27239561080932617
step: 170, loss: 0.12979331612586975
step: 180, loss: 0.17691054940223694
step: 190, loss: 0.12235379964113235
step: 200, loss: 0.26672983169555664
step: 210, loss: 0.11715410649776459
step: 220, loss: 0.06588133424520493
step: 230, loss: 0.28958114981651306
step: 240, loss: 0.1442154496908188
step: 250, loss: 0.2652602791786194
step: 260, loss: 0.1525669991970062

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9582996368408203
step: 10, loss: 1.8249481916427612
step: 20, loss: 0.6700311899185181
step: 30, loss: 0.6046101450920105
step: 40, loss: 0.2301972657442093
step: 50, loss: 0.18572214245796204
step: 60, loss: 0.10507126152515411
step: 70, loss: 0.17649157345294952
step: 80, loss: 0.19417905807495117
step: 90, loss: 0.15601064264774323
step: 100, loss: 0.08921883255243301
step: 110, loss: 0.044629473239183426
step: 120, loss: 0.09504897892475128
step: 130, loss: 0.31219345331192017
step: 140, loss: 0.09930957108736038
step: 150, loss: 0.27321740984916687
step: 160, loss: 0.1466110199689865
step: 170, loss: 0.12360028177499771
step: 180, loss: 0.11975203454494476
step: 190, loss: 0.16042503714561462
step: 200, loss: 0.07567974179983139
step: 210, loss: 0.12180677801370621
step: 220, loss: 0.10523597151041031
step: 230, loss: 0.07382869720458984
step: 240, loss: 0.27817124128341675
step: 250, loss: 0.1614522933959961
step: 260, loss: 0.05022516846656

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.967308282852173
step: 10, loss: 1.9493290185928345
step: 20, loss: 0.7577653527259827
step: 30, loss: 0.5611835718154907
step: 40, loss: 0.3393835723400116
step: 50, loss: 0.3219708204269409
step: 60, loss: 0.1268589198589325
step: 70, loss: 0.17318446934223175
step: 80, loss: 0.2311050444841385
step: 90, loss: 0.21086324751377106
step: 100, loss: 0.17789986729621887
step: 110, loss: 0.19529615342617035
step: 120, loss: 0.2778151035308838
step: 130, loss: 0.1102503165602684
step: 140, loss: 0.19874577224254608
step: 150, loss: 0.1664084494113922
step: 160, loss: 0.1721031367778778
step: 170, loss: 0.06932882964611053
step: 180, loss: 0.1116962730884552
step: 190, loss: 0.18274717032909393
step: 200, loss: 0.10897097736597061
step: 210, loss: 0.1421630084514618
step: 220, loss: 0.0823134332895279
step: 230, loss: 0.08477891981601715
step: 240, loss: 0.07477150857448578
step: 250, loss: 0.06118211895227432
step: 260, loss: 0.18078981339931488
step: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.916905641555786
step: 10, loss: 1.9728736877441406
step: 20, loss: 0.7337070107460022
step: 30, loss: 0.5660843253135681
step: 40, loss: 0.24938593804836273
step: 50, loss: 0.18021315336227417
step: 60, loss: 0.2640312910079956
step: 70, loss: 0.2390705943107605
step: 80, loss: 0.16996939480304718
step: 90, loss: 0.12459852546453476
step: 100, loss: 0.11759120970964432
step: 110, loss: 0.16793830692768097
step: 120, loss: 0.18493913114070892
step: 130, loss: 0.2119285762310028
step: 140, loss: 0.07813221216201782
step: 150, loss: 0.09309204667806625
step: 160, loss: 0.14472314715385437
step: 170, loss: 0.1431066244840622
step: 180, loss: 0.16751140356063843
step: 190, loss: 0.12334993481636047
step: 200, loss: 0.14797501266002655
step: 210, loss: 0.11178842186927795
step: 220, loss: 0.15251192450523376
step: 230, loss: 0.04640892148017883
step: 240, loss: 0.11469638347625732
step: 250, loss: 0.0943935215473175
step: 260, loss: 0.10998808592557907


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.950421094894409
step: 10, loss: 1.9944342374801636
step: 20, loss: 0.7868780493736267
step: 30, loss: 0.4284162223339081
step: 40, loss: 0.35986849665641785
step: 50, loss: 0.34620729088783264
step: 60, loss: 0.1546086072921753
step: 70, loss: 0.3955244719982147
step: 80, loss: 0.2307206094264984
step: 90, loss: 0.1331174075603485
step: 100, loss: 0.10227174311876297
step: 110, loss: 0.17241176962852478
step: 120, loss: 0.17073655128479004
step: 130, loss: 0.2542482018470764
step: 140, loss: 0.14258812367916107
step: 150, loss: 0.13812586665153503
step: 160, loss: 0.07987601310014725
step: 170, loss: 0.09497424215078354
step: 180, loss: 0.183591827750206
step: 190, loss: 0.051736053079366684
step: 200, loss: 0.21867252886295319
step: 210, loss: 0.1471271961927414
step: 220, loss: 0.22160254418849945
step: 230, loss: 0.2823318839073181
step: 240, loss: 0.04842458292841911
step: 250, loss: 0.06621947884559631
step: 260, loss: 0.07880479842424393
ste

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.865830421447754
step: 10, loss: 1.869805932044983
step: 20, loss: 0.6822465658187866
step: 30, loss: 0.4888271987438202
step: 40, loss: 0.23511338233947754
step: 50, loss: 0.2991226017475128
step: 60, loss: 0.21645094454288483
step: 70, loss: 0.2098425328731537
step: 80, loss: 0.12764345109462738
step: 90, loss: 0.16210715472698212
step: 100, loss: 0.15273278951644897
step: 110, loss: 0.19344276189804077
step: 120, loss: 0.1773684322834015
step: 130, loss: 0.1261894553899765
step: 140, loss: 0.3228941857814789
step: 150, loss: 0.17845939099788666
step: 160, loss: 0.08669593930244446
step: 170, loss: 0.07739907503128052
step: 180, loss: 0.17026059329509735
step: 190, loss: 0.1441430300474167
step: 200, loss: 0.15276585519313812
step: 210, loss: 0.1181754395365715
step: 220, loss: 0.06466468423604965
step: 230, loss: 0.10600832104682922
step: 240, loss: 0.14771506190299988
step: 250, loss: 0.1073315367102623
step: 260, loss: 0.1391608715057373
step:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9922983646392822
step: 10, loss: 1.6437197923660278
step: 20, loss: 0.8478151559829712
step: 30, loss: 0.5150139331817627
step: 40, loss: 0.22727078199386597
step: 50, loss: 0.18755610287189484
step: 60, loss: 0.20724275708198547
step: 70, loss: 0.13899581134319305
step: 80, loss: 0.22636911273002625
step: 90, loss: 0.14528191089630127
step: 100, loss: 0.23487268388271332
step: 110, loss: 0.1257224678993225
step: 120, loss: 0.18211515247821808
step: 130, loss: 0.22295139729976654
step: 140, loss: 0.10150113701820374
step: 150, loss: 0.2754174470901489
step: 160, loss: 0.13848818838596344
step: 170, loss: 0.13168363273143768
step: 180, loss: 0.06020205467939377
step: 190, loss: 0.17599567770957947
step: 200, loss: 0.16285201907157898
step: 210, loss: 0.1301746815443039
step: 220, loss: 0.1297040730714798
step: 230, loss: 0.06382784247398376
step: 240, loss: 0.11433518677949905
step: 250, loss: 0.1872488111257553
step: 260, loss: 0.16935577988624573

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9833672046661377
step: 10, loss: 2.0012624263763428
step: 20, loss: 0.9333800673484802
step: 30, loss: 0.46426212787628174
step: 40, loss: 0.2991935908794403
step: 50, loss: 0.28763866424560547
step: 60, loss: 0.24691373109817505
step: 70, loss: 0.24274155497550964
step: 80, loss: 0.24511593580245972
step: 90, loss: 0.28137826919555664
step: 100, loss: 0.23452970385551453
step: 110, loss: 0.1657952070236206
step: 120, loss: 0.047282252460718155
step: 130, loss: 0.12113465368747711
step: 140, loss: 0.1300913393497467
step: 150, loss: 0.14493998885154724
step: 160, loss: 0.19895969331264496
step: 170, loss: 0.1449916511774063
step: 180, loss: 0.1812533587217331
step: 190, loss: 0.0907856673002243
step: 200, loss: 0.1667603999376297
step: 210, loss: 0.22557903826236725
step: 220, loss: 0.07394960522651672
step: 230, loss: 0.17610138654708862
step: 240, loss: 0.07996619492769241
step: 250, loss: 0.26975491642951965
step: 260, loss: 0.1561480313539505


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.935161590576172
step: 10, loss: 1.911209225654602
step: 20, loss: 0.9037948250770569
step: 30, loss: 0.3635736107826233
step: 40, loss: 0.2953256368637085
step: 50, loss: 0.32126733660697937
step: 60, loss: 0.22760558128356934
step: 70, loss: 0.1371592879295349
step: 80, loss: 0.12471862882375717
step: 90, loss: 0.12261476367712021
step: 100, loss: 0.11061687767505646
step: 110, loss: 0.16220317780971527
step: 120, loss: 0.11777196079492569
step: 130, loss: 0.1073988527059555
step: 140, loss: 0.11261854320764542
step: 150, loss: 0.1403658241033554
step: 160, loss: 0.11712750792503357
step: 170, loss: 0.2865191102027893
step: 180, loss: 0.2186758816242218
step: 190, loss: 0.13984200358390808
step: 200, loss: 0.08780822157859802
step: 210, loss: 0.08354240655899048
step: 220, loss: 0.11134849488735199
step: 230, loss: 0.11070156842470169
step: 240, loss: 0.1174364760518074
step: 250, loss: 0.12061665952205658
step: 260, loss: 0.0942346528172493
step

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.933379888534546
step: 10, loss: 2.079237699508667
step: 20, loss: 0.7082272171974182
step: 30, loss: 0.3658095598220825
step: 40, loss: 0.39438706636428833
step: 50, loss: 0.2855231463909149
step: 60, loss: 0.273346871137619
step: 70, loss: 0.257242351770401
step: 80, loss: 0.20762771368026733
step: 90, loss: 0.12094555795192719
step: 100, loss: 0.25085729360580444
step: 110, loss: 0.16700465977191925
step: 120, loss: 0.13564090430736542
step: 130, loss: 0.18133509159088135
step: 140, loss: 0.14227987825870514
step: 150, loss: 0.1679641306400299
step: 160, loss: 0.08405518531799316
step: 170, loss: 0.17879851162433624
step: 180, loss: 0.16967813670635223
step: 190, loss: 0.21257784962654114
step: 200, loss: 0.10305246710777283
step: 210, loss: 0.17060287296772003
step: 220, loss: 0.0778423622250557
step: 230, loss: 0.1276039034128189
step: 240, loss: 0.07280922681093216
step: 250, loss: 0.135254368185997
step: 260, loss: 0.23587730526924133
step: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.972285032272339
step: 10, loss: 1.881079912185669
step: 20, loss: 0.8686563968658447
step: 30, loss: 0.4632740914821625
step: 40, loss: 0.3136829435825348
step: 50, loss: 0.21951790153980255
step: 60, loss: 0.2424425333738327
step: 70, loss: 0.22672469913959503
step: 80, loss: 0.22567012906074524
step: 90, loss: 0.1412728875875473
step: 100, loss: 0.17538921535015106
step: 110, loss: 0.11442257463932037
step: 120, loss: 0.1897497922182083
step: 130, loss: 0.2295593023300171
step: 140, loss: 0.18233737349510193
step: 150, loss: 0.11299975216388702
step: 160, loss: 0.10710879415273666
step: 170, loss: 0.09027916193008423
step: 180, loss: 0.10736306756734848
step: 190, loss: 0.09539572894573212
step: 200, loss: 0.20191554725170135
step: 210, loss: 0.12055662274360657
step: 220, loss: 0.2992651164531708
step: 230, loss: 0.0982879027724266
step: 240, loss: 0.20909830927848816
step: 250, loss: 0.17327097058296204
step: 260, loss: 0.08199142664670944
ste

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9889211654663086
step: 10, loss: 1.8902932405471802
step: 20, loss: 0.8721663355827332
step: 30, loss: 0.4634251296520233
step: 40, loss: 0.2630622982978821
step: 50, loss: 0.3083701431751251
step: 60, loss: 0.19311611354351044
step: 70, loss: 0.20632493495941162
step: 80, loss: 0.21555358171463013
step: 90, loss: 0.13461995124816895
step: 100, loss: 0.19053933024406433
step: 110, loss: 0.17524878680706024
step: 120, loss: 0.17663361132144928
step: 130, loss: 0.21215607225894928
step: 140, loss: 0.14651650190353394
step: 150, loss: 0.13787151873111725
step: 160, loss: 0.07736410945653915
step: 170, loss: 0.16541312634944916
step: 180, loss: 0.17952395975589752
step: 190, loss: 0.12273160368204117
step: 200, loss: 0.09034425765275955
step: 210, loss: 0.10374200344085693
step: 220, loss: 0.14182940125465393
step: 230, loss: 0.1353885978460312
step: 240, loss: 0.10320206731557846
step: 250, loss: 0.0990682989358902
step: 260, loss: 0.1933981627225875

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9744536876678467
step: 10, loss: 1.8813841342926025
step: 20, loss: 0.8601671457290649
step: 30, loss: 0.25061285495758057
step: 40, loss: 0.3138974606990814
step: 50, loss: 0.24492688477039337
step: 60, loss: 0.2361849844455719
step: 70, loss: 0.14505547285079956
step: 80, loss: 0.14649318158626556
step: 90, loss: 0.14387916028499603
step: 100, loss: 0.26877284049987793
step: 110, loss: 0.1308128535747528
step: 120, loss: 0.1805248111486435
step: 130, loss: 0.19299860298633575
step: 140, loss: 0.1818201243877411
step: 150, loss: 0.27566584944725037
step: 160, loss: 0.0750981941819191
step: 170, loss: 0.21244047582149506
step: 180, loss: 0.4024541974067688
step: 190, loss: 0.07883187383413315
step: 200, loss: 0.05764206871390343
step: 210, loss: 0.11974114924669266
step: 220, loss: 0.22525708377361298
step: 230, loss: 0.18757307529449463
step: 240, loss: 0.1549552083015442
step: 250, loss: 0.10043161362409592
step: 260, loss: 0.14222991466522217
s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.921952486038208
step: 10, loss: 1.9553453922271729
step: 20, loss: 0.8228297829627991
step: 30, loss: 0.546245276927948
step: 40, loss: 0.4095013439655304
step: 50, loss: 0.34957775473594666
step: 60, loss: 0.17030252516269684
step: 70, loss: 0.3192024230957031
step: 80, loss: 0.257066935300827
step: 90, loss: 0.21964257955551147
step: 100, loss: 0.08713547140359879
step: 110, loss: 0.133710578083992
step: 120, loss: 0.07465558499097824
step: 130, loss: 0.20182396471500397
step: 140, loss: 0.0979204997420311
step: 150, loss: 0.11746446043252945
step: 160, loss: 0.14733202755451202
step: 170, loss: 0.13915866613388062
step: 180, loss: 0.15032973885536194
step: 190, loss: 0.14940519630908966
step: 200, loss: 0.08555871248245239
step: 210, loss: 0.11843817681074142
step: 220, loss: 0.23812337219715118
step: 230, loss: 0.0839167982339859
step: 240, loss: 0.07623381912708282
step: 250, loss: 0.10840439796447754
step: 260, loss: 0.2695624828338623
step:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 4.001358985900879
step: 10, loss: 2.176811933517456
step: 20, loss: 0.778374433517456
step: 30, loss: 0.39793869853019714
step: 40, loss: 0.2603228688240051
step: 50, loss: 0.20918981730937958
step: 60, loss: 0.25824499130249023
step: 70, loss: 0.2705514132976532
step: 80, loss: 0.17983576655387878
step: 90, loss: 0.12380404770374298
step: 100, loss: 0.17446008324623108
step: 110, loss: 0.2079235464334488
step: 120, loss: 0.217134490609169
step: 130, loss: 0.18322893977165222
step: 140, loss: 0.127005934715271
step: 150, loss: 0.09769882261753082
step: 160, loss: 0.09292875230312347
step: 170, loss: 0.16319064795970917
step: 180, loss: 0.14336436986923218
step: 190, loss: 0.09133639931678772
step: 200, loss: 0.1182781308889389
step: 210, loss: 0.06775201112031937
step: 220, loss: 0.06734900921583176
step: 230, loss: 0.08980239182710648
step: 240, loss: 0.17466220259666443
step: 250, loss: 0.10532821714878082
step: 260, loss: 0.0904889851808548
step:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.8675501346588135
step: 10, loss: 2.0546021461486816
step: 20, loss: 0.7829028964042664
step: 30, loss: 0.47590675950050354
step: 40, loss: 0.3435775339603424
step: 50, loss: 0.21292619407176971
step: 60, loss: 0.25671157240867615
step: 70, loss: 0.13401192426681519
step: 80, loss: 0.14558568596839905
step: 90, loss: 0.1226481944322586
step: 100, loss: 0.3375145196914673
step: 110, loss: 0.21370398998260498
step: 120, loss: 0.18654392659664154
step: 130, loss: 0.18362629413604736
step: 140, loss: 0.049059074372053146
step: 150, loss: 0.08412835001945496
step: 160, loss: 0.13183647394180298
step: 170, loss: 0.07567910104990005
step: 180, loss: 0.16673295199871063
step: 190, loss: 0.17998698353767395
step: 200, loss: 0.04684961214661598
step: 210, loss: 0.1154683455824852
step: 220, loss: 0.11786939203739166
step: 230, loss: 0.14790967106819153
step: 240, loss: 0.09465213865041733
step: 250, loss: 0.11259350925683975
step: 260, loss: 0.01914226822555

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9306347370147705
step: 10, loss: 1.8630945682525635
step: 20, loss: 0.9501100778579712
step: 30, loss: 0.4462183117866516
step: 40, loss: 0.2706449627876282
step: 50, loss: 0.2950475215911865
step: 60, loss: 0.3984825015068054
step: 70, loss: 0.10151363164186478
step: 80, loss: 0.30601128935813904
step: 90, loss: 0.16543038189411163
step: 100, loss: 0.23344500362873077
step: 110, loss: 0.15280593931674957
step: 120, loss: 0.16076704859733582
step: 130, loss: 0.1035730391740799
step: 140, loss: 0.14258907735347748
step: 150, loss: 0.22788938879966736
step: 160, loss: 0.14310452342033386
step: 170, loss: 0.08486581593751907
step: 180, loss: 0.13020752370357513
step: 190, loss: 0.09415335208177567
step: 200, loss: 0.06965958327054977
step: 210, loss: 0.06766773760318756
step: 220, loss: 0.14209946990013123
step: 230, loss: 0.09482316672801971
step: 240, loss: 0.15127825736999512
step: 250, loss: 0.07908084243535995
step: 260, loss: 0.2468144148588180

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.864757776260376
step: 10, loss: 1.7239850759506226
step: 20, loss: 0.8288119435310364
step: 30, loss: 0.3564641773700714
step: 40, loss: 0.21333026885986328
step: 50, loss: 0.22471864521503448
step: 60, loss: 0.25397607684135437
step: 70, loss: 0.17759199440479279
step: 80, loss: 0.3097107708454132
step: 90, loss: 0.20673254132270813
step: 100, loss: 0.266372948884964
step: 110, loss: 0.14085261523723602
step: 120, loss: 0.3687836229801178
step: 130, loss: 0.14101256430149078
step: 140, loss: 0.09956920892000198
step: 150, loss: 0.14494085311889648
step: 160, loss: 0.1496981531381607
step: 170, loss: 0.1388646960258484
step: 180, loss: 0.06745640188455582
step: 190, loss: 0.14588893949985504
step: 200, loss: 0.15627913177013397
step: 210, loss: 0.09728321433067322
step: 220, loss: 0.05335378274321556
step: 230, loss: 0.09887602180242538
step: 240, loss: 0.21320024132728577
step: 250, loss: 0.11062014102935791
step: 260, loss: 0.10534782707691193
s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9358298778533936
step: 10, loss: 2.2466273307800293
step: 20, loss: 0.8041014671325684
step: 30, loss: 0.43872424960136414
step: 40, loss: 0.2484079897403717
step: 50, loss: 0.2658046782016754
step: 60, loss: 0.18988747894763947
step: 70, loss: 0.1539314240217209
step: 80, loss: 0.0902363732457161
step: 90, loss: 0.1545673906803131
step: 100, loss: 0.24288064241409302
step: 110, loss: 0.21379302442073822
step: 120, loss: 0.22533059120178223
step: 130, loss: 0.1229313537478447
step: 140, loss: 0.15117251873016357
step: 150, loss: 0.08084571361541748
step: 160, loss: 0.1884625256061554
step: 170, loss: 0.04344213753938675
step: 180, loss: 0.19340765476226807
step: 190, loss: 0.1742456704378128
step: 200, loss: 0.13018785417079926
step: 210, loss: 0.1554989516735077
step: 220, loss: 0.12866194546222687
step: 230, loss: 0.23414544761180878
step: 240, loss: 0.11854326725006104
step: 250, loss: 0.06546331942081451
step: 260, loss: 0.06903573870658875
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9555978775024414
step: 10, loss: 1.6326723098754883
step: 20, loss: 1.0784960985183716
step: 30, loss: 0.5778010487556458
step: 40, loss: 0.2593838572502136
step: 50, loss: 0.183100625872612
step: 60, loss: 0.13610757887363434
step: 70, loss: 0.12334715574979782
step: 80, loss: 0.10100452601909637
step: 90, loss: 0.18718692660331726
step: 100, loss: 0.12110275775194168
step: 110, loss: 0.17203551530838013
step: 120, loss: 0.18723468482494354
step: 130, loss: 0.13622857630252838
step: 140, loss: 0.23485156893730164
step: 150, loss: 0.17897473275661469
step: 160, loss: 0.12936393916606903
step: 170, loss: 0.16715066134929657
step: 180, loss: 0.09094057977199554
step: 190, loss: 0.1655702143907547
step: 200, loss: 0.09673520922660828
step: 210, loss: 0.10353197157382965
step: 220, loss: 0.12251419574022293
step: 230, loss: 0.030579598620533943
step: 240, loss: 0.10840312391519547
step: 250, loss: 0.08367463201284409
step: 260, loss: 0.079356439411640

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9681103229522705
step: 10, loss: 2.0285375118255615
step: 20, loss: 0.9123215675354004
step: 30, loss: 0.4078019857406616
step: 40, loss: 0.3636032044887543
step: 50, loss: 0.396932989358902
step: 60, loss: 0.2721887230873108
step: 70, loss: 0.15963265299797058
step: 80, loss: 0.1352388560771942
step: 90, loss: 0.09224873036146164
step: 100, loss: 0.0836498811841011
step: 110, loss: 0.13483165204524994
step: 120, loss: 0.07264231145381927
step: 130, loss: 0.14103305339813232
step: 140, loss: 0.07327605783939362
step: 150, loss: 0.06277741491794586
step: 160, loss: 0.08289482444524765
step: 170, loss: 0.15018902719020844
step: 180, loss: 0.2114550918340683
step: 190, loss: 0.11842069774866104
step: 200, loss: 0.1719295084476471
step: 210, loss: 0.21058127284049988
step: 220, loss: 0.12019719928503036
step: 230, loss: 0.14349696040153503
step: 240, loss: 0.14067187905311584
step: 250, loss: 0.11646910011768341
step: 260, loss: 0.10840488225221634
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9623427391052246
step: 10, loss: 1.8966691493988037
step: 20, loss: 0.7803052067756653
step: 30, loss: 0.2601768970489502
step: 40, loss: 0.3431254029273987
step: 50, loss: 0.1712368130683899
step: 60, loss: 0.27732759714126587
step: 70, loss: 0.25007152557373047
step: 80, loss: 0.16980856657028198
step: 90, loss: 0.17424534261226654
step: 100, loss: 0.21397767961025238
step: 110, loss: 0.13506202399730682
step: 120, loss: 0.13949212431907654
step: 130, loss: 0.10206527262926102
step: 140, loss: 0.16542178392410278
step: 150, loss: 0.14843830466270447
step: 160, loss: 0.09021269530057907
step: 170, loss: 0.16290336847305298
step: 180, loss: 0.23917558789253235
step: 190, loss: 0.21353398263454437
step: 200, loss: 0.09355674684047699
step: 210, loss: 0.1399371325969696
step: 220, loss: 0.15627558529376984
step: 230, loss: 0.10328761488199234
step: 240, loss: 0.1915159672498703
step: 250, loss: 0.12056891620159149
step: 260, loss: 0.1191147491335868

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 4.008667945861816
step: 10, loss: 2.123138189315796
step: 20, loss: 0.6585496664047241
step: 30, loss: 0.3760696351528168
step: 40, loss: 0.2852446734905243
step: 50, loss: 0.35284724831581116
step: 60, loss: 0.3372957110404968
step: 70, loss: 0.1853935420513153
step: 80, loss: 0.16300082206726074
step: 90, loss: 0.09872939437627792
step: 100, loss: 0.10356563329696655
step: 110, loss: 0.10304224491119385
step: 120, loss: 0.17673034965991974
step: 130, loss: 0.10655201971530914
step: 140, loss: 0.17311321198940277
step: 150, loss: 0.10110468417406082
step: 160, loss: 0.055706124752759933
step: 170, loss: 0.10649729520082474
step: 180, loss: 0.21981896460056305
step: 190, loss: 0.20212829113006592
step: 200, loss: 0.04057512432336807
step: 210, loss: 0.1143881231546402
step: 220, loss: 0.17347072064876556
step: 230, loss: 0.12349089980125427
step: 240, loss: 0.17459352314472198
step: 250, loss: 0.04905720427632332
step: 260, loss: 0.11257674545049667

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.8868348598480225
step: 10, loss: 1.795323133468628
step: 20, loss: 0.6906288266181946
step: 30, loss: 0.5298102498054504
step: 40, loss: 0.377384215593338
step: 50, loss: 0.2771925926208496
step: 60, loss: 0.13009336590766907
step: 70, loss: 0.22708934545516968
step: 80, loss: 0.20638911426067352
step: 90, loss: 0.14473019540309906
step: 100, loss: 0.09677142649888992
step: 110, loss: 0.21193721890449524
step: 120, loss: 0.21287600696086884
step: 130, loss: 0.19535289704799652
step: 140, loss: 0.15549778938293457
step: 150, loss: 0.19682380557060242
step: 160, loss: 0.1784082055091858
step: 170, loss: 0.13034148514270782
step: 180, loss: 0.2464001625776291
step: 190, loss: 0.11796712130308151
step: 200, loss: 0.06220724806189537
step: 210, loss: 0.20384465157985687
step: 220, loss: 0.2516816258430481
step: 230, loss: 0.20446638762950897
step: 240, loss: 0.16015049815177917
step: 250, loss: 0.2163160741329193
step: 260, loss: 0.1748257279396057
ste

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.959040880203247
step: 10, loss: 2.0310254096984863
step: 20, loss: 0.9113648533821106
step: 30, loss: 0.443359911441803
step: 40, loss: 0.501987636089325
step: 50, loss: 0.20850533246994019
step: 60, loss: 0.29904690384864807
step: 70, loss: 0.2047886997461319
step: 80, loss: 0.23451143503189087
step: 90, loss: 0.24451617896556854
step: 100, loss: 0.13573628664016724
step: 110, loss: 0.0750793069601059
step: 120, loss: 0.23212003707885742
step: 130, loss: 0.1023145392537117
step: 140, loss: 0.15250645577907562
step: 150, loss: 0.18496663868427277
step: 160, loss: 0.13079003989696503
step: 170, loss: 0.0729406401515007
step: 180, loss: 0.15721918642520905
step: 190, loss: 0.19918182492256165
step: 200, loss: 0.23211678862571716
step: 210, loss: 0.1096203401684761
step: 220, loss: 0.1401509791612625
step: 230, loss: 0.053563930094242096
step: 240, loss: 0.13158375024795532
step: 250, loss: 0.1378379613161087
step: 260, loss: 0.19387951493263245
step

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.960095167160034
step: 10, loss: 1.831671118736267
step: 20, loss: 0.7679698467254639
step: 30, loss: 0.4193817675113678
step: 40, loss: 0.18262892961502075
step: 50, loss: 0.3013148903846741
step: 60, loss: 0.21076831221580505
step: 70, loss: 0.1683776080608368
step: 80, loss: 0.1768563985824585
step: 90, loss: 0.13118433952331543
step: 100, loss: 0.13871490955352783
step: 110, loss: 0.08509650826454163
step: 120, loss: 0.1894870400428772
step: 130, loss: 0.17446064949035645
step: 140, loss: 0.16969835758209229
step: 150, loss: 0.16423174738883972
step: 160, loss: 0.13391868770122528
step: 170, loss: 0.12463785707950592
step: 180, loss: 0.14621633291244507
step: 190, loss: 0.16642646491527557
step: 200, loss: 0.14482979476451874
step: 210, loss: 0.19682151079177856
step: 220, loss: 0.1453271508216858
step: 230, loss: 0.1303224265575409
step: 240, loss: 0.09301222860813141
step: 250, loss: 0.08455400913953781
step: 260, loss: 0.07143307477235794
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.916170835494995
step: 10, loss: 1.982944130897522
step: 20, loss: 0.7716990113258362
step: 30, loss: 0.626810610294342
step: 40, loss: 0.291324645280838
step: 50, loss: 0.19179773330688477
step: 60, loss: 0.1934954971075058
step: 70, loss: 0.2348651885986328
step: 80, loss: 0.20660999417304993
step: 90, loss: 0.1661296933889389
step: 100, loss: 0.1051936075091362
step: 110, loss: 0.17230038344860077
step: 120, loss: 0.14863477647304535
step: 130, loss: 0.23250268399715424
step: 140, loss: 0.10126712173223495
step: 150, loss: 0.15330666303634644
step: 160, loss: 0.3045642077922821
step: 170, loss: 0.15333598852157593
step: 180, loss: 0.1344618946313858
step: 190, loss: 0.18856285512447357
step: 200, loss: 0.11375031620264053
step: 210, loss: 0.13073758780956268
step: 220, loss: 0.08021168410778046
step: 230, loss: 0.0971570760011673
step: 240, loss: 0.10998137295246124
step: 250, loss: 0.18573929369449615
step: 260, loss: 0.12979677319526672
step: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9276421070098877
step: 10, loss: 2.0107600688934326
step: 20, loss: 0.6412672996520996
step: 30, loss: 0.38036203384399414
step: 40, loss: 0.3293927311897278
step: 50, loss: 0.2412095069885254
step: 60, loss: 0.24165615439414978
step: 70, loss: 0.16257010400295258
step: 80, loss: 0.13483195006847382
step: 90, loss: 0.14514292776584625
step: 100, loss: 0.1812020093202591
step: 110, loss: 0.1305546909570694
step: 120, loss: 0.14527179300785065
step: 130, loss: 0.1642572581768036
step: 140, loss: 0.3017669916152954
step: 150, loss: 0.08671228587627411
step: 160, loss: 0.24213802814483643
step: 170, loss: 0.0745009258389473
step: 180, loss: 0.08250083029270172
step: 190, loss: 0.11874690651893616
step: 200, loss: 0.14494819939136505
step: 210, loss: 0.10813605785369873
step: 220, loss: 0.23726940155029297
step: 230, loss: 0.1603071540594101
step: 240, loss: 0.15998908877372742
step: 250, loss: 0.06342843174934387
step: 260, loss: 0.1077989786863327
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.8749887943267822
step: 10, loss: 1.7468065023422241
step: 20, loss: 0.7857472896575928
step: 30, loss: 0.34477221965789795
step: 40, loss: 0.2092415988445282
step: 50, loss: 0.28058570623397827
step: 60, loss: 0.33698585629463196
step: 70, loss: 0.2531597912311554
step: 80, loss: 0.15061794221401215
step: 90, loss: 0.26120725274086
step: 100, loss: 0.13932137191295624
step: 110, loss: 0.2091992199420929
step: 120, loss: 0.06363054364919662
step: 130, loss: 0.20826083421707153
step: 140, loss: 0.11957211792469025
step: 150, loss: 0.11137189716100693
step: 160, loss: 0.188290074467659
step: 170, loss: 0.12046846002340317
step: 180, loss: 0.11150538921356201
step: 190, loss: 0.07822588831186295
step: 200, loss: 0.0844736248254776
step: 210, loss: 0.1260073035955429
step: 220, loss: 0.03734308481216431
step: 230, loss: 0.12362390756607056
step: 240, loss: 0.2047717571258545
step: 250, loss: 0.12191155552864075
step: 260, loss: 0.22699132561683655
step

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9826526641845703
step: 10, loss: 2.0843541622161865
step: 20, loss: 0.5950069427490234
step: 30, loss: 0.5080621838569641
step: 40, loss: 0.5369895100593567
step: 50, loss: 0.20122016966342926
step: 60, loss: 0.15766313672065735
step: 70, loss: 0.3583151400089264
step: 80, loss: 0.13639657199382782
step: 90, loss: 0.1384068876504898
step: 100, loss: 0.1984596848487854
step: 110, loss: 0.10337593406438828
step: 120, loss: 0.16928090155124664
step: 130, loss: 0.12109661847352982
step: 140, loss: 0.06675271689891815
step: 150, loss: 0.12777762115001678
step: 160, loss: 0.08939576148986816
step: 170, loss: 0.0870320275425911
step: 180, loss: 0.09719623625278473
step: 190, loss: 0.14430908858776093
step: 200, loss: 0.13102532923221588
step: 210, loss: 0.10081414878368378
step: 220, loss: 0.14174515008926392
step: 230, loss: 0.05947151407599449
step: 240, loss: 0.04016437754034996
step: 250, loss: 0.06868080794811249
step: 260, loss: 0.17887797951698303

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 4.05072546005249
step: 10, loss: 2.079935312271118
step: 20, loss: 1.1515299081802368
step: 30, loss: 0.5111343264579773
step: 40, loss: 0.34140393137931824
step: 50, loss: 0.24725016951560974
step: 60, loss: 0.25193795561790466
step: 70, loss: 0.1720106601715088
step: 80, loss: 0.26305127143859863
step: 90, loss: 0.17262904345989227
step: 100, loss: 0.24584728479385376
step: 110, loss: 0.20003953576087952
step: 120, loss: 0.1853145956993103
step: 130, loss: 0.07296455651521683
step: 140, loss: 0.08162181079387665
step: 150, loss: 0.1814326047897339
step: 160, loss: 0.13427726924419403
step: 170, loss: 0.07466243207454681
step: 180, loss: 0.19029615819454193
step: 190, loss: 0.16631749272346497
step: 200, loss: 0.17219825088977814
step: 210, loss: 0.2518295347690582
step: 220, loss: 0.07000984251499176
step: 230, loss: 0.15591128170490265
step: 240, loss: 0.1454756110906601
step: 250, loss: 0.10654862970113754
step: 260, loss: 0.10162365436553955
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9816367626190186
step: 10, loss: 1.9828946590423584
step: 20, loss: 0.6430389881134033
step: 30, loss: 0.41326093673706055
step: 40, loss: 0.33333176374435425
step: 50, loss: 0.2781907916069031
step: 60, loss: 0.1692461222410202
step: 70, loss: 0.11948822438716888
step: 80, loss: 0.17902271449565887
step: 90, loss: 0.2281421571969986
step: 100, loss: 0.2235051542520523
step: 110, loss: 0.16443374752998352
step: 120, loss: 0.1635790467262268
step: 130, loss: 0.05207251012325287
step: 140, loss: 0.27641353011131287
step: 150, loss: 0.17572878301143646
step: 160, loss: 0.11132755130529404
step: 170, loss: 0.18299554288387299
step: 180, loss: 0.11782286316156387
step: 190, loss: 0.1314026415348053
step: 200, loss: 0.10674039274454117
step: 210, loss: 0.2372926026582718
step: 220, loss: 0.12283238768577576
step: 230, loss: 0.12082339078187943
step: 240, loss: 0.09162910282611847
step: 250, loss: 0.06934843212366104
step: 260, loss: 0.06759125739336014


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.8965487480163574
step: 10, loss: 1.867255687713623
step: 20, loss: 0.7641825675964355
step: 30, loss: 0.4129057228565216
step: 40, loss: 0.39036595821380615
step: 50, loss: 0.15351751446723938
step: 60, loss: 0.3347763419151306
step: 70, loss: 0.23730137944221497
step: 80, loss: 0.19183529913425446
step: 90, loss: 0.1530686318874359
step: 100, loss: 0.2544952929019928
step: 110, loss: 0.11204375326633453
step: 120, loss: 0.1348150074481964
step: 130, loss: 0.12346067279577255
step: 140, loss: 0.09768608957529068
step: 150, loss: 0.16451364755630493
step: 160, loss: 0.13873137533664703
step: 170, loss: 0.14260649681091309
step: 180, loss: 0.12139029055833817
step: 190, loss: 0.131741464138031
step: 200, loss: 0.19133946299552917
step: 210, loss: 0.14844904839992523
step: 220, loss: 0.2325112521648407
step: 230, loss: 0.10624198615550995
step: 240, loss: 0.06058590114116669
step: 250, loss: 0.06756109744310379
step: 260, loss: 0.08439792692661285
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 4.013138771057129
step: 10, loss: 2.2395570278167725
step: 20, loss: 0.9030024409294128
step: 30, loss: 0.526643693447113
step: 40, loss: 0.4186587631702423
step: 50, loss: 0.2710305452346802
step: 60, loss: 0.2857153117656708
step: 70, loss: 0.1367371380329132
step: 80, loss: 0.2738984525203705
step: 90, loss: 0.11650940775871277
step: 100, loss: 0.10337415337562561
step: 110, loss: 0.12969274818897247
step: 120, loss: 0.06326396018266678
step: 130, loss: 0.1254047006368637
step: 140, loss: 0.15379729866981506
step: 150, loss: 0.16816584765911102
step: 160, loss: 0.1063871905207634
step: 170, loss: 0.12878431379795074
step: 180, loss: 0.10424583405256271
step: 190, loss: 0.15119828283786774
step: 200, loss: 0.08398506045341492
step: 210, loss: 0.11412518471479416
step: 220, loss: 0.1276850402355194
step: 230, loss: 0.14920446276664734
step: 240, loss: 0.07400981336832047
step: 250, loss: 0.10633990913629532
step: 260, loss: 0.04026471823453903
step

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.970917224884033
step: 10, loss: 2.093954563140869
step: 20, loss: 1.0494707822799683
step: 30, loss: 0.5315368175506592
step: 40, loss: 0.330430805683136
step: 50, loss: 0.16933169960975647
step: 60, loss: 0.22597543895244598
step: 70, loss: 0.16711410880088806
step: 80, loss: 0.2433757781982422
step: 90, loss: 0.058360710740089417
step: 100, loss: 0.13158531486988068
step: 110, loss: 0.2603911757469177
step: 120, loss: 0.15230536460876465
step: 130, loss: 0.16011135280132294
step: 140, loss: 0.18889260292053223
step: 150, loss: 0.132660374045372
step: 160, loss: 0.1683526486158371
step: 170, loss: 0.0726243257522583
step: 180, loss: 0.17619886994361877
step: 190, loss: 0.2247796505689621
step: 200, loss: 0.12155357003211975
step: 210, loss: 0.15913860499858856
step: 220, loss: 0.1990828961133957
step: 230, loss: 0.1682073473930359
step: 240, loss: 0.2537221908569336
step: 250, loss: 0.18132051825523376
step: 260, loss: 0.08786503970623016
step: 2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9510927200317383
step: 10, loss: 1.6963247060775757
step: 20, loss: 0.7676147818565369
step: 30, loss: 0.5684288144111633
step: 40, loss: 0.1651923805475235
step: 50, loss: 0.24084553122520447
step: 60, loss: 0.18038272857666016
step: 70, loss: 0.12067651003599167
step: 80, loss: 0.2269279509782791
step: 90, loss: 0.1615825891494751
step: 100, loss: 0.16526082158088684
step: 110, loss: 0.12492214143276215
step: 120, loss: 0.07110109180212021
step: 130, loss: 0.29426854848861694
step: 140, loss: 0.17072153091430664
step: 150, loss: 0.17298105359077454
step: 160, loss: 0.11705880612134933
step: 170, loss: 0.13802337646484375
step: 180, loss: 0.16756004095077515
step: 190, loss: 0.2563578188419342
step: 200, loss: 0.12766103446483612
step: 210, loss: 0.23173223435878754
step: 220, loss: 0.21279790997505188
step: 230, loss: 0.09138934314250946
step: 240, loss: 0.099103644490242
step: 250, loss: 0.0988968014717102
step: 260, loss: 0.11432038247585297
s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.96004056930542
step: 10, loss: 1.8525991439819336
step: 20, loss: 0.8041223883628845
step: 30, loss: 0.4342963695526123
step: 40, loss: 0.22150608897209167
step: 50, loss: 0.1396239995956421
step: 60, loss: 0.08905162662267685
step: 70, loss: 0.23213863372802734
step: 80, loss: 0.16425423324108124
step: 90, loss: 0.11698860675096512
step: 100, loss: 0.1722438931465149
step: 110, loss: 0.2138989269733429
step: 120, loss: 0.1908452957868576
step: 130, loss: 0.21975775063037872
step: 140, loss: 0.11640804260969162
step: 150, loss: 0.19088958203792572
step: 160, loss: 0.2261100560426712
step: 170, loss: 0.0546133853495121
step: 180, loss: 0.22028818726539612
step: 190, loss: 0.19984419643878937
step: 200, loss: 0.09553410857915878
step: 210, loss: 0.1273011863231659
step: 220, loss: 0.14103415608406067
step: 230, loss: 0.09666953235864639
step: 240, loss: 0.12707912921905518
step: 250, loss: 0.08807364851236343
step: 260, loss: 0.1739995926618576
step

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9919679164886475
step: 10, loss: 2.1831295490264893
step: 20, loss: 0.8834181427955627
step: 30, loss: 0.40551456809043884
step: 40, loss: 0.5093245506286621
step: 50, loss: 0.17259877920150757
step: 60, loss: 0.2642151117324829
step: 70, loss: 0.2267920970916748
step: 80, loss: 0.09236352890729904
step: 90, loss: 0.06847624480724335
step: 100, loss: 0.193571075797081
step: 110, loss: 0.21791543066501617
step: 120, loss: 0.15678715705871582
step: 130, loss: 0.1794290393590927
step: 140, loss: 0.08837181329727173
step: 150, loss: 0.34447041153907776
step: 160, loss: 0.15022188425064087
step: 170, loss: 0.14880697429180145
step: 180, loss: 0.07858103513717651
step: 190, loss: 0.14007481932640076
step: 200, loss: 0.10545501112937927
step: 210, loss: 0.12647585570812225
step: 220, loss: 0.1506822556257248
step: 230, loss: 0.12373344600200653
step: 240, loss: 0.07537040114402771
step: 250, loss: 0.1313236653804779
step: 260, loss: 0.10953597724437714
s

In [None]:
model_file = os.path.join(model_dir, "nonfixed_model.pt")
torch.save(model.state_dict(), model_file)

In [None]:
# from collections import Counter

# c = Counter([tuple(sen) for sen in top_words])
# print( c.most_common(3))

In [None]:
import pandas as pd

In [None]:
test_metric = pd.DataFrame({
    "Loop": list(range(len(domain_precision_value_lst))) * 3,
    "metric": ["precision"]*len(domain_precision_value_lst) + ["recall"]*len(domain_precision_value_lst) + ["f1"]*len(domain_precision_value_lst),
    "value": domain_precision_value_lst + domain_recall_value_lst + domain_f1_value_lst
})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import plotly
import plotly.express as px
import plotly.graph_objects as go

In [None]:
fig = px.line(test_metric, x="Loop", y="value", color='metric', markers=True)
fig.show()