# Load Data

In [None]:
! pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Capstone')

import os
import pandas as pd
import numpy as np

from utils import read_conll_file, read_data


data_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/data/gweb_sancl"
wsj_dir = os.path.join(data_dir, "pos_fine", "wsj")
model_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/model"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
wsj_train_file = os.path.join(wsj_dir, "gweb-wsj-train.conll")
wsj_dev_file = os.path.join(wsj_dir, "gweb-wsj-dev.conll")
wsj_test_file = os.path.join(wsj_dir, "gweb-wsj-test.conll")

In [None]:
wsj_train_word_lst, wsj_train_tag_lst, wsj_train_tag_set = read_data(wsj_train_file)
wsj_dev_word_lst, wsj_dev_tag_lst, wsj_dev_tag_set = read_data(wsj_dev_file)
wsj_test_word_lst, wsj_test_tag_lst, wsj_test_tag_set = read_data(wsj_test_file)

The number of samples: 30060
The number of tags 48
The number of samples: 1336
The number of tags 45
The number of samples: 1640
The number of tags 45


In [None]:
wsj_tags = wsj_train_tag_set + wsj_dev_tag_set + wsj_test_tag_set
wsj_tags = sorted(list(set(wsj_tags)))
wsj_tags = ["<pad>"] + wsj_tags
tag2idx = {tag:idx for idx, tag in enumerate(wsj_tags)}
idx2tag = {idx:tag for idx, tag in enumerate(wsj_tags)}
print(len(wsj_tags))

49


# Build Model

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [None]:
# tokens = tokenizer.tokenize("mistakenly")
# tokens

In [None]:
# tid = tokenizer.convert_tokens_to_ids(tokens)
# tid

In [None]:
# tokenizer.convert_ids_to_tokens([234,2000,3000,22893])

In [None]:
class PosDataset(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        sents, tags_li = [], [] # list of lists
        for i in range(len(word_lst)):
            sents.append(["[CLS]"] + word_lst[i] + ["[SEP]"])
            tags_li.append(["<pad>"] + tag_lst[i] + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [None]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [None]:
from pytorch_pretrained_bert import BertModel

In [None]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def eval(model, iterator, average="macro"):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)
    print("classification_report", classification_report(y_true, y_pred))
    precision_value = precision_score(y_true, y_pred, average=average)
    recall_value = recall_score(y_true, y_pred, average=average)
    f1_value = f1_score(y_true, y_pred, average=average)

    return precision_value, recall_value, f1_value

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

In [None]:
train_dataset = PosDataset(wsj_train_word_lst, wsj_train_tag_lst)
eval_dataset = PosDataset(wsj_test_word_lst, wsj_test_tag_lst)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
# train(model, train_iter, optimizer, criterion)
# eval(model, test_iter)

# Save Model

In [None]:
model_file = os.path.join(model_dir, "base_model.pt")
# torch.save(model.state_dict(), model_file)

## Load Model

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)
model.load_state_dict(torch.load(model_file))
wsj_precision_value, wsj_recall_value, wsj_f1_value = eval(model, test_iter)

acc=0.97
classification_report               precision    recall  f1-score   support

           1       1.00      1.00      1.00       178
           2       1.00      1.00      1.00       352
           3       1.00      1.00      1.00      2000
           4       1.00      1.00      1.00        60
           5       1.00      1.00      1.00        60
           6       1.00      1.00      1.00      1613
           7       1.00      1.00      1.00       223
           9       1.00      0.99      1.00       935
          10       0.98      1.00      0.99      1266
          11       0.99      1.00      0.99      3309
          12       1.00      1.00      1.00        46
          13       1.00      0.20      0.33        20
          14       1.00      0.99      1.00       511
          15       0.97      0.99      0.98      4250
          16       0.97      0.89      0.93      2423
          17       0.96      0.93      0.94       139
          18       0.92      0.93      0.93       

In [None]:
# wsj_precision_value, wsj_recall_value, wsj_f1_value

# Self Training

In [None]:
def filter_tag(process_words, process_tags, label_tags_set=wsj_tags):
  new_words = []
  new_tags = []
  for words, tags in zip(process_words, process_tags):
    w_lst = []
    t_lst = []
    for i, t in enumerate(tags):
      if t in label_tags_set:
        w_lst.append(words[i])
        t_lst.append(tags[i])

    if w_lst:
      new_words.append(w_lst)
      new_tags.append(t_lst)
  print("after filter tag", len(new_words))
  return new_words, new_tags

In [None]:
file_name_lst = ["answers", "emails", "newsgroups", "reviews", "weblogs"]

In [None]:
domain = "emails"
domain_dir = os.path.join(data_dir, "pos_fine", f"{domain}")
domain_dev_file = os.path.join(domain_dir, f"gweb-{domain}-dev.conll")
domain_test_file = os.path.join(domain_dir, f"gweb-{domain}-test.conll")

In [None]:
domain_dev_word_lst, domain_dev_tag_lst, domain_dev_tag_set = read_data(domain_dev_file)
domain_test_word_lst, domain_test_tag_lst, domain_test_tag_set = read_data(domain_test_file)
domain_dev_word_lst, domain_dev_tag_lst = filter_tag(domain_dev_word_lst, domain_dev_tag_lst)  
domain_test_word_lst, domain_test_tag_lst = filter_tag(domain_test_word_lst, domain_test_tag_lst)

The number of samples: 2450
The number of tags 49
The number of samples: 2450
The number of tags 48
after filter tag 2427
after filter tag 2402


In [None]:
domain_precision_value_lst = []
domain_recall_value_lst = []
domain_f1_value_lst = []

In [None]:
domain_test_dataset = PosDataset(domain_test_word_lst, domain_test_tag_lst)

domain_test_iter = data.DataLoader(dataset=domain_test_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

domain_precision_value, domain_recall_value, domain_f1_value = eval(model, domain_test_iter)

domain_precision_value_lst.append(domain_precision_value)
domain_recall_value_lst.append(domain_recall_value)
domain_f1_value_lst.append(domain_f1_value)

acc=0.91
classification_report               precision    recall  f1-score   support

           1       0.79      0.94      0.86        35
           2       0.87      0.52      0.65        77
           3       1.00      0.79      0.88      1030
           4       1.00      0.84      0.91       291
           5       0.91      0.84      0.87       294
           6       0.99      0.98      0.99      1570
           7       0.61      0.94      0.74       186
           8       0.00      0.00      0.00        11
           9       0.99      0.98      0.98       689
          10       0.93      0.98      0.96       901
          11       0.96      1.00      0.98      2111
          12       0.98      0.96      0.97        47
          13       0.60      0.46      0.52        13
          14       0.28      1.00      0.44        43
          15       0.93      0.98      0.95      2778
          16       0.90      0.80      0.85      1151
          17       0.91      0.95      0.93       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
class PosDataset_new(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        self.word_lst, self.tag_lst = word_lst, tag_lst

    def __len__(self):
      return len(self.word_lst)

    def __getitem__(self, idx):
      words, tags = self.word_lst[idx], self.tag_lst[idx] # words, tags: string list
      assert len(words)==len(tags)
        # seqlen
      seqlen = len(words)

      return words, tags, seqlen

def pad_new(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    tags = f(1)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(0, maxlen)
    y = f(1, maxlen)

    f = torch.LongTensor

    return f(x), f(y), seqlens

def train_new(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        x, y, seqlens = batch
        
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def gen_pseudo_data(model, domain_dev_iter, topn=300, initial=True):
  model.eval()

  LLD = []
  MEAN_PROB = []
  new_x_lst = []
  new_y_lst = []
  new_prob_lst = []

  if initial:
    with torch.no_grad():
        for i, batch in enumerate(domain_dev_iter):

          _, x, _, _, y, _ = batch
          sen_len = y.bool().sum(axis=1)

          logits, _, y_hat = model(x, y)  # y_hat: (N, T)

          # Save prediction as new training dataset
          softmax_value = torch.softmax(logits, dim=2)
          max_prob = torch.amax(softmax_value, dim=2)

          # Rank by mean probability
          res_prob = y.bool().to(device) * max_prob.to(device)
          sum_prob = res_prob.sum(axis=1)
          mean_prob = sum_prob / sen_len.to(device)
          MEAN_PROB.extend(mean_prob)
          
          new_x_lst.extend(x.tolist())
          new_y_lst.extend(y_hat.tolist())
          new_prob_lst.extend(max_prob.tolist())
  else:
    with torch.no_grad():
        for i, batch in enumerate(domain_dev_iter):

          x, y, seqlens = batch
          sen_len = y.bool().sum(axis=1)

          logits, _, y_hat = model(x, y)  # y_hat: (N, T)

          # Save prediction as new training dataset
          softmax_value = torch.softmax(logits, dim=2)
          max_prob = torch.amax(softmax_value, dim=2)

          # Rank by mean probability
          res_prob = y.bool().to(device) * max_prob.to(device)
          sum_prob = res_prob.sum(axis=1)
          mean_prob = sum_prob / sen_len.to(device)
          MEAN_PROB.extend(mean_prob)
          
          new_x_lst.extend(x.tolist())
          new_y_lst.extend(y_hat.tolist())
          new_prob_lst.extend(max_prob.tolist())

  ind = list(range(len(MEAN_PROB)))
  ind = [x for _, x in sorted(zip(MEAN_PROB, ind), reverse=True)]

  select_ind = ind[: topn]
  not_select_ind = ind[topn: ]

  new_train_x = [new_x_lst[i] for i in select_ind]
  new_train_y = [new_y_lst[i] for i in select_ind]
  new_train_prob = [new_prob_lst[i] for i in select_ind]

  remain_train_x = [new_x_lst[i] for i in not_select_ind]
  remain_train_y = [new_y_lst[i] for i in not_select_ind]
  remain_train_prob = [new_prob_lst[i] for i in not_select_ind]

  return new_train_x, new_train_y, new_train_prob, remain_train_x, remain_train_y, remain_train_prob

In [None]:
domain_dev_dataset = PosDataset(domain_dev_word_lst, domain_dev_tag_lst)

domain_dev_iter = data.DataLoader(dataset=domain_dev_dataset,
                            batch_size=8,
                            shuffle=False,
                            num_workers=1,
                            collate_fn=pad)

In [None]:
# model.eval()

# LLD = []
# MEAN_PROB = []
# new_x_lst = []
# new_y_lst = []

# with torch.no_grad():
#     for i, batch in enumerate(domain_dev_iter):

#       _, x, _, _, y, _ = batch
#       sen_len = y.bool().sum(axis=1)

#       logits, _, y_hat = model(x, y)  # y_hat: (N, T)

#       # Save prediction as new training dataset
#       softmax_value = torch.softmax(logits, dim=2)
#       max_prob = torch.amax(softmax_value, dim=2)

#       # Rank by mean probability
#       res_prob = y.bool().to(device) * max_prob.to(device)
#       sum_prob = res_prob.sum(axis=1)
#       mean_prob = sum_prob / sen_len.to(device)
#       MEAN_PROB.extend(mean_prob)
      
#       new_x_lst.extend(x.tolist())
#       new_y_lst.extend(y_hat.tolist())
#       break

In [None]:
# y.shape

In [None]:
# y

In [None]:
# y_hat

In [None]:
# max_prob

In [None]:
len(domain_dev_word_lst)

2427

In [None]:
threshold_prob = 0.9

# 1. topn token overall
# 2. select from each POS class 1% , 2%, 5%


topn = len(domain_dev_word_lst)
i = 0
while i <= 10:
  i += 1
  print("\nLoop", i)
  print("domain_dev_word_lst", len(domain_dev_word_lst))

  domain_dev_dataset = PosDataset(domain_dev_word_lst, domain_dev_tag_lst)

  domain_dev_iter = data.DataLoader(dataset=domain_dev_dataset,
                              batch_size=8,
                              shuffle=False,
                              num_workers=1,
                                collate_fn=pad)
  
  top_words_ids, top_tags_ids, top_prob_lst, _, _, _ = gen_pseudo_data(model, domain_dev_iter, topn)

  # Revert ids to words
  top_words = []
  top_tags = []
  top_prob = []
  for t in range(len(top_words_ids)):
    word_ids = tokenizer.convert_ids_to_tokens(top_words_ids[t])
    tag_ids = list(map(idx2tag.get, top_tags_ids[t]))
    prob_lst = top_prob_lst[t]
    words = []
    tags = []
    probs = []
    for k, w in enumerate(word_ids):
      if w == '[CLS]':
        pass
      elif w == '[SEP]':
        break
      else:
        words.append(w)
        
        if prob_lst[k] >= threshold_prob:
          tags.append(tag_ids[k])
        else:
          tags.append('<pad>')

        probs.append(prob_lst[k])
        
    top_words.append(words)
    top_tags.append(tags)
    top_prob.append(probs)

  new_train_dataset = PosDataset(wsj_train_word_lst+top_words, wsj_train_tag_lst+top_tags)
  new_train_iter = data.DataLoader(dataset=new_train_dataset,
                              batch_size=8,
                              shuffle=True,
                              num_workers=1,
                              collate_fn=pad)

  print("Train from scratch...")
  model = Net(vocab_size=len(tag2idx))
  model.to(device)
  model = nn.DataParallel(model)

  optimizer = optim.Adam(model.parameters(), lr = 0.0001)
  criterion = nn.CrossEntropyLoss(ignore_index=0)

  train(model, new_train_iter, optimizer, criterion)

  domain_precision_value, domain_recall_value, domain_f1_value = eval(model, domain_test_iter)
  domain_precision_value_lst.append(domain_precision_value)
  domain_recall_value_lst.append(domain_recall_value)
  domain_f1_value_lst.append(domain_f1_value)


Loop 1
domain_dev_word_lst 2427
Train from scratch...
step: 0, loss: 3.8578107357025146
step: 10, loss: 2.022843837738037
step: 20, loss: 0.9560863375663757
step: 30, loss: 0.35957786440849304
step: 40, loss: 0.4113931655883789
step: 50, loss: 0.19826169312000275
step: 60, loss: 0.293730765581131
step: 70, loss: 0.2802572548389435
step: 80, loss: 0.31896159052848816
step: 90, loss: 0.13969437777996063
step: 100, loss: 0.2858290672302246
step: 110, loss: 0.20158474147319794
step: 120, loss: 0.1962890326976776
step: 130, loss: 0.06106842681765556
step: 140, loss: 0.20354318618774414
step: 150, loss: 0.31151801347732544
step: 160, loss: 0.06656526029109955
step: 170, loss: 0.06906750053167343
step: 180, loss: 0.25116419792175293
step: 190, loss: 0.11046190559864044
step: 200, loss: 0.14170722663402557
step: 210, loss: 0.1346970796585083
step: 220, loss: 0.042788807302713394
step: 230, loss: 0.04952073097229004
step: 240, loss: 0.1223900094628334
step: 250, loss: 0.09885464608669281
step:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9744224548339844
step: 10, loss: 1.8904787302017212
step: 20, loss: 0.8285925388336182
step: 30, loss: 0.4231554865837097
step: 40, loss: 0.399410218000412
step: 50, loss: 0.252452552318573
step: 60, loss: 0.1653655469417572
step: 70, loss: 0.2522847056388855
step: 80, loss: 0.2739276885986328
step: 90, loss: 0.19627831876277924
step: 100, loss: 0.19765301048755646
step: 110, loss: 0.18636098504066467
step: 120, loss: 0.2103412002325058
step: 130, loss: 0.09848518669605255
step: 140, loss: 0.07492765784263611
step: 150, loss: 0.10969369858503342
step: 160, loss: 0.16029424965381622
step: 170, loss: 0.21435096859931946
step: 180, loss: 0.1558590531349182
step: 190, loss: 0.0823933482170105
step: 200, loss: 0.126363605260849
step: 210, loss: 0.28107380867004395
step: 220, loss: 0.1996993124485016
step: 230, loss: 0.1396445482969284
step: 240, loss: 0.37917211651802063
step: 250, loss: 0.20601120591163635
step: 260, loss: 0.28652194142341614
step: 27

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9339003562927246
step: 10, loss: 2.289106845855713
step: 20, loss: 0.7053303718566895
step: 30, loss: 0.6224756240844727
step: 40, loss: 0.3166123330593109
step: 50, loss: 0.5051404237747192
step: 60, loss: 0.13824541866779327
step: 70, loss: 0.08874711394309998
step: 80, loss: 0.13650724291801453
step: 90, loss: 0.14978356659412384
step: 100, loss: 0.14438968896865845
step: 110, loss: 0.25198033452033997
step: 120, loss: 0.07747942209243774
step: 130, loss: 0.12324719876050949
step: 140, loss: 0.22597134113311768
step: 150, loss: 0.27523043751716614
step: 160, loss: 0.15472295880317688
step: 170, loss: 0.09249476343393326
step: 180, loss: 0.17971962690353394
step: 190, loss: 0.06000003591179848
step: 200, loss: 0.21116024255752563
step: 210, loss: 0.1884876787662506
step: 220, loss: 0.15460163354873657
step: 230, loss: 0.09133506566286087
step: 240, loss: 0.17950715124607086
step: 250, loss: 0.1579282283782959
step: 260, loss: 0.07437818497419357

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9027211666107178
step: 10, loss: 1.8218066692352295
step: 20, loss: 0.5341317653656006
step: 30, loss: 0.373980313539505
step: 40, loss: 0.2784781754016876
step: 50, loss: 0.14584752917289734
step: 60, loss: 0.2054794728755951
step: 70, loss: 0.3061464726924896
step: 80, loss: 0.25042346119880676
step: 90, loss: 0.3166840672492981
step: 100, loss: 0.21608568727970123
step: 110, loss: 0.08802615106105804
step: 120, loss: 0.12345367670059204
step: 130, loss: 0.14347471296787262
step: 140, loss: 0.18266521394252777
step: 150, loss: 0.03799182176589966
step: 160, loss: 0.1504223495721817
step: 170, loss: 0.11642548441886902
step: 180, loss: 0.34273630380630493
step: 190, loss: 0.10991539806127548
step: 200, loss: 0.16636116802692413
step: 210, loss: 0.07686644792556763
step: 220, loss: 0.10445191711187363
step: 230, loss: 0.16015705466270447
step: 240, loss: 0.09220515191555023
step: 250, loss: 0.16339239478111267
step: 260, loss: 0.21591594815254211


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9450647830963135
step: 10, loss: 1.9303308725357056
step: 20, loss: 0.7436556220054626
step: 30, loss: 0.35262247920036316
step: 40, loss: 0.2850682735443115
step: 50, loss: 0.2496727854013443
step: 60, loss: 0.2910245954990387
step: 70, loss: 0.11595608294010162
step: 80, loss: 0.2279968112707138
step: 90, loss: 0.28462380170822144
step: 100, loss: 0.1593732237815857
step: 110, loss: 0.0955706387758255
step: 120, loss: 0.24801349639892578
step: 130, loss: 0.15364909172058105
step: 140, loss: 0.09673310071229935
step: 150, loss: 0.11215564608573914
step: 160, loss: 0.1718411147594452
step: 170, loss: 0.08355002105236053
step: 180, loss: 0.17112946510314941
step: 190, loss: 0.08040986955165863
step: 200, loss: 0.21723614633083344
step: 210, loss: 0.17928163707256317
step: 220, loss: 0.06091427430510521
step: 230, loss: 0.13611041009426117
step: 240, loss: 0.1116798147559166
step: 250, loss: 0.0772329643368721
step: 260, loss: 0.08671128749847412
st

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9125754833221436
step: 10, loss: 2.266775131225586
step: 20, loss: 0.8810528516769409
step: 30, loss: 0.44713687896728516
step: 40, loss: 0.23138757050037384
step: 50, loss: 0.2086188942193985
step: 60, loss: 0.22301948070526123
step: 70, loss: 1.3270195722579956
step: 80, loss: 0.32700708508491516
step: 90, loss: 0.1893535703420639
step: 100, loss: 0.18158401548862457
step: 110, loss: 0.1648378074169159
step: 120, loss: 0.30230948328971863
step: 130, loss: 0.16539664566516876
step: 140, loss: 0.3052779734134674
step: 150, loss: 0.19958344101905823
step: 160, loss: 0.12110505998134613
step: 170, loss: 0.11179114133119583
step: 180, loss: 0.1142435371875763
step: 190, loss: 0.12621475756168365
step: 200, loss: 0.17072132229804993
step: 210, loss: 0.13609035313129425
step: 220, loss: 0.23101240396499634
step: 230, loss: 0.17650935053825378
step: 240, loss: 0.16503679752349854
step: 250, loss: 0.08404826372861862
step: 260, loss: 0.1914597451686859
s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9424540996551514
step: 10, loss: 2.002934694290161
step: 20, loss: 0.9450781345367432
step: 30, loss: 0.49137580394744873
step: 40, loss: 0.28753459453582764
step: 50, loss: 0.4099986255168915
step: 60, loss: 0.21073055267333984
step: 70, loss: 0.16521331667900085
step: 80, loss: 0.150138720870018
step: 90, loss: 0.3428265452384949
step: 100, loss: 0.19429075717926025
step: 110, loss: 0.15412501990795135
step: 120, loss: 0.12337151914834976
step: 130, loss: 0.17618608474731445
step: 140, loss: 0.1268484741449356
step: 150, loss: 0.06639576703310013
step: 160, loss: 0.2297154664993286
step: 170, loss: 0.13366635143756866
step: 180, loss: 0.0923285037279129
step: 190, loss: 0.0953238308429718
step: 200, loss: 0.07968135923147202
step: 210, loss: 0.23057647049427032
step: 220, loss: 0.07571388781070709
step: 230, loss: 0.18365436792373657
step: 240, loss: 0.27481237053871155
step: 250, loss: 0.1395493894815445
step: 260, loss: 0.3155767321586609
step

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.8351759910583496
step: 10, loss: 2.1066806316375732
step: 20, loss: 0.9604615569114685
step: 30, loss: 0.3924427628517151
step: 40, loss: 0.30013036727905273
step: 50, loss: 0.23292863368988037
step: 60, loss: 0.18906502425670624
step: 70, loss: 0.24661128222942352
step: 80, loss: 0.21913781762123108
step: 90, loss: 0.19464431703090668
step: 100, loss: 0.18617886304855347
step: 110, loss: 0.08205337822437286
step: 120, loss: 0.08649905771017075
step: 130, loss: 0.18566110730171204
step: 140, loss: 0.10276804864406586
step: 150, loss: 0.12429571896791458
step: 160, loss: 0.14546851813793182
step: 170, loss: 0.12877807021141052
step: 180, loss: 0.11385523527860641
step: 190, loss: 0.05925017595291138
step: 200, loss: 0.15535056591033936
step: 210, loss: 0.08999836444854736
step: 220, loss: 0.07373537123203278
step: 230, loss: 0.17364977300167084
step: 240, loss: 0.170032799243927
step: 250, loss: 0.09139011055231094
step: 260, loss: 0.16414499282836

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9851202964782715
step: 10, loss: 2.224785566329956
step: 20, loss: 0.9794295430183411
step: 30, loss: 1.119818091392517
step: 40, loss: 0.26466700434684753
step: 50, loss: 0.2720685601234436
step: 60, loss: 0.23667916655540466
step: 70, loss: 0.2318653166294098
step: 80, loss: 0.21900655329227448
step: 90, loss: 0.32603392004966736
step: 100, loss: 0.14908678829669952
step: 110, loss: 0.3333498537540436
step: 120, loss: 0.09357237070798874
step: 130, loss: 0.1587657332420349
step: 140, loss: 0.252299427986145
step: 150, loss: 0.11623068153858185
step: 160, loss: 0.312703400850296
step: 170, loss: 0.11389130353927612
step: 180, loss: 0.19228552281856537
step: 190, loss: 0.26803186535835266
step: 200, loss: 0.2172648161649704
step: 210, loss: 0.07492701709270477
step: 220, loss: 0.12881827354431152
step: 230, loss: 0.13540121912956238
step: 240, loss: 0.052153799682855606
step: 250, loss: 0.1450689435005188
step: 260, loss: 0.045438844710588455
step

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.889564037322998
step: 10, loss: 1.879541277885437
step: 20, loss: 0.852138102054596
step: 30, loss: 0.6019236445426941
step: 40, loss: 0.29497015476226807
step: 50, loss: 0.2186659574508667
step: 60, loss: 0.13997168838977814
step: 70, loss: 0.21559494733810425
step: 80, loss: 0.14445188641548157
step: 90, loss: 0.16802670061588287
step: 100, loss: 0.17704999446868896
step: 110, loss: 0.09666986763477325
step: 120, loss: 0.18809185922145844
step: 130, loss: 0.10295765101909637
step: 140, loss: 0.12088201195001602
step: 150, loss: 0.20180293917655945
step: 160, loss: 0.1826397180557251
step: 170, loss: 0.11122650653123856
step: 180, loss: 0.10047156363725662
step: 190, loss: 0.20482635498046875
step: 200, loss: 0.2059742510318756
step: 210, loss: 0.15915395319461823
step: 220, loss: 0.19827055931091309
step: 230, loss: 0.13552021980285645
step: 240, loss: 0.17386871576309204
step: 250, loss: 0.08044538646936417
step: 260, loss: 0.12348268181085587


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train from scratch...
step: 0, loss: 3.9646313190460205
step: 10, loss: 1.97420334815979
step: 20, loss: 1.0019079446792603
step: 30, loss: 0.38914409279823303
step: 40, loss: 0.4883798360824585
step: 50, loss: 0.21851304173469543
step: 60, loss: 0.15883852541446686
step: 70, loss: 0.11100539565086365
step: 80, loss: 0.09270802140235901
step: 90, loss: 0.1336880475282669
step: 100, loss: 0.188099205493927
step: 110, loss: 0.24303926527500153
step: 120, loss: 0.28486183285713196
step: 130, loss: 0.210298553109169
step: 140, loss: 0.15324409306049347
step: 150, loss: 0.21110299229621887
step: 160, loss: 0.18490919470787048
step: 170, loss: 0.026357881724834442
step: 180, loss: 0.23249347507953644
step: 190, loss: 0.1527516096830368
step: 200, loss: 0.08726396411657333
step: 210, loss: 0.07206195592880249
step: 220, loss: 0.21130336821079254
step: 230, loss: 0.1278863400220871
step: 240, loss: 0.12176171690225601
step: 250, loss: 0.0808488205075264
step: 260, loss: 0.26033347845077515
ste

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd

In [None]:
test_metric = pd.DataFrame({
    "Loop": list(range(len(domain_precision_value_lst))) * 3,
    "metric": ["precision"]*len(domain_precision_value_lst) + ["recall"]*len(domain_precision_value_lst) + ["f1"]*len(domain_precision_value_lst),
    "value": domain_precision_value_lst + domain_recall_value_lst + domain_f1_value_lst
})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import plotly
import plotly.express as px
import plotly.graph_objects as go

In [None]:
fig = px.line(test_metric, x="Loop", y="value", color='metric', markers=True)
fig.show()