# Load Data

In [1]:
! pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 32.0 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.26.5-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 51.2 MB/s 
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 8.8 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.30.0,>=1.29.5
  Downloading botocore-1.29.5-py3-none-any.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 10.0 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 57.0 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Capstone')

import os

from utils import read_conll_file, read_data


data_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/data/gweb_sancl"
wsj_dir = os.path.join(data_dir, "pos_fine", "wsj")
labeled_dir = os.path.join(data_dir, "unlabeled")
model_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/model"

Mounted at /content/drive


In [3]:
wsj_train_file = os.path.join(wsj_dir, "gweb-wsj-train.conll")
wsj_dev_file = os.path.join(wsj_dir, "gweb-wsj-dev.conll")
wsj_test_file = os.path.join(wsj_dir, "gweb-wsj-test.conll")

In [4]:
wsj_train_word_lst, wsj_train_tag_lst, wsj_train_tag_set = read_data(wsj_train_file)
wsj_dev_word_lst, wsj_dev_tag_lst, wsj_dev_tag_set = read_data(wsj_dev_file)
wsj_test_word_lst, wsj_test_tag_lst, wsj_test_tag_set = read_data(wsj_test_file)

The number of samples: 30060
The number of tags 48
The number of samples: 1336
The number of tags 45
The number of samples: 1640
The number of tags 45


In [5]:
# import random

# random.seed(0)
# random.shuffle(wsj_train_word_lst)
# random.seed(0)
# random.shuffle(wsj_train_tag_lst)

# labeled_train_words = wsj_train_word_lst[:10000]
# labeled_train_tags = wsj_train_tag_lst[:10000]
# unlabeled_words = wsj_train_word_lst[10000:]
# unlabeled_tags = wsj_train_tag_lst[10000:]

# print(len(labeled_train_words))
# print(len(unlabeled_words))

In [6]:
wsj_tags = wsj_train_tag_set + wsj_dev_tag_set + wsj_test_tag_set
wsj_tags = sorted(list(set(wsj_tags)))
wsj_tags = ["<pad>"] + wsj_tags
tag2idx = {tag:idx for idx, tag in enumerate(wsj_tags)}
idx2tag = {idx:tag for idx, tag in enumerate(wsj_tags)}
print(len(wsj_tags))

49


# Build Model

In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

100%|██████████| 213450/213450 [00:00<00:00, 20770577.19B/s]


In [10]:
class PosDataset(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        sents, tags_li = [], [] # list of lists
        for i in range(len(word_lst)):
            sents.append(["[CLS]"] + word_lst[i] + ["[SEP]"])
            tags_li.append(["<pad>"] + tag_lst[i] + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [11]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [12]:
from pytorch_pretrained_bert import BertModel

In [13]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [14]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [15]:
def eval(model, iterator, average="macro"):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)
    print("classification_report", classification_report(y_true, y_pred))
    precision_value = precision_score(y_true, y_pred, average=average)
    recall_value = recall_score(y_true, y_pred, average=average)
    f1_value = f1_score(y_true, y_pred, average=average)

    return precision_value, recall_value, f1_value

In [16]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

100%|██████████| 404400730/404400730 [00:14<00:00, 28816658.77B/s]


In [17]:
train_dataset = PosDataset(wsj_train_word_lst, wsj_train_tag_lst)
eval_dataset = PosDataset(wsj_test_word_lst, wsj_test_tag_lst)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
train(model, train_iter, optimizer, criterion)
eval(model, test_iter)

step: 0, loss: 3.9347734451293945
step: 10, loss: 1.762121558189392
step: 20, loss: 0.7181916236877441
step: 30, loss: 0.5616925358772278
step: 40, loss: 0.3763507902622223
step: 50, loss: 0.2633183002471924
step: 60, loss: 0.33471694588661194
step: 70, loss: 0.23860099911689758
step: 80, loss: 0.17271754145622253
step: 90, loss: 0.1582895666360855
step: 100, loss: 0.21905650198459625
step: 110, loss: 0.10516443848609924
step: 120, loss: 0.08706970512866974
step: 130, loss: 0.10375358909368515
step: 140, loss: 0.07010554522275925
step: 150, loss: 0.11012985557317734
step: 160, loss: 0.1452144980430603
step: 170, loss: 0.1952163577079773
step: 180, loss: 0.12142522633075714
step: 190, loss: 0.12944729626178741
step: 200, loss: 0.19011715054512024
step: 210, loss: 0.13491018116474152
step: 220, loss: 0.1767224371433258
step: 230, loss: 0.14211967587471008
step: 240, loss: 0.13849107921123505
step: 250, loss: 0.1435754895210266
step: 260, loss: 0.11594511568546295
step: 270, loss: 0.10954

(0.9397482533696433, 0.9371770472723233, 0.9273419086238314)

# Save Model

In [18]:
model_file = os.path.join(model_dir, "base_model.pt")

In [19]:
# torch.save(model.state_dict(), model_file)

## Load Model

In [20]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)
model.load_state_dict(torch.load(model_file))
wsj_precision_value, wsj_recall_value, wsj_f1_value = eval(model, test_iter)

acc=0.98
classification_report               precision    recall  f1-score   support

           1       1.00      1.00      1.00       178
           2       1.00      1.00      1.00       352
           3       1.00      1.00      1.00      2000
           4       1.00      1.00      1.00        60
           5       1.00      1.00      1.00        60
           6       1.00      1.00      1.00      1613
           7       1.00      1.00      1.00       223
           9       1.00      0.99      1.00       935
          10       0.99      1.00      0.99      1266
          11       1.00      1.00      1.00      3309
          12       1.00      1.00      1.00        46
          13       1.00      0.20      0.33        20
          14       1.00      1.00      1.00       511
          15       0.98      0.99      0.99      4250
          16       0.95      0.94      0.94      2423
          17       0.94      0.95      0.94       139
          18       0.92      0.96      0.94       

In [22]:
print(wsj_precision_value, wsj_recall_value, wsj_f1_value)

0.9397482533696433 0.9371770472723233 0.9273419086238314


In [33]:
file_name_lst = ["answers", "emails", "newsgroups", "reviews", "weblogs"]

In [37]:
def filter_tag(process_words, process_tags, label_tags_set=wsj_tags):
  new_words = []
  new_tags = []
  for words, tags in zip(process_words, process_tags):
    w_lst = []
    t_lst = []
    for i, t in enumerate(tags):
      if t in label_tags_set:
        w_lst.append(words[i])
        t_lst.append(tags[i])

    if w_lst:
      new_words.append(w_lst)
      new_tags.append(t_lst)
  print("after filter tag", len(new_words))
  return new_words, new_tags

In [38]:
dev_precision_value_lst = []
dev_recall_value_lst = []
dev_f1_value_lst = []

test_precision_value_lst = []
test_recall_value_lst = []
test_f1_value_lst = []

for domain in file_name_lst:
  print("\n")
  print("Domain", domain)
  domain_dir = os.path.join(data_dir, "pos_fine", f"{domain}")
  domain_dev_file = os.path.join(domain_dir, f"gweb-{domain}-dev.conll")
  domain_test_file = os.path.join(domain_dir, f"gweb-{domain}-test.conll")

  domain_dev_word_lst, domain_dev_tag_lst, domain_dev_tag_set = read_data(domain_dev_file)
  domain_test_word_lst, domain_test_tag_lst, domain_test_tag_set = read_data(domain_test_file)

  domain_dev_word_lst, domain_dev_tag_lst = filter_tag(domain_dev_word_lst, domain_dev_tag_lst)  
  domain_test_word_lst, domain_test_tag_lst = filter_tag(domain_test_word_lst, domain_test_tag_lst)

  dev_dataset = PosDataset(domain_dev_word_lst, domain_dev_tag_lst)
  test_dataset = PosDataset(domain_test_word_lst, domain_test_tag_lst)

  dev_iter = data.DataLoader(dataset=dev_dataset,
                              batch_size=8,
                              shuffle=True,
                              num_workers=1,
                              collate_fn=pad)
  test_iter = data.DataLoader(dataset=test_dataset,
                              batch_size=8,
                              shuffle=False,
                              num_workers=1,
                              collate_fn=pad)
  
  dev_precision_value, dev_recall_value, dev_f1_value = eval(model, dev_iter)
  test_precision_value, test_recall_value, test_f1_value = eval(model, test_iter)

  dev_precision_value_lst.append(dev_precision_value)
  dev_recall_value_lst.append(dev_recall_value)
  dev_f1_value_lst.append(dev_f1_value)

  test_precision_value_lst.append(test_precision_value)
  test_recall_value_lst.append(test_recall_value)
  test_f1_value_lst.append(test_f1_value)
  



Domain answers
The number of samples: 1745
The number of tags 49
The number of samples: 1744
The number of tags 50
after filter tag 1713
after filter tag 1723
acc=0.92
classification_report               precision    recall  f1-score   support

           1       0.71      0.94      0.81        16
           2       0.61      0.96      0.74        68
           3       1.00      0.82      0.90       881
           4       0.87      0.98      0.92        81
           5       0.98      0.97      0.97        90
           6       1.00      0.97      0.98      1445
           7       0.21      0.89      0.34        53
           8       0.00      0.00      0.00        11
           9       0.99      0.98      0.99       870
          10       0.86      0.97      0.91       295
          11       0.97      0.99      0.98      2022
          12       0.92      0.93      0.93        87
          13       0.94      0.71      0.81        24
          14       0.59      0.96      0.73        


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



acc=0.92
classification_report               precision    recall  f1-score   support

           1       0.62      1.00      0.76         8
           2       0.52      0.98      0.68        44
           3       1.00      0.87      0.93       987
           4       1.00      0.98      0.99       108
           5       0.99      0.98      0.99       115
           6       1.00      0.97      0.98      1600
           7       0.19      0.69      0.29        48
           8       0.00      0.00      0.00         4
           9       0.99      0.98      0.99      1086
          10       0.90      0.97      0.94       386
          11       0.97      0.99      0.98      2229
          12       0.95      0.95      0.95        61
          13       0.94      0.89      0.91        18
          14       0.33      0.75      0.46        28
          15       0.95      0.97      0.96      2566
          16       0.87      0.88      0.87      1511
          17       0.84      0.89      0.86       


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



The number of samples: 2450
The number of tags 48
after filter tag 2427
after filter tag 2402
acc=0.91
classification_report               precision    recall  f1-score   support

           1       0.66      1.00      0.80        55
           2       0.40      0.90      0.55        73
           3       1.00      0.84      0.91      1166
           4       0.90      0.78      0.83       233
           5       0.93      0.79      0.85       234
           6       0.99      0.99      0.99      1566
           7       0.65      0.96      0.77       203
           8       0.00      0.00      0.00         8
           9       1.00      0.98      0.99       751
          10       0.95      0.97      0.96       870
          11       0.98      1.00      0.99      2062
          12       1.00      1.00      1.00        37
          13       1.00      0.20      0.33        15
          14       0.41      1.00      0.58        54
          15       0.97      0.98      0.97      2830
          


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



acc=0.91
classification_report               precision    recall  f1-score   support

           1       0.67      1.00      0.80        35
           2       0.42      0.94      0.58        77
           3       1.00      0.79      0.89      1030
           4       0.90      0.85      0.87       291
           5       0.94      0.84      0.89       294
           6       0.99      0.98      0.98      1570
           7       0.60      0.94      0.73       186
           8       0.00      0.00      0.00        11
           9       1.00      0.99      0.99       689
          10       0.96      0.98      0.97       901
          11       0.98      1.00      0.99      2111
          12       0.98      0.98      0.98        47
          13       0.75      0.69      0.72        13
          14       0.30      1.00      0.46        43
          15       0.98      0.98      0.98      2778
          16       0.85      0.87      0.86      1151
          17       0.93      0.98      0.95       


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.




after filter tag 1190
after filter tag 1180
acc=0.93
classification_report               precision    recall  f1-score   support

           1       0.88      0.99      0.93        97
           2       0.66      0.94      0.77       171
           3       1.00      0.87      0.93       994
           4       0.94      0.81      0.87        96
           5       0.93      0.82      0.87       101
           6       1.00      0.99      0.99       927
           7       0.70      0.94      0.81       316
           8       0.00      0.00      0.00         2
           9       0.99      0.99      0.99       585
          10       0.98      0.77      0.86       760
          11       0.99      0.99      0.99      1917
          12       1.00      1.00      1.00        26
          13       1.00      0.88      0.93         8
          14       0.75      0.98      0.85       128
          15       0.97      0.98      0.98      2551
          16       0.91      0.88      0.89      1242
     


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



acc=0.93
classification_report               precision    recall  f1-score   support

           1       1.00      1.00      1.00        16
           2       0.49      0.83      0.61       137
           3       1.00      0.90      0.94       993
           4       0.77      0.59      0.67       113
           5       0.97      0.60      0.74       112
           6       0.99      0.99      0.99       906
           7       0.38      0.85      0.52        89
           8       0.00      0.00      0.00         2
           9       1.00      0.97      0.99       636
          10       0.95      0.98      0.97       413
          11       0.98      1.00      0.99      1804
          12       0.96      1.00      0.98        25
          13       0.75      0.12      0.21        25
          14       0.64      0.98      0.77        87
          15       0.97      0.99      0.98      2256
          16       0.90      0.92      0.91      1285
          17       0.85      0.92      0.88       


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



after filter tag 1905
after filter tag 1906
acc=0.94
classification_report               precision    recall  f1-score   support

           1       0.93      0.96      0.95        27
           2       0.56      0.82      0.67        44
           3       1.00      0.88      0.94       902
           4       1.00      1.00      1.00        59
           5       0.97      1.00      0.98        60
           6       0.99      0.98      0.99      1700
           7       0.19      0.72      0.30        36
           8       0.00      0.00      0.00         5
           9       0.99      0.99      0.99      1096
          10       0.94      0.99      0.97       311
          11       0.98      0.99      0.99      2181
          12       0.91      0.96      0.93        45
          13       0.67      0.33      0.44         6
          14       0.55      0.82      0.66        61
          15       0.96      0.98      0.97      2308
          16       0.92      0.89      0.91      2013
      


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_divisi

acc=0.93
classification_report               precision    recall  f1-score   support

           1       0.83      0.97      0.89        39
           2       0.54      0.94      0.69        49
           3       1.00      0.86      0.92       961
           4       0.99      0.99      0.99        84
           5       0.98      0.99      0.98        93
           6       1.00      0.98      0.99      1717
           7       0.14      0.91      0.25        23
           8       0.00      0.00      0.00         4
           9       1.00      0.98      0.99      1130
          10       0.92      0.99      0.96       330
          11       0.97      0.99      0.98      2258
          12       0.91      0.89      0.90        47
          13       1.00      0.71      0.83         7
          14       0.56      0.92      0.70        49
          15       0.95      0.96      0.96      2440
          16       0.92      0.87      0.90      2019
          17       0.86      0.83      0.85       


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



The number of samples: 1015
The number of tags 49
after filter tag 1016
after filter tag 974
acc=0.96
classification_report               precision    recall  f1-score   support

           1       0.43      1.00      0.60         6
           2       0.59      0.91      0.71       161
           3       1.00      0.95      0.98      1126
           4       0.97      0.95      0.96        63
           5       0.92      0.95      0.94        63
           6       1.00      1.00      1.00       952
           7       0.42      0.90      0.57        29
           8       0.00      0.00      0.00         8
           9       1.00      0.97      0.99       784
          10       0.95      0.96      0.96       388
          11       0.98      1.00      0.99      2261
          12       0.92      1.00      0.96        55
          13       0.50      0.18      0.27        11
          14       0.91      0.98      0.94       169
          15       0.98      0.99      0.99      2970
          1


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



acc=0.95
classification_report               precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       0.58      0.89      0.70       138
           3       1.00      0.88      0.94       938
           4       0.95      0.94      0.94        77
           5       0.97      0.94      0.96        80
           6       0.99      0.98      0.99       887
           7       0.41      0.94      0.57        89
           8       0.00      0.00      0.00         2
           9       0.99      0.98      0.99       597
          10       0.97      1.00      0.98       259
          11       0.99      1.00      0.99      1860
          12       0.94      1.00      0.97        31
          13       0.80      0.44      0.57         9
          14       0.90      0.99      0.94       169
          15       0.98      0.99      0.98      2324
          16       0.92      0.90      0.91      1296
          17       0.80      0.90      0.85       


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [39]:
import pandas as pd

In [40]:
dev_precision_value_lst

[0.8222879010077336,
 0.840899106124663,
 0.8581409423386496,
 0.8194049426170883,
 0.8585879743872378]

In [41]:
# dev_metric = pd.DataFrame({
#     "domain": file_name_lst * 3,
#     "metric": ["precision"]*5 + ["recall"]*5 + ["f1"]*5,
#     "value": dev_precision_value_lst + dev_recall_value_lst + dev_f1_value_lst,
# })

test_metric = pd.DataFrame({
    "domain": (["wsj_test"] + file_name_lst) * 3,
    "metric": ["precision"]*6 + ["recall"]*6 + ["f1"]*6,
    "value": [wsj_precision_value] + test_precision_value_lst + [wsj_recall_value] + test_recall_value_lst + [wsj_f1_value] + test_f1_value_lst,
})

In [42]:
import plotly
import plotly.express as px
import plotly.graph_objects as go

In [43]:
# fig = px.line(dev_metric, x="domain", y="value", color='metric', markers=True)
# fig.show()

In [44]:
fig = px.line(test_metric, x="domain", y="value", color='metric', markers=True)
fig.show()