# Load Data

In [None]:
! pip install pytorch_pretrained_bert
! pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 11.4 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.26.20-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 36.8 MB/s 
Collecting botocore<1.30.0,>=1.29.20
  Downloading botocore-1.29.20-py3-none-any.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 17.5 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 3.0 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 59.7 MB/s 
  Downloading urllib3-1.25.11-py2.py3-non

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Capstone')

import os
import pandas as pd
import numpy as np

from utils import read_conll_file, read_data

from torchmetrics.functional.classification import multiclass_f1_score, multiclass_precision, multiclass_recall, multiclass_accuracy

data_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/data/gweb_sancl"
wsj_dir = os.path.join(data_dir, "pos_fine", "wsj")
model_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/model"

Mounted at /content/drive


In [None]:
wsj_train_file = os.path.join(wsj_dir, "gweb-wsj-train.conll")
wsj_dev_file = os.path.join(wsj_dir, "gweb-wsj-dev.conll")
wsj_test_file = os.path.join(wsj_dir, "gweb-wsj-test.conll")

In [None]:
wsj_train_word_lst, wsj_train_tag_lst, wsj_train_tag_set = read_data(wsj_train_file)
wsj_dev_word_lst, wsj_dev_tag_lst, wsj_dev_tag_set = read_data(wsj_dev_file)
wsj_test_word_lst, wsj_test_tag_lst, wsj_test_tag_set = read_data(wsj_test_file)

The number of samples: 30060
The number of tags 48
The number of samples: 1336
The number of tags 45
The number of samples: 1640
The number of tags 45


In [None]:
wsj_tags = wsj_train_tag_set + wsj_dev_tag_set + wsj_test_tag_set
wsj_tags = sorted(list(set(wsj_tags)))
wsj_tags = ["<pad>"] + wsj_tags
tag2idx = {tag:idx for idx, tag in enumerate(wsj_tags)}
idx2tag = {idx:tag for idx, tag in enumerate(wsj_tags)}
print(len(wsj_tags))

49


# Build Model

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

100%|██████████| 213450/213450 [00:00<00:00, 875729.65B/s]


In [None]:
class PosDataset(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        sents, tags_li = [], [] # list of lists
        for i in range(len(word_lst)):
            sents.append(["[CLS]"] + word_lst[i] + ["[SEP]"])
            tags_li.append(["<pad>"] + tag_lst[i] + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [None]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [None]:
from pytorch_pretrained_bert import BertModel

In [None]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def eval(model, iterator, average="weighted"):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []

    pred_lst = []
    true_lst = []

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            for s in y_hat.cpu().numpy().tolist():
              pred_lst.extend(s)
            for s in y.numpy().tolist():
              true_lst.extend(s)

    precision_value = multiclass_precision(
            torch.tensor(pred_lst), torch.tensor(true_lst), num_classes=len(wsj_tags), ignore_index=0, 
            average=average)   
    recall_value = multiclass_recall(
            torch.tensor(pred_lst), torch.tensor(true_lst), num_classes=len(wsj_tags), ignore_index=0, 
            average=average)   
    f1_value = multiclass_f1_score(
            torch.tensor(pred_lst), torch.tensor(true_lst), num_classes=len(wsj_tags), ignore_index=0, 
            average=average)   
    acc = multiclass_accuracy(
        torch.tensor(pred_lst), torch.tensor(true_lst), num_classes=len(wsj_tags), ignore_index=0, 
        average=average)    


    return precision_value, recall_value, f1_value, acc

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

100%|██████████| 404400730/404400730 [00:14<00:00, 27697426.09B/s]


In [None]:
train_dataset = PosDataset(wsj_train_word_lst, wsj_train_tag_lst)
eval_dataset = PosDataset(wsj_test_word_lst, wsj_test_tag_lst)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
# train(model, train_iter, optimizer, criterion)
# eval(model, test_iter)

# Save Model

In [None]:
model_file = os.path.join(model_dir, "base_model.pt")
# torch.save(model.state_dict(), model_file)

## Load Model

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)
model.load_state_dict(torch.load(model_file))
wsj_precision_value, wsj_recall_value, wsj_f1_value, wsj_acc_value = eval(model, test_iter)
print(wsj_precision_value, wsj_recall_value, wsj_f1_value, wsj_acc_value)

tensor(0.9771) tensor(0.9743) tensor(0.9751) tensor(0.9743)


# Self Training

In [None]:
def filter_tag(process_words, process_tags, label_tags_set=wsj_tags):
  new_words = []
  new_tags = []
  for words, tags in zip(process_words, process_tags):
    w_lst = []
    t_lst = []
    for i, t in enumerate(tags):
      if t in label_tags_set:
        w_lst.append(words[i])
        t_lst.append(tags[i])

    if w_lst:
      new_words.append(w_lst)
      new_tags.append(t_lst)
  print("after filter tag", len(new_words))
  return new_words, new_tags

In [None]:
file_name_lst = ["answers", "emails", "newsgroups", "reviews", "weblogs"]

In [None]:
domain = "emails"
domain_dir = os.path.join(data_dir, "pos_fine", f"{domain}")
domain_dev_file = os.path.join(domain_dir, f"gweb-{domain}-dev.conll")
domain_test_file = os.path.join(domain_dir, f"gweb-{domain}-test.conll")

In [None]:
domain_dev_word_lst, domain_dev_tag_lst, domain_dev_tag_set = read_data(domain_dev_file)
domain_test_word_lst, domain_test_tag_lst, domain_test_tag_set = read_data(domain_test_file)
domain_dev_word_lst, domain_dev_tag_lst = filter_tag(domain_dev_word_lst, domain_dev_tag_lst)  
domain_test_word_lst, domain_test_tag_lst = filter_tag(domain_test_word_lst, domain_test_tag_lst)

The number of samples: 2450
The number of tags 49
The number of samples: 2450
The number of tags 48
after filter tag 2427
after filter tag 2402


In [None]:
domain_precision_value_lst = []
domain_recall_value_lst = []
domain_f1_value_lst = []
domain_acc_value_lst = []

In [None]:
domain_test_dataset = PosDataset(domain_test_word_lst, domain_test_tag_lst)

domain_test_iter = data.DataLoader(dataset=domain_test_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

domain_precision_value, domain_recall_value, domain_f1_value, domain_acc_value = eval(model, domain_test_iter)

domain_precision_value_lst.append(domain_precision_value)
domain_recall_value_lst.append(domain_recall_value)
domain_f1_value_lst.append(domain_f1_value)
domain_acc_value_lst.append(domain_acc_value)

In [None]:
class PosDataset_new(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        self.word_lst, self.tag_lst = word_lst, tag_lst

    def __len__(self):
      return len(self.word_lst)

    def __getitem__(self, idx):
      words, tags = self.word_lst[idx], self.tag_lst[idx] # words, tags: string list
      assert len(words)==len(tags)
        # seqlen
      seqlen = len(words)

      return words, tags, seqlen

def pad_new(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    tags = f(1)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(0, maxlen)
    y = f(1, maxlen)

    f = torch.LongTensor

    return f(x), f(y), seqlens

def train_new(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        x, y, seqlens = batch
        
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def gen_pseudo_data(model, domain_dev_iter, topn=300, initial=True):
  model.eval()

  LLD = []
  MEAN_PROB = []
  new_x_lst = []
  new_y_lst = []
  acc_lst = []

  if initial:
    with torch.no_grad():
        for i, batch in enumerate(domain_dev_iter):

          _, x, _, _, y, _ = batch
          # When calculating the length of sentences, ignore <pad>
          sen_len = y.bool().sum(axis=1)

          logits, _, y_hat = model(x, y)  # y_hat: (N, T)

          # Save prediction as new training dataset
          softmax_value = torch.softmax(logits, dim=2)
          max_prob = torch.amax(softmax_value, dim=2)

          # Rank by mean probability
          res_prob = y.bool().to(device) * max_prob.to(device)
          sum_prob = res_prob.sum(axis=1)
          mean_prob = sum_prob / sen_len.to(device)
          MEAN_PROB.extend(mean_prob.tolist())
          
          new_x_lst.extend(x.tolist())
          new_y_lst.extend(y_hat.tolist())

          # Calculate the accuracy for each sentences, ignore 0
          batch_acc = multiclass_accuracy(
              torch.tensor(y_hat).to(device), torch.tensor(y).to(device), num_classes=len(wsj_tags), 
              ignore_index=0, average="micro", multidim_average="samplewise")
          acc_lst.extend(batch_acc.tolist())
          

  else:
    with torch.no_grad():
        for i, batch in enumerate(domain_dev_iter):

          x, y, seqlens = batch
          sen_len = y.bool().sum(axis=1)

          logits, _, y_hat = model(x, y)  # y_hat: (N, T)

          # Save prediction as new training dataset
          softmax_value = torch.softmax(logits, dim=2)
          max_prob = torch.amax(softmax_value, dim=2)

          # Rank by mean probability
          res_prob = y.bool().to(device) * max_prob.to(device)
          sum_prob = res_prob.sum(axis=1)
          mean_prob = sum_prob / sen_len.to(device)
          MEAN_PROB.extend(mean_prob.tolist())
          
          new_x_lst.extend(x.tolist())
          new_y_lst.extend(y_hat.tolist())

          # Calculate the accuracy for each sentences, ignore 0
          batch_acc = multiclass_accuracy(
              torch.tensor(y_hat).to(device), torch.tensor(y).to(device), num_classes=len(wsj_tags), 
              ignore_index=0, average="micro", multidim_average="samplewise")
          acc_lst.extend(batch_acc.tolist())

  ind = list(range(len(MEAN_PROB)))
  ind = [x for _, x in sorted(zip(MEAN_PROB, ind), reverse=True)]
  prob_lst = [prob for prob, _ in sorted(zip(MEAN_PROB, ind), reverse=True)]

  select_ind = ind[: topn] # The index of topn sentences
  not_select_ind = ind[topn: ]

  new_train_x = [new_x_lst[i] for i in select_ind]
  new_train_y = [new_y_lst[i] for i in select_ind]

  remain_train_x = [new_x_lst[i] for i in not_select_ind]
  remain_train_y = [new_y_lst[i] for i in not_select_ind]

  new_prob = prob_lst[: topn]
  remain_prob = prob_lst[topn: ]
  new_acc = [acc_lst[i] for i in select_ind]
  remain_acc = [acc_lst[i] for i in not_select_ind]


  return new_train_x, new_train_y, remain_train_x, remain_train_y, new_acc, remain_acc, new_prob, remain_prob

In [None]:
acc_lst = []
prob_lst = []

top_percent = 0.05
topn = int(top_percent * len(domain_dev_word_lst))

i = 0
while len(domain_dev_word_lst) >= topn:
  i += 1
  print("\nLoop", i)
  print("domain_dev_word_lst", len(domain_dev_word_lst))

  if i == 1:
    domain_dev_dataset = PosDataset(domain_dev_word_lst, domain_dev_tag_lst)

    domain_dev_iter = data.DataLoader(dataset=domain_dev_dataset,
                                batch_size=8,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=pad)
  else:
    domain_dev_dataset = PosDataset_new(domain_dev_word_lst, domain_dev_tag_lst)

    domain_dev_iter = data.DataLoader(dataset=domain_dev_dataset,
                                batch_size=8,
                                shuffle=True,
                                num_workers=1,
                                collate_fn=pad_new)
    
  initial = True if i==1 else False
  new_train_x, new_train_y, domain_dev_word_lst, domain_dev_tag_lst, new_acc, remain_acc, new_prob, remain_prob = gen_pseudo_data(model, domain_dev_iter, topn, initial)

  new_train_dataset = PosDataset_new(new_train_x, new_train_y)

  new_train_iter = data.DataLoader(dataset=new_train_dataset,
                              batch_size=8,
                              shuffle=True,
                              num_workers=1,
                              collate_fn=pad_new)

  optimizer = optim.Adam(model.parameters(), lr = 0.00001)
  criterion = nn.CrossEntropyLoss(ignore_index=0)

  train_new(model, new_train_iter, optimizer, criterion)

  domain_precision_value, domain_recall_value, domain_f1_value, domain_acc_value = eval(model, domain_test_iter)

  domain_precision_value_lst.append(domain_precision_value)
  domain_recall_value_lst.append(domain_recall_value)
  domain_f1_value_lst.append(domain_f1_value)
  domain_acc_value_lst.append(domain_acc_value)

  acc_lst.append(new_acc)
  prob_lst.append(new_prob)



Loop 1
domain_dev_word_lst 2427


  torch.tensor(y_hat).to(device), torch.tensor(y).to(device), num_classes=len(wsj_tags),


step: 0, loss: 0.38303402066230774
step: 10, loss: 0.08681316673755646

Loop 2
domain_dev_word_lst 2306


  torch.tensor(y_hat).to(device), torch.tensor(y).to(device), num_classes=len(wsj_tags),


KeyboardInterrupt: ignored

In [None]:
print(domain_precision_value_lst)
print(domain_recall_value_lst)
print(domain_f1_value_lst)
print(domain_acc_value_lst)

print(acc_lst)
print(prob_lst)

In [None]:
import pandas as pd

In [None]:
test_metric = pd.DataFrame({
    "Loop": list(range(len(domain_precision_value_lst))) * 3,
    "metric": ["precision"]*len(domain_precision_value_lst) + ["recall"]*len(domain_precision_value_lst) + ["f1"]*len(domain_precision_value_lst),
    "value": domain_precision_value_lst + domain_recall_value_lst + domain_f1_value_lst
})

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import plotly
import plotly.express as px
import plotly.graph_objects as go

In [None]:
fig = px.line(test_metric, x="Loop", y="value", color='metric', markers=True)
fig.show()