# Load Data

In [None]:
! pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

In [None]:
data_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/data/gweb_sancl"
answer_dir = os.path.join(data_dir, "pos_fine", "answers")
wsj_dir = os.path.join(data_dir, "pos_fine", "wsj")
labeled_dir = os.path.join(data_dir, "unlabeled")

model_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/model"

In [None]:
import codecs

In [None]:
def read_conll_file(file_name, raw=False):
    """
    read in conll file
    word1    tag1
    ...      ...
    wordN    tagN
    Sentences MUST be separated by newlines!
    :param file_name: file to read in
    :param raw: if raw text file (with one sentence per line) -- adds 'DUMMY' label
    :return: generator of instances ((list of  words, list of tags) pairs)
    """
    current_words = []
    current_tags = []
    
    for line in codecs.open(file_name, encoding='utf-8'):
        #line = line.strip()
        line = line[:-1]

        if line:
            if raw:
                current_words = line.split() ## simple splitting by space
                current_tags = ['DUMMY' for _ in current_words]
                yield (current_words, current_tags)

            else:
                if len(line.split("\t")) != 2:
                    if len(line.split("\t")) == 1: # emtpy words in gimpel
                        raise IOError("Issue with input file - doesn't have a tag or token?")
                    else:
                        print("erroneous line: {} (line number: {}) ".format(line), file=sys.stderr)
                        exit()
                else:
                    word, tag = line.split('\t')
                current_words.append(word)
                current_tags.append(tag)

        else:
            if current_words and not raw: #skip emtpy lines
                yield (current_words, current_tags)
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != [] and not raw:
        yield (current_words, current_tags)

In [None]:
wsj_train_file = os.path.join(wsj_dir, "gweb-wsj-train.conll")
wsj_dev_file = os.path.join(wsj_dir, "gweb-wsj-dev.conll")

In [None]:
wsj_train_word_lst = []
wsj_train_tag_lst = []
wsj_tags = []
for word, tag in read_conll_file(wsj_train_file):
  wsj_train_word_lst.append(word)
  wsj_train_tag_lst.append(tag)
  wsj_tags.extend(tag)
print("The number of sentences in wsj train", len(wsj_train_word_lst))

wsj_dev_word_lst = []
wsj_dev_tag_lst = []
for word, tag in read_conll_file(wsj_dev_file):
  wsj_dev_word_lst.append(word)
  wsj_dev_tag_lst.append(tag)
  wsj_tags.extend(tag)
print("The number of sentences in wsj dev", len(wsj_dev_word_lst))
print("The number of tags in wsj", len(set(wsj_tags)))

The number of sentences in wsj train 30060
The number of sentences in wsj dev 1336
The number of tags in wsj 48


In [None]:
import random

In [None]:
random.seed(0)
random.shuffle(wsj_train_word_lst)
random.seed(0)
random.shuffle(wsj_train_tag_lst)

In [None]:
labeled_train_words = wsj_train_word_lst[:10000]
labeled_train_tags = wsj_train_tag_lst[:10000]
unlabeled_words = wsj_train_word_lst[10000:]
unlabeled_tags = wsj_train_tag_lst[10000:]

print(len(labeled_train_words))
print(len(unlabeled_words))

10000
20060


In [None]:
wsj_tags = sorted(list(set(wsj_tags)))
wsj_tags = ["<pad>"] + wsj_tags
tag2idx = {tag:idx for idx, tag in enumerate(wsj_tags)}
idx2tag = {idx:tag for idx, tag in enumerate(wsj_tags)}
print(len(wsj_tags))

50


# Build Model

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [None]:
class PosDataset(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        sents, tags_li = [], [] # list of lists
        for i in range(len(word_lst)):
            sents.append(["[CLS]"] + word_lst[i] + ["[SEP]"])
            tags_li.append(["<pad>"] + tag_lst[i] + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [None]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [None]:
from pytorch_pretrained_bert import BertModel

In [None]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)
    print("classification_report", classification_report(y_true, y_pred))


In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

In [None]:
train_dataset = PosDataset(labeled_train_words, labeled_train_tags)
eval_dataset = PosDataset(wsj_dev_word_lst, wsj_dev_tag_lst)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
train(model, train_iter, optimizer, criterion)
eval(model, test_iter)

step: 0, loss: 4.019284725189209
step: 10, loss: 1.9670449495315552
step: 20, loss: 0.6764383912086487
step: 30, loss: 0.3844396770000458
step: 40, loss: 0.2357845902442932
step: 50, loss: 0.2136392891407013
step: 60, loss: 0.1777462214231491
step: 70, loss: 0.18430738151073456
step: 80, loss: 0.3058522641658783
step: 90, loss: 0.18708693981170654
step: 100, loss: 0.11203416436910629
step: 110, loss: 0.16677045822143555
step: 120, loss: 0.09839129447937012
step: 130, loss: 0.16833922266960144
step: 140, loss: 0.1866477131843567
step: 150, loss: 0.07283345609903336
step: 160, loss: 0.1110125333070755
step: 170, loss: 0.09983836114406586
step: 180, loss: 0.241097092628479
step: 190, loss: 0.16661380231380463
step: 200, loss: 0.09665440768003464
step: 210, loss: 0.1402125060558319
step: 220, loss: 0.11045082658529282
step: 230, loss: 0.05402184650301933
step: 240, loss: 0.08019950240850449
step: 250, loss: 0.12376444041728973
step: 260, loss: 0.10420562326908112
step: 270, loss: 0.1099987

# Save Model

In [None]:
model_file1 = os.path.join(model_dir, "base_model1.pt")

In [None]:
torch.save(model.state_dict(), model_file1)

In [None]:
model_file2 = os.path.join(model_dir, "base_model2.pt")

In [None]:
torch.save(model, model_file2)

## Load Model

In [None]:
base_model1 = Net(vocab_size=len(tag2idx))
base_model1.to(device)
base_model1 = nn.DataParallel(base_model1)

In [None]:
base_model1.load_state_dict(torch.load(model_file1))
eval(base_model1, test_iter)

acc=0.97


In [None]:
base_model2 = torch.load(model_file2)
eval(base_model2, test_iter)

acc=0.97


# Self Training

In [None]:
unlabeled_dataset = PosDataset(unlabeled_words, unlabeled_tags)

unlabeled_iter = data.DataLoader(dataset=unlabeled_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

In [None]:
eval(base_model1, unlabeled_iter)

acc=0.97


## Produce pseduo-labels

In [None]:
model.eval()

Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
LLD = []
new_x_lst = []
new_y_lst = []
i = 0

with torch.no_grad():
    for i, batch in enumerate(unlabeled_iter):

      words, x, is_heads, tags, y, seqlens = batch

      logits, _, y_hat = model(x, y)  # y_hat: (N, T)

      # Save prediction as new training dataset
      softmax_value = torch.softmax(logits, dim=2)
      max_prob = torch.amax(softmax_value, dim=2)
      lld = torch.prod(max_prob, 1)
      LLD.extend(lld)

      new_x_lst.extend(x.tolist())
      new_y_lst.extend(y_hat.tolist())

      Words.extend(words)
      Is_heads.extend(is_heads)
      Tags.extend(tags)
      Y.extend(y.numpy().tolist())
      Y_hat.extend(y_hat.cpu().numpy().tolist())

      # if i==20:
      #   break

In [None]:
len(LLD)

20060

In [None]:
ind = list(range(len(LLD)))
ind = [x for _, x in sorted(zip(LLD, ind), reverse=True)]

In [None]:
select_ind = ind[: 2000]
not_select_ind = ind[2000:]

new_train_x = [new_x_lst[i] for i in select_ind]
new_train_y = [new_y_lst[i] for i in select_ind]

remain_train_x = [new_x_lst[i] for i in not_select_ind]
remain_train_y = [new_y_lst[i] for i in not_select_ind]

In [None]:
len(new_train_x)

2000

In [None]:
# y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
# y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

In [None]:
# ## calc metric
# y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
# y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

# acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

# print("classification_report", classification_report(y_true, y_pred))

In [None]:
class PosDataset_new(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        self.word_lst, self.tag_lst = word_lst, tag_lst

    def __len__(self):
      return len(self.word_lst)

    def __getitem__(self, idx):
      words, tags = self.word_lst[idx], self.tag_lst[idx] # words, tags: string list
      assert len(words)==len(tags)
        # seqlen
      seqlen = len(words)

      return words, tags, seqlen

In [None]:
def pad_new(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    tags = f(1)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(0, maxlen)
    y = f(1, maxlen)

    f = torch.LongTensor

    return f(x), f(y), seqlens

In [None]:
new_train_dataset = PosDataset_new(new_train_x, new_train_y)

new_train_iter = data.DataLoader(dataset=new_train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad_new)

In [None]:
for i, batch_iter in enumerate(new_train_iter):
  break

41
8
8
[39, 34, 39, 36, 36, 41, 39, 40]
56
8
8
[44, 34, 40, 34, 39, 44, 56, 47]


In [None]:
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
def train_new(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        x, y, seqlens = batch
        
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
train_new(model, new_train_iter, optimizer, criterion)

step: 0, loss: 0.015222178772091866
step: 10, loss: 0.008685328997671604
step: 20, loss: 0.071180559694767
step: 30, loss: 0.010449478402733803
step: 40, loss: 0.03320135921239853
step: 50, loss: 0.016082294285297394
step: 60, loss: 0.03760402277112007
step: 70, loss: 0.00989021547138691
step: 80, loss: 0.042467646300792694
step: 90, loss: 0.021140089258551598
step: 100, loss: 0.015271157026290894
step: 110, loss: 0.028544079512357712
step: 120, loss: 0.055811602622270584
step: 130, loss: 0.01912704110145569
step: 140, loss: 0.018367575481534004
step: 150, loss: 0.01601603627204895
step: 160, loss: 0.01235022209584713
step: 170, loss: 0.020826663821935654
step: 180, loss: 0.017045238986611366
step: 190, loss: 0.012764754705131054
step: 200, loss: 0.06760004907846451
step: 210, loss: 0.058981966227293015
step: 220, loss: 0.021801872178912163
step: 230, loss: 0.019672349095344543
step: 240, loss: 0.03009956143796444
