# Load Data

In [1]:
! pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 8.1 MB/s 
Collecting boto3
  Downloading boto3-1.26.3-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 51.2 MB/s 
Collecting botocore<1.30.0,>=1.29.3
  Downloading botocore-1.29.3-py3-none-any.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 62.3 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 10.9 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 72.9 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os

In [4]:
data_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/data/gweb_sancl"
answer_dir = os.path.join(data_dir, "pos_fine", "answers")
wsj_dir = os.path.join(data_dir, "pos_fine", "wsj")
labeled_dir = os.path.join(data_dir, "unlabeled")

model_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/model"

In [5]:
import codecs

In [6]:
def read_conll_file(file_name, raw=False):
    """
    read in conll file
    word1    tag1
    ...      ...
    wordN    tagN
    Sentences MUST be separated by newlines!
    :param file_name: file to read in
    :param raw: if raw text file (with one sentence per line) -- adds 'DUMMY' label
    :return: generator of instances ((list of  words, list of tags) pairs)
    """
    current_words = []
    current_tags = []
    
    for line in codecs.open(file_name, encoding='utf-8'):
        #line = line.strip()
        line = line[:-1]

        if line:
            if raw:
                current_words = line.split() ## simple splitting by space
                current_tags = ['DUMMY' for _ in current_words]
                yield (current_words, current_tags)

            else:
                if len(line.split("\t")) != 2:
                    if len(line.split("\t")) == 1: # emtpy words in gimpel
                        raise IOError("Issue with input file - doesn't have a tag or token?")
                    else:
                        print("erroneous line: {} (line number: {}) ".format(line), file=sys.stderr)
                        exit()
                else:
                    word, tag = line.split('\t')
                current_words.append(word)
                current_tags.append(tag)

        else:
            if current_words and not raw: #skip emtpy lines
                yield (current_words, current_tags)
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != [] and not raw:
        yield (current_words, current_tags)

In [7]:
wsj_train_file = os.path.join(wsj_dir, "gweb-wsj-train.conll")
wsj_dev_file = os.path.join(wsj_dir, "gweb-wsj-dev.conll")

In [8]:
wsj_train_word_lst = []
wsj_train_tag_lst = []
wsj_tags = []
for word, tag in read_conll_file(wsj_train_file):
  wsj_train_word_lst.append(word)
  wsj_train_tag_lst.append(tag)
  wsj_tags.extend(tag)
print("The number of sentences in wsj train", len(wsj_train_word_lst))

wsj_dev_word_lst = []
wsj_dev_tag_lst = []
for word, tag in read_conll_file(wsj_dev_file):
  wsj_dev_word_lst.append(word)
  wsj_dev_tag_lst.append(tag)
  wsj_tags.extend(tag)
print("The number of sentences in wsj dev", len(wsj_dev_word_lst))
print("The number of tags in wsj", len(set(wsj_tags)))

The number of sentences in wsj train 30060
The number of sentences in wsj dev 1336
The number of tags in wsj 48


In [9]:
import random

In [10]:
random.seed(0)
random.shuffle(wsj_train_word_lst)
random.seed(0)
random.shuffle(wsj_train_tag_lst)

In [11]:
labeled_train_words = wsj_train_word_lst[:10000]
labeled_train_tags = wsj_train_tag_lst[:10000]
unlabeled_words = wsj_train_word_lst[10000:]
unlabeled_tags = wsj_train_tag_lst[10000:]

print(len(labeled_train_words))
print(len(unlabeled_words))

10000
20060


In [12]:
wsj_tags = sorted(list(set(wsj_tags)))
wsj_tags = ["<pad>"] + wsj_tags
tag2idx = {tag:idx for idx, tag in enumerate(wsj_tags)}
idx2tag = {idx:tag for idx, tag in enumerate(wsj_tags)}
print(len(wsj_tags))

49


In [13]:
!pip install git+https://github.com/PetrochukM/PyTorch-NLP.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/PetrochukM/PyTorch-NLP.git
  Cloning https://github.com/PetrochukM/PyTorch-NLP.git to /tmp/pip-req-build-vuofq5m8
  Running command git clone -q https://github.com/PetrochukM/PyTorch-NLP.git /tmp/pip-req-build-vuofq5m8


In [14]:
# https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/datasets/ud_pos.html

import os
import io

from torchnlp.download import download_file_maybe_extract


def ud_pos_dataset(directory='data/',
                   train=False,
                   dev=False,
                   test=False,
                   train_filename='en-ud-tag.v2.train.txt',
                   dev_filename='en-ud-tag.v2.dev.txt',
                   test_filename='en-ud-tag.v2.test.txt',
                   extracted_name='en-ud-v2',
                   check_files=['en-ud-v2/en-ud-tag.v2.train.txt'],
                   url='https://bitbucket.org/sivareddyg/public/downloads/en-ud-v2.zip'):

    download_file_maybe_extract(url=url, directory=directory, check_files=check_files)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []}
            for line in f:
                line = line.strip()
                if line == '' and len(sentence['tokens']) > 0:
                    examples.append(sentence)
                    sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []}
                elif line != '':
                    token, ud_tag, ptb_tag = tuple(line.split('\t'))
                    sentence['tokens'].append(token)
                    sentence['ud_tags'].append(ud_tag)
                    sentence['ptb_tags'].append(ptb_tag)
        ret.append(examples)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

In [47]:
penn_train, penn_dev, penn_test = ud_pos_dataset(train=True,
                                                 dev=True,
                                                 test=True)
print("Train size", len(penn_train))
print("Dev size", len(penn_dev))
print("Test size", len(penn_test))

Train size 12543
Dev size 2002
Test size 2077


In [48]:
_penn_train_words = [i["tokens"] for i in penn_train] 
_penn_train_tags = [i["ptb_tags"] for i in penn_train]

penn_tags_lst = []
for i in _penn_train_tags:
  penn_tags_lst.extend(i)
penn_tags = sorted(list(set(penn_tags_lst)))
print("penn_tags", len(penn_tags))

print(set(penn_tags).difference(set(wsj_tags)))
print(set(wsj_tags).difference(set(penn_tags)))

In [49]:
penn_train_words = []
penn_train_tags = []
for words, tags in zip(_penn_train_words, _penn_train_tags):
  new_words = []
  new_tags = []
  for i, t in enumerate(tags):
    if t not in {'ADD', 'GW'}:
      new_words.append(words[i])
      new_tags.append(tags[i])
    # else:
    #   print(words)
    #   print(tags)
  penn_train_words.append(new_words)
  penn_train_tags.append(new_tags)

['http://www.ibiblio.org/expo/soviet.exhibit/chernobyl.html']
['ADD']
['http://www.ibrae.ac.ru/IBRAE/eng/chernobyl/nat_rep/nat_repe.htm#24']
['ADD']
['http://www.nsrl.ttu.edu/chernobyl/wildlifepreserve.htm']
['ADD']
['http://www.environmentalchemistry.com/yogi/hazmat/articles/chernobyl1.html']
['ADD']
['http://digon_va.tripod.com/Chernobyl.htm']
['ADD']
['http://www.oneworld.org/index_oc/issue196/byckau.html']
['ADD']
['http://www.collectinghistory.net/chernobyl/']
['ADD']
['http://www.ukrainianweb.com/chernobyl_ukraine.htm']
['ADD']
['http://www.bullatomsci.org/issues/1993/s93/s93Marples.html']
['ADD']
['http://www.calguard.ca.gov/ia/Chernobyl-15%20years.htm']
['ADD']
['http://www.infoukes.com/history/chornobyl/gregorovich/index.html']
['ADD']
['http://www.un.org/ha/chernobyl/']
['ADD']
['http://www.tecsoc.org/pubs/history/2002/apr26.htm']
['ADD']
['http://www.chernobyl.org.uk/page2.htm']
['ADD']
['http://www.time.com/time/daily/chernobyl/860901.accident.html']
['ADD']
['http://www.in

# Build Model

In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [20]:
class PosDataset(data.Dataset):
    def __init__(self, word_lst, tag_lst):
        sents, tags_li = [], [] # list of lists
        for i in range(len(word_lst)):
            sents.append(["[CLS]"] + word_lst[i] + ["[SEP]"])
            tags_li.append(["<pad>"] + tag_lst[i] + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [21]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [22]:
from pytorch_pretrained_bert import BertModel

In [23]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [24]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [25]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)
    print("classification_report", classification_report(y_true, y_pred))


In [26]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

100%|██████████| 404400730/404400730 [00:34<00:00, 11840006.79B/s]


In [27]:
train_dataset = PosDataset(labeled_train_words, labeled_train_tags)
eval_dataset = PosDataset(wsj_dev_word_lst, wsj_dev_tag_lst)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [28]:
# train(model, train_iter, optimizer, criterion)
# eval(model, test_iter)

# Save Model

In [29]:
model_file = os.path.join(model_dir, "model.pt")
# torch.save(model.state_dict(), model_file)

## Load Model

In [30]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

In [31]:
model.load_state_dict(torch.load(model_file))
eval(model, test_iter)

acc=0.97
classification_report               precision    recall  f1-score   support

           1       0.99      1.00      1.00       125
           2       0.99      1.00      1.00       236
           3       1.00      1.00      1.00      1629
           4       0.67      1.00      0.80        53
           5       1.00      0.42      0.59        55
           6       1.00      1.00      1.00      1321
           7       0.96      0.99      0.98       187
           9       1.00      0.99      0.99       809
          10       1.00      1.00      1.00      1270
          11       1.00      0.99      0.99      2768
          12       1.00      1.00      1.00        29
          13       0.00      0.00      0.00         2
          14       0.99      0.96      0.98       409
          15       0.97      0.99      0.98      3419
          16       0.92      0.91      0.92      1819
          17       0.88      0.93      0.90       114
          18       0.99      0.96      0.97       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
penn_train_dataset = PosDataset(penn_train_words, penn_train_tags)

penn_train_iter = data.DataLoader(dataset=penn_train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)

In [52]:
eval(model, penn_train_iter)

acc=0.92


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


classification_report               precision    recall  f1-score   support

           1       0.78      1.00      0.88       258
           2       0.40      0.16      0.23       785
           3       0.94      0.87      0.90      8062
           4       0.17      0.03      0.05       973
           5       0.00      0.00      0.00      1008
           6       0.96      0.98      0.97     10317
           7       0.33      0.97      0.49       866
           8       0.00      0.00      0.00        48
           9       1.00      0.98      0.99      6706
          10       0.96      0.94      0.95      3998
          11       0.98      0.99      0.98     16817
          12       0.95      0.96      0.96       359
          13       0.95      0.43      0.59        93
          14       0.91      0.94      0.92       664
          15       0.93      0.98      0.96     20724
          16       0.90      0.87      0.89     11591
          17       0.94      0.87      0.91       503
     

  _warn_prf(average, modifier, msg_start, len(result))
