# Baseline for EWT
To get your project started, you start with implementing a baseline model. Ideally, this is going to be the main baseline that you are going to compare to in your paper. Note that this baseline should be more advanced than just predicting the majority class (O).

We will use EWT portion of the Universal NER project, which we provide in the folder "Project_description" for convenience. You can use the train data (en_ewt-ud-train.iob2) and test data(en_ewt-ud-dev.iob2) to build your baseline, then upload your prediction on the test data (en_ewt-ud-test.iob2).

It is important to upload your predictions in same format as the training and test files, so that the span_f1.py script can be used.

Note that you do not have to implement your baseline from scratch, you can use for example the code from the RNN or BERT assignments as a starting point.

## **Step 1:** read in the data
Conll-approach retro-fitted for ewt-data, adjusted to be iob2-format-friendly

In [1]:
def read_ewt_file(path):
    data=[]
    words=[]
    tags=[]
    nr_tags=0
    nr_toks=0
    
    for line in open(path, encoding='utf-8'):
        line=line.strip()
        
        if line:
            if line[0]=='#':
                continue
    
            elements=line.split('\t')
            nr_toks+=1
            
            words.append(elements[1])
            tags.append(elements[2])
            
            if elements[3]!='-':
                print(elements[3])
            if elements[4]=='stephen':
                nr_tags+=1
    
        else:
            if words:
                data.append((words, tags))
            words=[]
            tags=[]

    if tags!=[]:
        data.append((words, tags))

    proportion_tagged=nr_tags/nr_toks
    
    return data, proportion_tagged

## Data

In [2]:
train_data,prop_tag_train=read_ewt_file('Project_description/en_ewt-ud-train.iob2')
test_data,prop_tag_test=read_ewt_file('Project_description/en_ewt-ud-dev.iob2')
print("Proportion of training data tagged: ", prop_tag_train)
print("Proportion of testing data tagged: ", prop_tag_test)

Proportion of training data tagged:  0.050640583833140254
Proportion of testing data tagged:  0.05948546661895105


## **Step 2.1:** Implement RNN model

In [3]:
import torch

class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word(idx)

max_len= max([len(x[0]) for x in train_data ])

class preprocess():
    """
    data: the dataset from which we get the matrix used by a Neural network (instances + their tags)
    instances: number of instances in the dataset, needed for dimension of matrix
    features: the number of features/columns of the matrix
    """
    def __init__(self):
        self.vocab_words = Vocab()
        self.vocab_tags = Vocab()

    def build_vocab(self, data, instances, features):
        data_X = torch.zeros(instances, features, dtype=int)
        data_y = torch.zeros(instances, features, dtype=int)
        for i, sentence_tags in enumerate(data):
            for j, word in enumerate(sentence_tags[0]):
                data_X[i, j]=self.vocab_words.getIdx(word=word, add=True)
                data_y[i, j]=self.vocab_tags.getIdx(word=sentence_tags[1][j], add=True)

        #returns the list of unique words in the list from the attributes of the Vocab() 
        idx2word_train = self.vocab_words.idx2word
        #returns the list of unique tags in the list from the attributes of the Vocab() 
        idx2label_train = self.vocab_tags.idx2word
        #only returned in the builder function, because they are reused for test data in transform_prep_data()
        return data_X, data_y, idx2word_train, idx2label_train

    def transform_prep_data(self, data, instances, features):
        #to be used only on test data
        data_X = torch.zeros(instances, features, dtype=int)
        data_y = torch.zeros(instances, features, dtype=int)
        for i, sentence_tags in enumerate(data):
            for j, word in enumerate(sentence_tags[0]):
                data_X[i, j]=self.vocab_words.getIdx(word=word, add=False)
                data_y[i, j]=self.vocab_tags.getIdx(word=sentence_tags[1][j], add=False)
        return data_X, data_y

In [4]:
transformer = preprocess()
train_X, train_y, idx2word, idx2label = transformer.build_vocab(train_data, len(train_data), max_len)
test_X, test_y = transformer.transform_prep_data(test_data, len(test_data), max_len)

print(train_X.shape, train_y.shape)
print(test_X.shape, test_y.shape)

torch.Size([12543, 159]) torch.Size([12543, 159])
torch.Size([2001, 159]) torch.Size([2001, 159])


In [5]:
print(idx2label)

['<PAD>', 'O', 'B-LOC', 'I-LOC', 'B-PER', 'B-ORG', 'I-ORG', 'I-PER']


In [6]:
tmp_feats = torch.zeros((200, 100))

batch_size = 32
num_batches = int(len(tmp_feats)/batch_size)

print(num_batches)

print(tmp_feats.shape)

tmp_feats_batches = tmp_feats[:batch_size*num_batches].view(num_batches,batch_size, 100)

# 6 batches with 32 instances with 100 features
print(tmp_feats_batches.shape)

print()
for feats_batch in tmp_feats_batches:
    print(feats_batch.shape)

6
torch.Size([200, 100])
torch.Size([6, 32, 100])

torch.Size([32, 100])
torch.Size([32, 100])
torch.Size([32, 100])
torch.Size([32, 100])
torch.Size([32, 100])
torch.Size([32, 100])


In [7]:
def create_batches(batch_size, train_X, train_y):

    num_batches = int(len(train_X)/batch_size)

    # print("Num of batches: ", num_batches)

    batches_X = train_X[:batch_size*num_batches].view(num_batches,batch_size, train_X.shape[1])
    batches_y = train_y[:batch_size*num_batches].view(num_batches,batch_size, train_y.shape[1])

    # batches = torch.cat((batches_X, batches_y), -1)
    # print("Shape of batches full of X inst: ", batches_X.shape)
    # print("Shape of batches full of y inst: ", batches_y.shape)
    return batches_X, batches_y

In [8]:
from torch import nn
torch.manual_seed(42)
DIM_EMBEDDING = 100
RNN_HIDDEN = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.01
EPOCHS = 10

class TaggerModel(torch.nn.Module):
    def __init__(self, nwords, ntags):
        super().__init__()
        self.embed = nn.Embedding(nwords, DIM_EMBEDDING)
        self.rnn = nn.RNN(DIM_EMBEDDING, RNN_HIDDEN, batch_first=True)
        self.fc = nn.Linear(RNN_HIDDEN, ntags)
        
    def forward(self, input_data):
        word_vectors = self.embed(input_data)
        output, hidden = self.rnn(word_vectors)
        predictions = self.fc(output)

        return predictions 

model = TaggerModel(len(idx2word), len(idx2label))
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

#creating the batches 
batches_X, batches_y = create_batches(BATCH_SIZE, train_X, train_y)

for epoch in range(EPOCHS):
    model.train()
    # reset the gradient
    model.zero_grad()
    print(f"Epoch {epoch+1}\n-------------------------------")
    loss_sum = 0

    # loop over batches
    for X, y in zip(batches_X, batches_y):
        predicted_values = model.forward(X)
        predicted_values=predicted_values.view(BATCH_SIZE*max_len, -1) #resizing tensor to 2D from 3D
        
        # calculate loss
        y=torch.flatten(y.view(BATCH_SIZE*max_len, -1)) #flattening to make it 1D
        loss = loss_function(predicted_values, y)
        loss_sum += loss.item() #avg later

        # update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Average loss after epoch {epoch+1}: {loss_sum/batches_X.shape[0]}")
        
# set to evaluation mode
model.eval()

Epoch 1
-------------------------------
Average loss after epoch 1: 111.65099384046881
Epoch 2
-------------------------------
Average loss after epoch 2: 45.170315415124456
Epoch 3
-------------------------------
Average loss after epoch 3: 23.83703379218688
Epoch 4
-------------------------------
Average loss after epoch 4: 15.787376305331355
Epoch 5
-------------------------------
Average loss after epoch 5: 11.782564234588762
Epoch 6
-------------------------------
Average loss after epoch 6: 9.921833932247308
Epoch 7
-------------------------------
Average loss after epoch 7: 8.88464398029477
Epoch 8
-------------------------------
Average loss after epoch 8: 8.40197725655973
Epoch 9
-------------------------------
Average loss after epoch 9: 8.238924911445784
Epoch 10
-------------------------------
Average loss after epoch 10: 7.7995702784718075


TaggerModel(
  (embed): Embedding(19674, 100)
  (rnn): RNN(100, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=8, bias=True)
)

## RNN eval.

In [9]:
#Evaluating on test data we will predict using trained TaggerModel
predictions_test = model.forward(test_X)
#gives probabilities for each tag (dim=18) for each word/feature (dim=159) for each sentence(dim=2000)
#we want to classify each word for the part-of-speech with highest probability

labels_test = torch.argmax(predictions_test, 2)
labels_test = torch.flatten(labels_test) #model predictions
test_y_flat = torch.flatten(test_y) #true labels

acc = []
for i in range(len(labels_test)):
    if test_y_flat[i]!=0:
        acc.append(int(labels_test[i]==test_y_flat[i]))

accuracy = sum(acc)/len(acc)
print(f"Model accuracy on test set: {accuracy}")

Model accuracy on test set: 0.9568571314962822


In [10]:
labels_test[0:159]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [14]:
test_pred=[]
for i, sentence in enumerate(predictions_test[:,:,]):
    labels=[]
    for j, token in enumerate(sentence):
        label=idx2label[torch.argmax(predictions_test[i,j,:])]
        labels.append(label)
    test_pred.append((test_data[i][0],labels))

B-LOC
I-LOC
B-LOC
I-ORG
B-LOC
B-LOC
B-LOC
B-ORG
B-LOC
B-LOC
B-LOC
B-LOC
B-ORG
I-LOC
B-ORG
I-ORG
B-LOC
I-LOC
B-LOC
I-ORG
B-ORG
I-ORG
I-ORG
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
I-ORG
B-LOC
B-LOC
I-LOC
B-LOC
I-LOC
B-LOC
B-LOC
B-LOC
B-ORG
I-ORG
B-LOC
B-ORG
I-ORG
B-ORG
I-ORG
B-LOC
B-ORG
I-ORG
B-LOC
B-LOC
B-ORG
I-ORG
B-LOC
I-LOC
I-LOC
B-LOC
I-LOC
B-LOC
I-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
I-ORG
B-LOC
B-ORG
B-LOC
B-ORG
B-ORG
B-LOC
I-LOC
I-LOC
I-ORG
I-ORG
I-ORG
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
I-LOC
I-LOC
B-LOC
I-LOC
B-LOC
B-LOC
B-LOC
I-PER
B-LOC
B-LOC
I-LOC
B-LOC
I-LOC
B-LOC
B-LOC
B-ORG
B-LOC
B-PER
B-LOC
B-LOC
B-LOC
B-LOC
I-LOC
B-LOC
B-LOC
B-LOC
I-LOC
B-ORG
B-ORG
I-ORG
B-ORG
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
I-LOC
B-LOC
B-LOC
B-LOC
I-ORG
B-LOC
B-ORG
I-LOC
B-LOC
I-ORG
I-ORG
I-LOC
B-LOC
B-LOC
B-ORG
B-LOC
I-ORG
I-ORG
B-PER
B-PER
I-PER
B-PER
I-PER
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-PER
B-PER
I-PER
B-PER
I-PER
B-PER
B-PER
B-LOC
B-PER
I-PER
B-PER
B-PER
B-PER
B-ORG
I-OR

[(['where',
   'can',
   'I',
   'get',
   'morcillas',
   'in',
   'tampa',
   'bay',
   ',',
   'I',
   'will',
   'like',
   'the',
   'argentinian',
   'type',
   ',',
   'but',
   'I',
   'will',
   'to',
   'try',
   'anothers',
   'please',
   '?'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',

## **Step 2.2:** Implement BERT model

## 2.2.1: Testing MASK outputs with AutoTokenizer

In [9]:
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM,AutoTokenizer
import torch

tokzr = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', use_fast=False)

def getTopN(inputSent, model, tokzr, topn=1):
    maskId = tokzr.convert_tokens_to_ids(tokzr.mask_token)
    tokenIds = tokzr(inputSent).input_ids
    if maskId not in tokenIds:
        return 'please include ' + tokzr.mask_token + ' in your input'
    maskIndex = tokenIds.index(maskId)
    logits = model(torch.tensor([tokenIds])).logits
    return tokzr.convert_ids_to_tokens(torch.topk(logits, topn, dim=2).indices[0][maskIndex])

model = AutoModelForMaskedLM.from_pretrained('bert-base-cased')
tokzr = AutoTokenizer.from_pretrained('bert-base-cased')

getTopN('This is a [MASK] test.', model, tokzr, 5)
print(getTopN('daughter is to dad as son is to [MASK].', model, tokzr))
print(getTopN('Małopolska is to Poland as Sjælland is to [MASK].', model, tokzr))
print(getTopN('robić is to robienie as palić is to [MASK].', model, tokzr))
print(getTopN('Cracow is to Poland as Copenhagen is to [MASK].', model, tokzr))
print(getTopN('cracow is to poland as copenhagen is to [MASK].', model, tokzr))
print(getTopN('Maiopolska is to Poland as Sjælland is to [MASK].', model, tokzr)) #typo
print(getTopN('Małopolska is to Poland as Sjælland is to [MASK]:', model, tokzr))
print(getTopN('doughter is to dad as san is to [MASK].', model, tokzr))
print(getTopN('The woman occupation is [MASK].', model, tokzr))
print(getTopN('The man occupation is [MASK].', model, tokzr))
print(getTopN('The Polish lady is [MASK].', model, tokzr))
print(getTopN('The Danish man is [MASK].', model, tokzr))
print(getTopN('The young girls outfit is [MASK].', model, tokzr))
print(getTopN('The old womans outfit is [MASK].', model, tokzr))
print(getTopN('[MASK] occupation is teacher.', model, tokzr))
print(getTopN('[MASK] does not have a job', model, tokzr))
print(getTopN('[MASK] earns money', model, tokzr))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['mother']
['Denmark']
['kill']
['Denmark']
['port']
['Denmark']
['Denmark']
['mom']
['agriculture']
['agriculture']
['married']
['dead']
['identical']
['gone']
['His']
['He']
['Whoever']


## 2.2.2: bert-topic.py code, needs to be run as .py on HPC

In [None]:
"""
A basic classifier based on the transformers (https://github.com/huggingface/transformers) 
library. It loads a masked language model (by default distilbert), and adds a linear layer for
prediction. Usage with ITU HPC:

python3 bert-topic.py Project_description/en_ewt-ud-train.iob2 Project_description/en_ewt-ud-dev.iob2
"""
from typing import List, Dict
import torch
import codecs
import sys
import myutils
from transformers import AutoModel, AutoTokenizer

# set seed for consistency
torch.manual_seed(42)
# Set some constants
MLM = 'distilbert-base-cased'
BATCH_SIZE = 8
LEARNING_RATE = 0.00001
EPOCHS = 3
# We have an UNK label for robustness purposes, it makes it easier to run on
# data with other labels, or without labels.
UNK = "[UNK]"
MAX_TRAIN_SENTS=64
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

class ClassModel(torch.nn.Module):
    def __init__(self, nlabels: int, mlm: str):
        """
        Model for classification with transformers.

        The architecture of this model is simple, we just have a transformer
        based language model, and add one linear layer to converts it output
        to our prediction.
    
        Parameters
        ----------
        nlabels : int
            Vocabulary size of output space (i.e. number of labels)
        mlm : str
            Name of the transformers language model to use, can be found on:
            https://huggingface.co/models
        """
        super().__init__()

        # The transformer model to use
        self.mlm = AutoModel.from_pretrained(mlm)

        # Find the size of the output of the masked language model
        if hasattr(self.mlm.config, 'hidden_size'):
            self.mlm_out_size = self.mlm.config.hidden_size
        elif hasattr(self.mlm.config, 'dim'):
            self.mlm_out_size = self.mlm.config.dim
        else: # if not found, guess
            self.mlm_out_size = 768

        # Create prediction layer
        self.hidden_to_label = torch.nn.Linear(self.mlm_out_size, nlabels)

    def forward(self, input: torch.tensor):
        """
        Forward pass
    
        Parameters
        ----------
        input : torch.tensor
            Tensor with wordpiece indices. shape=(batch_size, max_sent_len).

        Returns
        -------
        output_scores : torch.tensor
            ?. shape=(?,?)
        """
        # Run transformer model on input
        mlm_out = self.mlm(input)

        # Keep only the last layer: shape=(batch_size, max_len, DIM_EMBEDDING)
        mlm_out = mlm_out.last_hidden_state
        # Keep only the output for the first ([CLS]) token: shape=(batch_size, DIM_EMBEDDING)
        mlm_out = mlm_out[:,:1,:].squeeze()

        # Matrix multiply to get scores for each label: shape=(?,?)
        output_scores = self.hidden_to_label(mlm_out)

        return output_scores

    def run_eval(self, text_batched: List[torch.tensor], labels_batched: List[torch.tensor]):
        """
        Run evaluation: predict and score
    
        Parameters
        ----------
        text_batched : List[torch.tensor]
            list with batches of text, containing wordpiece indices.
        labels_batched : List[torch.tensor]
            list with batches of labels (converted to ints).
        model : torch.nn.module
            The model to use for prediction.
    
        Returns
        -------
        score : float
            accuracy of model on labels_batches given feats_batches
        """
        self.eval()
        match = 0
        total = 0
        for sents, labels in zip(text_batched, labels_batched):
            output_scores = self.forward(sents)
            pred_labels = torch.argmax(output_scores, 1)
            for gold_label, pred_label in zip(labels, pred_labels):
                total += 1
                if gold_label.item() == pred_label.item():
                    match+= 1
        return(match/total)

if len(sys.argv) < 2:
    print('Please provide path to training and development data')

if __name__ == '__main__':
    print('reading data...')
    train_text, train_labels = myutils.read_data(sys.argv[1])
    train_text = train_text[:MAX_TRAIN_SENTS]
    train_labels = train_labels[:MAX_TRAIN_SENTS]
    
    id2label, label2id = myutils.labels2lookup(train_labels, UNK)
    NLABELS = len(id2label)
    print(train_labels)
    print(label2id)
    train_labels = [label2id[label] for label in train_labels]
    
    dev_text, dev_labels = myutils.read_data(sys.argv[2])
    dev_labels = [label2id[label] for label in dev_labels]
    
    print('tokenizing...')
    tokzr = AutoTokenizer.from_pretrained(MLM)
    train_tokked = myutils.tok(train_text, tokzr)
    dev_tokked = myutils.tok(dev_text, tokzr)
    PAD = tokzr.pad_token_id
    
    print('converting to batches...')
    train_text_batched, train_labels_batched = myutils.to_batch(train_tokked, train_labels, BATCH_SIZE, PAD, DEVICE)
    # Note, some data is trown away if len(text_tokked)%BATCH_SIZE!= 0
    dev_text_batched, dev_labels_batched = myutils.to_batch(dev_tokked, dev_labels, BATCH_SIZE, PAD, DEVICE)
    
    print('initializing model...')
    model = ClassModel(NLABELS, MLM)
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
    
    print('training...')
    for epoch in range(EPOCHS):
        print('=====================')
        print('starting epoch ' + str(epoch))
        model.train() 
    
        # Loop over batches
        loss = 0
        for batch_idx in range(0, len(train_text_batched)):
            optimizer.zero_grad()

            output_scores = model.forward(train_text_batched[batch_idx])
            batch_loss = loss_function(output_scores, train_labels_batched[batch_idx])
            loss += batch_loss.item()
    
            batch_loss.backward()

            optimizer.step()
    
        dev_score = model.run_eval(dev_text_batched, dev_labels_batched)
        print('Loss: {:.2f}'.format(loss))
        print('Acc(dev): {:.2f}'.format(100*dev_score))
        print()

## BERT eval 