## Baseline implementation with RNN, a mix of homemade code, and solutions given from ass.4

Reference: Rob's solution to assignment 4

**Adjustable stuffs:**

In [None]:
#Input and output paths 
training_data_path = 'Project_description/en_ewt-ud-train.iob2'
validation_data_path = 'Project_description/en_ewt-ud-dev.iob2'
testing_data_path = 'Project_description/en_ewt-ud-test-masked.iob2'

output_predictions_path_and_name = 'baseline_test_pred_output.iob2'

#Model hyperparameters
batch_size=32
nr_embedding_dimensions=100
nr_hidden_rnn_layers=50
learning_rate=0.01
epochs=10

# From the project description: Baseline for EWT

To get your project started, you start with implementing a baseline model. Ideally, this is going to be the main baseline that you are going to compare to in your paper. Note that this baseline should be more advanced than just predicting the majority class (O).

We will use EWT portion of the Universal NER project, which we provide in the folder "Project_description" for convenience. You can use the train data (en_ewt-ud-train.iob2) and test data(en_ewt-ud-dev.iob2) to build your baseline, then upload your prediction on the test data (en_ewt-ud-test.iob2).

It is important to upload your predictions in same format as the training and test files, so that the span_f1.py script can be used.

Note that you do not have to implement your baseline from scratch, you can use for example the code from the RNN or BERT assignments as a starting point.

# First, import needed modules

In [None]:
import torch
from torch import nn
torch.manual_seed(42)

# All the functions are gathered below

__________________________________________________
Homemade code, inspired by Rob's conll-implementation to ensure that we remembered all checks and outlying cases

In [None]:
def read_iob_file(path):
    data=[]
    words=[]
    tags=[]
    nr_tags=0
    nr_toks=0

    data_file_name=path.split("/")[-1]
    
    for line in open(path, encoding='utf-8'):
        line=line.strip()

        #Start with a check to filter out empty lines and comments
        if line !="" and line[0]!="#":
            elements=line.split('\t')
            nr_toks+=1
            
            words.append(elements[1])
            tags.append(elements[2])

            #We are interested in how much of the data is actually tagged as an entity
            if elements[4]=='stephen':
                nr_tags+=1

        #If we do reach an empty line or sudden comment, then the current sentence has ended, and we append all the stored tokens and labels to a list of gathered data
        else:
            if words!=[]:
                data.append((words, tags))
            words=[]
            tags=[]

    if tags!=[]:
        data.append((words, tags))

    proportion_tagged=nr_tags/nr_toks
    print(f"Proportion of {data_file_name} data with a tag: ", proportion_tagged)
    
    return data

________________________________________
Homemade code

In [None]:
def create_vocabs(data_list):
    words=['<PAD>']
    labels=['<PAD>']
    
    #Iterate over each sentence and corresponding label in the training data
    for pair in data_list:
        #Unpack the tokens and labels, to iterate simultaneously over each word/token and it's label
        for word, label in zip(pair[0], pair[1]):
    
            #Check if the word and token already exists in the vocabulary, and if not, add it
            if word not in words:
                words.append(word)
            if label not in labels:
                labels.append(label)

    return words, labels

________________________________________
Homemade code

In [None]:
def conv2matrix(data_list, tok_vocab, lab_vocab):
    
    nr_sent=len(data_list)
    longest_sent=max([len(x[0]) for x in train_data])
    
    data_matrix=torch.zeros((nr_sent,longest_sent)) #PyTorch tensor of dim 12543 x 159 . Should consist of sentences word by word as rows, and padding for shorter sentences.
    label_matrix=torch.zeros((nr_sent,longest_sent)) #PyTorch tensor of dim 12543 x 159, containing values from the label index for each word in the train_data_matrix
    
    #Iterate over the training data again, this time looking up the vocab index for each token and label, to create pytorch tensors of sentence representation
    for sent_nr, (sentence, labels) in enumerate(data_list):
        for tok_nr, (token, label) in enumerate(zip(sentence, labels)):

            try:
                token_idx=word_vocab.index(token)
            #New words occuring in the testing data should be classified as unknown, and have vocab index 0
            except:
                token_idx=0
                
            label_idx=label_vocab.index(label)
            
            data_matrix[sent_nr,tok_nr]=token_idx
            label_matrix[sent_nr,tok_nr]=label_idx
    
    #Convert all matrix values to dType LongInt, since initially adding them to the tensor interpreted the values as float
    data_matrix=data_matrix.long()
    label_matrix=label_matrix.long()

    return data_matrix, label_matrix

__________________________________________________
Rob's code

In [None]:
def create_batches(batch_size, train_data_matrix, train_label_matrix):
    num_batches=int(len(train_data_matrix)/batch_size)
    batches_X=train_data_matrix[:batch_size*num_batches].view(num_batches, batch_size, train_data_matrix.shape[1]) #6, 32, 159
    batches_Y=train_label_matrix[:batch_size*num_batches].view(num_batches, batch_size, train_label_matrix.shape[1]) #6, 32, 159
    return batches_X, batches_Y

__________________________________________________
Rob's code

In [None]:
class TaggerModel(torch.nn.Module):
    def __init__(self, nwords, ntags):
        super().__init__()
        self.embed = nn.Embedding(nwords, nr_embedding_dimensions)
        self.rnn = nn.RNN(nr_embedding_dimensions, nr_hidden_rnn_layers, batch_first=True)
        self.fc = nn.Linear(nr_hidden_rnn_layers, ntags)
        
    def forward(self, input_data):
        word_vectors = self.embed(input_data)
        output, hidden = self.rnn(word_vectors)
        predictions = self.fc(output)

        return predictions 

_______________________________________
Homemade

In [None]:
#Since the predictions_test is a 3-dimensional tensor (8 layers of 2d-matrices, with each layer representing a possible label), 
# we need to get the most likely label for each token, and look that index up in our label_vocab for the word-label
def get_predictions(plain_data, likelihood_3d_tensor):
    
    predictions=[]
    
    for i, sentence in enumerate(likelihood_3d_tensor[:,:,]):
        
        labels=[]
        
        for j, token in enumerate(sentence):
            
            label = label_vocab[torch.argmax(likelihood_3d_tensor[i,j,:])]
            labels.append(label)
            
        predictions.append((plain_data[i][0],labels))
        
    return predictions

___________________________________

# Main code for model creation from training data below

### Reading in training and validation data

In [None]:
train_data=read_iob_file(training_data_path)

### Vocab and tensor creation

In [None]:
word_vocab, label_vocab = create_vocabs(train_data)
train_data_matrix, train_label_matrix=conv2matrix(train_data, word_vocab, label_vocab)

### Dividing the training data into batches
The code below is taken from Rob's solutions to assignment 4, but has been adjusted to our data

In [None]:
tmp_feats=torch.zeros((200,100))

num_batches=int(len(tmp_feats)/batch_size)

tmp_feats_batches=tmp_feats[:batch_size*num_batches].view(num_batches, batch_size, 100)

#creating the batches 
word_batches, label_batches = create_batches(batch_size, train_data_matrix, train_label_matrix)

### Model creation and adjusting
The code below is taken from Rob's solutions to assignment 4, but has been adjusted to our data

In [None]:
model = TaggerModel(len(word_vocab), len(label_vocab))
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

for iteration in range(epochs):
    model.train()
    # reset the gradient
    model.zero_grad()
    print(f"Epoch {iteration+1}\n-------------------------------")
    loss_sum = 0

    # loop over batches
    for x, y in zip(word_batches, label_batches):
        predicted_values = model.forward(x)
        predicted_values=predicted_values.view(batch_size*train_data_matrix.shape[1], -1) #resizing tensor to 2D from 3D
        
        # calculate loss
        y=torch.flatten(y.view(batch_size*train_data_matrix.shape[1], -1)) #flattening to make it 1D
        loss = loss_function(predicted_values, y)
        loss_sum += loss.item() #avg later

        # update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Average loss after epoch {iteration+1}: {loss_sum/word_batches.shape[0]}\n")
        
# set to evaluation mode
model.eval()

### Validation of the model on the dev data done according to Rob's solutions to ass.4

In [None]:
val_data = read_iob_file(validation_data_path)
val_data_matrix, val_label_matrix = conv2matrix(val_data, word_vocab, label_vocab)
val_predictions = model.forward(val_data_matrix)

In [None]:
labels_val = torch.argmax(val_predictions, 2)
labels_val = torch.flatten(labels_val) #model predictions
dev_y_flat = torch.flatten(val_label_matrix) #true labels
acc = []
for i in range(len(labels_val)):
    if dev_y_flat[i]!=0:
        acc.append(int(labels_val[i]==dev_y_flat[i]))

accuracy = sum(acc)/len(acc)
print(f"Model accuracy on validation set: {accuracy}")

# Running our created model on the test data

### Read in data and convert it to pytorch tensor

In [None]:
test_data = read_iob_file(testing_data_path)
test_data_matrix, _ = conv2matrix(test_data, word_vocab, label_vocab)

### Run the test data through the model to get 3-dimensional likelihood distribution of labels, and get predictions from that

In [None]:
test_predictions = model.forward(test_data_matrix)
test_pred = get_predictions(test_data, test_predictions)

### Now convert to iob2-format for the output file, and save it as a new file

Assemble it all into a string of the proper format: with 5 columns of [ nr, token, label, "-", and "stephen" (if the token has a label) ]

In [None]:
#To get it as iob2-format output, we assemble the predicted labels with the corresponding words in a string, as well as a "stephen" if a word has a label
output_txt=""
for sentence, labels in test_pred:
    output_txt+="\n# text = "+" ".join(sentence)+"\n"
    for i, (token, label) in enumerate(zip(sentence,labels)):
        steph="-"
        if label != "O":
            steph="stephen"
        line=str(i+1)+"\t"+token+"\t"+label+"\t-\t"+steph+"\n"
        output_txt+=line

In [None]:
#Lastly, write that string into a file
with open(output_predictions_path_and_name, "w", encoding="utf-8") as file:
    file.write(output_txt)