# Named Entity Recognition Task

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
from zipfile import ZipFile
from gensim.models import Word2Vec
from spacy.tokenizer import Tokenizer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pprint
import spacy
import re
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

## Read Data

In [None]:
%%time 
path = '/content/gdrive/My Drive/COMP5046/Assignment2'
file_name = '2020-comp5046-a2.zip'

with ZipFile(path + '/' + file_name, 'r') as z:
   # Extract all the contents of zip file in current directory
   z.extractall()

CPU times: user 5.89 ms, sys: 2.53 ms, total: 8.42 ms
Wall time: 3.1 s


In [None]:
df_train = pd.read_csv(os.getcwd() + '/train.csv') #30,000 for training 
df_valid = pd.read_csv(os.getcwd() + '/val.csv') #10,000 for testing 
df_test = pd.read_csv(os.getcwd() + '/test.csv') #10,000 for testing 

In [None]:
print(df_train.shape)
df_test.head()

(3000, 2)


Unnamed: 0,Sentence,NER
0,-docstart-,
1,"soccer - japan get lucky win , china in surpri...",
2,nadim ladki,
3,"al-ain , united arab emirates 1996-12-06",
4,japan began the defence of their asian cup tit...,


In [None]:
def get_sentences_and_labels(df, get_labels = True): 
    """
    Get a list of sentences and labels
    """
    sentences, labels = [], []

    for sent in df['Sentence'].tolist():
        sentences.append(sent.split(' '))

    if get_labels == True: 
        for sent_labels in df['NER'].tolist(): 
            labels.append(sent_labels.split(' '))

    return sentences, labels 

In [None]:
%%time 
X_train, y_train = get_sentences_and_labels(df_train)
X_valid, y_valid = get_sentences_and_labels(df_valid)
X_test, _ = get_sentences_and_labels(df_test, get_labels = False)

CPU times: user 10.3 ms, sys: 2.84 ms, total: 13.2 ms
Wall time: 13.6 ms


## Create Features

In [None]:
### get unique words and assign an index to each word 
%%time

all_sentences = X_train + X_valid + X_test

word_to_idx = {} #this stores the vocabulary 
for sent in all_sentences:
    for word in sent: 
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
print('Vocab size:', len(word_to_idx))

Vocab size: 13972
CPU times: user 14.8 ms, sys: 0 ns, total: 14.8 ms
Wall time: 15.2 ms


### Encode named entity tags

In [None]:
### get unique named entities and assign an index to each tag
%%time 
START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_idx = {START_TAG:0, STOP_TAG:1}
for tags in y_train + y_valid:
    for tag in tags:
        if tag not in tag_to_idx:
            tag_to_idx[tag] = len(tag_to_idx)

print('Named entities: ', tag_to_idx)

Named entities:  {'<START>': 0, '<STOP>': 1, 'O': 2, 'I-ORG': 3, 'I-MISC': 4, 'I-PER': 5, 'I-LOC': 6}
CPU times: user 5.47 ms, sys: 318 µs, total: 5.79 ms
Wall time: 6.44 ms


### Encode POS tags

In [None]:
### get unique pos tags and assign an index to each tag 
%%time 
postag_to_idx = {} 
for sent in all_sentences: 
    for word in sent: 
        tag = nltk.pos_tag([word])[0][1] 
        if tag not in postag_to_idx: 
            postag_to_idx[tag] = len(postag_to_idx)
print('Number of POS tags: ', len(postag_to_idx))

Number of POS tags:  34
CPU times: user 7.49 s, sys: 470 ms, total: 7.96 s
Wall time: 7.97 s


### Encode dependency tags

In [None]:
dep_to_idx  = {'ROOT': 0, 'nsubj': 1, 'amod': 2, 'dobj': 3, 'aux': 4, 'acl': 5, 
               'punct': 6, 'compound': 7, 'npadvmod': 8, 'nummod': 9, 'det': 10, 
               'prep': 11, 'pobj': 12, 'ccomp': 13, 'advcl': 14, 'mark': 15, 
               'nsubjpass': 16, 'auxpass': 17, 'poss': 18, 'case': 19, 'acomp': 20, 
               'neg': 21, 'cc': 22, 'conj': 23, 'agent': 24, 'advmod': 25, 'attr': 26, 
               'appos': 27, 'xcomp': 28,'expl': 29, 'pcomp': 30, 'dep': 31, 
               'relcl': 32, 'csubj': 33, 'nmod': 34, 'prt': 35, 'quantmod': 36, 
               'parataxis': 37, 'oprd': 38, 'dative': 39, 'intj': 40, 'preconj': 41, 
               'predet': 42, 'meta': 43}

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = Tokenizer(nlp.vocab)

def get_dependency_info(sentence, dep_to_idx):
    """
    Get the dependency labels from dependency parsing for 
    each word in the given sentence. 
    """
    parse = nlp(' '.join(sentence))
    pred_dep = [t.dep_ for i, t in enumerate(parse)]

    dependencies = [] #stores the dependency labels 

    for dep_label in pred_dep:
     
        dependencies.append(dep_to_idx[dep_label])

    return dependencies

### Encode suffix

In [None]:
### get unique suffixes and assign an index to each suffix 

SUFFIX_LEN = 3

suffix_to_idx = {} 
for sent in all_sentences:
    for word in sent: 
        if word[-SUFFIX_LEN:] not in suffix_to_idx: 
            suffix_to_idx[word[-SUFFIX_LEN:]] = len(suffix_to_idx)
print('Number of unique suffixes: ', len(suffix_to_idx))

Number of unique suffixes:  3494


### Collect tf-IDF features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
### accumulate documents 
all_documents = [] 
doc = []
sent_to_doc_id = {} 

for sent in all_sentences: 
    if sent[0] == '-docstart-':
        all_documents.append(doc)
        doc = []
    doc.append(' '.join(sent))
    sent_to_doc_id[' '.join(sent)] = len(all_documents)-1

all_documents.append(doc)
all_documents = all_documents[1:]
all_documents = [' '.join(x) for x in all_documents]

In [None]:
print('Number of documents: ', len(all_documents))

Number of documents:  454


In [None]:
def white_space_tokenizer(s):
   return s.split(' ')

corpus = all_documents
vectorizer = TfidfVectorizer(tokenizer = white_space_tokenizer)
tfidf_matrix = vectorizer.fit_transform(corpus)
tfidf_matrix = tfidf_matrix.toarray()
tfidf_matrix.shape #shape = (number of documents, vocab_size)

(454, 13972)

In [None]:
features_arr = vectorizer.get_feature_names()
features_to_idx = {n: i for i, n in enumerate(features_arr)}
features_to_idx['fun']

6531

In [None]:
def get_tfidf_features(all_sentences, min_i, max_i):
    """
    Return the tf-idf for each word in each document. 
    """
    tfidf_features = []

    for i in range(min_i, max_i): 

        sentence = all_sentences[i]
        joined_sentence = ' '.join(sentence)
        doc_id = sent_to_doc_id[joined_sentence] 
        temp = []

        for word in sentence: 
            word_id = features_to_idx[word]
            tfidf = tfidf_matrix[doc_id][word_id]
            temp.append(tfidf)
            
        tfidf_features.append(temp)

    return tfidf_features

In [None]:
print('Number of training sentences: ', len(X_train))
print('Number of validation sentences: ', len(X_valid))
print('Number of testing sentences: ', len(X_test))

Number of training sentences:  3000
Number of validation sentences:  700
Number of testing sentences:  3684


In [None]:
%%time 
train_tfidf = get_tfidf_features(all_sentences, 0, 3000)
valid_tfidf = get_tfidf_features(all_sentences, 3000, 3700)
test_tfidf = get_tfidf_features(all_sentences, 3700, 7384)

CPU times: user 52.1 ms, sys: 3.62 ms, total: 55.7 ms
Wall time: 60.3 ms


### Character-based embeddings

In [None]:
!pip install chars2vec -q

[K     |████████████████████████████████| 8.1MB 34.1MB/s 
[?25h  Building wheel for chars2vec (setup.py) ... [?25l[?25hdone


In [None]:
%%time
import chars2vec
c2v_model = chars2vec.load_model('eng_50')

words = list(word_to_idx.keys())
# Create word embeddings
char_based_embeddings = c2v_model.vectorize_words(words)
print(char_based_embeddings.shape)

Using TensorFlow backend.


(13972, 50)
CPU times: user 12.6 s, sys: 2.16 s, total: 14.8 s
Wall time: 28.5 s


In [None]:
def get_char_embeddings(sentences):
    """
    Function to get character-level embedding for each word
    in given sentence. 
    """
    char_embedded_sentences = []
    for sent in sentences:
        temp = []
        for word in sent: 
            idx = word_to_idx[word]
            temp.append(char_based_embeddings[idx])
        char_embedded_sentences.append(temp)
        
    return np.array(char_embedded_sentences)

In [None]:
%%time 
train_char_embeddings = get_char_embeddings(X_train)
valid_char_embeddings = get_char_embeddings(X_valid)
test_char_embeddings = get_char_embeddings(X_test)

CPU times: user 34 ms, sys: 3.84 ms, total: 37.8 ms
Wall time: 38.6 ms


### Collect other features (pos tag, dependency tag, suffix)

In [None]:
def to_index(data, to_idx, feature = None):
    """
    Converts the features to corresponding indexes. 
    """
    input_index_list = []

    if feature == 'postag': 
        for sent in data:
            input_index_list.append([to_idx[nltk.pos_tag([w])[0][1]] for w in sent])
    elif feature == 'dependency': 
        for sent in data:
            input_index_list.append(get_dependency_info(sent, to_idx))
    elif feature == 'suffix': 
        for sent in data: 
            input_index_list.append([to_idx[w[-3:]] for w in sent])
    else: 
        for sent in data:
            input_index_list.append([to_idx[w] for w in sent])

    return input_index_list

In [None]:
%%time 
train_sentence_index =  to_index(X_train, word_to_idx)
train_postag_index = to_index(X_train, postag_to_idx, feature = 'postag')
train_dep_index = to_index(X_train, dep_to_idx, feature = 'dependency')
train_suffix_index = to_index(X_train, suffix_to_idx, feature = 'suffix')

train_output_index = to_index(y_train, tag_to_idx)

CPU times: user 23.7 s, sys: 283 ms, total: 23.9 s
Wall time: 24 s


In [None]:
%%time 
valid_sentence_index =  to_index(X_valid, word_to_idx)
valid_postag_index = to_index(X_valid, postag_to_idx, feature = 'postag')
valid_dep_index = to_index(X_valid, dep_to_idx, feature = 'dependency')
valid_suffix_index = to_index(X_valid, suffix_to_idx, feature = 'suffix')

valid_output_index = to_index(y_valid, tag_to_idx)

CPU times: user 5.09 s, sys: 64.6 ms, total: 5.15 s
Wall time: 5.16 s


In [None]:
%%time 
test_sentence_index =  to_index(X_test, word_to_idx)
test_postag_index = to_index(X_test, postag_to_idx, feature = 'postag')
test_dep_index = to_index(X_test, dep_to_idx, feature = 'dependency')
test_suffix_index = to_index(X_test, suffix_to_idx, feature = 'suffix')

CPU times: user 27.9 s, sys: 242 ms, total: 28.1 s
Wall time: 28.1 s


### Pack all features

In [None]:
def pack_features(sentence_index, postag_index, dep_index, suffix_index, tfidf_seq, char_embeds):
    """
    Accumulate all used features into a dictionary. 
    """
    input_dict = {'sentence': sentence_index,
                    'pos': postag_index,
                    'dep': dep_index,
                    'suffix': suffix_index,
                    'tfidf':tfidf_seq,
                    'char embed': char_embeds}

    return input_dict

In [None]:
%%time 
train_input_dict = pack_features(train_sentence_index, train_postag_index, train_dep_index,
                                 train_suffix_index, train_tfidf, train_char_embeddings)
valid_input_dict = pack_features(valid_sentence_index, valid_postag_index, valid_dep_index,
                                 valid_suffix_index, valid_tfidf, valid_char_embeddings)
test_input_dict = pack_features(test_sentence_index, test_postag_index, test_dep_index,
                                 test_suffix_index, test_tfidf, test_char_embeddings)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 11.9 µs


## Create Embedding Matrix

In [None]:
import gensim.downloader as api
word_emb_model = api.load("glove-twitter-100") 



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
### The embedding matrix will be the initial weights of the embedding layer in the network.
EMBEDDING_DIM = 100

embedding_matrix = []
for word in list(word_to_idx.keys()):
    try:
        embedding_matrix.append(word_emb_model[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
print('Shape of embedding matrix: ', embedding_matrix.shape) #shape = (vocab_size, embedding_dim)

Shape of embedding matrix:  (13972, 100)


## Build NER Model

In [None]:
### define some global variables 
VOCAB_SIZE = len(word_to_idx)
DEP_SIZE = len(dep_to_idx)
POS_SIZE = len(postag_to_idx)
SUFFIX_SIZE = len(suffix_to_idx)


WORD_EMB_DIM = EMBEDDING_DIM
CHAR_EMB_DIM = 50
DEP_EMB_DIM = 20
POS_EMB_DIM = 20
SUFFIX_EMB_DIM = 20
TFIDF_EMB_DIM = 50

FINAL_EMB_DIM = WORD_EMB_DIM + POS_EMB_DIM + DEP_EMB_DIM

OUTPUT_DIM = len(tag_to_idx)

NUM_LAYERS = 2

In [None]:
def argmax(vector):
    """
    Get index position of maximum value in vector.
    """
    _, idx = torch.max(vector, 1)
    return idx.item()

def log_sum_exp(vector): 
    """
    Given tensor, calculates log of the sum of exponentials
    """
    ### input vector has shape (1, 7)
    max_score = vector[0, argmax(vector)] 
    max_score_broadcast = max_score.view(1, -1).expand(1, vector.size()[1]) #shape = (1, 7)

    return max_score + torch.log(torch.sum(torch.exp(vector - max_score_broadcast)))

In [None]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, hidden_dim, tag_to_idx, attn_method = None): 
        super(BiLSTM_CRF, self).__init__() 

        # some class variables 
        self.hidden_dim = hidden_dim
        self.tag_to_idx = tag_to_idx 
        self.attn_method = attn_method
        
        # initialise embedding layers 
        self.word_embedding = nn.Embedding(VOCAB_SIZE, WORD_EMB_DIM)
        self.word_embedding.weight.data.copy_(torch.from_numpy(embedding_matrix)) #set initial weights to glove weights
        self.pos_embedding = nn.Embedding(POS_SIZE, POS_EMB_DIM)
        self.dep_embedding = nn.Embedding(DEP_SIZE, DEP_EMB_DIM)
        #self.suffix_embedding = nn.Embedding(SUFFIX_SIZE, SUFFIX_EMB_DIM)

        # Feed embedding to LSTM layers 
        self.lstm = nn.LSTM(FINAL_EMB_DIM, hidden_dim // 2, num_layers = NUM_LAYERS,
                            bidirectional = True, dropout = 0.40)
        
        #self.lstm_tfidf = nn.LSTM(1, TFIDF_EMB_DIM, num_layers = 1, bidirectional = False)

        # Map output of lstm to tag space 
        self.hidden_to_tag_attn = nn.Linear(2*hidden_dim, OUTPUT_DIM)
        #self.hidden_to_tag_attn = nn.Linear(2*(hidden_dim + TFIDF_EMB_DIM), OUTPUT_DIM)
        #self.hidden_to_tag = nn.Linear(hidden_dim + TFIDF_EMB_DIM, OUTPUT_DIM)
        #self.hidden_to_tag = nn.Linear(hidden_dim, OUTPUT_DIM)

        #initialise transition matrix as part of model parameters.
        #Entry (i,j) = score transitioning to i from j. 
        self.transitions = nn.Parameter(torch.randn(OUTPUT_DIM, OUTPUT_DIM))

        #enforce some transition tules 
        self.transitions.data[tag_to_idx[START_TAG], :] = -10000 # we never transfer to start tag
        self.transitions.data[:, tag_to_idx[STOP_TAG]] = -10000 # we never transfer from stop tag 

        self.hidden = self.init_hidden() #randomly initialise hidden state 
        #self.hidden_tfidf = self.init_hidden_tfidf()

        #self.Whc = nn.Linear(hidden_dim + TFIDF_EMB_DIM, hidden_dim + TFIDF_EMB_DIM) #need to adjust this 
        #self.Whc = nn.Linear(hidden_dim, hidden_dim) #need to adjust this 

    def init_hidden(self): 
        ### shape = (num of dir * num of layers, batch size, hidden_dim)
        return (torch.randn(2 * NUM_LAYERS, 1, self.hidden_dim // 2).to(device),
                torch.randn(2 * NUM_LAYERS, 1, self.hidden_dim // 2).to(device))
        
    def init_hidden_tfidf(self): 
        return (torch.randn(1, 1, TFIDF_EMB_DIM).to(device),
                torch.randn(1, 1, TFIDF_EMB_DIM).to(device))

    def forward_alg(self, features): 
        # takes in the output from the last linear layer. Has shape = (sentence_len, OUTPUT_DIM)
        # intial = initial scores of transitioning to a hidden state (tags)
        initial = torch.full((1, OUTPUT_DIM), -10000.).to(device) #shape = (1, 7)
        initial[0][self.tag_to_idx[START_TAG]] = 0 #give start tag all the score 

        forward_var = initial 

        for f in features:  #for each feature corresponding to each word in the sentence
            alphas_t = [] #holds the log_sum_exps for this step 
            for next_tag in range(OUTPUT_DIM): 
                
                # emission score = score of transitioning to observed state (outputs) condition on hidden state (tags)
                emission_score = f[next_tag].view(1,-1).expand(1, OUTPUT_DIM) # shape = (1, 7)
                # transition score = score for transitioning to next tag from i'th tag.  
                transition_score = self.transitions[next_tag].view(1, -1) #shape = (1, 7)
                next_tag_var = forward_var + transition_score + emission_score #shape = (1, 7)
                alphas_t.append(log_sum_exp(next_tag_var).view(1)) 
            
            forward_var = torch.cat(alphas_t).view(1, -1) # concatenate tensors in list: shape = (1, 9)
        
        # add stop tag and calculate log sum of exps again 
        terminal_var = forward_var + self.transitions[self.tag_to_idx[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def calculate_attention(self, encoder_out, final_hidden):
        
        #encoder_out has shape = (1, num_seq, hidden_dim)
        hidden = final_hidden.squeeze(0) #shape = (1, hidden_dim)
        
        #attn_score should have shape (1, seq_len)
        if self.attn_method == ATTN_TYPE_DOT_PRODUCT: 
            attn_score = torch.bmm(encoder_out, hidden.unsqueeze(2)).squeeze(2)
            
        if self.attn_method == ATTN_TYPE_SCALE_DOT_PRODUCT: 
            n = hidden.shape[1]
            attn_score = 1/np.sqrt(n) * torch.bmm(encoder_out, hidden.unsqueeze(2)).squeeze(2) 
            
        if self.attn_method == ATTN_TYPE_GENERAL: 
            #multiply hidden state by weight matrix
            attn_score = torch.bmm(encoder_out, self.Whc(hidden).unsqueeze(2)).squeeze(2) 

        if self.attn_method == ATTN_TYPE_CONTENT_BASED: 
            cos = nn.CosineSimilarity(dim=1, eps=1e-6)
            attn_score = cos(encoder_out[0], hidden).view(1, encoder_out.shape[1])

        soft_attn_weights = F.softmax(attn_score, dim = 1) #shape = (1, seq_len)
        attn_output = torch.bmm(encoder_out.transpose(1,2), soft_attn_weights.unsqueeze(2)).squeeze(2) #shape = (1, hidden_dim)
        #concatenate attention output with current hidden state. shape = (1, hidden_dim * 2)
        concat_out = torch.cat((attn_output, hidden), 1)

        return concat_out

    def get_lstm_features(self, input): 
        
        # gather features 
        sentence_idxs = input['sentence']
        pos_idxs = input['pos']
        dep_idxs = input['dep']
        # suffix_idxs = input['suffix']
        # tfidf_feats = input['tfidf'] #shape = seq_len 
        # tfidf_feats = tfidf_feats.view(len(tfidf_feats), 1, -1)
        # char_embeds = input['char embed']
        # char_embeds = char_embeds.view(len(char_embeds), 1, -1)

        #initialise hidden states
        self.hidden = self.init_hidden() 
        self.hidden_tfidf = self.init_hidden_tfidf()

        # get embeddings 
        word_embeds = self.word_embedding(sentence_idxs).view(len(sentence_idxs), 1, -1)
        pos_embeds = self.pos_embedding(pos_idxs).view(len(pos_idxs), 1, -1)
        dep_embeds = self.dep_embedding(dep_idxs).view(len(dep_idxs), 1, -1)
        #suffix_embeds = self.suffix_embedding(suffix_idxs).view(len(suffix_idxs), 1, -1)

        # concatenate embeddings: shape = (seq_len, 1, FINAL_DIM)
        final_embeds = torch.cat([word_embeds, pos_embeds, dep_embeds], dim = 2)
        #final_embeds = word_embeds

        #feed through lstm layer 
        lstm_out, self.hidden = self.lstm(final_embeds, self.hidden) #lstm_out shape = (seq_len, 1, hidden_dim)
       
        #lstm_tfidf_out, self.hidden_tfidf = self.lstm_tfidf(tfidf_feats, self.hidden_tfidf) #shape = (seq_len, 1, 20)
        #lstm_out = torch.cat((lstm_out, lstm_tfidf_out), dim = 2) #70

        if self.attn_method is not None: #use attention

            context_vectors = [] #stores attention outputs for each hidden state at timestep t
            encoder_states = lstm_out.permute(1, 0, 2) #shape = (1, num_seq, hidden_size)

            for timestep in range(0, len(sentence_idxs)): 
                current_state = lstm_out[timestep,:,:].unsqueeze(0) #get hidden state at timestep t. shape = (1, hidden_dim)
                attn_out = self.calculate_attention(encoder_states, current_state) #calculate attn for this hidden state
                context_vectors.append(attn_out)
            
            #concatenate all attention scores for each decoder hidden state 
            final_out = torch.cat(context_vectors, dim = 0) #shape = (seq_len, hidden_dim * 2)
            #final_out = final_out.view(len(sentence_idxs), (self.hidden_dim + TFIDF_EMB_DIM) * 2)
            final_out = final_out.view(len(sentence_idxs), self.hidden_dim * 2)
            lstm_features = self.hidden_to_tag_attn(final_out)
        
        else: 
            #final_out = lstm_out.view(len(sentence_idxs), self.hidden_dim + TFIDF_EMB_DIM) #reshape to (seq_len, hidden_dim)
            final_out = lstm_out.view(len(sentence_idxs), self.hidden_dim)
            lstm_features = self.hidden_to_tag(final_out) #shape = (seq_len, OUTPUT_DIM)
     
        return lstm_features

    def score_sentence(self, features, tags): 
        """
        Function to calculate the ground truth loss. 
        """
        score = torch.zeros(1).to(device)
        #tags is a tensor of indexes for named entity tags 
        tags = torch.cat([torch.tensor([self.tag_to_idx[START_TAG]], dtype=torch.long).to(device), tags])
        
        for i, f in enumerate(features): #iterate through sentence features 
            #score = transition score + emission score 
            score += self.transitions[tags[i + 1], tags[i]] + f[tags[i + 1]]

        #consider score from transition to last tag to stop tag as well. 
        score +=  self.transitions[self.tag_to_idx[STOP_TAG], tags[-1]]
        
        return score

    def viterbi(self, features): 

        #initialise viterbi variables 
        initial_vvars = torch.full((1, OUTPUT_DIM), -10000.).to(device)
        initial_vvars[0][self.tag_to_idx[START_TAG]] = 0 

        forward_var = initial_vvars 
        backpointers = []

        for feat in features: 
            #holds the backpointers and viterbi variables for this time step
            bp_t, viterbivars_t = [], []

            for next_tag in range(OUTPUT_DIM): 

                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bp_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))

            # Add emission scores. viterbivars_t and f has length 9
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1) #shape = (1, 9)
            backpointers.append(bp_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_idx[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Backtrack to decode the best path.
        best_path = [best_tag_id]
        for bp_t in reversed(backpointers):
            best_tag_id = bp_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag 
        start = best_path.pop()
        assert start == self.tag_to_idx[START_TAG]  # Sanity check
        best_path.reverse()

        return path_score, best_path


    def neg_log_likelihood(self, input, tags):
        """
        Self defined loss function. 
        """
        # tags = ground truth tags 

        features = self.get_lstm_features(input)
        forward_score = self.forward_alg(features)
        gold_score = self.score_sentence(features, tags)

        return forward_score - gold_score

    def forward(self, input):  
        # Get the emission scores from the BiLSTM
        lstm_features = self.get_lstm_features(input)

        # Find the best path, given the features.
        score, tag_sequence = self.viterbi(lstm_features)

        return score, tag_sequence

## Train Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 100 #has to be even number 
EPOCHS = 25

# attention types 
ATTN_TYPE_DOT_PRODUCT = "dot product"
ATTN_TYPE_SCALE_DOT_PRODUCT = "scaled dot product"
ATTN_TYPE_GENERAL = 'general'
ATTN_TYPE_CONTENT_BASED = 'cosine'

model = BiLSTM_CRF(HIDDEN_DIM, tag_to_idx, ATTN_TYPE_DOT_PRODUCT).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
### load trained model 
model = torch.load('final_model.pt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def unpack_features(index, input_dict): 
    """
    Creates a dictionary of input features to be fed into the model. 
    """
    input = {'sentence': torch.tensor(input_dict['sentence'][index], dtype=torch.long).to(device),
              'pos': torch.tensor(input_dict['pos'][index], dtype=torch.long).to(device),
              'dep': torch.tensor(input_dict['dep'][index], dtype=torch.long).to(device)}
            # 'suffix': torch.tensor(input_dict['suffix'][index], dtype=torch.long).to(device),
            # 'tfidf':torch.tensor(input_dict['tfidf'][index], dtype=torch.float).to(device),
            # 'char embed':torch.tensor(input_dict['char embed'][index], dtype=torch.float).to(device)}

    return input

In [None]:
def cal_acc(model, input_dict, output_index):
    """
    Function to calculates the prediction accuracy of the current model. 
    """
    accuracy = 0
    ground_truth, predicted = [], []

    for i in range(0, len(input_dict['sentence'])): 
        
        input = unpack_features(i, input_dict)
        predictions = model(input)[1] #predictions are 2nd element of tuple 
        
        predicted += predictions
        ground_truth += output_index[i]

        for j in range(0, len(predictions)):
            if predictions[j] == output_index[i][j]:
                accuracy += 1
    
    accuracy = accuracy/len(predicted)

    return ground_truth, predicted, accuracy

In [None]:
import datetime

train_losses, valid_losses = [], []
train_accs, valid_accs = [], []

for epoch in range(EPOCHS):  

    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i in range(0, len(train_sentence_index)):
    
        tags_index = train_output_index[i]

        train_input = unpack_features(i, train_input_dict)
        model.zero_grad()
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
 
        loss = model.neg_log_likelihood(train_input, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model, train_input_dict, train_output_index)
    _, _, val_acc = cal_acc(model, valid_input_dict, valid_output_index)

    val_loss = 0
    for i in range(0, len(valid_sentence_index)):
        tags_index = valid_output_index[i]

        valid_input = unpack_features(i, valid_input_dict)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(valid_input, targets)
        val_loss += loss.item()

    time2 = datetime.datetime.now()

    #store losses and accuracies 
    train_losses.append(train_loss)
    valid_losses.append(val_loss)
    train_accs.append(train_acc)
    valid_accs.append(val_acc)

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))


Epoch:1, Training loss: 11549.15, train acc: 0.9212, val loss: 1589.41, val acc: 0.9222, time: 239.06s
Epoch:2, Training loss: 6368.89, train acc: 0.9376, val loss: 1394.02, val acc: 0.9285, time: 238.42s
Epoch:3, Training loss: 5013.22, train acc: 0.9520, val loss: 1188.85, val acc: 0.9412, time: 239.57s
Epoch:4, Training loss: 3971.94, train acc: 0.9544, val loss: 1256.31, val acc: 0.9423, time: 243.25s
Epoch:5, Training loss: 3314.97, train acc: 0.9623, val loss: 1192.20, val acc: 0.9504, time: 246.47s
Epoch:6, Training loss: 2724.22, train acc: 0.9685, val loss: 1169.89, val acc: 0.9512, time: 242.83s
Epoch:7, Training loss: 2322.53, train acc: 0.9732, val loss: 1176.53, val acc: 0.9508, time: 244.92s
Epoch:8, Training loss: 1918.65, train acc: 0.9749, val loss: 1266.55, val acc: 0.9513, time: 244.69s
Epoch:9, Training loss: 1621.39, train acc: 0.9798, val loss: 1216.86, val acc: 0.9533, time: 244.30s
Epoch:10, Training loss: 1367.04, train acc: 0.9786, val loss: 1329.86, val acc: 

In [None]:
### save model 
torch.save(model, '/content/gdrive/My Drive/COMP5046/model.pt')

  "type " + obj.__name__ + ". It won't be checked "


## Evaluation on Validation Set

In [None]:
%%time 
### evaluating the model on the validation set. 
y_true, y_pred, _ =  cal_acc(model, valid_input_dict, valid_output_index)

def decode_output(output_list):
    idx_to_tag = {v:k for k,v in tag_to_idx.items()}
    return [idx_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)
print(len(y_true_decode), len(y_pred_decode))

7556 7556
CPU times: user 7.21 s, sys: 155 ms, total: 7.36 s
Wall time: 7.37 s


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true_decode,y_pred_decode,digits=4))

              precision    recall  f1-score   support

       I-LOC     0.9121    0.9165    0.9143       419
      I-MISC     0.8430    0.7754    0.8078       187
       I-ORG     0.8192    0.7474    0.7817       285
       I-PER     0.9742    0.9497    0.9618       875
           O     0.9814    0.9915    0.9864      5790

    accuracy                         0.9680      7556
   macro avg     0.9060    0.8761    0.8904      7556
weighted avg     0.9672    0.9680    0.9674      7556



## Kaggle Predictions

In [None]:
def get_test_predictions(model, input_dict): 
    """
    Function to get the kaggle prediction on the test set. 
    """
    model.eval()
    predictions = []

    for i in range(0, len(input_dict['sentence'])): 
        input = unpack_features(i, input_dict)
        pred_idxs = model(input)[1]
        pred = decode_output(pred_idxs)
        predictions += pred
    
    return predictions

In [None]:
%%time 
predictions = get_test_predictions(model, test_input_dict)

CPU times: user 42.9 s, sys: 856 ms, total: 43.8 s
Wall time: 43.8 s


In [None]:
kaggle_submission = pd.DataFrame.from_dict({'ID': np.arange(0, len(predictions)), 'Predicted': predictions})
kaggle_submission.to_csv('submission.csv', index = False)

In [None]:
kaggle_submission 

Unnamed: 0,ID,Predicted
0,0,O
1,1,O
2,2,O
3,3,I-LOC
4,4,O
...,...,...
46661,46661,O
46662,46662,O
46663,46663,O
46664,46664,I-PER
