
##### by Anastasiia Khaburska

## Homework 5

module : **Deep Learning for NLP**


The goal of the homework is to develop a tool for Named Entity Recognition. You need to implement model **”Glove word embeddings + BiLSTM + Softmax”** for sequence labeling. Please, use the standard PyTorch example for the sequence labeling task ”Sequence models and long-short term memory networks”(https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#sphx-glr-beginner-nlp-sequence-models-tutorial-py) as a basic code to start. Glove word embeddings can be downloaded here (http://neuroner.com/data/word_vectors/glove.6B.100d.zip).

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

import pandas as pd
import numpy as np
import gensim
import pickle
import os
import pandas as pd 


### **Task 1**: 
Implement functionality to read and process NER 2003 English Shared Task data in CoNNL file format, data will be provided (10% of score).

In [2]:
def process(filename):
    with open(f'data/{filename}.txt') as file:
        size = 0
        for line in file:
            if line != '\n':
                w, _, _, _ = line.split(' ')
                if (w!='-DOCSTART-'):
                    size+=1
    
    df = pd.DataFrame(
            columns=['sentence', 'word', 'POS_tag', 'SCHUNK_tag', 'NE_tag'],
            data=np.zeros((size, 5))
        )
    df[:] = ''
    with open(f'data/{filename}.txt') as file:
        index = 0
        word = 0
        for line in file:
            if line == '\n':
                index += 1
            else:
                w, p, s, n = line.split(' ')
                if (w!='-DOCSTART-'):
                    df.at[word, 'sentence'] = int(index)
                    df.at[word, 'word'] = w.strip()
                    df.at[word, 'POS_tag'] = p.strip()
                    df.at[word, 'SCHUNK_tag'] = s.strip()               
                    df.at[word, 'NE_tag'] = n.strip()
                    word += 1
        return df

In [3]:
train = process('train')
dev = process('dev')
test = process('test')

In [4]:
print(f"Length of train: {len(train)}")
print(f"Length of  dev:   {len(dev)}")
print(f"Length of  test:  {len(test)}")

Length of train: 203621
Length of  dev:   51362
Length of  test:  46435


In [5]:
print(f"Number of sentences in train: {len(train['sentence'].unique())}")
print(f"Number of sentences in dev:   {len(dev['sentence'].unique())}")
print(f"Number of sentences in test:  {len(test['sentence'].unique())}")

Number of sentences in train: 14041
Number of sentences in dev:   3250
Number of sentences in test:  3453


In [6]:
print("Named entity tags:",sorted(train['NE_tag'].unique()))

Named entity tags: ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


**Prepare data:**

In [7]:
def prepare_sequence(seq, to_ix):
    
    idxs = [to_ix[w] for w in seq]
    
    return torch.tensor(idxs, dtype=torch.long)

In [8]:
tag_to_ix = {'B-LOC':0, 'B-MISC':1, 'B-ORG':2, 'B-PER':3, 'I-LOC':4, 'I-MISC':5, 'I-ORG':6, 'I-PER':7, 'O':8}

In [9]:
word_to_ix = {}

for word in train['word']:
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
for word in dev['word']:
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
for word in test['word']:
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)

In [10]:
print(word_to_ix)



In [11]:
vocab_size = len(word_to_ix)

print(vocab_size)

30289


### **Task 2**: 

Implement 3 strategies for loading the embeddings:

In [12]:
glove_embeddings= {}

with open('data/glove.6B.100d.txt', 'r') as file:
    for line in file:
        elements = line.split(' ')
        word = elements[0]
        word_embedding = np.array([float(val) for val in elements[1:]])
        glove_embeddings[word] = word_embedding

In [13]:
embedding_length=len(glove_embeddings['the'])
print("Embedding vector length: ", embedding_length)

Embedding vector length:  100


**(a):** load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, associate it with UNKNOWN embedding (5% of score).

In [14]:
embeddings_matrix_a = np.zeros((vocab_size, 100))
unknown_a=0
for word, ix in word_to_ix.items():
    try:
        embeddings_matrix_a[ix, :] = glove_embeddings[word]
    except KeyError as e:
        embeddings_matrix_a[ix, :] = glove_embeddings['unknown']
        unknown_a+=1

In [15]:
print("Number of unknown: ", unknown_a )

Number of unknown:  15671


**(b).** load the embeddings for lowercased capitalization of words. If embedding for this lowercased word doesn’t exists, associate it with UNKNOWN embedding (5% of score).

In [16]:
embeddings_matrix_b = np.zeros((vocab_size, 100))
unknown_b=0
for word, ix in word_to_ix.items():
    try:
        embeddings_matrix_b[ix, :] = glove_embeddings[word.lower()]
    except KeyError as e:
        embeddings_matrix_b[ix, :] = glove_embeddings['unknown']
        unknown_b+=1        

In [17]:
print("Number of unknown: ", unknown_b)

Number of unknown:  3949


**(c).** load the embeddings for original capitalization of words. If embedding for this word doesn’t exists, try to find the embedding for lowercased version and associate it to the word with original capitalization. Otherwise, associate it with UNKNOWN embedding(20% of score).

In [18]:
embeddings_matrix_c = np.zeros((vocab_size, 100))
unknown_c=0
for word, ix in word_to_ix.items():
    if word in glove_embeddings:
        embeddings_matrix_c[ix, :] = glove_embeddings[word]
    elif word.lower() in glove_embeddings:
        embeddings_matrix_c[ix, :] = glove_embeddings[word.lower()]
    else:
        embeddings_matrix_c[ix, :] = glove_embeddings['unknown']
        unknown_c+=1

In [19]:
print("Number of unknown: ", unknown_c)

Number of unknown:  3949


### **Task 2**: 

Implement training on batches (20% of score).

In [20]:
EMBEDDING_DIM = embedding_length
HIDDEN_DIM = 50
EMBEDDING_MATRIX=embeddings_matrix_c
VOCAB_SIZE=EMBEDDING_MATRIX.shape[0]
TARGET_SIZE=len(tag_to_ix)

In [40]:
training_data = []
dev_data = []
test_data = []
train_grouped = train.groupby(['sentence']).agg(lambda x: list(x)).reset_index(drop=True)
for i in range(len(train_grouped)):
    training_data.append((train_grouped.loc[i, 'word'], train_grouped.loc[i, 'NE_tag']))
dev_grouped = dev.groupby(['sentence']).agg(lambda x: list(x)).reset_index(drop=True)
for i in range(len(dev_grouped)):
    dev_data.append((dev_grouped.loc[i, 'word'], dev_grouped.loc[i, 'NE_tag']))
test_grouped = test.groupby(['sentence']).agg(lambda x: list(x)).reset_index(drop=True)
for i in range(len(test_grouped)):
    test_data.append((test_grouped.loc[i, 'word'], test_grouped.loc[i, 'NE_tag']))

In [22]:
training_data [0]

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'])

In [23]:
class LSTMTagger(nn.Module):

    def __init__(self,embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim).from_pretrained(
        torch.tensor(EMBEDDING_MATRIX, dtype=torch.float))
        self.word_embeddings.weight.requires_grad = False
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [26]:
model = LSTMTagger(EMBEDDING_DIM , HIDDEN_DIM, VOCAB_SIZE, TARGET_SIZE)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses=[]
epochs=[]

print("Start training..............:")
for epoch in range(30): 
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance        
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        
        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    losses.append(loss.item())
    epochs.append(epoch)
    print(f'Epoch {epoch} : Loss {loss.item()}')

Start training..............:
Epoch 0 : Loss 0.15103602409362793
Epoch 1 : Loss 0.14554905891418457
Epoch 2 : Loss 0.10618782043457031
Epoch 3 : Loss 0.09880781173706055
Epoch 4 : Loss 0.07561063766479492
Epoch 5 : Loss 0.06079506874084473
Epoch 6 : Loss 0.052024126052856445
Epoch 7 : Loss 0.048589468002319336
Epoch 8 : Loss 0.04235219955444336
Epoch 9 : Loss 0.03465390205383301
Epoch 10 : Loss 0.030825138092041016
Epoch 11 : Loss 0.027204275131225586
Epoch 12 : Loss 0.0226137638092041
Epoch 13 : Loss 0.021651268005371094
Epoch 14 : Loss 0.01965045928955078
Epoch 15 : Loss 0.01956796646118164
Epoch 16 : Loss 0.01625514030456543
Epoch 17 : Loss 0.015863418579101562
Epoch 18 : Loss 0.013532161712646484
Epoch 19 : Loss 0.012593746185302734
Epoch 20 : Loss 0.01140904426574707
Epoch 21 : Loss 0.009791851043701172
Epoch 22 : Loss 0.008656024932861328
Epoch 23 : Loss 0.010126829147338867
Epoch 24 : Loss 0.010091304779052734
Epoch 25 : Loss 0.010461091995239258
Epoch 26 : Loss 0.00900864601135

In [28]:
filename = 'data/model30.pkl'
output = open(filename, 'wb')
#pickle.dump(model,output)
with open(filename, 'rb') as pickle_file:
    model_loaded = pickle.load(pickle_file)

In [29]:
model

LSTMTagger(
  (word_embeddings): Embedding(30289, 100)
  (lstm): LSTM(100, 50)
  (hidden2tag): Linear(in_features=50, out_features=9, bias=True)
)

In [31]:
model_loaded

LSTMTagger(
  (word_embeddings): Embedding(30289, 100)
  (lstm): LSTM(100, 50)
  (hidden2tag): Linear(in_features=50, out_features=9, bias=True)
)

In [51]:
with torch.no_grad():
    inputs = prepare_sequence(dev_data[120][0], word_to_ix)
    tag_scores = model(inputs)
tag_scores

tensor([[-1.0831e+01, -1.2034e+01, -6.8271e+00, -1.1282e-03, -2.0422e+01,
         -1.9767e+01, -1.7008e+01, -1.0912e+01, -1.7414e+01],
        [-2.1461e+01, -1.9454e+01, -1.5933e+01, -1.5259e+01, -9.6717e+00,
         -1.2609e+01, -8.8447e+00, -1.6713e+00, -2.0852e-01],
        [-1.6192e+01, -1.4071e+01, -1.6699e+01, -1.2927e+01, -1.5037e+01,
         -1.9383e+01, -1.2311e+01, -1.1517e+01, -1.8120e-05],
        [-1.0159e+01, -6.4259e+00, -7.6228e+00, -5.6080e+00, -1.7043e+01,
         -1.8085e+01, -1.6861e+01, -1.2285e+01, -5.8374e-03],
        [-1.8133e+01, -1.8992e+01, -1.9773e+01, -2.1643e+01, -1.9085e+01,
         -1.5303e+01, -1.8362e+01, -1.7667e+01,  0.0000e+00],
        [-1.5455e+01, -1.8560e+01, -2.1529e+01, -2.2534e+01, -2.4147e+01,
         -2.6264e+01, -2.8178e+01, -3.1964e+01,  0.0000e+00],
        [-2.0627e+01, -1.4935e+01, -2.1152e+01, -2.5505e+01, -2.9034e+01,
         -2.6578e+01, -3.2355e+01, -3.3287e+01,  0.0000e+00],
        [-1.9955e+01, -1.7779e+01, -2.4771e+01, 

In [56]:
prepare_sequence(dev_data[120][1], tag_to_ix)

tensor([3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 4,
        8, 3, 7, 7, 7, 7, 8])

In [57]:
tag_scores.numpy().argmax(axis=1)



array([3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       1, 2, 8, 3, 7, 7, 8, 3, 8])