In [1]:
import codecs
from dataclasses import dataclass
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
import time
import pickle
import datetime

import sys
sys.path.append('nlp_project')
from nlp_project.scripts.read_write_data import read_processed_data

import gensim.models
GoogleEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                'nlp_project/models/GoogleNews-50k.bin', binary=True)

# from tensorflow.keras.utils import pad_sequences
from sklearn.metrics import f1_score, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAIN_PATH = "nlp_project/data/processed/train.conll"
DEV_PATH = "nlp_project/data/processed/dev.conll"
TEST_PATH = "nlp_project/data/processed/test.conll"

PAD = "<PAD>"

In [3]:
def create_vocabulary(documents: List[List[str]], pad_token: str = None) -> Dict[str, int]:
    vocab = {pad_token: 0} if pad_token else {}
    for doc in documents:
        for token in doc:
            vocab[token] = vocab.get(token, len(vocab))

    return vocab

def reverse_dict(collection: Dict[Any, Any]) -> Dict[Any, Any]:
    reverse = {}
    for k, v in collection.items():
        reverse[v] = k
        
    return reverse

In [4]:
documents = []
doc_labels = []
for words, labels, _, _ in read_processed_data(TRAIN_PATH):
    documents.append(words)
    doc_labels.append(labels)

In [5]:
def pad_inputs(collection: List[List[int]], pad_token, max_len: int = None):
    if not max_len:
        max_len = max([len(x) for x in collection])

    to_series = [pd.Series(el) for el in collection]
    enc_matrix = (pd.concat(to_series, axis=1)
                    .reindex(range(max_len))
                    .fillna(pad_token)
                    .T)

    return enc_matrix.values.tolist()

In [6]:
max_len = max([len(x) for x in documents])

padded_documents = pad_inputs(documents, PAD, max_len=max_len)
padded_labels = pad_inputs(doc_labels, '0', max_len=max_len)

In [7]:
# first sentence padded
print(padded_documents[0])
print(len(padded_documents[0]))

# first target labels padded
print(padded_labels[0])
print(len(padded_labels[0]))

['My', 'dad', 'just', 'does', "n't", 'understand', '?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
99
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0

In [8]:
print(np.array(padded_documents).shape)
print(np.array(padded_labels).shape)

(12537, 99)
(12537, 99)


In [9]:
padded_labels = [list(map(int, sentence)) for sentence in padded_labels]
padded_labels

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0

In [10]:
@dataclass
class Batch:
    inputs: Tensor
    targets: Tensor

class DataIterator:
    
    def __init__(self, batch_size=32):
        self.batch_size = batch_size
        
    def __call__(self, inputs: Tensor, targets: Tensor) -> Batch:
        intervals = np.arange(0, len(inputs), self.batch_size)
        for start in intervals:
            end = start + self.batch_size
            batch_inputs = inputs[start: end]
            batch_targets = targets[start: end]
            
            yield Batch(batch_inputs, batch_targets)

In [11]:
torch.manual_seed(1)

EMBEDDING_DIM = 300
LSTM_HIDDEN = 50
LEARNING_RATE = 0.01
EPOCHS = 3

class BaselineLSTM(torch.nn.Module):
    def __init__(self, n_labels):
        super().__init__()
        self.n_labels = n_labels
        self.lstm = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=LSTM_HIDDEN, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(in_features=2 * LSTM_HIDDEN, out_features=n_labels)
        
    def forward(self, inputs):
        word_embeds = self._get_embeds(inputs)
        word_embeds = nn.Dropout(p=0.2)(word_embeds)
        lstm_result, _ = self.lstm(word_embeds)
        lstm_result = nn.Dropout(p=0.3)(lstm_result)
        tags = self.linear(lstm_result)
        log_probs = F.softmax(tags, dim=2)
        return log_probs
    
    def _get_embeds(self, inputs):
        embeddings = torch.Tensor()
        for sentence in inputs:
            sentence_embeds = torch.Tensor()
        
            for word in sentence:
                try:
                    embed = torch.from_numpy(GoogleEmbs.get_vector(word))
                except KeyError:
                    embed = torch.zeros(300)
                sentence_embeds = torch.cat((sentence_embeds, embed), dim=0)

            embeddings = torch.cat((embeddings, sentence_embeds), dim=0)
        return embeddings.view(len(inputs), -1, EMBEDDING_DIM)


In [12]:
TRAIN_AND_SAVE = False  # 'False' stops this block from running when running the notebook

if TRAIN_AND_SAVE:

    model = BaselineLSTM(n_labels=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    loss_func = torch.nn.CrossEntropyLoss(reduction='sum')
    data_iterator = DataIterator()

    for epoch in range(EPOCHS):
        model.train()
        
        total_tags = 0
        matched_tags = 0
        epoch_loss = 0
        for i, batch in enumerate(data_iterator(padded_documents, padded_labels)):
            pred_tags = model.forward(inputs=batch.inputs)
            
            # probability distribution for each tag across all words
            pred_tags = pred_tags.view(-1, model.n_labels)
            
            # true label for each word
            targets = torch.tensor(batch.targets).flatten()
            
            batch_loss = loss_func(pred_tags, targets)
            epoch_loss += batch_loss.item()
            
            # optimization
            batch_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            # proportion of matched tags
            for pred_tag, true_tag in zip(pred_tags, targets):
                pred_tag_idx = torch.argmax(pred_tag)
                if pred_tag_idx == true_tag:
                    matched_tags +=1
                total_tags += 1
            
            # validation loss
            # model.eval()
            # pred_dev_tags = model(dev_input)
            
            # pred_dev_tags = pred_dev_tags.view(-1, model.n_tags)
            # true_dev_tags = dev_labels.flatten()
            
            # val_loss = loss_func(pred_dev_tags, true_dev_tags).item()
                            
        print(f"Epoch {epoch} loss: {epoch_loss:.2f},  total tags matched: {matched_tags / total_tags * 100:.2f}%")

    # Save the model
    import pickle
    mili_time = round(time.time() * 1000)
    file_name = f"nlp_project/models/model_{mili_time}.pkl"
    pickle.dump(model, open(file_name, "wb"))

    data_iterator = DataIterator()

    for i, batch in enumerate(data_iterator(padded_documents, padded_labels)):
        if i == 2: print(batch.inputs)

# All-in-one class

In [53]:
class SecondLSTM(torch.nn.Module):
    def __init__(self, EMBEDDING_DIM=300,LSTM_HIDDEN=20,max_len=100,n_labels=3,batch_size=32):
        super().__init__()
        
        self.EMBEDDING_DIM = EMBEDDING_DIM  # length of embedding vectors
        self.LSTM_HIDDEN = LSTM_HIDDEN  # number of LSTM cells
        self.max_len=max_len  # maximum input sentence length, will be padded to this size
        self.n_labels = n_labels
        self.lstm = nn.LSTM(input_size=self.EMBEDDING_DIM, hidden_size=self.LSTM_HIDDEN, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(in_features=2 * self.LSTM_HIDDEN, out_features=n_labels)
        self.batch_size = batch_size

    def forward(self, inputs):
        '''
        Implements a forward pass through the Bi-LSTM.
        inputs are a batch (list) of sentences.
        '''
        word_embeds = self._get_embeds(inputs)
        word_embeds = nn.Dropout(p=0.2)(word_embeds)
        lstm_result, _ = self.lstm(word_embeds)
        lstm_result = nn.Dropout(p=0.3)(lstm_result)
        tags = self.linear(lstm_result)
        log_probs = F.softmax(tags, dim=2)
        return log_probs
    
    def _get_embeds(self, inputs):
        embeddings = torch.Tensor()
        for sentence in inputs:
            sentence_embeds = torch.Tensor()
        
            for word in sentence:
                if GoogleEmbs.__contains__(word):
                    embed = GoogleEmbs.get_vector(word)
                    embed.setflags(write = True)
                    embed = torch.from_numpy(embed)
                else:
                    embed = torch.zeros(300)
                sentence_embeds = torch.cat((sentence_embeds, embed), dim=0)

            embeddings = torch.cat((embeddings, sentence_embeds), dim=0)
        return embeddings.view(len(inputs), -1, self.EMBEDDING_DIM)
    
    def fit(self, documents, labels, LEARNING_RATE=0.01, EPOCHS=3):

        padded_documents = pad_inputs(documents, "<PAD>")
        padded_labels = pad_inputs(labels, -100)  # padding label is -100
        padded_labels = [list(map(int, sentence)) for sentence in padded_labels]

        self.train()
        torch.manual_seed(1)
        optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE)
        loss_func = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=-100)  # ignores loss for padding tokens
        data_iterator = DataIterator(batch_size=self.batch_size)

        for epoch in range(EPOCHS):
            
            total_tags = 0
            matched_tags = 0
            epoch_loss = 0

            for i, batch in enumerate(data_iterator(padded_documents, padded_labels)):
                pred_tags = self.forward(inputs=batch.inputs)
                
                # probability distribution for each tag across all words
                pred_tags = pred_tags.view(-1, self.n_labels)
                
                # true label for each word
                targets = torch.tensor(batch.targets).flatten()
                batch_loss = loss_func(pred_tags, targets)
                epoch_loss += batch_loss.item()
                
                # optimization
                batch_loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
                # proportion of matched tags
                for pred_tag, true_tag in zip(pred_tags, targets):
                    true_tag = true_tag.item()
                    
                    if true_tag == -100: continue  # ignore tag of padding tokens
    
                    pred_tag_idx = torch.argmax(pred_tag).item()
                    if pred_tag_idx == true_tag:
                        matched_tags +=1
                    total_tags += 1
                                
            print(f"Epoch {epoch} loss: {epoch_loss:.2f},  total tags matched: {matched_tags / total_tags * 100:.2f}%")
    
    def pad_inputs(collection: List[List[int]], pad_token):
        to_series = [pd.Series(el) for el in collection]
        enc_matrix = (pd.concat(to_series, axis=1)
                        .reindex(range(self.max_len))
                        .fillna(pad_token)
                        .T)
        return enc_matrix.values.tolist()


In [56]:
model2 = SecondLSTM(LSTM_HIDDEN=1)
model2.train()
model2.fit(documents, doc_labels, LEARNING_RATE=0.05, EPOCHS=10)

Epoch 0 loss: 131406.33,  total tags matched: 91.35%
Epoch 1 loss: 121974.84,  total tags matched: 95.46%
Epoch 2 loss: 120986.90,  total tags matched: 95.87%
Epoch 3 loss: 120685.25,  total tags matched: 96.00%
Epoch 4 loss: 120605.28,  total tags matched: 96.03%
Epoch 5 loss: 120616.40,  total tags matched: 96.02%
Epoch 6 loss: 120597.35,  total tags matched: 96.02%
Epoch 7 loss: 120528.55,  total tags matched: 96.06%
Epoch 8 loss: 120447.85,  total tags matched: 96.09%
Epoch 9 loss: 120555.53,  total tags matched: 96.03%


In [28]:
# Save the model
date = datetime.date.today().isoformat()[5:]  # date in mm_dd format
file_name = f"nlp_project/models/model_{date}.pkl"
pickle.dump(model2, open(file_name, "wb"))

In [41]:
def pad_inputs(collection: List[List[int]], pad_token):
    to_series = [pd.Series(el) for el in collection]
    enc_matrix = (pd.concat(to_series, axis=1)
                    .reindex(range(99))
                    .fillna(pad_token)
                    .T)
    return enc_matrix.values.tolist()

In [48]:
model2.eval()
test_index = 21
print(documents[test_index])
print(doc_labels[test_index])
model2.forward([documents[test_index]])

['Besides', 'from', 'Pacquiao', ',', 'what', 'else', 'is', 'The', 'Philipines', 'famous', 'for', '?']
['0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0']


tensor([[[1.0000e+00, 8.2845e-07, 9.3068e-07],
         [1.0000e+00, 1.5175e-07, 1.8562e-07],
         [1.0000e+00, 1.2283e-07, 1.5355e-07],
         [1.0000e+00, 1.2850e-07, 1.6123e-07],
         [1.0000e+00, 1.5196e-07, 1.9825e-07],
         [1.0000e+00, 1.4795e-07, 1.9324e-07],
         [1.0000e+00, 1.1860e-07, 1.4874e-07],
         [1.0000e+00, 1.1022e-07, 1.3574e-07],
         [1.0000e+00, 1.2100e-07, 1.5253e-07],
         [1.0000e+00, 1.4495e-07, 1.8816e-07],
         [1.0000e+00, 3.2540e-07, 4.5434e-07],
         [9.9997e-01, 1.2498e-05, 2.1642e-05]]], grad_fn=<SoftmaxBackward0>)

In [None]:
pad_inputs()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>