In [24]:
import codecs
import time
from dataclasses import dataclass
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
import pickle

import sys
sys.path.insert(1, '/work/nlp-project/scripts')
from read_write_data import read_processed_data

import gensim.models
GoogleEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                '/work/nlp-project/models/GoogleNews-50k.bin', binary=True)

from tensorflow.keras.utils import pad_sequences
from sklearn.metrics import f1_score, accuracy_score


In [2]:
TRAIN_PATH = "nlp-project/data/processed/train.conll"
DEV_PATH = "nlp-project/data/processed/dev.conll"
TEST_PATH = "nlp-project/data/processed/test.conll"

PAD = "<PAD>"

# Data processing

## Loading the data

In [3]:
documents = []
doc_labels = []
for words, labels, _, _ in read_processed_data(TRAIN_PATH):
    documents.append(words)
    doc_labels.append(labels)

In [4]:
for doc, labels in zip(documents[:10], doc_labels[:10]):
    print("Document:")
    print(doc)
    print("\nMatching labels:")
    print(labels, '\n\n') 

Document:
['My', 'dad', 'just', 'does', "n't", 'understand', '?']

Matching labels:
['0', '0', '0', '0', '0', '0', '0'] 


Document:
['Ugh', 'my', 'dad', 'is', 'so', 'stupid', '...', 'he', 'just', 'does', "n't", 'understand', 'anything', '!']

Matching labels:
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'] 


Document:
['I', 'have', '5', 'sisters', 'and', 'so', 'including', 'my', 'mom', '...', 'he', 'is', 'the', 'only', 'guy', 'in', 'a', 'house', 'of', 'six', 'females', '.']

Matching labels:
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'] 


Document:
['Now', 'I', "'m", 'the', 'youngest', 'and', 'I', 'just', 'got', 'my', 'period', 'so', 'now', 'we', 'all', 'have', 'ours', 'and', 'he', 'thinks', 'it', "'s", 'a', 'good', 'thing', '?']

Matching labels:
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'] 


Document:
['He', 

## Creating vocabularies

In [5]:
def create_vocabulary(documents: List[List[str]], pad_token: str = None) -> Dict[str, int]:
    vocab = {pad_token: 0} if pad_token else {}
    for doc in documents:
        for token in doc:
            vocab[token] = vocab.get(token, len(vocab))

    return vocab

def reverse_dict(collection: Dict[Any, Any]) -> Dict[Any, Any]:
    reverse = {}
    for k, v in collection.items():
        reverse[v] = k
        
    return reverse

In [6]:
word2idx = create_vocabulary(documents=documents, pad_token=PAD)
idx2word = reverse_dict(collection=word2idx)
label2idx = create_vocabulary(documents=doc_labels)
idx2label = reverse_dict(collection=label2idx)

print("word2idx len:", len(word2idx))
print("idx2word len:", len(idx2word))
print("label2idx len:", len(label2idx))
print("idx2label len:", len(idx2label))

word2idx len: 19641
idx2word len: 19641
label2idx len: 2
idx2label len: 2


In [7]:
word2idx

{'<PAD>': 0,
 'My': 1,
 'dad': 2,
 'just': 3,
 'does': 4,
 "n't": 5,
 'understand': 6,
 '?': 7,
 'Ugh': 8,
 'my': 9,
 'is': 10,
 'so': 11,
 'stupid': 12,
 '...': 13,
 'he': 14,
 'anything': 15,
 '!': 16,
 'I': 17,
 'have': 18,
 '5': 19,
 'sisters': 20,
 'and': 21,
 'including': 22,
 'mom': 23,
 'the': 24,
 'only': 25,
 'guy': 26,
 'in': 27,
 'a': 28,
 'house': 29,
 'of': 30,
 'six': 31,
 'females': 32,
 '.': 33,
 'Now': 34,
 "'m": 35,
 'youngest': 36,
 'got': 37,
 'period': 38,
 'now': 39,
 'we': 40,
 'all': 41,
 'ours': 42,
 'thinks': 43,
 'it': 44,
 "'s": 45,
 'good': 46,
 'thing': 47,
 'He': 48,
 'always': 49,
 'like': 50,
 '"': 51,
 'ohh': 52,
 'you': 53,
 'must': 54,
 'be': 55,
 'happy': 56,
 'to': 57,
 'finally': 58,
 'yours': 59,
 ',': 60,
 'wish': 61,
 'had': 62,
 'mine': 63,
 'even': 64,
 'joking': 65,
 'think': 66,
 'living': 67,
 'with': 68,
 'many': 69,
 'girls': 70,
 'making': 71,
 'him': 72,
 'go': 73,
 'crazy': 74,
 'Yep': 75,
 'are': 76,
 'getting': 77,
 'dads': 78,
 '.

In [8]:
label2idx

{'0': 0, '1': 1}

In [9]:
enc_documents = [[word2idx[token] for token in doc] for doc in documents]
enc_doc_labels = [[label2idx[label] for label in labels] for labels in doc_labels]

for doc, labels in zip(enc_documents[:10], enc_doc_labels[:10]):
    print("Document:")
    print(doc)
    print("\nMatching labels:")
    print(labels, '\n\n') 

Document:
[1, 2, 3, 4, 5, 6, 7]

Matching labels:
[0, 0, 0, 0, 0, 0, 0] 


Document:
[8, 9, 2, 10, 11, 12, 13, 14, 3, 4, 5, 6, 15, 16]

Matching labels:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 


Document:
[17, 18, 19, 20, 21, 11, 22, 9, 23, 13, 14, 10, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]

Matching labels:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 


Document:
[34, 17, 35, 24, 36, 21, 17, 3, 37, 9, 38, 11, 39, 40, 41, 18, 42, 21, 14, 43, 44, 45, 28, 46, 47, 7]

Matching labels:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 


Document:
[48, 45, 49, 50, 51, 52, 53, 54, 55, 11, 56, 57, 58, 18, 59, 60, 17, 61, 17, 62, 63, 16, 51, 21, 14, 10, 5, 64, 65, 33]

Matching labels:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 


Document:
[17, 66, 3, 67, 27, 28, 29, 68, 11, 69, 70, 10, 71, 72, 73, 74, 7]

Matching labels:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 


Document:
[

## Preparing input data - padding input data and using Tensors

In [10]:
def create_encoding_matrix(collection: List[List[int]], pad_token_idx: str, max_len: int = None):
    if not max_len:
        max_len = max([len(x) for x in collection])

    to_series = [pd.Series(el) for el in collection]
    enc_matrix = (pd.concat(to_series, axis=1)
                    .reindex(range(max_len))
                    .fillna(pad_token_idx)
                    .astype('int16')
                    .T)

    return enc_matrix

In [11]:
max_len = max([len(x) for x in enc_documents])

enc_document_matrix = create_encoding_matrix(enc_documents, word2idx[PAD], max_len=max_len)
enc_label_matrix = create_encoding_matrix(enc_doc_labels, label2idx['0'], max_len=max_len)

In [23]:
enc_document_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
0,1,2,3,4,5,6,7,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,9,2,10,11,12,13,14,3,4,...,0,0,0,0,0,0,0,0,0,0
2,17,18,19,20,21,11,22,9,23,13,...,0,0,0,0,0,0,0,0,0,0
3,34,17,35,24,36,21,17,3,37,9,...,0,0,0,0,0,0,0,0,0,0
4,48,45,49,50,51,52,53,54,55,11,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12532,187,6680,60,8640,343,24,8134,3219,30,5286,...,0,0,0,0,0,0,0,0,0,0
12533,187,19441,15047,19631,1264,149,55,12126,236,166,...,0,0,0,0,0,0,0,0,0,0
12534,3378,24,9377,7652,111,14,422,2336,16183,60,...,0,0,0,0,0,0,0,0,0,0
12535,602,152,30,6227,4873,19635,45,19636,3650,202,...,0,0,0,0,0,0,0,0,0,0


In [13]:
enc_label_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12532,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12533,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12534,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12535,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
input_documents = torch.tensor(enc_document_matrix.to_numpy(), dtype=torch.long)
input_labels = torch.tensor(enc_label_matrix.to_numpy(), dtype=torch.long)

print(input_documents)
print(input_labels)

tensor([[    1,     2,     3,  ...,     0,     0,     0],
        [    8,     9,     2,  ...,     0,     0,     0],
        [   17,    18,    19,  ...,     0,     0,     0],
        ...,
        [ 3378,    24,  9377,  ...,     0,     0,     0],
        [  602,   152,    30,  ...,     0,     0,     0],
        [10193, 11098,    18,  ...,     0,     0,     0]])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])


# Batch Iterator

In [15]:
@dataclass
class Batch:
    inputs: Tensor
    targets: Tensor

class DataIterator:
    
    def __init__(self, batch_size=32):
        self.batch_size = batch_size
        
    def __call__(self, inputs: Tensor, targets: Tensor) -> Batch:
        intervals = np.arange(0, len(inputs), self.batch_size)
        for start in intervals:
            end = start + self.batch_size
            batch_inputs = inputs[start: end]
            batch_targets = targets[start: end]
            
            yield Batch(batch_inputs, batch_targets)

data_iterator = DataIterator()
for batch in data_iterator(input_documents, input_labels):
    print(batch.inputs.shape)

torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([32, 99])
torch.Size([3

# Training a model

In [16]:
torch.manual_seed(1)

EMBEDDING_DIM = 300
LSTM_HIDDEN = 50
LEARNING_RATE = 0.01
EPOCHS = 20

class BaselineLSTM(torch.nn.Module):
    def __init__(self, n_words, n_labels):
        super().__init__()
        self.n_words = n_words
        self.n_labels = n_labels
        # self.embeds = nn.Embedding(n_words, EMBEDDING_DIM)
        self.lstm = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=LSTM_HIDDEN, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(in_features=2 * LSTM_HIDDEN, out_features=n_labels)
        
    def forward(self, inputs):
        word_embeds = self._get_embeds(inputs)
        word_embeds = nn.Dropout(p=0.2)(word_embeds)
        lstm_result, _ = self.lstm(word_embeds)
        lstm_result = nn.Dropout(p=0.3)(lstm_result)
        tags = self.linear(lstm_result)
        log_probs = F.softmax(tags)
        return log_probs
    
    def _get_embeds(self, inputs):  # inputs of shape (32 sentences,150 tokens)
        embeddings = torch.Tensor()
        for sentence in inputs:
            sentence_embeds = torch.Tensor()
        
            for word in sentence:
                try:
                    embed = torch.from_numpy(GoogleEmbs.get_vector(word))
                except KeyError:
                    embed = torch.zeros(300)
                sentence_embeds = torch.cat((sentence_embeds, embed), dim=0)

            embeddings = torch.cat((embeddings, sentence_embeds), dim=0)
        return embeddings.view(len(inputs), -1, EMBEDDING_DIM)


In [17]:
TRAIN_AND_SAVE = False  # 'False' stops this block from running when running the notebook

if TRAIN_AND_SAVE:

    model = BaselineLSTM(len(idx2word), len(idx2label))
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    loss_func = torch.nn.CrossEntropyLoss(reduction='sum')

    for epoch in range(EPOCHS):
        model.train()
        
        total_tags = 0
        matched_tags = 0
        epoch_loss = 0
        for i, batch in enumerate(data_iterator(input_documents, input_labels)):
            pred_tags = model.forward(inputs=batch.inputs)
            
            # probability distribution for each tag across all words
            pred_tags = pred_tags.view(-1, model.n_labels)
            
            # true label for each word
            targets = batch.targets.flatten() 
            
            batch_loss = loss_func(pred_tags, targets)
            epoch_loss += batch_loss.item()
            
            # optimization
            batch_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            # proportion of matched tags
            for pred_tag, true_tag in zip(pred_tags, targets):
                pred_tag_idx = torch.argmax(pred_tag)
                if pred_tag_idx == true_tag:
                    matched_tags +=1
                total_tags += 1
            
            # validation loss
            # model.eval()
            # pred_dev_tags = model(dev_input)
            
            # pred_dev_tags = pred_dev_tags.view(-1, model.n_tags)
            # true_dev_tags = dev_labels.flatten()
            
            # val_loss = loss_func(pred_dev_tags, true_dev_tags).item()
                            
        print(f"Epoch {epoch} loss: {epoch_loss:.2f},  total tags matched: {matched_tags / total_tags * 100:.2f}%")

    # Save the model
    mili_time = round(time.time() * 1000)
    file_name = f"/work/nlp-project/models/model_{mili_time}.pkl"
    pickle.dump(model, open(file_name, "wb"))

# 

In [18]:
# loading model
with open('/work/nlp-project/models/model_1679946334736.pkl', 'rb') as f:
    model = pickle.load(f)

In [19]:
dev_documents = []
dev_labels = []
for words, labels, _ ,_ in read_processed_data(DEV_PATH):
    dev_documents.append(words)
    dev_labels.append(labels)

In [21]:
# enc_dev_documents = [[word2idx.get(token, '<PAD>') for token in doc] for doc in dev_documents]
# enc_dev_labels = [[label2idx.get(label, '') for label in labels] for labels in dev_labels]


# enc_dev_document_matrix = create_encoding_matrix(enc_dev_documents, word2idx[PAD], max_len=max_len)
# enc_dev_label_matrix = create_encoding_matrix(enc_dev_labels, label2idx['0'], max_len=max_len)

# dev_inputs = torch.Tensor(enc_dev_document_matrix.to_numpy(), dtype=torch.long)
# dev_labels = torch.Tensor(enc_dev_label_matrix.to_numpy(), dtype=torch.long)

In [0]:
# model.eval()
# pred_dev_tags = model(dev_inputs).view(-1, model.n_tags)
# true_dev_tags = dev_labels.flatten()
# val_loss = loss_func(pred_dev_tags, true_dev_tags).item()
# print(val_loss)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>