In [2]:
import codecs
import time
from dataclasses import dataclass
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
import pickle

import sys
sys.path.append("nlp_project")
from nlp_project.scripts.read_write_data import read_processed_data

import gensim.models
GoogleEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                'nlp_project/models/GoogleNews-50k.bin', binary=True)

from tensorflow.keras.utils import pad_sequences
from sklearn.metrics import f1_score, accuracy_score


In [3]:
TRAIN_PATH = "nlp_project/data/processed/train.conll"
DEV_PATH = "nlp_project/data/processed/dev.conll"
TEST_PATH = "nlp_project/data/processed/test.conll"

PAD = "<PAD>"

# Data processing

## Loading the data

In [4]:
documents = []
doc_labels = []
for words, labels, _, _ in read_processed_data(TRAIN_PATH):
    documents.append(words)
    doc_labels.append(labels)

In [5]:
for doc, labels in zip(documents[:10], doc_labels[:10]):
    print("Document:")
    print(doc)
    print("\nMatching labels:")
    print(labels, '\n\n') 

Document:
['My', 'dad', 'just', 'does', "n't", 'understand', '?']

Matching labels:
['0', '0', '0', '0', '0', '0', '0'] 


Document:
['Ugh', 'my', 'dad', 'is', 'so', 'stupid', '...', 'he', 'just', 'does', "n't", 'understand', 'anything', '!']

Matching labels:
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'] 


Document:
['I', 'have', '5', 'sisters', 'and', 'so', 'including', 'my', 'mom', '...', 'he', 'is', 'the', 'only', 'guy', 'in', 'a', 'house', 'of', 'six', 'females', '.']

Matching labels:
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'] 


Document:
['Now', 'I', "'m", 'the', 'youngest', 'and', 'I', 'just', 'got', 'my', 'period', 'so', 'now', 'we', 'all', 'have', 'ours', 'and', 'he', 'thinks', 'it', "'s", 'a', 'good', 'thing', '?']

Matching labels:
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'] 


Document:
['He', 

## Creating vocabularies

In [None]:
def create_vocabulary(documents: List[List[str]], pad_token: str = None) -> Dict[str, int]:
    vocab = {pad_token: 0} if pad_token else {}
    for doc in documents:
        for token in doc:
            vocab[token] = vocab.get(token, len(vocab))

    return vocab

def reverse_dict(collection: Dict[Any, Any]) -> Dict[Any, Any]:
    reverse = {}
    for k, v in collection.items():
        reverse[v] = k
        
    return reverse

In [None]:
word2idx = create_vocabulary(documents=documents, pad_token=PAD)
idx2word = reverse_dict(collection=word2idx)
label2idx = create_vocabulary(documents=doc_labels)
idx2label = reverse_dict(collection=label2idx)

print("word2idx len:", len(word2idx))
print("idx2word len:", len(idx2word))
print("label2idx len:", len(label2idx))
print("idx2label len:", len(idx2label))

In [None]:
word2idx

In [None]:
label2idx

In [None]:
enc_documents = [[word2idx[token] for token in doc] for doc in documents]
enc_doc_labels = [[label2idx[label] for label in labels] for labels in doc_labels]

for doc, labels in zip(enc_documents[:10], enc_doc_labels[:10]):
    print("Document:")
    print(doc)
    print("\nMatching labels:")
    print(labels, '\n\n') 

## Preparing input data - padding input data and using Tensors

In [None]:
def create_encoding_matrix(collection: List[List[int]], pad_token_idx: str, max_len: int = None):
    if not max_len:
        max_len = max([len(x) for x in collection])

    to_series = [pd.Series(el) for el in collection]
    enc_matrix = (pd.concat(to_series, axis=1)
                    .reindex(range(max_len))
                    .fillna(pad_token_idx)
                    .astype('int16')
                    .T)

    return enc_matrix

In [None]:
max_len = max([len(x) for x in enc_documents])

enc_document_matrix = create_encoding_matrix(enc_documents, word2idx[PAD], max_len=max_len)
enc_label_matrix = create_encoding_matrix(enc_doc_labels, label2idx['0'], max_len=max_len)

In [None]:
enc_document_matrix

In [None]:
enc_label_matrix

In [None]:
input_documents = torch.tensor(enc_document_matrix.to_numpy(), dtype=torch.long)
input_labels = torch.tensor(enc_label_matrix.to_numpy(), dtype=torch.long)

print(input_documents)
print(input_labels)

# Batch Iterator

In [None]:
@dataclass
class Batch:
    inputs: Tensor
    targets: Tensor

class DataIterator:
    
    def __init__(self, batch_size=32):
        self.batch_size = batch_size
        
    def __call__(self, inputs: Tensor, targets: Tensor) -> Batch:
        intervals = np.arange(0, len(inputs), self.batch_size)
        for start in intervals:
            end = start + self.batch_size
            batch_inputs = inputs[start: end]
            batch_targets = targets[start: end]
            
            yield Batch(batch_inputs, batch_targets)

data_iterator = DataIterator()
for batch in data_iterator(input_documents, input_labels):
    print(batch.inputs.shape)

# Training a model

In [None]:
torch.manual_seed(1)

EMBEDDING_DIM = 300
LSTM_HIDDEN = 50
LEARNING_RATE = 0.01
EPOCHS = 20

class BaselineLSTM(torch.nn.Module):
    def __init__(self, n_words, n_labels):
        super().__init__()
        self.n_words = n_words
        self.n_labels = n_labels
        # self.embeds = nn.Embedding(n_words, EMBEDDING_DIM)
        self.lstm = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=LSTM_HIDDEN, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(in_features=2 * LSTM_HIDDEN, out_features=n_labels)
        
    def forward(self, inputs):
        word_embeds = self._get_embeds(inputs)
        word_embeds = nn.Dropout(p=0.2)(word_embeds)
        lstm_result, _ = self.lstm(word_embeds)
        lstm_result = nn.Dropout(p=0.3)(lstm_result)
        tags = self.linear(lstm_result)
        log_probs = F.softmax(tags)
        return log_probs
    
    def _get_embeds(self, inputs):  # inputs of shape (32 sentences,150 tokens)
        embeddings = torch.Tensor()
        for sentence in inputs:
            sentence_embeds = torch.Tensor()
        
            for word in sentence:
                try:
                    embed = torch.from_numpy(GoogleEmbs.get_vector(word))
                except KeyError:
                    embed = torch.zeros(300)
                sentence_embeds = torch.cat((sentence_embeds, embed), dim=0)

            embeddings = torch.cat((embeddings, sentence_embeds), dim=0)
        return embeddings.view(len(inputs), -1, EMBEDDING_DIM)


In [None]:
TRAIN_AND_SAVE = False  # 'False' stops this block from running when running the notebook

if TRAIN_AND_SAVE:

    model = BaselineLSTM(len(idx2word), len(idx2label))
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    loss_func = torch.nn.CrossEntropyLoss(reduction='sum')

    for epoch in range(EPOCHS):
        model.train()
        
        total_tags = 0
        matched_tags = 0
        epoch_loss = 0
        for i, batch in enumerate(data_iterator(input_documents, input_labels)):
            pred_tags = model.forward(inputs=batch.inputs)
            
            # probability distribution for each tag across all words
            pred_tags = pred_tags.view(-1, model.n_labels)
            
            # true label for each word
            targets = batch.targets.flatten() 
            
            batch_loss = loss_func(pred_tags, targets)
            epoch_loss += batch_loss.item()
            
            # optimization
            batch_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            # proportion of matched tags
            for pred_tag, true_tag in zip(pred_tags, targets):
                pred_tag_idx = torch.argmax(pred_tag)
                if pred_tag_idx == true_tag:
                    matched_tags +=1
                total_tags += 1
            
            # validation loss
            # model.eval()
            # pred_dev_tags = model(dev_input)
            
            # pred_dev_tags = pred_dev_tags.view(-1, model.n_tags)
            # true_dev_tags = dev_labels.flatten()
            
            # val_loss = loss_func(pred_dev_tags, true_dev_tags).item()
                            
        print(f"Epoch {epoch} loss: {epoch_loss:.2f},  total tags matched: {matched_tags / total_tags * 100:.2f}%")

    # Save the model
    mili_time = round(time.time() * 1000)
    file_name = f"nlp_project/models/model_{mili_time}.pkl"
    pickle.dump(model, open(file_name, "wb"))

# 

In [6]:
# loading model
with open('nlp_project/models/model_1679946334736.pkl', 'rb') as f:
    model = pickle.load(f)

AttributeError: Can't get attribute 'BaselineLSTM' on <module '__main__'>

--- Logging error ---
Traceback (most recent call last):
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\site-packages\ipykernel\kernelbase.py", line 461, in dispatch_queue
    await self.process_one()
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\site-packages\ipykernel\kernelbase.py", line 450, in process_one
    await dispatch(*args)
TypeError: object NoneType can't be used in 'await' expression

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\logging\__init__.py", line 1029, in emit
    self.flush()
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\logging\__init__.py", line 1009, in flush
    self.stream.flush()
OSError: [Errno 22] Invalid argument
Call stack:
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\use

In [None]:
dev_documents = []
dev_labels = []
for words, labels, _ ,_ in read_processed_data(DEV_PATH):
    dev_documents.append(words)
    dev_labels.append(labels)

In [None]:
# enc_dev_documents = [[word2idx.get(token, '<PAD>') for token in doc] for doc in dev_documents]
# enc_dev_labels = [[label2idx.get(label, '') for label in labels] for labels in dev_labels]


# enc_dev_document_matrix = create_encoding_matrix(enc_dev_documents, word2idx[PAD], max_len=max_len)
# enc_dev_label_matrix = create_encoding_matrix(enc_dev_labels, label2idx['0'], max_len=max_len)

# dev_inputs = torch.Tensor(enc_dev_document_matrix.to_numpy(), dtype=torch.long)
# dev_labels = torch.Tensor(enc_dev_label_matrix.to_numpy(), dtype=torch.long)

In [None]:
# model.eval()
# pred_dev_tags = model(dev_inputs).view(-1, model.n_tags)
# true_dev_tags = dev_labels.flatten()
# val_loss = loss_func(pred_dev_tags, true_dev_tags).item()
# print(val_loss)

--- Logging error ---
Traceback (most recent call last):
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\site-packages\ipykernel\kernelbase.py", line 461, in dispatch_queue
    await self.process_one()
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\site-packages\ipykernel\kernelbase.py", line 450, in process_one
    await dispatch(*args)
TypeError: object NoneType can't be used in 'await' expression

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\logging\__init__.py", line 1029, in emit
    self.flush()
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\logging\__init__.py", line 1009, in flush
    self.stream.flush()
OSError: [Errno 22] Invalid argument
Call stack:
  File "c:\users\krzys\appdata\local\programs\python\python37\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\use

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>