In [2]:
import codecs
from dataclasses import dataclass
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
import time

import random
random.seed(0)
torch.manual_seed(0)

# from transformers import BertTokenizer, BertModel
# bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# bert_model = BertModel.from_pretrained("bert-base-cased")
import matplotlib.pyplot as plt

import sys
sys.path.insert(1, '/work/nlp-project')
from scripts.read_write_data import read_processed_data, write_baseline_pred
from scripts.evaluation_functions import f1_score, tag_accuracy
from models.classes import DataIterator, Batch

import gensim.models
GoogleEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                '/work/nlp-project/models/GoogleNews-50k.bin', binary=True)

import matplotlib.pyplot as plt

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# TRAIN_PATH = "nlp-project/data/processed/train.conll"
TRAIN_PATH = "/work/nlp-project/data/processed/train_splits/labeled.conll"
DEV_PATH = "nlp-project/data/processed/dev.conll"
TEST_PATH = "nlp-project/data/processed/test.conll"

# Loading data

x_train = []
y_train = []
for words, labels, _, _ in read_processed_data(TRAIN_PATH):
    x_train.append(words)
    y_train.append(labels)

x_dev = []
y_dev = []
for words, labels, _, _ in read_processed_data(DEV_PATH):
    x_dev.append(words)
    y_dev.append(labels)

In [4]:
class BiLSTM(torch.nn.Module):
    def __init__(self,
                embedding_type = 'google',
                LSTM_HIDDEN=20,
                max_len=100,
                n_labels=3,
                batch_size=32,
                pad_token="<PAD>",
                pad_label=2
                ):
        super().__init__()
        
        self.embedding_type = embedding_type  # 'bert' (doesn't work) or 'google' for where to get embeddings from
        if embedding_type == 'google':
            self.EMBEDDING_DIM = 300  # length of embedding vectors
        elif embedding_type == 'bert':
            self.EMBEDDING_DIM = 768  # length of embedding vectors
        else:
            raise AttributeError("Must")
        self.LSTM_HIDDEN = LSTM_HIDDEN  # number of LSTM cells
        self.max_len=max_len  # maximum input sentence length, will be padded to this size
        self.n_labels = n_labels
        self.lstm = nn.LSTM(input_size=self.EMBEDDING_DIM, hidden_size=self.LSTM_HIDDEN, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(in_features=2 * self.LSTM_HIDDEN, out_features=n_labels)
        self.batch_size = batch_size
        self.pad_token = pad_token
        self.pad_label = pad_label

    def pad_inputs(self, collection: List[List[int]], padding_token):
        to_series = [pd.Series(el) for el in collection]
        enc_matrix = (pd.concat(to_series, axis=1)
                        .reindex(range(self.max_len))
                        .fillna(padding_token)
                        .T)
        collection = enc_matrix.values.tolist()
        return collection

    def forward(self, inputs):
        '''
        Implements a forward pass through the Bi-LSTM.
        inputs are a batch (list) of sentences.
        '''
        if self.embedding_type == 'bert':
            word_embeds = self._get_bert_embeds(inputs)
        elif self.embedding_type == 'google':
            word_embeds = self._get_google_embeds(inputs)

        # word_embeds = nn.Dropout(p=0.2)(word_embeds)
        lstm_result, _ = self.lstm(word_embeds)
        # lstm_result = nn.Dropout(p=0.3)(lstm_result)
        tags = self.linear(lstm_result)
        log_probs = F.softmax(tags, dim=2)
        return log_probs
    
    def _get_google_embeds(self, inputs):
        embeddings = torch.Tensor()
        for sentence in inputs:
            sentence_embeds = torch.Tensor()
            for word in sentence:
                if GoogleEmbs.__contains__(word):
                    embed = GoogleEmbs.get_vector(word)
                    embed.setflags(write = True)
                    embed = torch.from_numpy(embed)
                else:
                    embed = torch.zeros(300)  # the word is not in the model dictionary, so use zero vector
                sentence_embeds = torch.cat((sentence_embeds, embed), dim=0)
            embeddings = torch.cat((embeddings, sentence_embeds), dim=0)
        return embeddings.view(len(inputs), -1, self.EMBEDDING_DIM)

    # def _get_bert_embeds(self, inputs):
    #     embeddings = torch.Tensor().float()
    #     for sentence in inputs:

    #         input_ids = torch.Tensor([bert_tokenizer.convert_tokens_to_ids(sentence)]).long()
    #         sentence_embeds = bert_model(input_ids)[0][0].float()

    #         embeddings = torch.cat((embeddings, sentence_embeds), dim=0).float()
    #     return embeddings.view(len(inputs), -1, self.EMBEDDING_DIM).float()
    
    def fit(self, documents, labels, dev, LEARNING_RATE=0.01, EPOCHS=3):
        
    
        self.train()

        # Padding data
        padded_documents = self.pad_inputs(documents, self.pad_token)
        padded_labels = self.pad_inputs(labels, self.pad_label)
        padded_labels = [list(map(int,sentence)) for sentence in padded_labels]
        if dev:
            dev_docs, dev_labs = dev
            padded_dev_docs = self.pad_inputs(dev_docs, self.pad_token)
            padded_dev_labs = self.pad_inputs(dev_labs, self.pad_label)
            padded_dev_labs = [list(map(int,s)) for s in padded_dev_labs]

        optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE)
        loss_func = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=self.pad_label)  # ignores loss for padding token
        data_iterator = DataIterator(batch_size=self.batch_size)
    	
        # Logs for performance
        train_f1_log = []
        dev_f1_log = [],

        for epoch in range(EPOCHS):
            
            total_tags = 0
            matched_tags = 0
            epoch_loss = 0
            batch_f1s = []

            for i, batch in enumerate(data_iterator(padded_documents, padded_labels)):

                pred_tags = self.forward(inputs=batch.inputs)
                pred_tags = pred_tags.view(-1, self.n_labels) # probability distribution for each tag across all words in batch
                targets = torch.tensor(batch.targets).flatten()  # true label for each word
                batch_loss = loss_func(pred_tags, targets)
                epoch_loss += batch_loss.item()
                
                # backprop
                batch_loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                batch_f1 = f1_score(targets, pred_tags, pad_label=self.pad_label, relaxed=True)  # exact match f1
                batch_f1s.append(batch_f1)

                # batch_acc = tag_accuracy(targets, pred_tags, pad_label=self.pad_label)

            train_epoch_f1 = round(np.mean(batch_f1s), 3)  # mean f1 over batches in epoch
            train_f1_log.append(train_epoch_f1)

            print(f"Epoch {epoch}, train F1:{train_f1_log[epoch]}, loss: {epoch_loss:.3f}")
            # print(f"Epoch {epoch}, F1: {train_epoch_f1:.3f}, loss: {epoch_loss:.3f}")


Training model:

In [5]:
model = BiLSTM(LSTM_HIDDEN=5)
model.fit(x_train, y_train, dev=None, LEARNING_RATE=0.1, EPOCHS=10)

Epoch 0, train F1:1.0, loss: 63776.308
Epoch 1, train F1:1.0, loss: 63508.484
Epoch 2, train F1:1.0, loss: 63508.484
Epoch 3, train F1:1.0, loss: 63508.483
Epoch 4, train F1:1.0, loss: 63508.483
Epoch 5, train F1:1.0, loss: 63508.483
Epoch 6, train F1:1.0, loss: 63508.483
Epoch 7, train F1:1.0, loss: 63508.483
Epoch 8, train F1:1.0, loss: 63508.483


KeyboardInterrupt: 

In [0]:
"""
Full training data:
lr=.1, LSTM_HIDDEN=1, 16 epochs, train F1:    0.634
lr=.1, LSTM_HIDDEN=5, 16 epochs, train F1:    0.641
lr=0.07, LSTM_HIDDEN=10, 16 epochs, train F1: 0.70
lr=.05, LSTM_HIDDEN=10, 16 epochs, train F1:  0.718 (0.73 at 38 epochs)

"""

In [0]:
train_f1 = [0.0, 0.0, 0.288, 0.593, 0.609, 0.613, 0.617, 0.618, 0.619, 0.626, 0.626, 0.632, 0.637, 0.638, 0.632, 0.641, 0.64, 0.637, 0.636, 0.64]

plt.plot(train_f1)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>