In [1]:
import codecs
import time
from dataclasses import dataclass
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
import pickle

import sys
sys.path.insert(1, '/work/nlp-project')
from scripts.read_write_data import read_processed_data, write_baseline_pred
from models.classes import SecondLSTM, DataIterator, Batch

import gensim.models
GoogleEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                '/work/nlp-project/models/GoogleNews-50k.bin', binary=True)

from sklearn.metrics import f1_score, accuracy_score

KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [5]:
def pad_inputs(collection: List[List[int]], pad_token, max_len: int = None):
    if not max_len:
        max_len = max([len(x) for x in collection])

    to_series = [pd.Series(el) for el in collection]
    enc_matrix = (pd.concat(to_series, axis=1)
                    .reindex(range(max_len))
                    .fillna(pad_token)
                    .T)

    return enc_matrix.values.tolist()

# Evaluation on dev set

In [2]:
# loading model
with open('/work/nlp-project/models/model_04-03.pkl', 'rb') as f:
    model = pickle.load(f)
model.eval()  # setting to eval mode

SecondLSTM(
  (lstm): LSTM(300, 10, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=20, out_features=3, bias=True)
)

In [3]:
TRAIN_PATH = "nlp-project/data/processed/train.conll"
DEV_PATH = "nlp-project/data/processed/dev.conll"
TEST_PATH = "nlp-project/data/processed/test.conll"

dev_documents = []
dev_labels = []
for words, labels, _ ,_ in read_processed_data(DEV_PATH):
    dev_documents.append(words)
    dev_labels.append(labels)

padding dev sets:

In [6]:
max_len = 99
dev_docs_padded = pad_inputs(dev_documents, '<PAD>', max_len=max_len)
dev_labels_padded = pad_inputs(dev_labels, -100, max_len=max_len)

In [11]:
doc = dev_docs_padded[0]
print(doc)
pred = model.forward([doc])

['Iguazu', 'is', 'a', 'big', 'or', 'a', 'small', 'country', '?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
torch.Size([300])
torch.Size([600])
torch.Size([900])
torch.Size([1200])
torch.Size([1500])
torch.Size([1800])
torch.Size([21

In [0]:
results[0]

In [0]:
pred_labels = []
for padded_pred_labels, true_labels in zip(results, dev_labels):
    preds = padded_pred_labels[:len(true_labels)].tolist()
    preds = list(map(str, preds))
    pred_labels.append(preds)

In [0]:
print(pred_labels[0])
print(dev_labels[0])

In [0]:
with open('nlp-project/data/predictions/bi-lstm_binary_preds.conll', encoding='utf-8', mode='a') as file:
    for doc, pred_lab in zip(dev_documents, pred_labels):
        for word, lab in zip(doc, pred_lab):
            file.write(word+'\t'+lab+'\n')
        file.write('\n')

Now writing the gold labels:

In [0]:
df = pd.DataFrame(read_processed_data('/work/nlp-project/data/processed/dev.conll')).drop([2,3], axis=1)
words = df[0].tolist()
labels = df[1].tolist()

In [0]:
with open('nlp-project/data/predictions/bi-lstm_binary_gold.conll', encoding='utf-8', mode='a') as file:
    for doc, labs in zip(words, labels):
        for word, lab in zip(doc, labs):
            file.write(word+'\t'+lab+'\n')
        file.write('\n')

In [0]:
sentence = ['Hello', 'Sam']
pad_length = max(0, 99 - len(sentence))
padded_sentence = sentence + ['<PAD>']*pad_length
model_output = model([padded_sentence])
model_output

In [0]:
padding = max(0, 99 - len(sentence))
padded_sentence = sentence.extend(['<PAD>']*padding)
model_output = model(padded_sentence)
model_prediction = 
predicted_labels = np.append(predicted_labels, predictions)

In [0]:
for i, sentence in enumerate(dev_documents):
    pad_length = max(0, 99 - len(sentence))
    padded_sentence = sentence + ['<PAD>']*pad_length
    model_output = model(padded_sentence)
    model_prediction = 
    predicted_labels = np.append(predicted_labels, predictions)

print(predicted_labels)
    

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>