In [1]:
import os
import math
import sys
import torch

def tag_sentence(test_file, model_file, out_file):
    # write your code here. You can add functions as well.
    # use torch library to load model_file
    print('Finished...')

if __name__ == "__main__":
    # make no changes here
#     test_file = sys.argv[1]
#     model_file = sys.argv[2]
#     out_file = sys.argv[3]
#     tag_sentence(test_file, model_file, out_file)
    pass


# Loading model & dictionary

In [2]:
def load_model(model_filename):
    model = torch.load(model_filename + "_1.data")
    dictionaries = torch.load(model_filename + "_2.data")
    return model, dictionaries

In [3]:
import torch.nn as nn
import torch.nn.functional as F

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence[-1]), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence[-1]), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [4]:
import torch
import torch.utils.data


class Dataset(torch.utils.data.Dataset):
    def __init__(self, path, to_lower=True, training=True):
        self.to_lower = to_lower
        self.training = training
        
        self.sentences = []
        self.vocab = []
        self.tags = []
        
        self.generate_dataset(path)
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        sentence_embs, tag_embs = self.transform_sentence(self.sentences[index])
        return sentence_embs, tag_embs
    
    def generate_dataset(self, path):
        with open(path, 'r') as input_file:
            self.sentences = input_file.read().split("\n")
            
            if len(self.vocab) == 0:
                self.create_vocabs(self.sentences)
                self.vocab_size = len(self.vocab)
                self.tag_size = len(self.tags)
            
            if self.sentences[-1] == "":
                self.sentences.pop()
    
    def create_vocabs(self, sentences):
        vocab_set = set()
        tag_set = set()

        for sentence in sentences:
            for word in sentence.split(" "):
                try:
                    word, tag = self.split_words_tag(word)
                    vocab_set.add(word.lower() if self.to_lower else word)
                    tag_set.add(tag)
                except RuntimeError:
                    print("Not a valid word/tag pair: " + word)

        self.vocab = list(vocab_set)
        self.tags = list(tag_set)
            
    def transform_sentence(self, sentence):
        numeric_sent = []
        tags = []

        for word_tag in sentence.split(" "):
            try:
                if self.training:
                    word, tag = self.split_words_tag(word_tag)
                    tag_id = self.tags.index(tag)
                else:
                    word = word_tag
                    
                word_id = self.vocab.index(word.lower() if self.to_lower else word)

            except RuntimeError:
                print("Not a valid word/tag pair: " + word_tag)
            except ValueError:
                print("Word not in the vocab: " + word_tag)
                # The id of an unknown word
                word_id = len(self.vocab) - 1

            numeric_sent.append(word_id)
            if self.training: tags.append(tag_id)

        return torch.tensor(numeric_sent), torch.tensor(tags) if self.training else []

    @staticmethod
    def split_words_tag(word):
        words_tag = word.split("/")
        
        if len(words_tag) < 2: 
            raise RuntimeError("Not a valid word/tag pair:" + word)
            
        tag = words_tag.pop()
        word = "/".join(words_tag)
        
        return word, tag
    
    def print_sentence(self, sentence):
        print(" ".join([self.vocab[word.item()] for word in sentence.view(-1)]))
                
    def decode_sentence(self, sentence):
        return [self.vocab[word.item()] for word in sentence.view(-1)]
    
    def decode_tags(self, tag_ids):
        return [self.tags[tag.item()] for tag in tag_ids.view(-1)]
    
    def __getstate__(self):
        d = dict(self.__dict__)
        del d['sentences']
        return d
    
    def __setstate(self, d):
        self.__dict__.update(d)
        self.__dict__.update({'sentences': []})


In [5]:
model, dataset = load_model("LSTMTagger_5")

# Digesting the new input data

In [6]:
from pathlib import Path

test_data = Path("../data/sents.test")
dataset.generate_dataset(test_data)
dataset.training = False

In [7]:
%%time
from torch.utils.data import DataLoader

gpu = torch.device("cuda")
cpu = torch.device("cpu")
model.to(cpu)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4)

i = 0

preds = []
for x, _ in dataloader:
#     if i > 100:
#         break
#     i += 1
    predictions = model(x.to(cpu))
    _, pos_tag_ids = predictions.max(1)
    
    words = dataset.decode_sentence(x)
    tags = dataset.decode_tags(pos_tag_ids)
    word_tags = ["/".join(word_tag) for word_tag in zip(words, tags)]
    
    preds.append(" ".join(word_tags))


Word not in the vocab: forest-product
Word not in the vocab: overpaying
Word not in the vocab: A.D.
Word not in the vocab: WTD
Word not in the vocab: Correll
Word not in the vocab: Polytechnic
Word not in the vocab: Correll
Word not in the vocab: Strother
Word not in the vocab: Pamplin
Word not in the vocab: researching
Word not in the vocab: Pamplin
Word not in the vocab: enticed
Word not in the vocab: befuddled
Word not in the vocab: Phi
Word not in the vocab: Kappa
Word not in the vocab: retentive
Word not in the vocab: non-core
Word not in the vocab: recession-inspired
Word not in the vocab: 467
Word not in the vocab: reallocate
Word not in the vocab: 30,537
Word not in the vocab: reallocated
Word not in the vocab: 23,403
Word not in the vocab: Barbados
Word not in the vocab: Californian
Word not in the vocab: upsetting
Word not in the vocab: Backseat
Word not in the vocab: railings
Word not in the vocab: railings
Word not in the vocab: railing
Word not in the vocab: railing
Word n

Word not in the vocab: presumes
Word not in the vocab: Sidak
Word not in the vocab: impetuous
Word not in the vocab: droughts
Word not in the vocab: mortgaged
Word not in the vocab: reclaimed
Word not in the vocab: price-support
Word not in the vocab: hardest-hit
Word not in the vocab: disaster-assistance
Word not in the vocab: 240-page
Word not in the vocab: Dakotas
Word not in the vocab: land-idling
Word not in the vocab: cultivation
Word not in the vocab: 238,000-circulation
Word not in the vocab: one-newspaper
Word not in the vocab: 170,000
Word not in the vocab: senses
Word not in the vocab: tire-kickers
Word not in the vocab: lookee-loos
Word not in the vocab: newsstand
Word not in the vocab: balkanized
Word not in the vocab: sports-oriented
Word not in the vocab: flirted
Word not in the vocab: News-American
Word not in the vocab: Herald-American
Word not in the vocab: cornerstones
Word not in the vocab: Renaissance-style
Word not in the vocab: Simeon
Word not in the vocab: bygon

Word not in the vocab: 7.40
Word not in the vocab: 7.40
Word not in the vocab: 2017
Word not in the vocab: 2029
Word not in the vocab: 2029
Word not in the vocab: Heiwado
Word not in the vocab: Svenska
Word not in the vocab: Intecknings
Word not in the vocab: Garanti
Word not in the vocab: Svenska
Word not in the vocab: Aktiebolaget
Word not in the vocab: Handelsbanken
Word not in the vocab: Takashima
Word not in the vocab: 3.42
Word not in the vocab: Koizumi
Word not in the vocab: Sangyo
Word not in the vocab: Schweiz
Word not in the vocab: avid
Word not in the vocab: major-league
Word not in the vocab: moneymakers
Word not in the vocab: Pepperdine
Word not in the vocab: redistribute
Word not in the vocab: Baim
Word not in the vocab: mega-stadium
Word not in the vocab: spaces
Word not in the vocab: spaces
Word not in the vocab: Robbie
Word not in the vocab: city-owned
Word not in the vocab: Robbie
Word not in the vocab: Landrieu
Word not in the vocab: Superdome
Word not in the vocab: 

Word not in the vocab: Papua-New
Word not in the vocab: 36-store
Word not in the vocab: Younkers
Word not in the vocab: Younkers
Word not in the vocab: Younkers
Word not in the vocab: 313
Word not in the vocab: Younkers
Word not in the vocab: Hubbell
Word not in the vocab: Younkers
Word not in the vocab: Equus
Word not in the vocab: 13.65
Word not in the vocab: Equus
Word not in the vocab: all-cash
Word not in the vocab: Equus
Word not in the vocab: 13.65
Word not in the vocab: Equus
Word not in the vocab: Reupke
Word not in the vocab: Reupke
Word not in the vocab: several-year
Word not in the vocab: Reupke
Word not in the vocab: Reupke
Word not in the vocab: Reupke
Word not in the vocab: directorship
Word not in the vocab: eight-person
Word not in the vocab: Shepperd
Word not in the vocab: 913
Word not in the vocab: Reupke
Word not in the vocab: Judah
Word not in the vocab: Mannix
Word not in the vocab: Dunkin'
Word not in the vocab: Dunkin'
Word not in the vocab: Dunkin'
Word not in 

Word not in the vocab: Eveready
Word not in the vocab: Baking
Word not in the vocab: 55.1
Word not in the vocab: cash-and-stock
Word not in the vocab: Ravenswood
Word not in the vocab: corn-buying
Word not in the vocab: bottlenecks
Word not in the vocab: barge
Word not in the vocab: reaping
Word not in the vocab: gyrate
Word not in the vocab: scrounge
Word not in the vocab: tows
Word not in the vocab: Dunton
Word not in the vocab: Barge
Word not in the vocab: barge
Word not in the vocab: Lyle
Word not in the vocab: Waterloo
Word not in the vocab: Biedermann
Word not in the vocab: barge
Word not in the vocab: Allendale
Word not in the vocab: sidestep
Word not in the vocab: larger-than-normal
Word not in the vocab: 19.94
Word not in the vocab: 58.64
Word not in the vocab: 377.60
Word not in the vocab: 494.50
Word not in the vocab: 5.2180
Word not in the vocab: 170,262
Word not in the vocab: 226,570,380
Word not in the vocab: Disputada
Word not in the vocab: precedes
Word not in the vocab

In [8]:
def export_preds(preds, filename):
    with open(filename, "w") as out_file:
        out_file.write("\n".join(preds))

In [9]:
export_preds(preds, "test_output.txt")

!python3 ../data/eval.py test_output.txt ../data/sents.answer

Accuracy= 0.01200848151491613
