# Build and train the model

In [6]:
import os
import math
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

def train_model(train_file, model_file):
    # write your code here. You can add functions as well.
    # use torch library to save model parameters, hyperparameters, etc. to model_file
    print('Finished...')

if __name__ == "__main__":
    # make no changes here
    train_file = sys.argv[1]
    model_file = sys.argv[2]
    train_model(train_file, model_file)


Finished...


# Load and do the inference

In [None]:
import os
import math
import sys
import torch

def tag_sentence(test_file, model_file, out_file):
    # write your code here. You can add functions as well.
		# use torch library to load model_file
    print('Finished...')

if __name__ == "__main__":
    # make no changes here
    test_file = sys.argv[1]
    model_file = sys.argv[2]
    out_file = sys.argv[3]
    tag_sentence(test_file, model_file, out_file)


# The network:
1. CNN character level word embedder
1. concatenate CNN embedding with word embedding
1. bi-directional LSTM block, looking at a sentence
1. fully conncected layer? (what does linear projection mean?)

In [105]:
!pip install fastprogress
from fastprogress import progress_bar
from pathlib import Path
import numpy as np

training_data = Path("../data/sents.train")

Collecting fastprogress
  Downloading https://files.pythonhosted.org/packages/83/db/794db47024a26c75635c35f0ee5431aa8b528e895ad1ed958041290f83f7/fastprogress-0.1.21-py3-none-any.whl
Installing collected packages: fastprogress
Successfully installed fastprogress-0.1.21
[33mYou are using pip version 9.0.3, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Creating data input pipeline

In [229]:
import torch
import torch.utils.data


class Dataset(torch.utils.data.Dataset):
    """
    
    """
    def __init__(self, path, to_lower=True):
        self.to_lower = to_lower
        self.sentences = []
        self.vocab = []
        self.tags = []
        
        self.generate_dataset(path)
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        sentence_embs, tag_embs = self.transform_sentence(self.sentences[index])
        return sentence_embs, tag_embs
    
    def generate_dataset(self, path):
        with open(path, 'r') as input_file:
            self.sentences = input_file.read().split("\n")
            self.create_vocabs(self.sentences)
            
            self.vocab_size = len(self.vocab)
            self.tag_size = len(self.tags)
            
            if self.sentences[-1] == "":
                self.sentences.pop()
            
    
    def create_vocabs(self, sentences):
        vocab_set = set()
        tag_set = set()

        for sentence in sentences:
            for word in sentence.split(" "):
                try:
                    word, tag = self.split_words_tag(word)
                    vocab_set.add(word.lower() if self.to_lower else word)
                    tag_set.add(tag)
                except RuntimeError:
                    print("Not a valid word/tag pair: " + word)

        self.vocab = list(vocab_set)
        self.tags = list(tag_set)
            
    def transform_sentence(self, sentence):
        numeric_sent = []
        tags = []

        for word_tag in sentence.split(" "):
            try:
                word, tag = self.split_words_tag(word_tag)
                tag_id = self.tags.index(tag)
                word_id = self.vocab.index(word.lower() if self.to_lower else word)

            except RuntimeError:
                print("Not a valid word/tag pair: " + word_tag)
            except ValueError:
                print("Word not in the vocab: " + word_tag)
                # The id of an unknown word
                word_id = len(self.vocab)
                tag_id = len(self.tags)

            numeric_sent.append(word_id)
            tags.append(tag_id)

        return torch.tensor(numeric_sent), torch.tensor(tags)

    @staticmethod
    def split_words_tag(word):
        words_tag = word.split("/")
        
        if len(words_tag) < 2: 
            raise RuntimeError("Not a valid word/tag pair:" + word)
            
        tag = words_tag.pop()
        word = "/".join(words_tag)
        
        return word, tag
                
    


In [230]:
%%time
dataset = Dataset(training_data)

Not a valid word/tag pair: 
CPU times: user 1.56 s, sys: 28.1 ms, total: 1.59 s
Wall time: 1.58 s


## DataLoader implementation

In [243]:
from torch.utils.data import DataLoader

batch_size = 1
num_workers = 4


pos_dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

# Initial simple model implementation 

## Proposed plan:
1. Begin with word-level LSTM (check for an example in forum)
2. Make it bi-directional
3. Add character-level CNN

Try a new notebook: https://polynote.org/docs/01-installation.html

In [207]:
class PipelineTestModel(nn.Module):
    def __init__(self, vocab_size, emb_dims, output_dims):
        super(PipelineTestModel, self).__init__()
        
        self.vocab_size = vocab_size
        self.emb_dims = emb_dims
        self.output_dims = output_dims
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_dims)
        self.fc = nn.LSTM(self.emb_dims, self.output_dims)
        
    def forward(self, sentence):
        print(sentence.shape)
        emb = self.emb(sentence)
        print(emb.shape)
#         tags = F.softmax(self.fc(emb), dim=self.output_dims)
        tags = self.fc(emb.view(len(sentence), 1, -1))
        print(tags.shape)
        return tags    
    
    
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores


In [217]:
embedding_dims = 128
hidden_dims = 64
vocab_size = dataset.vocab_size
tagset_size = dataset.tag_size

model = LSTMTagger(embedding_dims, hidden_dims, vocab_size, tagset_size)

In [232]:
pos_dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

In [245]:
gpu = torch.device("cuda")

for epoch in range(1):
    for x, y in pos_dataloader:
        x_gpu, y_gpu = x.to(gpu), y.to(gpu)

RuntimeError: Cannot initialize CUDA without ATen_cuda library. PyTorch splits its backend into two shared libraries: a CPU library and a CUDA library; this error has occurred because you are trying to use some CUDA functionality, but the CUDA library has not been loaded by the dynamic linker for some reason.  The CUDA library MUST be loaded, EVEN IF you don't directly use any symbols from the CUDA library! One common culprit is a lack of -Wl,--no-as-needed in your link arguments; many dynamic linkers will delete dynamic library dependencies if you don't depend on any of their symbols.  You can check if this has occurred by using ldd on your binary to see if there is a dependency on *_cuda.so library.