# Word Inflection

## Data

source: [github repository](https://github.com/sigmorphon/conll2017) - datasets for the 2017 joint CoNLL-SIGMORPHON shared task on morphological reinflection. The repository contains word inflection datasets for 52 languages. This project takes only three of the languages: English, Finnish and Spanish.


In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:

import os.path

CONLL_SIGMORPHON_DATA_PATH="./data/conll2017"

### Load Data

In [3]:
import pandas as pd
import os
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def read_tsv(language, setting):
    train = pd.read_csv(f"{CONLL_SIGMORPHON_DATA_PATH}/all/task1/{language}-train-{setting}",
                        sep="\t", header=None, names=["input","output","msd"])
    dev = pd.read_csv(f"{CONLL_SIGMORPHON_DATA_PATH}/all/task1/{language}-dev",
                        sep="\t", header=None, names=["input","output","msd"])
    test = pd.read_csv(f"{CONLL_SIGMORPHON_DATA_PATH}/answers/task1/{language}-uncovered-test",
                        sep="\t", header=None, names=["input","output","msd"])
    return train, dev, test

Both take a dataset as argument and `yield` lists as output. 

1. `yield_input()` `yields` lists of the form `["<start>", "w", "a", "l", "k", "FEAT=V", "FEAT=PAST", "<end>"]`
1. `yield_output()` `yields` lists of the form `["<start>", "w", "a", "l", "k", "e", "d", "<end>"]`


In [4]:
train_df, _, _ = read_tsv("spanish", "medium")
train_df.head()

Unnamed: 0,input,output,msd
0,manducar,manducado,V.PTCP;PST;MASC;SG
1,reestrenar,reestrenaba,V;IND;PST;3;SG;IPFV
2,fragmentar,fragmentaríamos,V;COND;1;PL
3,descomponer,no descompongáis,V;NEG;IMP;2;PL
4,fusilar,fusilas,V;IND;PRS;2;SG


In [5]:
def yield_input(data):
    for ex in data:
        yield ["<start>"] + list(ex[0]) + ["FEAT=" + i for i in ex[2].split(";")] + ["<end>"]
    
def yield_output(data):
    for ex in data:
        yield ["<start>"] + list(ex[1]) + ["<end>"]

In [6]:
from collections import namedtuple
from itertools import chain 
Example = namedtuple("Example",["input", "output"])

class UDDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        self.iloc = data.iloc
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.iloc[index]   
    
def read_data(language,setting,batch_size):
    train_df, dev_df, test_df = read_tsv(language,setting)

    train = UDDataset(train_df)
    dev = UDDataset(dev_df)
    test = UDDataset(test_df)

    vocab = build_vocab_from_iterator(chain(yield_input(train), yield_output(train)),
                                      specials=["<pad>", "<unk>", "<start>", "<end>"])
    vocab.set_default_index(vocab["<unk>"])

    input_transform = lambda w: [vocab[c] for c in w]
    output_transform = lambda w: [vocab[c] for c in w]
    
    def collate_batch(batch):
        input_list, output_list, input_lens, output_lens = [], [], [], []
        for lemma, wf, in zip(yield_input(batch), 
                              yield_output(batch)):
            input_tensor = torch.tensor(input_transform(lemma), dtype=torch.long)
            output_tensor = torch.tensor(output_transform(wf), dtype=torch.long)
            input_list.append(input_tensor)
            output_list.append(output_tensor)
            input_lens.append(input_tensor.size()[0])
            output_lens.append(output_tensor.size()[0])

        return Example((pad_sequence(input_list, 
                                 batch_first=False, 
                                 padding_value=vocab["<pad>"]), torch.tensor(input_lens, dtype=torch.long)),
                       (pad_sequence(output_list, 
                                 batch_first=False, 
                                 padding_value=vocab["<pad>"]), torch.tensor(output_lens, dtype=torch.long)))

    train_iter = DataLoader(train, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    dev_iter = DataLoader(dev, batch_size=1, shuffle=False, collate_fn=collate_batch)
    test_iter = DataLoader(test, batch_size=1, shuffle=False, collate_fn=collate_batch)
    
    return train_iter, dev_iter, test_iter, vocab

Read the Spanish inflection dataset and print a training example returned by read_data along with the vocabulary

In [7]:
train_iter, dev_iter, test_iter, vocab = read_data("spanish", "medium", 1)

# Print the first training exaple
example = next(iter(train_iter))
print(example)
print(vocab.get_stoi())

Example(input=(tensor([[ 2],
        [16],
        [ 6],
        [ 8],
        [ 6],
        [ 5],
        [13],
        [ 7],
        [32],
        [ 4],
        [ 5],
        [11],
        [22],
        [21],
        [26],
        [17],
        [39],
        [ 3]]), tensor([18])), output=(tensor([[ 2],
        [16],
        [ 6],
        [ 8],
        [ 6],
        [ 5],
        [13],
        [ 7],
        [32],
        [ 4],
        [14],
        [10],
        [ 8],
        [ 3]]), tensor([14])))
{'g': 28, 'FEAT=V': 11, '<pad>': 0, '<unk>': 1, 'r': 5, 'FEAT=3': 27, '<start>': 2, '<end>': 3, 'i': 7, 'FEAT=MASC': 52, 'FEAT=FEM': 50, 'a': 4, 'FEAT=PL': 17, 'e': 6, 'FEAT=NEG': 46, 's': 8, 'n': 9, 'FEAT=POS': 42, 'o': 10, 'c': 12, 't': 13, 'm': 14, 'l': 15, 'd': 16, 'u': 18, 'FEAT=SG': 19, 'FEAT=V.PTCP': 47, 'p': 20, 'FEAT=PST': 21, 'j': 41, 'FEAT=IND': 22, 'FEAT=LGSPEC1': 43, 'b': 23, 'FEAT=SBJV': 24, 'FEAT=2': 25, 'FEAT=1': 26, 'v': 29, 'f': 30, 'FEAT=PRS': 31, 'FEAT=IPFV': 40, 'z': 32

### Basic Encoder-Decoder Model(without attention)

EX: 

`<start> s t o d g e FEAT=V FEAT=PST <end>` into a single hidden state vector which is then fed into a decoder network

Decoder then generates the output word form (`s t o d g e d`) one symbol at a time. 



In [8]:
import numpy as np

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn.functional import log_softmax
from torch.optim import Adam, SGD

from random import random, seed, shuffle

# Ensure reproducible results.
seed(0)
torch.manual_seed(0)
np.random.seed(0)

import re

# Hyperparameters
EMBEDDING_DIM=50
RNN_HIDDEN_DIM=50
RNN_LAYERS=1
BATCH_SIZE=10
CHAR_DROPOUT=0.0
EPOCHS=10

# Maximum length of generated output word forms.
MAXWFLEN=40

def accuracy(sys,gold):
    assert(len(sys) == len(gold))
    return sum([1 if x==y else 0 for x,y in zip(sys,gold)])*100.0/len(gold)

#### Encoder Network


`Encoder` class which serves as a wrapper for a character embedding and a bidirectional LSTM network. 


<!-- 1. Embed input sequence in `ex.input` using the character `self.embedding`. Given an input tensor of dimension `(sequence_length,1)`, this should result in a `(sequence_length,1,EMBEDDING_DIM)` tensor.
1. Process the embedded input sequence using `self.rnn`. You should return the final hidden state returned by `self.rnn`. Note that `self.rnn` will return a hidden state of size `(2,1,RNN_HIDDEN_DIM)`.
1. Your last task is to convert the hidden state returned by `self.rnn` into a `(1,1,2*RNN_HIDDEN_DIM)` tensor and return it. You can do this using several pytorch functions, for example `torch.cat` or `torch.view`. -->

In [9]:
class Encoder(nn.Module):
        def __init__(self,alphabet):
                super(Encoder,self).__init__()
                self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
                self.rnn = nn.LSTM(EMBEDDING_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=True)

        def forward(self,ex):
            input, _ = ex.input #(sequence_len, 1)
            input = self.embedding(input) # (sequence_len, 1, embedding_dim)
            hss, (hn, cn) = self.rnn(input) # hn: (2, 1, rnn_hidden_dim)
            hs = torch.cat([hn[0], hn[1]], dim=1).unsqueeze(0) # (1,1, 2*rnn_hidden_dim)
            return hs

# An assertion to test that your implementation returns a tensor of the correct size. 
assert(Encoder(vocab.get_stoi())(example).size() == torch.Size([1,1,2*RNN_HIDDEN_DIM]))

#### Decoder Network


The `Decoder` class serves as a wrapper for a character embedding `embedding` and an LSTM network `rnn`. At each time-step `t`, the decoder `rnn` consumes an input vector which is a concatenation of an encoder hidden state and the embedding of the output character at position `t-1` in the output sequence (at position `t == 0`, this will be the embedding of the start of sequence symbol `<start>`). 



##### `Decoder.forward`

The `forward` function is used when training the decoder. It takes an encoder hidden state `encoder_hs` corresponding to an input sequence like `<start> s t o d g e FEAT=V FEAT=PST <END>` and returns a tensor corresponding to the output sequence `s t o d g e d <END>`. However, the `forward` function does not directly return the output sequence. Instead, it returns a tensor `distr` of size `(output_length,1,alphabet_size)`. For example, `distr[6,0,15]` indicates the log-probability that the output symbol at position `t` will be symbol number `15` (imagine for example that `15 == self.alphabet["d"]`).

Since our decoder is trained using teacher forcing, we will feed in the gold standard output symbol at position `t-1` when predicting the output at position `t` in `forward`. You can access the gold standard output sequence via `ex.output`.

In order to implement `forward`, you should:

1. Embed the gold standard outputword form in `ex.output` using `self.embedding`. This should give you a tensor `embedded_output` of size `(output_length - 1,1,EMBEDDING_DIM)` (note the `- 1` which is a result of clipping the final `<end>` symbol form the output sequence `<start> s t o d g e d <end>`).
1. Concatenate one copy of `encoder_hs` to each embedding vector in `embedded_output`. This should give you a tensor `decoder_input` of size `(output_length-1,1,EMBEDDING_DIM+2*RNN_HIDDEN_DIM)`.
1. Run `self.rnn` on `decoder_output`. This should give you a tensor of hidden states `decoder_hidden_states` having  dimension `(output_length-1,1,RNN_HIDDEN_DIM)`.
1. Apply `self.hidden2tag` and a `log_softmax` layer to `decoder_hidden_states`. this should give you a distribution tensor `distr` having dimension `(output_length-1,1,alphabet_size)`. 
1. Return `distr` and `output[1:]`.

##### `Decoder.generate`

The second function you need to implement is `generate`. It recursively generates an output word form given an encoder hidden state.  

It takes only one argument: an encoder hidden state of dimension `(1,1,2*RNN_HIDDEN_STATE)`. In contrast to `forward`, you don't get a gold standard output sequence as parameter since we need to use `generate` during test time. Instead, it is your task to recursively generate the output sequence starting with the sequence inital symbol `"<start>"`. 

At each time step, you should feed the current output symbol `output_char` and the encoder hidden state `encoder_hs` as input to the decoder. It is your task to maintain the internal state `decoder_state`. It is a pair `(hs,cs)`, where `hs` is the decoder hidden state and `cs` is its cell state. You need to initialize the decoder to this state and update it using the return value of `self.rnn.forward`. You should then predict the next output symbol using the updated value of `decoder_state` and `self.hidden2char` echoing the `forward` function you just implemented.

The `generate` function always produces an output of length `MAXWFLEN` but since this output may contain the `"<end>"` symbol, we can in reality generate shorter output sequences because `a b c <end> <end> ...` corresponds to the output word form `a b c`.

In order to implement `generate`, you should:

1. Embed `output_char` using `self.embedding`. This should give you a tensor `output_embedding` of dimension `(1,1,EMBEDDING_DIM)`.
1. Run `self.rnn` on the concatenation of `output_embedding` and `encoder_hs`. Note that you need to initialize the decoder to `decoder_state`. You should use the return value to update `decoder_state`.
1. Use the decoder hidden state `hs` from `decoder_state` and self.hidden2char to predict the next output character. You will probably need to use torch.argmax to find the most probable output symbol.
1. Update the value of the `output_char` variable to the current output character and add it to the result array. Note that if `output_char == torch.Tensor([[1]])`, then you need to add the integer `1` to result. 
1. After `MAXWFLEN` time-steps, return `result`.

In [10]:
x = torch.LongTensor([[vocab.get_stoi()["<start>"]]])
x.size()

torch.Size([1, 1])

In [11]:
vocab.get_stoi()["<start>"]

2

In [12]:
class Decoder(nn.Module):
    def __init__(self, alphabet):
        super(Decoder,self).__init__()
        self.alphabet = alphabet
        self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
        self.rnn = nn.LSTM(EMBEDDING_DIM+2*RNN_HIDDEN_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=False)
        self.hidden2char = nn.Linear(RNN_HIDDEN_DIM, len(alphabet))
        
    def forward(self,ex,encoder_hs):
        # encoder_hs : (1,1, 2*rnn_hidden_dim)
        output, output_length = ex.output # output: (seq_len, 1)  output is the golden label
        output_em = self.embedding(output[:-1,:]) # (seq_length - 1,1,EMBEDDING_DIM)
        # concat with input rep
        encoder_hs_copy = encoder_hs.expand(output_em.size()[0], 1, 2*RNN_HIDDEN_DIM)
        emb = torch.cat([output_em, encoder_hs_copy], dim=2) # (seq_length-1, 1, embedding_dim + 2*rnn_hidden_dim)
        hss, _ = self.rnn(emb) # hss: (seq_length-1 x 1 x rnn_hidden_dim)
        dist = self.hidden2char(hss) # (seq_length-1 x 1 x len(vocab))
        dist = dist.log_softmax(dim = 2) # # (seq_length-1 x 1 x alphabet_size)
        return dist, output[1:]


        
    def generate(self,encoder_hs):
        # encoder_hs : (1,1, 2*rnn_hidden_dim)
        # We're not accumulating gradients during test time.
        with torch.no_grad():
            decoder_state = (torch.zeros(1,1,RNN_HIDDEN_DIM), torch.zeros(1,1,RNN_HIDDEN_DIM))
            output_char = torch.LongTensor([[self.alphabet["<start>"]]]) # (1,1)
            result = []
            for _ in range(MAXWFLEN):
                output_embedding = self.embedding(output_char) # (1,1,EMBEDDING_DIM)
                emb = torch.cat([output_embedding, encoder_hs], dim = 2) # (1,1, embedding_dim + 2*rnn_hidden_dim)
                hss, decoder_state = self.rnn(emb, decoder_state) 
                dist = self.hidden2char(decoder_state[0]).softmax(dim = 2) #(1,1,len(alphabet))
                predicted_symbol = dist.argmax() # just a symbol
                output_char = torch.LongTensor([[predicted_symbol]]) # (1,1)
                result.append(output_char.numpy().tolist()[0][0])
            return result
            
# Assertions to test that your implementation returns objects of the correct size. 
alphabet = vocab.get_stoi()
encoder_hs = Encoder(alphabet)(example)
_, output_length = example.output
alphabet_size = len(alphabet)

assert(Decoder(alphabet)(example,encoder_hs)[0].size() == torch.Size([output_length - 1,1,alphabet_size]))
assert(len(Decoder(alphabet).generate(encoder_hs)) == MAXWFLEN)

#### Training the Model

`WordInflector` class combines the encoder and decoder networks. 

<!-- We also give you some code for training a `WordInflector`. This code will run backpropagation through time using the Adam optimizer. -->

<!-- Running 10 epochs of the training algorithm on the Spanish medium training set spanning 1,000 examples, you should come close to 10% accuracy. This is quite modest but it shows that your model is indeed training. If you are seeing 0-1% accuracy after training 10 epochs, then you have a problem somewhere in your code. -->

In [13]:
class WordInflector(nn.Module):
    def __init__(self, alphabet):
        super(WordInflector, self).__init__()
        self.alphabet = alphabet.get_stoi()
        self.integer2char = alphabet.get_itos()
        alphabet_size = len(self.alphabet)
        
        self.encoder = Encoder(self.alphabet)
        self.decoder = Decoder(self.alphabet)
    
    def get_string(self,ids):
        string = ''.join([self.integer2char[i] for i in ids])
        return re.sub("%s.*" % "<end>","",string)

    def forward(self, example):
        encoder_hs = self.encoder(example)
        return self.decoder(example,encoder_hs)
            
    def generate(self, data):
        all_results = []
        with torch.no_grad():
            for example in data:
                encoder_hs = self.encoder(example)
                output = self.decoder.generate(encoder_hs)
                all_results.append(self.get_string(output))
        return all_results
    
if __name__=="__main__":
    # Read the Spanish medium data set.
    train_iter, dev_iter, test_iter, vocab = read_data(language="spanish",
                                                       setting="medium",
                                                       batch_size=1)
    
    inflector = WordInflector(vocab)

    loss_function = nn.NLLLoss(ignore_index=vocab["<pad>"],reduction='mean')
    optimizer = Adam(inflector.parameters())
    gold_dev_words = [''.join(w.output) for w in dev_iter.dataset]

    for epoch in range(EPOCHS):
        tot_loss = 0 

        # Update parameters
        for i, batch in enumerate(train_iter):
            print("Example %u of %u" % (i+1,len(train_iter)),end="\r")
            inflector.zero_grad()
            tag_scores, tgt = inflector(batch)
            tgt = tgt.permute(1,0)
            tag_scores = tag_scores.permute(1,2,0)
            loss = loss_function(tag_scores,tgt) 
            loss.backward()
            optimizer.step()
            tot_loss += len(batch)*loss.detach().numpy()
        print()
        avg_loss = tot_loss/len(train_iter)
        print("EPOCH %u: AVG LOSS PER EX: %.5f" % (epoch+1,avg_loss))        

        # Evaluate on dev data.
        sys_dev_words = inflector.generate(dev_iter)
        print("DEV ACC: %.2f%%" % accuracy(sys_dev_words,gold_dev_words))

Example 1000 of 1000
EPOCH 1: AVG LOSS PER EX: 4.95871
DEV ACC: 0.00%
Example 1000 of 1000
EPOCH 2: AVG LOSS PER EX: 3.69327
DEV ACC: 0.00%
Example 1000 of 1000
EPOCH 3: AVG LOSS PER EX: 2.89394
DEV ACC: 0.40%
Example 1000 of 1000
EPOCH 4: AVG LOSS PER EX: 2.33738
DEV ACC: 1.20%
Example 1000 of 1000
EPOCH 5: AVG LOSS PER EX: 1.93215
DEV ACC: 2.90%
Example 1000 of 1000
EPOCH 6: AVG LOSS PER EX: 1.62633
DEV ACC: 4.00%
Example 1000 of 1000
EPOCH 7: AVG LOSS PER EX: 1.38394
DEV ACC: 5.50%
Example 1000 of 1000
EPOCH 8: AVG LOSS PER EX: 1.17550
DEV ACC: 7.50%
Example 1000 of 1000
EPOCH 9: AVG LOSS PER EX: 1.04067
DEV ACC: 8.50%
Example 1000 of 1000
EPOCH 10: AVG LOSS PER EX: 0.87147
DEV ACC: 10.10%


### Encoder-Decoder with Attention


The `forward` function for the old `Encoder` returned a single hidden state of dimension `(1,1,2*RNN_HIDDEN_DIM)`, namely the final hidden state. In contrast, you should return a hidden state for each time step, which means that `forward` should return a tensor of dimension `(input_length,1,2*RNN_HIDDEN_DIM)`. This will require a small change to your existing implementation of the `Encoder` class.

In [14]:
class Encoder(nn.Module):
        def __init__(self,alphabet):
                super(Encoder,self).__init__()
                self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
                self.rnn = nn.LSTM(EMBEDDING_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=True)

        def forward(self,ex):
            input, _ = ex.input #(sequence_len, 1)
            input = self.embedding(input) # (sequence_len, 1, embedding_dim)
            hss, (hn, cn) = self.rnn(input) # hss: (seq_len, 1, 2*rnn_hidden_dim)
            return hss


# An assertion to test that your implementation returns an object of the correct size. 
input, input_length = example.input
assert(Encoder(vocab.get_stoi())(example).size() == torch.Size([input_length,1,2*RNN_HIDDEN_DIM]))

The `Attention` class implements a version of [Bahdanau attention](https://blog.floydhub.com/attention-mechanism/). 

Its `forward` function takes two inputs: a tensor of encoder hidden states `encoder_hss` of dimension `(sequence_length, 1, 2*RNN_HIDDEN_DIM)` and a decoder hidden state `dec_state` of dimension `(1,1,RNN_HIDDEN_DIM)`. It computes a context weight for each of the encoder hidden states and the decoder hidden state using a feed-forward neural network with one hidden layer and a ReLU non-linearity. These weights are then normalized into a probability distribution $p_1, ..., p_T$ using a softmax layer. Finally, `forward` will return the weighted mean $p_1 e_1 + ... + p_T e_T$.      

In order to implement the `forward` function, you should:

1. Concatenate one copy of the decoder hidden state `decoder_hs` to each hidden state in `encoder_hss`. This will give you a tensor `conditioned` of dimension `(sequence_length,1,3*RNN_HIDDEN_DIM)`.
1. Apply the first layer of the feed-forward network `self.attention1` and `self.relu` to `conditioned`. This should give you a tensor `att1` of dimension `()`.

In [15]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention,self).__init__()

        self.linear1 = nn.Linear(3*RNN_HIDDEN_DIM,RNN_HIDDEN_DIM)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(RNN_HIDDEN_DIM,1)
    
    def forward(self,encoder_hss,decoder_hs):
        # encoder_hss : (seq_len, 1, 2*rnn_hidden_dim)
        # decoder_hs: (1,1,rnn_hidden_dim)
        decoder_hs_copy = decoder_hs.expand(encoder_hss.size()[0],-1,-1) # (seq_len, 1, rnn_hidden_dim)
        concat = torch.cat([encoder_hss,decoder_hs_copy],dim=2) # (seq_len, 1, 3*rnn_hidden_dim)
        att1 = self.linear1(concat)# (seq_len, 1, rnn_hidden_dim)
        att1 = self.relu(att1) # (seq_len, 1, rnn_hidden_dim)
        att1 = self.linear2(att1) # (seq_len, 1, 1)
        # expand att1 to encoder_hss shape
        att1 = att1.expand(-1,-1,2*RNN_HIDDEN_DIM) # (seq_len, 1, 2*rnn_hidden_dim)
        mean_att_weight = torch.sum(att1*encoder_hss, dim=0) # (1, 2*rnn_hidden_dim)
        context = mean_att_weight.unsqueeze(0) # (1, 1, 2*rnn_hidden_dim)
        return context

# An assertion to test that your implementation returns an object of the correct size. 
input, input_length = example.input
encoder_hss = Encoder(vocab.get_stoi())(example)
decoder_hs = torch.randn(1,1,RNN_HIDDEN_DIM)

assert(Attention()(encoder_hss,decoder_hs).size() == torch.Size([1,1,2*RNN_HIDDEN_DIM]))

The `Decoder` class will require some changes compared to the case without attention. The most drastic change concerns the `forward` function which will now become almost identical to the `generate` function.

##### `Decoder.forward`

The `forward` function takes a sequence of encoder hidden states given as a tensor `encoder_hss` corresponding to an input sequence like `<start> s t o d g e FEAT=V FEAT=PST <end>`. It has dimension `(sequence_length,1,2*RNN_HIDDEN_DIM)`. The `forward` function returns a tensor corresponding to the output sequence `s t o d g e d <end>`. As in the non-attentional case, the `forward` function does not directly return the output sequence. Instead, it returns a tensor `distr` of size `(output_length,1,alphabet_size)`. For example, `distr[6,0,15]` indicates the log-probability that the output symbol at position `t` will be symbol number `15` (imagine for example that `15 == self.alphabet["d"]`).

At each time-step `t`, the decoder uses `self.attention` to compute a context vector based on all of the encoder hidden states and the decoder hidden state `decoder_state` at time-step `t-1`. In contrast, to the `Decoder` without attention, we therefore need to recursively generate the output because the context vector depends on the decoder hidden state at time-step `t-1`. It is your task to maintain the internal state `decoder_state`. It is a pair `(hs,cs)`, where `hs` is the decoder hidden state and `cs` is its cell state.

Since our decoder is trained using teacher forcing, we will feed in the gold standard output symbol at position `t-1` when predicting the output at position `t` in `forward`. You can access the gold standard output sequence via `ex.output`.

In order to implement `forward`, you should:

1. Embed the gold standard output word form in `ex.output` using `self.embedding`. This should give you a tensor `embedded_output` of size `(output_length - 1,1,EMBEDDING_DIM)` (note the `- 1` which is a result of clipping the `<end>` symbol form the output sequence `<start> s t o d g e d <end>`).
1. Loop over the output sequence `<start> s t o d g e d <end>`.
1. Use `self.attention` to compute a context vector based on the decoder hidden state and all encoder hidden states. This should give you a tensor `context` of dimension `(1,1,2*RNN_HIDDEN_DIM)`.
1. Run `self.rnn` on the concatenation of `output_embedding` and `context`. Note that you need to initialize the decoder to `decoder_state`. You should use the return value to update `decoder_state`.
1. Use the decoder hidden state `hs` from `decoder_state` and self.hidden2char to predict the distribution for the next output character and append it to the `result` array.  
1. After `output_length - 1` time-steps, transform `result` into a tensor of dimension `(output_length - 1, 1, alphabet_size)` and return it (you should be able to transform the array into a tensor using `torch.cat`).

##### `Decoder.generate`

The `generate` function is very similar to the `forward` function except that we are not using teacher forcing.

It takes only one argument: a sequence of encoder hidden states of dimension `(sequence_length,1,2*RNN_HIDDEN_STATE)`. In contrast to `forward`, you don't get a gold standard output sequence as parameter since we need to use `generate` during test time. Instead, it is your task to recursively generate the output sequence starting with the sequence inital symbol `<start>`. 

At each time step, you should compute a `context` vector using `self.attention`, the current decoder state `decoder_state` and the sequence of encoder hidden states `encoder_hss`. You should then feed the current output symbol `output_char` and the context vector `context` as input to the decoder. 

It is your task to maintain the internal state `decoder_state`. It is a pair `(hs,cs)`, where `hs` is the decoder hidden state and `cs` is its cell state. You need to initialize the decoder to this state and update it using the return value of `self.rnn.forward`. You should then predict the next output symbol using the updated value of `decoder_state` and `self.hidden2char` echoing the `forward` function you just implemented.

The `generate` function always produces an output of length `MAXWFLEN` but since this output may contain the `<end>` symbol, we can in reality generate shorter output sequences because `a b c <end> <end> ...` corresponds to the output word form `a b c`.

In order to implement `generate`, you should:

1. Embed `output_char` using `self.embedding`. This should give you a tensor `output_embedding` of dimension `(1,1,EMBEDDING_DIM)`.
1. Compute a `context` vector using `self.attention`. You need to give `decoder_state` and `encoder_hss` as input to `self.attention`. This should result in `context` tensor of dimension `(1,1,2*RNN_HIDDEN_DIM)`.
1. Run `self.rnn` on the concatenation of `output_embedding` and `context`. Note that you need to initialize the decoder to `decoder_state`. You should use the return value to update `decoder_state`.
1. Use the decoder hidden state `hs` from `decoder_state` and self.hidden2char to predict the next output character. You will probably need to use torch.argmax to find the most probable output symbol.
1. Update the value of the `output_char` variable to the current output character and add it to the result array. Note that if `output_char == torch.Tensor([[1]])`, then you need to add the integer `1` to result. 
1. After `MAXWFLEN` time-steps, return `result`.

In [16]:
x = [torch.randn(2, 3),torch.randn(2, 3)] #(2,2,3)
print(x)
y = torch.cat(x,dim=0)
print(y)
print(y.size())

[tensor([[ 1.0266, -0.8604,  0.7567],
        [-0.3657, -2.0474, -0.4515]]), tensor([[ 0.2081, -0.6706,  0.8777],
        [ 0.0225,  0.3548,  0.3639]])]
tensor([[ 1.0266, -0.8604,  0.7567],
        [-0.3657, -2.0474, -0.4515],
        [ 0.2081, -0.6706,  0.8777],
        [ 0.0225,  0.3548,  0.3639]])
torch.Size([4, 3])


In [17]:
class Decoder(nn.Module):
    def __init__(self, alphabet):
        super(Decoder,self).__init__()
        self.alphabet = alphabet
        self.embedding = nn.Embedding(len(alphabet), EMBEDDING_DIM)
        self.attention = Attention()
        self.rnn = nn.LSTM(EMBEDDING_DIM+2*RNN_HIDDEN_DIM, RNN_HIDDEN_DIM, RNN_LAYERS, bidirectional=False)
        self.hidden2char = nn.Linear(RNN_HIDDEN_DIM, len(alphabet))
    
    def forward(self,ex,encoder_hss):
        output, output_length = ex.output
        embedded_output = self.embedding(output[:-1]) # clip <end> (output_len -1, 1, embedding_dim)
        results = []
        decoder_state = (torch.zeros(1,1,RNN_HIDDEN_DIM,requires_grad=False), 
                         torch.zeros(1,1,RNN_HIDDEN_DIM,requires_grad=False)) # initialize zero decoder states (hs, cs)
        for i in range(output_length-1):
            context = self.attention(encoder_hss, decoder_state[0]) #(1,1,2*rnn_hidden_dim)
            output_i = embedded_output[i].unsqueeze(0) # (1,1, embedding_dim)
            decoder_input = torch.cat([output_i,context], dim=2) # (1,1, embedding_dim+2*rnn_hidden_dim)
            hss,(hs, cs) = self.rnn(decoder_input,decoder_state) # hs:(1,1, rnn_hidden_dim)
            decoder_state = (hs, cs)
            dist = self.hidden2char(hs) # hs:(1,1, len(vocab))
            results.append(dist)
        
        # results : (output_len-1, 1, 1, len(vocab))
        results = torch.cat(results, dim=0) # results : (output_len-1,1, len(vocab))
        return results.log_softmax(dim=2), output[1:] # (output_len-1,1, len(alphabet_size))

        
    def generate(self,encoder_hss):
        with torch.no_grad():
            decoder_state = (torch.zeros(1,1,RNN_HIDDEN_DIM), torch.zeros(1,1,RNN_HIDDEN_DIM))
            output_char = torch.LongTensor([[self.alphabet["<start>"]]]) # (1,1)
            result = []
            for _ in range(MAXWFLEN):
                output_embedding = self.embedding(output_char) # (1,1,EMBEDDING_DIM)
                context = self.attention(encoder_hss, decoder_state[0]) #(1,1,2*rnn_hidden_dim)
                decoder_input = torch.cat([output_embedding,context], dim=2) # (1,1, embedding_dim+2*rnn_hidden_dim)
                hss, (hs, cs) = self.rnn(decoder_input,decoder_state)  # hs:(1,1, rnn_hidden_dim)
                decoder_state = (hs, cs)
                dist = self.hidden2char(hs).softmax(dim=2) #(1,1,len(alphabet))
                predicted_symbol = dist.argmax() # just a symbol
                output_char = torch.LongTensor([[predicted_symbol]]) # (1,1)
                result.append(output_char.numpy().tolist()[0][0])
            return result
            
# Assertions to test that your implementation returns objects of the correct size. 
encoder_hs = Encoder(vocab.get_stoi())(example)
_, output_length = example.output
alphabet = vocab.get_stoi()
alphabet_size = len(alphabet)
assert(Decoder(alphabet)(example,encoder_hs)[0].size() == torch.Size([output_length - 1,1,alphabet_size]))
assert(len(Decoder(alphabet).generate(encoder_hs)) == MAXWFLEN)

#### Training the Model

provided code for a `WordInflector` class which combines the encoder and decoder networks.

Running 10 epochs of the training algorithm on the Spanish medium training set spanning 1,000 examples, you should get development accuracy > 50%. If you are seeing < 20% accuracy after training 10 epochs, then you have a problem somewhere in your code.

In [18]:
class WordInflector(nn.Module):
    def __init__(self, alphabet):
        super(WordInflector, self).__init__()
        self.c2i = alphabet.get_stoi()
        self.i2c = alphabet.get_itos()
        alphabet_size = len(self.c2i)
        
        self.encoder = Encoder(self.c2i)
        self.decoder = Decoder(self.c2i)
    
    def get_string(self,ids):
        string = ''.join([self.i2c[i] for i in ids])
        return re.sub("%s.*" % "<end>","",string)

    def forward(self, example):
        encoder_hs = self.encoder(example)
        return self.decoder(example,encoder_hs)
            
    def generate(self, data):
        all_results = []
        with torch.no_grad():
            for example in data:
                encoder_hs = self.encoder(example)
                output = self.decoder.generate(encoder_hs)
                all_results.append(self.get_string(output))
        return all_results
    
if __name__=="__main__":
    train_iter, dev_iter, test_iter, vocab = read_data(language="spanish",
                                                       setting="medium",
                                                       batch_size=1)
    
    inflector = WordInflector(vocab)

    loss_function = nn.NLLLoss(ignore_index=inflector.c2i["<pad>"],reduction='mean')
    optimizer = Adam(inflector.parameters())
    gold_dev_words = [''.join(w.output) for w in dev_iter.dataset]

    for epoch in range(EPOCHS):
        tot_loss = 0 

        # Update parameters
        for i, batch in enumerate(train_iter):
            print("Example %u of %u" % (i+1,len(train_iter)),end="\r")
            inflector.zero_grad()
            tag_scores, tgt = inflector(batch)
            tgt = tgt.permute(1,0)
            tag_scores = tag_scores.permute(1,2,0)
            loss = loss_function(tag_scores,tgt) 
            tot_loss += loss.detach().numpy()
            loss.backward()
            optimizer.step()
        print()
        avg_loss = tot_loss/len(train_iter)
        print("EPOCH %u: AVG LOSS PER EX: %.5f" % (epoch+1,avg_loss))        

        # Evaluate on dev data.
        sys_dev_words = inflector.generate(dev_iter)
        print("DEV ACC: %.2f%%" % accuracy(sys_dev_words,gold_dev_words))

Example 1000 of 1000
EPOCH 1: AVG LOSS PER EX: 2.54354
DEV ACC: 0.00%
Example 1000 of 1000
EPOCH 2: AVG LOSS PER EX: 1.65693
DEV ACC: 1.00%
Example 1000 of 1000
EPOCH 3: AVG LOSS PER EX: 1.05424
DEV ACC: 8.50%
Example 1000 of 1000
EPOCH 4: AVG LOSS PER EX: 0.69113
DEV ACC: 22.90%
Example 1000 of 1000
EPOCH 5: AVG LOSS PER EX: 0.47361
DEV ACC: 35.50%
Example 1000 of 1000
EPOCH 6: AVG LOSS PER EX: 0.33736
DEV ACC: 41.60%
Example 1000 of 1000
EPOCH 7: AVG LOSS PER EX: 0.27975
DEV ACC: 48.40%
Example 1000 of 1000
EPOCH 8: AVG LOSS PER EX: 0.21874
DEV ACC: 21.70%
Example 1000 of 1000
EPOCH 9: AVG LOSS PER EX: 0.20619
DEV ACC: 58.40%
Example 1000 of 1000
EPOCH 10: AVG LOSS PER EX: 0.15511
DEV ACC: 62.20%
