In [1]:
import torch
from torch import nn
from torch.nn import functional as F

import sys
sys.path.append('..')

from nmt.datasets import Vocab, VocabStore

#### From Character-Aware Neural Language Models : Kim et al (https://arxiv.org/pdf/1508.06615.pdf)

<img src="images/cnn-char-model.png" />

> Whereas a conventional NLM takes word embeddings as inputs, our model instead takes the output from a single-layer character-level convolutional neural network with max-over-time pooling.

Here we will implement the embedding upto the highway network. Everything uptil the LSTM.  
* Character Embedding Layer
* CNN Embedding Layer with Maxpool
* Highway Network


In [2]:
## Setup something to work with

sentences = [
    "Human: What do we want?",
    "Computer: Natural language processing!",
    "Human: When do we want it?",
    "Computer: When do we want what?"
]

sentences_words = [
    ['Human:', 'What', 'do', 'we', 'want?'],
    ['Computer:', 'Natural', 'language', 'processing!'],
    ['Human:', 'When', 'do', 'we', 'want', 'it?'],
    ['Computer:', 'When', 'do', 'we', 'want', 'what?']
]

In [3]:
vocab = Vocab.build(sentences, sentences_words)

Initializing source vocab
Vocab Store: Tokens [size=17],                 Characters [size=97]
Initializing target vocab
Vocab Store: Tokens [size=17],                 Characters [size=97]


In [4]:
char_tensors = vocab.src.to_tensor(sentences_words, tokens=False)

I input 4 sentences. Max words per sentence is 21. Max sentence length is 6.  
Therefore, tensor shape is (6, 4, 21)

In [5]:
print(char_tensors.size())

torch.Size([6, 4, 21])


<img src="images/cnn-char-layer.png" />

```python
C = vocab.src.length(tokens=False) # 97 in this case
d = 50 # usually good enough for character embeddings
l = 21 # we set max word length in vocab
```

According to paper the character-level representation of word k will of dimension d x l = [50 x 21]. 
  
Note Q is just the char embedding layer. 

In [6]:
# Char Embedding (Q)
embed = nn.Embedding(num_embeddings=vocab.src.length(tokens=False), embedding_dim=50, padding_idx=vocab.src.unk_char_idx)

In [7]:
# CNN Embedding
class CharCNNEmbedding(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 5) -> None:
        super(CharCNNEmbedding, self).__init__()
        self.conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
                              kernel_size=kernel_size)
        self.maxpool = nn.AdaptiveMaxPool1d(output_size=1)

    def forward(self, x: torch.Tensor):
        out = F.relu(self.conv(x))
        out = self.maxpool(out)
        return out

In [8]:
cnnembed = CharCNNEmbedding(in_channels=50, out_channels=1024, kernel_size=5)

<img src="images/char-embed-highway.png" />

It's essentially a residual layer but with a gate. Hence the highway.

In [9]:
## Highway Layer
class Highway(nn.Module):
    def __init__(self, in_features: int, out_features: int) -> None:
        super(Highway, self).__init__()
        self.linear = nn.Linear(in_features=in_features, out_features=out_features)
        self.gate = nn.Linear(in_features=in_features, out_features=out_features)

    def forward(self, x: torch.Tensor):
        z = F.relu(self.linear(x))
        t = torch.sigmoid(self.gate(x))

        return t * z + (1 - t) * x


In [10]:
highway = Highway(in_features=1024, out_features=1024)

Alright lets check this out.

In [11]:
char_embed = embed(char_tensors)

This would be of shape (sentence_length, batch_size, num_words, num_embeddings)

In [12]:
char_embed.size()

torch.Size([6, 4, 21, 50])

Recall
> According to paper the character-level representation of word k will of dimension d x l = [50 x 21]. 

In [13]:
char_embed = char_embed.view(-1, 21, 50).permute(0, 2, 1)
print(char_embed.size())

torch.Size([24, 50, 21])


In [14]:
cnn_embed_tensor = cnnembed(char_embed)

In [15]:
cnn_embed_tensor.size()

torch.Size([24, 1024, 1])

To feed to linear layer, gotta squeeze it

In [16]:
cnn_embed_tensor = cnn_embed_tensor.squeeze(dim=2)
print(cnn_embed_tensor.size())

torch.Size([24, 1024])


In [17]:
highway_embed_tensor = highway(cnn_embed_tensor)

In [18]:
highway_embed_tensor.size()

torch.Size([24, 1024])

Now convert it back to (sentence_length, batch_size, embeddings)

In [19]:
## Note when we actually feed to lstm, we will pack padded 
## https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch
input_to_lstm = highway_embed_tensor.view(6, 4, 1024)
print(input_to_lstm.size())

torch.Size([6, 4, 1024])


### Put it together as a ModelEmbeddingNetwork

In [25]:
class ModelEmbeddings(nn.Module):
    def __init__(self, embed_size: int, char_embed_size: int, vocab_src: VocabStore, 
                 cnn_kernel_size: int = 5, dropout_prob: float = 0.3) -> None:
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size
        self.char_embed_size = char_embed_size
        
        pad_char_id = vocab_src.pad_char_idx

        self.char_embed = nn.Embedding(num_embeddings=vocab_src.length(tokens=False),
                                       embedding_dim=char_embed_size, padding_idx=pad_char_id)
        self.cnn_embed = CharCNNEmbedding(in_channels=char_embed_size,
                                          out_channels=embed_size,
                                          kernel_size=cnn_kernel_size)
        self.highway = Highway(in_features=embed_size, out_features=embed_size)
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, x: torch.Tensor):
        sentence_length, batch_size, word_length = x.size()

        embed_t = self.char_embed(x)
        embed_t = embed_t.view(-1, word_length, self.char_embed_size).permute([0, 2, 1])

        cnn_embed_t = self.cnn_embed(embed_t)
        cnn_embed_t = cnn_embed_t.squeeze(dim=2)

        highway_t = self.highway(cnn_embed_t)

        out = self.dropout(highway_t)

        out = out.view(sentence_length, batch_size, self.embed_size)
        return out

In [26]:
model_embeddings = ModelEmbeddings(embed_size=1024, char_embed_size=50, vocab_src=vocab.src)

In [27]:
inp_to_lstm = model_embeddings(char_tensors)

In [28]:
inp_to_lstm.size()

torch.Size([6, 4, 1024])