# Sentence Embedding using LSTM


## imports

In [1]:
import torch

from transformers import AlbertTokenizer

## Tokenizer

In [36]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
tokenizer.padding_side = 'left'

In [37]:
sentences = ['Hello, my dog is cute', 'Hello, my cat is cute and my dog is cute']

In [38]:
inputs = torch.cat(
    [
        tokenizer.encode(
            sentences[i],
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_tensors='pt'
        ) for i in range(len(sentences))]
)

# inputs

In [40]:
# inputs

## LSTM Layer

In [41]:
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable

In [61]:
class LSTM(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embed_dim=256,
                 hidden_dim=128,
                 num_layers=2,
                 bidirectional=True,
                 dropout=0.2):
        super(LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        if bidirectional:
            self.num_directs = 2
        else:
            self.num_directs = 1
        
        self.dropout = dropout
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, 
                              num_layers=num_layers,
                              batch_first=True, bidirectional=bidirectional)
        self.linear = nn.Linear(self.num_directs*hidden_dim, hidden_dim)
        
    
    def init_hidden(self, batch_size):
        # (num_layers * num_directions, batch_size, hidden_size)
        hidden = Variable(
            torch.zeros(self.num_layers*self.num_directs, batch_size, self.hidden_dim)
        )
        
        cell = Variable(
            torch.zeros(self.num_layers*self.num_directs, batch_size, self.hidden_dim)
        )
        return hidden, cell
        

    def forward(self, sents):
        x = self.embed(sents)
        
        h_0, cell = self.init_hidden(x.size(0))  # initial h_0
        
        # (batch, seq, feature)
        output, h_n = self.bilstm(x, (h_0, cell))
        output = torch.mean(output, dim=1)
        output = self.linear(output)
        return output

## Setence Embedding

In [62]:
vocab_size = tokenizer.vocab_size

In [63]:
lstm = LSTM(vocab_size=vocab_size)

In [64]:
output = lstm(inputs)

In [65]:
# (batch, sequence, feature)
output.shape

torch.Size([2, 128])