In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

## Define the device

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Encoder

In [11]:
class Encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, dropout_probability):
        super().__init__()

        self.lstm_layer = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout_probability)

        # Dropout layer to prevent over fitting (regularization)
        # it randomly zeroes some of the elements of the input tensor with probability p using samples from a Bernoulli distribution.
        self.dropout = nn.Dropout(dropout_probability)

    def forward(self, embeddings):
        # inputs = [inputs len, batch size]
        dropout_embeddings = self.dropout(embeddings)

        # embedded = [inputs len, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm_layer(dropout_embeddings)

        # outputs = [inputs len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        # outputs are always from the top hidden layer
        return hidden, cell

## Seq2Seq