In [None]:
!pip install torchtext==0.6.0
!pip install requests

In [None]:
!pip install de_core_news_sm
!python -m spacy download de_core_news_sm

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

spacy_en = spacy.load("en_core_web_sm")
spacy_de = spacy.load("de_core_news_sm")

def tokenizer_en(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

def tokenizer_de(text):
  return [tok.text for tok in spacy_de.tokenizer(text)]

print(tokenizer_en("Hello World!"))
print(tokenizer_de("Hallo Welt!"))

['Hello', 'World', '!']
['Hallo', 'Welt', '!']


In [None]:
english = Field(tokenize=tokenizer_en, lower=True, init_token='<sos>', eos_token='<eos>')
german = Field(tokenize=tokenizer_de, lower=True, init_token='<sos>', eos_token='<eos>')


train_data, validation_data, test_data = Multi30k.splits(exts=('.en', '.de'), fields=(english, german), root='./')

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(validation_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1071


In [None]:
print(" ".join(vars(train_data.examples[0])['src']))
print(" ".join(vars(train_data.examples[0])['trg']))
print(vars(train_data.examples[0]))

two young , white males are outside near many bushes .
zwei junge weiße männer sind im freien in der nähe vieler büsche .
{'src': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.'], 'trg': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']}


In [None]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [None]:
print(f"Unique tokens in german vocabulary: {len(german.vocab)}")
print(f"Unique tokens in english vocabulary: {len(english.vocab)}")

Unique tokens in german vocabulary: 7853
Unique tokens in english vocabulary: 5893


In [None]:
print(german.vocab.freqs.most_common(20))
print(german.vocab.itos[:10])

[('.', 28809), ('ein', 18851), ('einem', 13711), ('in', 11895), ('eine', 9909), (',', 8938), ('und', 8925), ('mit', 8843), ('auf', 8745), ('mann', 7805), ('einer', 6765), ('der', 4990), ('frau', 4186), ('die', 3949), ('zwei', 3873), ('einen', 3479), ('im', 3107), ('an', 3062), ('von', 2363), ('sich', 2273)]
['<unk>', '<pad>', '<sos>', '<eos>', '.', 'ein', 'einem', 'in', 'eine', ',']


In [None]:
class Encoder(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
    super(Encoder, self).__init__()
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)

  def forward(self, x):
    embedded = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.lstm(embedded)
    return hidden, cell

In [None]:
class Decoder(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
    super(Decoder, self).__init__()
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
    self.fc = nn.Linear(hidden_dim, output_dim)
  def forward(self, x, hidden, cell):
    x = x.unsqueeze(0)
    embedded = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
    predictions = self.fc(output)
    predictions = predictions.squeeze(0)
    return predictions, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio=0.5):
    batch_size = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = len(english.vocab)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    hidden, cell = self.encoder(source)
    x = target[0]
    for t in range(1, target_len):
      output, hidden, cell = self.decoder(x, hidden, cell)
      outputs[t] = output
      best_guess = output.argmax(1)
      x = target[t] if random.random() < teacher_force_ratio else best_guess

    return outputs

In [None]:
epochs = 20
lr = 0.001
batch_sizes = [64, 64, 54]

load_model = False
input_dim_encoder = len(german.vocab)
input_dim_decoder = len(english.vocab)
output_dim = len(english.vocab)
embedding_dim_encoder = 300
embedding_dim_decoder = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_sizes=batch_sizes,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device
)

encoder_model = Encoder(input_dim_encoder, embedding_dim_encoder, hidden_size, num_layers, encoder_dropout).to(device)
decoder_model = Decoder(input_dim_decoder, embedding_dim_decoder, hidden_size, output_dim, num_layers, decoder_dropout).to(device)
model = Seq2Seq(encoder_model, decoder_model).to(device)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
model.train()
for epoch in range(epochs):
  for batch_idx, batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)
    # print(inp_data.shape)
    # print(target.shape)
    output = model(inp_data, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)
    optimizer.zero_grad()
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
  print(f'epoch {epoch + 1}')

IndexError: index out of range in self