<a href="https://colab.research.google.com/github/Adityahulk/NLP_with_Pytorch_complete/blob/main/Transformer_machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time


In [None]:
!pip install --upgrade spacy

In [None]:
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download de_core_news_sm

2021-04-20 18:01:40.850136: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Collecting en-core-web-sm==3.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7MB)
[K     |████████████████████████████████| 13.7MB 238kB/s 
Installing collected packages: en-core-web-sm
  Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
2021-04-20 18:01:48.669736: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Collecting de-core-news-sm==3.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core

In [None]:
eng_module = spacy.load('en_core_web_sm')
ger_module = spacy.load('de_core_news_sm')

In [None]:
def tokenize_eng(text):
  return [sen.text for sen in eng_module.tokenizer(text)]

def tokenize_ger(text):
  return [sen.text for sen in ger_module.tokenizer(text)]

In [None]:
source = Field(tokenize = tokenize_eng, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

target = Field(tokenize = tokenize_ger, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [None]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.en','.de'), 
                                                    fields = (source, target))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 571kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 168kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 166kB/s]


In [None]:
source.build_vocab(train_data, min_freq = 2)
target.build_vocab(train_data, min_freq = 2)

In [None]:
print(len(source.vocab))
print(len(target.vocab))

5893
7853


In [None]:
class Transformer(nn.Module):
  def __init__(
      self,
      embedding_size,
      src_vocab_size,
      trg_vocab_size,
      src_pad_idx,
      num_heads,
      num_encoder_layers,
      num_decoder_layers,
      forward_expansion,
      dropout,
      max_len,
      device,
  ):
    super().__init__()
    self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
    self.src_position_embedding = nn.Embedding(max_len, embedding_size)
    self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
    self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
    self.device = device
    self.transformer = nn.Transformer(
        embedding_size,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
    )

    self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.src_pad_idx = src_pad_idx

  def make_src_mask(self,src):
    src_mask = src.transpose(0, 1) == self.src_pad_idx
    return src_mask

  def forward(self,src,trg):
    src_seq_length, N = src.shape
    trg_seq_length, N = trg.shape

    src_positions = torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device)
    trg_positions = torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(self.device)

    embed_src = self.dropout((self.src_word_embedding(src) + self.src_position_embedding(src_positions)))
    embed_trg = self.dropout((self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions)))

    src_padding_mask = self.make_src_mask(src)
    trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)

    out = self.transformer(embed_src,embed_trg,src_key_padding_mask = src_padding_mask,trg_key_padding_mask = trg_mask)
    out = self.fc_out(out)
    return out
  


In [None]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True

num_epochs = 10
learning_rate = 3e-4
batch_size = 32

src_vocab_size = len(source.vocab)
trg_vocab_size = len(target.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = source.vocab.stoi["<pad>"]

train_iterator,valid_iterator,test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x:len(x.src),
    device = device
)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = source.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

if load_model:
  load_checkpoint(torch.load("my_checkpoint.pth.tar") ,model, optimizer)

sentence = "a horse goes under a bridge next to a boat"

for epoch in range(num_epochs):

  loss_per_epoch = 0
  if save_model:
    checkpoint = {
        "state_dict" : model.state_dict(),
        "optimizer" : optimizer.state_dict(),
    }

    save_checkpoint(checkpoint)

  model.eval()
  #translated_sentence = translate_sentence(
  #    model, sentence, target, source, device, max_length = 100
  #)

  #print(translated_sentence)

  for batch_idx, batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)
    
    output = model(inp_data, target[:-1])

    output = output.reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)
    optimizer.zero_grad()

    loss = criterion(output, target)
    loss.backward()
    loss_per_epoch = loss_per_epoch + loss
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)

    optimizer.step()
  
  print("Epoch:- ",epoch,"Loss-",loss_per_epoch)


AttributeError: ignored