<a href="https://colab.research.google.com/github/DelmiroDaladier/NLP-studies/blob/master/machine_translation(pytorch)/machine_translation_with_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
SOS = 0
EOS = 1
MAX_LEN = 10


input_word_to_index = {}
input_index_to_word = {0: "SOS", 1: "EOS"}

target_word_to_index = {}
target_index_to_word = {0: "SOS", 1: "EOS"}

In [40]:
def unicode_to_ascii(s):
  return ''.join(
      c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn'
  )

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [41]:
lines = open(f'data/por.txt', encoding='utf-8').read().strip().split('\n')
en = [normalize_string(sentence.split('\t')[0]) for sentence in lines]
pt = [normalize_string(sentence.split('\t')[1]) for sentence in lines]


In [42]:
filtered_pairs = [(sentence[0].strip(), sentence[1].strip()) for sentence in zip(en, pt) if len(sentence[0].split(' ')) < MAX_LEN and len(sentence[1].split(' ')) < MAX_LEN]

In [43]:
input_vocab = 2
target_vocab = 2

for pair in filtered_pairs:
  for word in pair[0].split(' '):
    if word not in input_word_to_index:
      input_word_to_index[word] = input_vocab
      input_index_to_word[input_vocab] = word
      input_vocab += 1  

for pair in filtered_pairs:
  for word in pair[1].split(' '):
    if word not in target_word_to_index:
      target_word_to_index[word] = target_vocab
      target_index_to_word[target_vocab] = word
      target_vocab += 1  


In [7]:
input_vocab = len(input_word_to_index)
target_vocab = len(target_word_to_index)

In [58]:
def sentence_to_tensor(sentence: str, word_to_index: dict):
  indexes = [word_to_index[word] for word in sentence.split(' ')]
  indexes.append(EOS)
  tensors = torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
  return tensors

In [9]:
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)
  
  def forward(self, input, hidden):
    embedded = self.embedding(input).view(1, 1, -1)
    output = embedded
    output, hidden = self.gru(output, hidden)
    return output, hidden

  def init_hidden():
    return torch.zeros(1, 1, self.hidden_size, device=device)

In [13]:
class AttnDecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LEN):
    super(AttnDecoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.dropout_p = dropout_p
    self.max_length = max_length

    self.embedding = nn.Embedding(self.output_size, self.hidden_size)
    self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
    self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
    self.dropout = nn.Dropout(self.dropout_p)
    self.gru = nn.GRU(self.hidden_size, self.hidden_size)
    self.out = nn.Linear(self.hidden_size, self.output_size)

  def forward(self, input, hidden, encoder_outputs):
    embedded = self.embedding(input).view(1, 1, -1)
    embedded = self.dropout(embedded)

    attn_weights = F.softmax(
        self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
    attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                             encoder_outputs.unsqueeze(0))
    
    output = torch.cat((embedded[0], attn_applied[0]), 1)
    output = self.attn_combine(output).unsqueeze(0)

    output = F.relu(output)
    output, hidden = self.gru(output, hidden)

    output = F.log_softmax(self.out(output[0]), dim=1)
    return output, hidden, attn_weights

In [50]:
input_word_to_index['go']

2