# Data Preprocessing

In [37]:
import unicodedata
from collections import Counter
from tqdm import tqdm
import numpy as np

In [24]:
num_samples = 30000

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent):
    sent = unicode_to_ascii(sent.lower())

    sent = re.sub(r"([?.!,¿])", r" \1", sent)

    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

    sent = re.sub(r"\s+", " ", sent)

    return sent

def load_preprocessed_data():
  encoder_input, decoder_input, decoder_target = [], [], []

  with open("fra.txt", "rt", encoding='UTF8') as lines:
    for i, line in enumerate(lines):
      # source 데이터와 target 데이터 분리
      src_line, tar_line, _ = line.strip().split('\t')

      # source 데이터 전처리
      src_line = [w for w in preprocess_sentence(src_line).split()]

      # target 데이터 전처리
      tar_line = preprocess_sentence(tar_line)
      tar_line_in = [w for w in ("<sos> " + tar_line).split()]
      tar_line_out = [w for w in (tar_line + " <eos>").split()]

      encoder_input.append(src_line)
      decoder_input.append(tar_line_in)
      decoder_target.append(tar_line_out)

      if i == num_samples - 1:
        break

  return encoder_input, decoder_input, decoder_target

In [19]:
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print('Before Preprocessing Eng Sen :', en_sent)
print('After Preprocessing Eng Sen :',preprocess_sentence(en_sent))
print('Before Preprocessing Fra Sen :', fr_sent)
print('After Preprocessing Fra Sen :', preprocess_sentence(fr_sent))

Before Preprocessing Eng Sen : Have you had dinner?
After Preprocessing Eng Sen : have you had dinner ?
Before Preprocessing Fra Sen : Avez-vous déjà diné?
After Preprocessing Fra Sen : avez vous deja dine ?


In [21]:
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()
print(sents_en_in[:3])
print(sents_fra_in[:3])
print(sents_fra_out[:3])

[['go', '.'], ['go', '.'], ['go', '.']]
[['<sos>', 'va', '!'], ['<sos>', 'marche', '.'], ['<sos>', 'en', 'route', '!']]
[['va', '!', '<eos>'], ['marche', '.', '<eos>'], ['en', 'route', '!', '<eos>']]


In [11]:
def build_vocab(sents):
  word_list = []

  for sent in sents:
      for word in sent:
        word_list.append(word)

  word_counts = Counter(word_list)
  vocab = sorted(word_counts, key=word_counts.get, reverse=True)

  word_to_index = {}
  word_to_index['<PAD>'] = 0
  word_to_index['<UNK>'] = 1

  for index, word in enumerate(vocab) :
    word_to_index[word] = index + 2

  return word_to_index

In [26]:
src_vocab = build_vocab(sents_en_in)
tar_vocab = build_vocab(sents_fra_in + sents_fra_out)

src_vocab_size = len(src_vocab)
tar_vocab_size = len(tar_vocab)
print("ENG Word Size : {:d}, FRA Word Size : {:d}".format(src_vocab_size, tar_vocab_size))

ENG Word Size : 4287, FRA Word Size : 7476


In [29]:
index_to_src = {v: k for k, v in src_vocab.items()}
index_to_tar = {v: k for k, v in tar_vocab.items()}

def texts_to_sequences(sents, word_to_index):
  encoded_X_data = []
  for sent in tqdm(sents):
    index_sequences = []
    for word in sent:
      try:
          index_sequences.append(word_to_index[word])
      except KeyError:
          index_sequences.append(word_to_index['<UNK>'])
    encoded_X_data.append(index_sequences)
  return encoded_X_data

In [30]:
encoder_input = texts_to_sequences(sents_en_in, src_vocab)
decoder_input = texts_to_sequences(sents_fra_in, tar_vocab)
decoder_target = texts_to_sequences(sents_fra_out, tar_vocab)

100%|████████████████████████████████████████████████████████████████████████████████████| 30000/30000 [00:00<00:00, 1303131.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 30000/30000 [00:00<00:00, 288201.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 30000/30000 [00:00<00:00, 1152861.97it/s]


In [36]:
def pad_sequences(sentences, max_len=None):
    if max_len is None:
        max_len = max([len(sentence) for sentence in sentences])

    features = np.zeros((len(sentences), max_len), dtype=int)
    for index, sentence in enumerate(sentences):
        if len(sentence) != 0:
            features[index, :len(sentence)] = np.array(sentence)[:max_len]
    return features

In [38]:
encoder_input = pad_sequences(encoder_input)
decoder_input = pad_sequences(decoder_input)
decoder_target = pad_sequences(decoder_target)

In [39]:
encoder_input.shape, decoder_input.shape, decoder_target.shape

((30000, 7), (30000, 16), (30000, 16))

In [42]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print('Random Sequence :',indices)
print(len(indices))

Random Sequence : [27798  3040 28667 ... 22736  3240  9997]
30000


In [43]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [45]:
print([index_to_src[word] for word in encoder_input[20997]])
print([index_to_tar[word] for word in decoder_input[20997]])
print([index_to_tar[word] for word in decoder_target[20997]])

['i', 'had', 'fun', 'today', '.', '<PAD>', '<PAD>']
['<sos>', 'je', 'me', 'suis', 'bien', 'amuse', 'aujourd', 'hui', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['je', 'me', 'suis', 'bien', 'amuse', 'aujourd', 'hui', '.', '<eos>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [46]:
n_of_val = int(num_samples * 0.1)

encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

# Seq2Seq Model

In [63]:
import torch
import torch.nn as nn
import torch.optim as optim

class Encoder(nn.Module):
    def __init__(self, src_vocab_size, embedding_dim, hidden_units):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(src_vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, cell) = self.lstm(x)
        return hidden, cell

In [64]:
class Decoder(nn.Module):
    def __init__(self, tar_vocab_size, embedding_dim, hidden_units):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(tar_vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True)
        self.fc = nn.Linear(hidden_units, tar_vocab_size)

    def forward(self, x, hidden, cell):
        x = self.embedding(x)

        output, (hidden, cell) = self.lstm(x, (hidden, cell))

        output = self.fc(output)

        return output, hidden, cell

In [65]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        hidden, cell = self.encoder(src)

        output, _, _ = self.decoder(trg, hidden, cell)
        return output

In [66]:
embedding_dim = 256
hidden_units = 256

encoder = Encoder(src_vocab_size, embedding_dim, hidden_units)
decoder = Decoder(tar_vocab_size, embedding_dim, hidden_units)
model = Seq2Seq(encoder, decoder)

loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())

In [67]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4287, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(7476, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=7476, bias=True)
  )
)

In [68]:
def evaluation(model, dataloader, loss_function, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for encoder_inputs, decoder_inputs, decoder_targets in dataloader:
            encoder_inputs = encoder_inputs.to(device)
            decoder_inputs = decoder_inputs.to(device)
            decoder_targets = decoder_targets.to(device)
            
            outputs = model(encoder_inputs, decoder_inputs)

            loss = loss_function(outputs.view(-1, outputs.size(-1)), decoder_targets.view(-1))
            total_loss += loss.item()

            mask = decoder_targets != 0
            total_correct += ((outputs.argmax(dim=-1) == decoder_targets) * mask).sum().item()
            total_count += mask.sum().item()

    return total_loss / len(dataloader), total_correct / total_count

In [69]:
from torch.utils.data import DataLoader, TensorDataset

In [70]:
encoder_input_train_tensor = torch.tensor(encoder_input_train, dtype=torch.long)
decoder_input_train_tensor = torch.tensor(decoder_input_train, dtype=torch.long)
decoder_target_train_tensor = torch.tensor(decoder_target_train, dtype=torch.long)

encoder_input_test_tensor = torch.tensor(encoder_input_test, dtype=torch.long)
decoder_input_test_tensor = torch.tensor(decoder_input_test, dtype=torch.long)
decoder_target_test_tensor = torch.tensor(decoder_target_test, dtype=torch.long)

batch_size = 128

train_dataset = TensorDataset(encoder_input_train_tensor, decoder_input_train_tensor, decoder_target_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = TensorDataset(encoder_input_test_tensor, decoder_input_test_tensor, decoder_target_test_tensor)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [71]:
num_epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4287, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(7476, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=7476, bias=True)
  )
)

In [72]:
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()

    for encoder_inputs, decoder_inputs, decoder_targets in train_dataloader:
        encoder_inputs = encoder_inputs.to(device)
        decoder_inputs = decoder_inputs.to(device)
        decoder_targets = decoder_targets.to(device)

        optimizer.zero_grad()

        outputs = model(encoder_inputs, decoder_inputs)

        loss = loss_function(outputs.view(-1, outputs.size(-1)), decoder_targets.view(-1))
        loss.backward()

        optimizer.step()

    train_loss, train_acc = evaluation(model, train_dataloader, loss_function, device)
    valid_loss, valid_acc = evaluation(model, valid_dataloader, loss_function, device)

    print(f'Epoch: {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Valid Loss: {valid_loss:.4f} | Valid Acc: {valid_acc:.4f}')

    if valid_loss < best_val_loss:
        print(f'Validation loss improved from {best_val_loss:.4f} to {valid_loss:.4f}.')
        best_val_loss = valid_loss

Epoch: 1/30 | Train Loss: 3.0362 | Train Acc: 0.5158 | Valid Loss: 3.1349 | Valid Acc: 0.5129
Validation loss improved from inf to 3.1349.
Epoch: 2/30 | Train Loss: 2.3663 | Train Acc: 0.5938 | Valid Loss: 2.5737 | Valid Acc: 0.5842
Validation loss improved from 3.1349 to 2.5737.
Epoch: 3/30 | Train Loss: 1.9521 | Train Acc: 0.6357 | Valid Loss: 2.2783 | Valid Acc: 0.6200
Validation loss improved from 2.5737 to 2.2783.
Epoch: 4/30 | Train Loss: 1.6364 | Train Acc: 0.6727 | Valid Loss: 2.0837 | Valid Acc: 0.6400
Validation loss improved from 2.2783 to 2.0837.
Epoch: 5/30 | Train Loss: 1.3825 | Train Acc: 0.7063 | Valid Loss: 1.9399 | Valid Acc: 0.6592
Validation loss improved from 2.0837 to 1.9399.
Epoch: 6/30 | Train Loss: 1.1646 | Train Acc: 0.7444 | Valid Loss: 1.8242 | Valid Acc: 0.6723
Validation loss improved from 1.9399 to 1.8242.
Epoch: 7/30 | Train Loss: 0.9765 | Train Acc: 0.7779 | Valid Loss: 1.7301 | Valid Acc: 0.6848
Validation loss improved from 1.8242 to 1.7301.
Epoch: 8/

#### Code Source : '딥 러닝 파이토치 교과서 - 입문부터 파인튜닝까지'