# Neural Machine Translation
While it might not seem like it, one of the very first development in Artificial Intellience is Neural Machine Translation. Traditionally, machine translation is a challenging task that involves large statistical models developed using high sophisticated linguistic knowledge. In neural machine translation, deep neural networks are developed for the problem. Advancing toward using Artificial Intelligence in machine translation task, AI to look for patterns in the input language and provide the target language representations as output. 

## Data Preparation
Like many machine learning task, we need to start with the data. In this tutorial, we'll use a dataset of English to Vietnamese phrases. Think of this as learning Vietnamese or English using flashcards. The dataset can be download [here](https://www.kaggle.com/datasets/hungnm/englishvietnamese-translation) and the credit for data preprocessing can be found [here](https://www.kaggle.com/code/huhuyngun/english-to-vietnamese-with-transformer) To prepare the dataset for modeling, we'll perform the following:

1. Start by reading in the associated data and scan through it
2. Cleanup punctuation
3. Process upper and lowercase words
4. Processing special characters
5. Handle duplciate phrases in English with different translations in Vietnamese


In [4]:
!pip install underthesea --quiet
!pip install torchtext --quiet

In [19]:
# import python libraries
import re
import string
import math
import pandas as pd
import numpy as np
from typing import Iterable, List
from unicodedata import normalize
from tqdm.notebook import tqdm

# NLP libraries
from gensim.models import KeyedVectors
from underthesea import word_tokenize  # Vietnamese NLP Toolkit
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


# NN Libraries
from transformers import AutoTokenizer

In [20]:
## Parameters
DATA_DIR = './data/'

In [22]:
## Helper Functions
def load_data(file_path:str) -> List[str]:
  """
    Function to load data from a text file
    Read it line by line and return as a list of strings
    Inputs:
      - file_path {string}: path to the file to be read
    Outputs:
      - data {list}: list of strings
  """

  data = []
  with open(file_path, 'rt', encoding='utf-8') as file:
    # read file line by line
    for line in file:
      # remove leading and trailing whitespaces
      line = line.strip()
      # append to data list
      data.append(line)
    # close file
    file.close()


  return data


def to_pairs(doc1: List[str], doc2: List[str]) -> List[str]:
  """
    Function to convert join two lists of strings into a list of pairs
    Inputs:
      - doc1 {list}: list of strings
      - doc2 {list}: list of strings
    Outputs:
      - pairs {list}: list of pairs
  """
  # initialize list of pairs
  pairs = []
  for i in range(0, len(doc1)):

    # append pair of strings
    pairs.append([doc1[i], doc2[i]])

  return pairs


# clean a list of lines
def clean_pairs(lines: List[str]) -> np.array:
  """
    Function to clean a list of pairs of strings
    Inputs:
      - lines {list}: list of pairs of strings
    Outputs:
      - cleaned {list}: list of cleaned pairs of
  """

  # delcare list and prepare regex for char filtering
  # also prepare translation table for removing punctuation
  cleaned = list()
  table = str.maketrans('', '', string.punctuation)

  for pair in tqdm(lines):
    clean_pair = list()
    # for each pari, perform the following operations
    # 1. tokenize on white space
    # 2. convert to lowercase
    # 3. remove punctuation from each token 
    # 4. remove extra whitespaces
    # 5. remove tokens with numbers in them
    # 6. store as string
    for line in pair:
      line = line.split()
      line = [word.lower() for word in line]
      line = [word.translate(table) for word in line]
      line = [re.sub("\s+", " ", w) for w in line]
      line = [word for word in line if word.isalpha()]
      clean_pair.append(' '.join(line))
      cleaned.append(clean_pair)
  return np.array(cleaned)

In [23]:
# Read in the data
# From initial inspection, the data between the English and Vietnamese sentences are aligned
# So we can read them in as pairs
english_text = load_data(DATA_DIR + 'raw/en_sents.txt')
vietnamese_text = load_data(DATA_DIR + 'raw/vi_sents.txt')
print(english_text[:5]), print(vietnamese_text[:5]), len(english_text), len(vietnamese_text)

['Please put the dustpan in the broom closet', 'Be quiet for a moment.', 'Read this', 'Tom persuaded the store manager to give him back his money.', 'Friendship consists of mutual understanding']
['xin vui lòng đặt đồ hốt rác trong tủ chổi', 'im lặng một lát', 'đọc này', 'tom thuyết phục người quản lý cửa hàng trả lại tiền cho anh ta.', 'tình bạn bao gồm sự hiểu biết lẫn nhau']


(None, None, 254090, 254090)

In [24]:
# convert to pairs
sentence_pairs = to_pairs(english_text, vietnamese_text)
sentence_pairs[:5]

[['Please put the dustpan in the broom closet',
  'xin vui lòng đặt đồ hốt rác trong tủ chổi'],
 ['Be quiet for a moment.', 'im lặng một lát'],
 ['Read this', 'đọc này'],
 ['Tom persuaded the store manager to give him back his money.',
  'tom thuyết phục người quản lý cửa hàng trả lại tiền cho anh ta.'],
 ['Friendship consists of mutual understanding',
  'tình bạn bao gồm sự hiểu biết lẫn nhau']]

In [25]:
# preprocessed data pairs
cleaned_pairs = clean_pairs(sentence_pairs)
print(cleaned_pairs[:5])

# Create dataframe with the token pairs
df = pd.DataFrame(cleaned_pairs, columns=['en', 'vi'])
df.head()

  0%|          | 0/254090 [00:00<?, ?it/s]

[['please put the dustpan in the broom closet'
  'xin vui lòng đặt đồ hốt rác trong tủ chổi']
 ['please put the dustpan in the broom closet'
  'xin vui lòng đặt đồ hốt rác trong tủ chổi']
 ['be quiet for a moment' 'im lặng một lát']
 ['be quiet for a moment' 'im lặng một lát']
 ['read this' 'đọc này']]


Unnamed: 0,en,vi
0,please put the dustpan in the broom closet,xin vui lòng đặt đồ hốt rác trong tủ chổi
1,please put the dustpan in the broom closet,xin vui lòng đặt đồ hốt rác trong tủ chổi
2,be quiet for a moment,im lặng một lát
3,be quiet for a moment,im lặng một lát
4,read this,đọc này


## Tokenizer
Now that we have prepared the data, it is time to tokenize it. Tokenization is the process of breaking down a sentence into indivial word, called token, and then assign a numerical value to it. A vocabulary is also created in this process to keep tract of the word to number consersion as well as the total number of unique words in our sample

In [28]:
# Create source and target lanauge
SRC_LANG = 'en'
TGT_LANG = 'vi'

# Create word to number and number to word dictionary
# token_index is a dictionary that maps a token to its index number
# index_token is a dictionary that maps an index number to its token
token_index = {}
index_token = {}

# Declare special tokens and their index
# these special tokens are unknown, pad, bos, eos
# Make sure the tokens are in order of their indices to properly insert them in vocab
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

# Tokenizer for vietnamese sentence
def vi_tokenizer(text):
  return word_tokenize(text, format='text')

# instantiating the tokenizer object
token_index[SRC_LANG] = get_tokenizer('basic_english')
token_index[TGT_LANG] = get_tokenizer(vi_tokenizer)



# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:    
  for index, data_sample in tqdm(data_iter):
      yield token_index[language](data_sample[language])


# begin tokenization process
for ln in [SRC_LANG, TGT_LANG]:
  # Training data Iterator
  train_iter = df.iterrows()
  # Create torchtext's Vocab object
  index_token[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                  min_freq=1,
                                                  specials=special_symbols,
                                                  special_first=True)
  

# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in [SRC_LANG, TGT_LANG]:
  index_token[ln].set_default_index(UNK_IDX)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [36]:
# visualizing our tokenized vocab
index_token['en'].get_itos()[:10], index_token['vi'].get_itos()[:10]

(['<unk>', '<pad>', '<bos>', '<eos>', 'the', 'to', 'i', 'tom', 'you', 'a'],
 ['<unk>', '<pad>', '<bos>', '<eos>', ' ', 'n', 'h', 't', 'i', 'c'])