In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function
import tensorflow as tf
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as utils
from sklearn.model_selection import train_test_split
import torch
import unicodedata
import re
import numpy as np
import os
import time
import datetime
import LoggerYN as YN

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines]
    
    return word_pairs

    # This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()

        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))

        self.vocab = sorted(self.vocab)

        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1

        for word, index in self.word2idx.items():
            self.idx2word[index] = word
            
def max_length(tensor):
    return max(len(t) for t in tensor)


def load_dataset(path):
    # creating cleaned input, output pairs
    pairs = create_dataset(path)

    # index language using the class defined above    
    inp_lang = LanguageIndex(sp for en, sp in pairs)
    targ_lang = LanguageIndex(en for en, sp in pairs)
    
    # Vectorize the input and target languages
    
    # Spanish sentences
    input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]
    
    # English sentences
    target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]
    
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.LSTM = nn.LSTM(input_size =embedding_dim, hidden_size = self.enc_units, batch_first=True)
        nn.init.xavier_uniform_(self.LSTM.weight_ih_l0)
        nn.init.orthogonal_(self.LSTM.weight_hh_l0)
        nn.init.constant_(self.LSTM.bias_ih_l0, 0.0)
        nn.init.constant_(self.LSTM.bias_ih_l0[self.enc_units:2*self.enc_units], 1.0)
        nn.init.constant_(self.LSTM.bias_hh_l0, 0.0)
        
    def forward(self, x, hidden):
        x = self.embedding(x)
        output, state = self.LSTM(x) 
        return output, state
    
    def initialize_hidden_state(self):
        return torch.zeros((self.batch_sz,self.enc_units))
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim)
        self.LSTM = nn.LSTM(input_size = embedding_dim, hidden_size = self.dec_units, batch_first=True)
        self.fc = nn.Linear(self.dec_units,vocab_size)
        nn.init.xavier_uniform_(self.LSTM.weight_ih_l0)
        nn.init.orthogonal_(self.LSTM.weight_hh_l0)
        nn.init.constant_(self.LSTM.bias_ih_l0, 0.0)
        nn.init.constant_(self.LSTM.bias_ih_l0[self.dec_units:2*self.dec_units], 1.0)
        nn.init.constant_(self.LSTM.bias_hh_l0, 0.0)
        
    def forward(self, x, hidden, enc_output):
        x = self.embedding(x)
        output, state = self.LSTM(x,hidden)
        x = self.fc(output)
        
        return x, state
        
    def initialize_hidden_state(self):
        return torch.zeros((self.batch_sz,self.dec_units))
    


In [None]:
class Encap(nn.Module):
    def __init__(self, encoder,decoder):
        super(Encap, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, inp,targ, hidden, BATCH_SIZE,vocab_tar_size):
        loss = 0
        enc_output, enc_hidden = self.encoder(inp, [hidden,hidden])   
        dec_hidden = enc_hidden
        dec_input = targ[:,:-1]
        
        predictions, dec_hidden = self.decoder(dec_input, dec_hidden, enc_output)
        loss = loss_function(targ[:,1:], predictions,vocab_tar_size)
        return loss

In [None]:
def create_db(path_to_file):
    input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file)
    # Creating training and validation sets using an 80-20 split
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2,random_state=42)
    vocab_inp_size = len(inp_lang.word2idx)
    vocab_tar_size = len(targ_lang.word2idx)
    return input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ

In [None]:
def loss_function(real, pred,vocab):
    sfmax = nn.LogSoftmax(dim =1)
    cross_ent=F.nll_loss
    pred = pred.view(-1,vocab)
    pred = sfmax(pred)
    real = real.reshape(-1)
    loss = cross_ent(pred, real)
    return loss

In [None]:
def train(model,epoch,my_dataloader,hidden,BATCH_SIZE,vocab_tar_size,optimizer):
    for (batch, (inp, targ)) in enumerate(my_dataloader):
        model.zero_grad()
        loss = model(inp,targ,hidden,BATCH_SIZE,vocab_tar_size)
        loss.backward()
        optimizer.step()
        if batch % 300 == 0:
            print('Batch {} Loss {}'.format(batch,loss))

In [None]:
def test(model,val_dataloader,hidden,BATCH_SIZE,vocab_tar_size):
    t_loss = 0
    for (batch, (inp, targ)) in enumerate(my_dataloader):
        loss = model(inp,targ,hidden,BATCH_SIZE,vocab_tar_size)
        t_loss +=loss.data.numpy()
    print('Validation Perplexity :{}'.format(np.power(2,t_loss/batch)))

In [None]:
def run(BATCH_SIZE,  embedding_dim, units, epochs):
    path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', extract=True)
    path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"
    torch.manual_seed(1)
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val,vocab_inp_size,vocab_tar_size,max_length_inp, max_length_targ = create_db(path_to_file)
    # Get parameters
    BUFFER_SIZE = len(input_tensor_train)
    N_BATCH = BUFFER_SIZE//BATCH_SIZE

    train_samples = len(input_tensor_train)
    val_samples = len(input_tensor_val)
    #data pre-pre processing
    input_tensor_train = np.array(input_tensor_train,dtype = 'int')
    target_tensor_train = np.array(target_tensor_train,dtype = 'int')
    input_tensor_val = np.array(input_tensor_val,dtype = 'int')
    target_tensor_val = np.array(target_tensor_val,dtype = 'int') 
    tensor_x = torch.stack([torch.from_numpy(i).cuda() for i in input_tensor_train]).cuda() 
    tensor_y = torch.stack([torch.from_numpy(i).cuda() for i in target_tensor_train]).cuda()
    val_x = torch.stack([torch.from_numpy(i) for i in input_tensor_val])
    val_y = torch.stack([torch.from_numpy(i) for i in target_tensor_val])
    my_dataset = utils.TensorDataset(tensor_x,tensor_y) # create your datset
    my_dataloader = utils.DataLoader(my_dataset,batch_size=BATCH_SIZE)
    val_dataset = utils.TensorDataset(val_x,val_y) # create your datset
    val_dataloader = utils.DataLoader(val_dataset,batch_size=BATCH_SIZE)
    #create model
    encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
    decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
    model = Encap(encoder,decoder)
    model.cuda()
    cudnn.benchmark = True
    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()),lr=0.0001)  
    memT,cpuT,gpuT = YN.StartLogger("PyTorch","Manythings")
    start = time.time()
    hidden = encoder.initialize_hidden_state()
    test(model,val_dataloader,hidden,BATCH_SIZE,vocab_tar_size)
    for epoch in range(epochs):
        train(model,epoch,my_dataloader,hidden,BATCH_SIZE,vocab_tar_size,optimizer)
        test(model,val_dataloader,hidden,BATCH_SIZE,vocab_tar_size)
    end = time.time()
    YN.EndLogger(memT,cpuT,gpuT)
    print(str(datetime.timedelta(seconds=end-start)))

In [None]:
run(BATCH_SIZE = 128,  embedding_dim = 256, units = 256, epochs = 10 )