In [1]:
import sys
import os
import pandas as pd
import string
import numpy as np
import re
import spacy
import es_core_news_sm
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from sklearn.model_selection import train_test_split
import torch
from model.Utils import GradientDescentMomentum
from model.Utils import exponentialDecaySchedule
from model.Seq2Seq_rnn import Seq2Seq_rnn

In [2]:
data_path = os.path.join(sys.path[0], 'data/spa.txt')
lines= pd.read_table(data_path,  names =['eng', 'spa', 'comments'])
train,valid = train_test_split(lines, test_size=0.1)

In [3]:
train.to_csv('train.csv')
valid.to_csv('valid.csv')

In [4]:
# use spacy for tokenization - provides good support for tokenization in languages other than english
eng_field = Field(tokenize="spacy",
                 tokenizer_language="en",
                 init_token="<sos>",
                 eos_token = "<eos>",
                 lower = True)

spa_field = Field(tokenize="spacy",
                 tokenizer_language="es",
                 init_token="<sos>",
                 eos_token = "<eos>",
                 lower=True)



In [5]:
tabular_data_fields = [("id", None), ('eng', eng_field), ('spa', spa_field),('comments',None)]

In [6]:
train, valid = TabularDataset.splits(

path = './',
train='train.csv',
validation='valid.csv',
format = 'csv',
skip_header = True,
fields = tabular_data_fields)



In [7]:
eng_field.build_vocab(train, valid)

In [8]:
spa_field.build_vocab(train, valid)

In [15]:
train_iter, val_iter = BucketIterator.splits((train, valid),
batch_size = 32, sort_key = lambda x: len(x.eng))



In [18]:
def confirm_mapping(src, trg):
    src = np.copy(src)
    vfunc = np.vectorize(lambda x: eng_field.vocab.itos[x])
    saved_src = vfunc(src)
    
    trg = np.copy(trg)
    xfunc = np.vectorize(lambda x: spa_field.vocab.itos[x])
    saved_trg = xfunc(trg)
    
    return saved_src, saved_trg

In [19]:
spa = "spa"
eng= "eng"

In [20]:
for i, batch in enumerate(val_iter):
    print(type(batch.eng))
    saved = batch.spa.numpy()
    if (type(batch.eng) == torch.Tensor):
        src2 = batch.eng[:,0].numpy()
        trg2 = batch.spa[:,0].numpy()
    print(batch.eng.shape)
    print(batch.spa.shape)
    break

<class 'torch.Tensor'>
torch.Size([5, 32])
torch.Size([9, 32])


In [25]:
seq2seq_obj = Seq2Seq_rnn(eos_int=spa_field.vocab.stoi['<eos>'], sos_int=spa_field.vocab.stoi['<sos>']
                          , vocab_size_src = len(eng_field.vocab), vocab_size_trg=len(spa_field.vocab),
                          dim_embed_src=1012, dim_embed_trg =1012, src_map_i2c = eng_field.vocab.itos,
                          trg_map_i2c = spa_field.vocab.itos, num_neurons_encoder = 1012, num_neurons_decoder = 1012,
                          optim = GradientDescentMomentum)

In [35]:
eng = "eng"
spa = "spa"
lr_schedule = exponentialDecaySchedule(0.96, 100)

In [34]:
# Works fine but have to train for a really long time 
seq2seq_obj.train( num_epochs=1000, valid_loader=val_iter, data_loader = train_iter, batch_size = 32, src_name = eng, trg_name=spa, 
                  padding_idx=spa_field.vocab.stoi['<pad>'], learning_schedule=lr_schedule, _testing=5, learn_rate=0.001)

