In [2]:
import pandas as pd
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from sentence_dataset_class import ProcessedSentences

  from .autonotebook import tqdm as notebook_tqdm


In [74]:
df = pd.read_json('../data/test_data.json')

In [75]:
df.shape

(25799, 2)

In [4]:
df['input_data'] = df['input_data'].astype('string')
df['output_data'] = df['output_data'].astype('string')

In [5]:
df.dtypes

input_data     string
output_data    string
dtype: object

In [5]:
df.head()

Unnamed: 0,input_data,output_data
0,louise - i 'm already booked morning ( includi...,louise - i 'm already booked for friday mornin...
1,back to you additional concerns .,please get back to me if you have any addition...
2,i am the revised confirmation with the two req...,i am enclosing the revised confirmation with t...
3,i have received the and have only two minor .,i have received the email below and have only ...
4,do the pa and eta for brankrupcty swaps in eac...,do the pa and eta work for brankrupcty swaps i...


In [2]:
token_transform = get_tokenizer('basic_english')

In [4]:
type(token_transform)

function

In [7]:
from typing import Iterable, List
def yield_tokens(data_iter: Iterable) -> List[str]:

    for data_sample in data_iter:
        yield token_transform(data_sample[1])

In [10]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [9]:
special_symbols.index('<unk>')

0

In [9]:
data_iterator_input = yield_tokens(df['input_data'].iteritems())
input_vocab = build_vocab_from_iterator(data_iterator_input,
                                                    min_freq=2,
                                                    specials=special_symbols,
                                                    special_first=True)
input_vocab.set_default_index(UNK_IDX)
data_iterator_output = yield_tokens(df['output_data'].iteritems())
output_vocab = build_vocab_from_iterator(data_iterator_output,
                                                    min_freq=2,
                                                    specials=special_symbols,
                                                    special_first=True)
output_vocab.set_default_index(UNK_IDX)

In [10]:
def tokenize_sentence(sentence):
    return token_transform(sentence)
def vocab_transform(tokens:List[str],vocab: torchtext.vocab.Vocab):
    return [vocab.vocab.__getitem__(token) for token in tokens]
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([BOS_IDX]), torch.tensor(token_ids),torch.tensor([EOS_IDX])))

In [11]:
def sentence_processing(sentence):
    tokens = tokenize_sentence(sentence)
    token_ids = vocab_transform(tokens,input_vocab)
    tensor_ids = tensor_transform(token_ids)
    return tensor_ids

In [12]:
input_sentences = [sentence_processing(sentence) for sentence in df['input_data'].values]
output_sentences = [sentence_processing(sentence) for sentence in df['output_data'].values]

In [13]:
input_sentences = pad_sequence(input_sentences,batch_first = True ,padding_value=PAD_IDX)
output_sentences = pad_sequence(output_sentences,batch_first = True ,padding_value=PAD_IDX)

In [14]:
dataset = ProcessedSentences(
    input_data=input_sentences,
    output_data= output_sentences,
    device=torch.device('cuda'))

In [15]:
dataloader = DataLoader(dataset,batch_size=32,shuffle=True)

In [16]:
for b in dataloader:
    print(b[0].shape)
    print(b[1].shape)
    break

torch.Size([32, 345])
torch.Size([32, 390])


In [6]:
df = pd.read_json('../data/train_data.json')

In [7]:
token_transform = get_tokenizer('basic_english')

In [8]:
from sentence_processing import build_vocab,sentence_processing

In [11]:
input_vocab = build_vocab(df['input_data'],token_transform,special_symbols)
output_vocab = build_vocab(df['output_data'],token_transform,special_symbols)

In [12]:
input_sentences = [sentence_processing(sentence,input_vocab,token_transform,special_symbols.index('<bos>'),special_symbols.index('<eos>')) for sentence in df['input_data'].values]
output_sentences = [sentence_processing(sentence,output_vocab,token_transform,special_symbols.index('<bos>'),special_symbols.index('<eos>')) for sentence in df['output_data'].values]

In [38]:
input_sentences_padded = pad_sequence(input_sentences ,padding_value=special_symbols.index('<pad>'))
input_sentences_padded_bf = pad_sequence(input_sentences ,batch_first= True,padding_value=special_symbols.index('<pad>'))

In [14]:
lengths = [len(sentence) for sentence in input_sentences]

In [72]:
max(lengths)

345

In [73]:
len(lengths)

212541

In [22]:
test_sent = input_sentences[:2]

In [43]:
test_sent_padded = pad_sequence(test_sent ,padding_value=special_symbols.index('<pad>'))
test_sent_padded_bf = pad_sequence(test_sent,batch_first= True ,padding_value=special_symbols.index('<pad>'))

In [34]:
test_sent[0].shape

torch.Size([30])

In [69]:
test_sent_padded.size()

torch.Size([30, 2])

In [56]:
test_sent_padded[:10]

tensor([[   2,    2],
        [1129,  115],
        [  16,    7],
        [   9,   18],
        [  13,  997],
        [  65,  582],
        [ 226,    4],
        [2936,    3],
        [ 276,    1],
        [  32,    1]])

In [67]:
test_sent_padded[:,1:][:10]

tensor([[  2],
        [115],
        [  7],
        [ 18],
        [997],
        [582],
        [  4],
        [  3],
        [  1],
        [  1]])

In [45]:
test_sent_padded_bf.shape

torch.Size([2, 30])

In [71]:
input_sentences_padded.shape

torch.Size([345, 212541])

In [39]:
input_sentences_padded_bf.shape

torch.Size([212541, 345])