# Build Pytorch Datasets
+ 0, goal: convert text -> numerical values and batch
+ 1, **mapping**: `get_tokenizer`, `Vacab`
> map each vocabulary to a index
    
+ 2, **Datasets**: `Dataset`
> set up Pytorch type datasets `Dataset`

+ 3, **padding**: `pad_sequence`
> set up paddings for every batch to keep same format.

In [1]:
import os # loading file path
import pandas as pd # for lookup in annotation file
import spacy # tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence # for padding batch
from torch.utils.data import DataLoader, Dataset
# from PIL import Image #load img

In [2]:
class Vocabulary_Persson:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<BOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<BOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [59]:
import io
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.utils import extract_archive

path = '/home/sharma/Desktop/DeepLearning/Testing/Datasets/multi30k-dataset/data/task1/raw/'
train_files = ('train.de.gz', 'train.en.gz')
val_files = ('val.de.gz', 'val.en.gz')
test_files = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(path + file)[0] for file in train_files]
val_filepaths = [extract_archive(path + file)[0] for file in val_files]
test_filepaths = [extract_archive(path + file)[0] for file in test_files]

de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def build_vocab(filepath, tokenizer, min_freq):
    counter = Counter()
#     with io.open(filepath, encoding="utf8") as f:
    with io.open(filepath) as f:
        for s_ in f:
            counter.update(tokenizer(s_))
    return Vocab(counter, specials=['<UNK>', '<PAD>', '<BOS>', '<EOS>'], min_freq=min_freq)
de_test_voc = build_vocab(test_filepaths[0], de_tokenizer, 3)
ts = de_test_voc.stoi
# print(ts['<BOS>'])
# print(de_test_voc['<BOS>'])
# print(de_test_voc['Es'])
print(de_test_voc.stoi['es'])
# print(de_test_voc.stoi['boston'])
print(de_test_voc.itos[439])

def numericalizer(vocab, tokenizer, text):
    text.lower()
    numerical_tok = tokenizer(text)
    numerical_sen = []
    for tok in numerical_tok:
        #print(vocab.stoi[tok])
        if tok not in vocab.stoi:
        #if vocab.stoi[tok] is None:
            vocab.stoi[tok] = 0
        numerical_sen.append(vocab.stoi[tok])
    return numerical_sen
print(numericalizer(de_test_voc, de_tokenizer, "boston ist ein city, Es ist die Stadt"))
    
    

439
es
[0, 72, 16, 0, 14, 0, 72, 24, 125]


In [142]:
class Vocabulary(Vocab):
    
    def __init__(self,):
        self.de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
        self.en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

    def build_vocab(filepath, tokenizer, min_freq):
        counter = Counter()
    #     with io.open(filepath, encoding="utf8") as f:
        with io.open(filepath) as f:
            for s_ in f:
                counter.update(tokenizer(s_))
        return Vocab(counter, specials=['<UNK>', '<PAD>', '<BOS>', '<EOS>'], min_freq=min_freq)
#     de_test_voc = build_vocab(test_filepaths[0], de_tokenizer, 3)
#     ts = de_test_voc.stoi
#     print(ts['<BOS>'])
#     print(de_test_voc['<BOS>'])
#     print(de_test_voc['Es'])
#     print(de_test_voc.stoi['es'])
#     print(de_test_voc.itos[439])
    print(de_test_voc.itos[5])
    print(de_test_voc['zaun'])

    def numericalizer(vocab, tokenizer, text):
        text.lower()
        numerical_tok = tokenizer(text)
        numerical_sen = []
        for tok in numerical_tok:
            print(vocab.stoi[tok])
            if vocab.stoi[tok] is None:
                vocab.stoi[tok] = 0
            numerical_sen.append(vocab.stoi[tok])
        return numerical_sen
    print(numericalizer(de_test_voc, de_tokenizer, "Es ist ein er."))

.
0
0
72
16
136
5
[0, 72, 16, 136, 5]


In [167]:
import pandas as pd
def sentence_format(vocab, tokenizer):
    sen_format = lambda x: [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']]
    return sen_format

class TextDataset(Dataset):
    def __init__(self, pathfilename, Vocabulary, tokenizer, freq_threshold=5):
        self.path = path
        self.tokenizer = tokenizer
        #self.df = pd.read_csv(pathfilename, sep='\n', header=None)
        self.txt = open(pathfilename, 'r').read().split('\n')
        #print('df.loc', self.df.iloc[1])
        #print(self.txt)
        self.vocab = Vocabulary
        
    def __len__(self):
        return len(self.txt)
#         return len(self.df)
    

    def textnumericalizer(self, text):
        numerical_tok = self.tokenizer(text.lower())
        numerical_sen = []
        for tok in numerical_tok:
            if tok not in self.vocab.stoi:
                self.vocab.stoi[tok] = 0
            numerical_sen.append(self.vocab.stoi[tok])
        return numerical_sen
    
    # get a numeralized and format sentence
    # as "es ist ein Ei" -> tensor([2, 439, 72, 16, 0, 3])
    def __getitem__(self, batch_idx):
#         sentence = self.df.iloc[batch_idx]
        sentence = self.txt[batch_idx]
#         print(sentence)
#         sen_format = lambda x: [self.vocab['<BOS>']] + [self.vocab[token] 
#                                                         for token in tokenizer(x)] + [self.vocab['<EOS>']]
#         sen_format = [self.vocab['<BOS>']] + [self.vocab.[token]
#                                               for x in sentence 
#                                               for token in self.textnumericalizer(x)]+ [self.vocab['<EOS>']]
        sen_format = [self.vocab['<BOS>']]
        sen_format += self.textnumericalizer(sentence)
#         sen_format += self.textnumericalizer(sentence.to_string(index=False))
#         for x in sentence:
#             sen_format += self.textnumericalizer(x)
#             print(x, len(sentence))
        sen_format.append(self.vocab['<EOS>'])


#         print(sen_format)
        return torch.tensor(sen_format)

test_de_dataset = TextDataset(test_filepaths[0], de_test_voc, de_tokenizer)
print(next(iter(test_de_dataset)))


tensor([  2,  16,   0,  11,   6, 151,   0,  14,  21, 110,   0,   5,   3])


In [177]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        targets = []
        for idx in batch:
            targets.append(idx)
            #targets.append(torch.tensor(idx))
            
#         imgs = [item[0].unsqueeze(0) for item in batch]
#         imgs = torch.cat(imgs, dim=0)
#         targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return targets
    
pad_idx = test_de_dataset.vocab.stoi['<PAD>']
print(pad_idx)

# def get_loader(
#     root_folder,
#     annotation_file,
#     transform,
#     batch_size=32,
#     num_workers=8,
#     shuffle=True,
#     pin_memory=True,
# ):
#     dataset = FlickrDataset(root_folder, annotation_file, transform=transform)

#     pad_idx = dataset.vocab.stoi["<PAD>"]

#     loader = DataLoader(
#         dataset=dataset,
#         batch_size=batch_size,
#         num_workers=num_workers,
#         shuffle=shuffle,
#         pin_memory=pin_memory,
#         collate_fn=MyCollate(pad_idx=pad_idx),
#     )

#     return loader, dataset

1


## Notice:
+ here in `DataLoader`, the `dataset` and `collate_fn` should be corresponding. if `dataset` is batchs of text, in `collate_fn` should use `numerlizer` or `tokenizer` to translate it into index, if `dataset` is batchs of index, `collate_fn` is fine, which means `collate_fn` should always return index lists.

In [178]:

text_loader = DataLoader(dataset=test_de_dataset, 
                    batch_size=9, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                   )



print(next(iter(text_loader)))

# for idx, (label, text) in enumerate(text_loader):
#     model(item)

tensor([[  2,   2,   2,   2,   2,   2,   2,   2,   2],
        [ 65,   0,  65,  16,  22, 358,  65,  16,  22],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0],
        [212,   8,  51,   8,  41, 100,   0,  11,  31],
        [ 18,   6, 187,   6,   9,   0,  24,   6,   8],
        [  0,   0,   6,   0,   6,   9,   0, 101,  13],
        [479,  63,   0,  11, 101,   6,  73,   0,   0],
        [ 26,   0,  11,   0,   0,   0,   0,  10,  50],
        [ 21,   5, 446,   0,  14,  63,  25,   0,  47],
        [  0,   3,   0,   3,  39,  13,  13,  80,   0],
        [  5,   1,   5,   1,  19,   0,   0,   0,  10],
        [  3,   1,   3,   1,  40,   0,   0,   5, 281],
        [  1,   1,   1,   1,   0,   0,   0,   3,  16],
        [  1,   1,   1,   1,  10,   5,   5,   1,   0],
        [  1,   1,   1,   1, 189,   3,   3,   1,   5],
        [  1,   1,   1,   1,   9,   1,   1,   1,   3],
        [  1,   1,   1,   1,  19,   1,   1,   1,   1],
        [  1,   1,   1,   1, 134,   1,   1,   1,   1],
        [ 

# Assemble all

In [4]:
from torchtext.utils import extract_archive
path = '/home/sharma/Desktop/DeepLearning/Testing/Datasets/multi30k-dataset/data/task1/raw/'
train_files = ('train.de.gz', 'train.en.gz')
val_files = ('val.de.gz', 'val.en.gz')
test_files = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(path + file)[0] for file in train_files]
val_filepaths = [extract_archive(path + file)[0] for file in val_files]
test_filepaths = [extract_archive(path + file)[0] for file in test_files]

In [13]:
from torch.nn.utils.rnn import pad_sequence # for padding batch
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
import io # for io.open
import torch

def build_vocab(filepath, tokenizer, min_freq):
    counter = Counter()
    with io.open(filepath) as f:
        for s_ in f:
            counter.update(tokenizer(s_))
    return Vocab(counter, specials=['<UNK>', '<PAD>', '<BOS>', '<EOS>'], min_freq=min_freq)



class TextDataset(Dataset):
    def __init__(self, pathfilename, Vocabulary, tokenizer, freq_threshold=5):
        self.path = path
        self.tokenizer = tokenizer
        self.txt = open(pathfilename, 'r').read().split('\n')
        self.vocab = Vocabulary
        
    def __len__(self):
        return len(self.txt)
    

    def textnumericalizer(self, text):
        numerical_tok = self.tokenizer(text.lower())
        numerical_sen = []
        for tok in numerical_tok:
            if tok not in self.vocab.stoi:
                self.vocab.stoi[tok] = 0
            numerical_sen.append(self.vocab.stoi[tok])
        return numerical_sen
    
    # get a numeralized and format sentence
    # as "es ist ein Ei" -> tensor([2, 439, 72, 16, 0, 3])
    def __getitem__(self, batch_idx):
        sentence = self.txt[batch_idx]
        sen_format = [self.vocab['<BOS>']]
        sen_format += self.textnumericalizer(sentence)
        sen_format.append(self.vocab['<EOS>'])

        return torch.tensor(sen_format)
    
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        targets = []
        for idx in batch:
            targets.append(idx)
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return targets
    
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

de_test_voc = build_vocab(test_filepaths[0], de_tokenizer, 2)
en_test_voc = build_vocab(test_filepaths[1], en_tokenizer, 2)

test_de_dataset = TextDataset(test_filepaths[0], de_test_voc, de_tokenizer)
test_en_dataset = TextDataset(test_filepaths[1], en_test_voc, en_tokenizer)

pad_idx = test_de_dataset.vocab.stoi['<PAD>']

text_en_loader = DataLoader(dataset=test_en_dataset, 
                    batch_size=9, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                    )
text_de_loader = DataLoader(dataset=test_de_dataset, 
                    batch_size=9, 
                    shuffle=True, pin_memory=True, 
                    collate_fn=MyCollate(pad_idx=pad_idx)
                   )


print(next(iter(text_en_loader)))
print(next(iter(text_de_loader)))
for idx, text in enumerate(text_de_loader):
    print(text)

tensor([[  2,   2,   2,   2,   2,   2,   2,   2,   2],
        [  9,  29,   4,  25,   4,  64,   4,  25,   4],
        [ 64, 162, 311,  16,  13, 114,  24,  35, 291],
        [ 86, 123,  12,  40,  14,  16,  36,  92,   8],
        [ 21,   8,  49, 551,   4,  49,  12,   0,   4],
        [ 63,   4,  20,   0, 368,  58,   0,  21,  52],
        [ 14,  60, 612,  15,   8,  20,   4,   0, 133],
        [  4, 212,   0,   0,  28,   9,  62,   9,  12],
        [166, 730, 309,   0, 145, 527,   8,   0,   0],
        [ 62,  67,   4, 458, 465,  11,  53,  93,  58],
        [  8,   0, 336,   0,   4, 835,   6, 100, 308],
        [203,   6,   6,  15, 736,   0,   3, 645,  18],
        [145,   3,   3, 375,  15, 193,   1,   6, 138],
        [ 21,   1,   1,   6, 177,  11,   1,   3,  10],
        [ 16,   1,   1,   3,   6, 149,   1,   1,   4],
        [ 96,   1,   1,   1,   3,  16,   1,   1,  77],
        [ 68,   1,   1,   1,   1,  78,   1,   1,   6],
        [807,   1,   1,   1,   1,   8,   1,   1,   3],
        [1