В книге о NLP на Pytorch предлагается делать следующие классы для загрузки данных:

* Vocabulary: класс, который осуществляет преобразование слова в число. Сейчас этот класс будет просто оберткой над словарем. 

* Vectorizer: класс, осуществляющий преобразование текстовой строки в последовательность чисел. Также он гарантирует, что эти последовательности чисел будут иметь одну и ту же длину (сейчас это реализовано очень костыльно, в будущем нужно будет сделать гибкое изменение размера). Осуществляет часть работы "подсчет частот встречаемости слов, выкидывание редких слов и создание Vocabulary"

* Dataset: это просто геттер, который возвращает одиночные семплы. 

* Dataloader: это обертка над Dataset, которая делает батчи

In [12]:
class Vocabulary(object):
    # TODO: use DefaultDict instead of Dict
    # TODO: use '[]' operators instead of add_token and lookup_token methods
    # TODO: implement special tokens for numbers (<NUM> and maybe <YEAR>)
    
    def __init__(self, token_to_idx=None, 
                 add_unk=True, unk_token='<UNK>', 
                 add_start_end=True, start_token='<START>', end_token='<END>'):
        
        if token_to_idx is None:
            token_to_idx = dict()
            
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx:token for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self._add_start_end = add_start_end
        self._start_token = start_token
        self._end_token = end_token
        
        self.unk_index = -1
        if self._add_unk:
            self.unk_index = self.add_token(unk_token)
        
        self._start_index = -1
        self._end_index = -1
        if self._add_start_end:
            self._start_index = self.add_token(start_token)
            self._end_index = self.add_token(end_token)
            
    def add_token(self, token):
        """
        Update mapping dicts basen on the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """
        Retrieve the index associated with the token
        or the UNK index if token isn't present
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        """
        Return the token associated with the index
        """
        if index not in self._idx_to_token:
            raise KeyError(f'the index {index} is not in the Vocabulary')
        else:
            return self._idx_to_token[index]

In [97]:
import numpy as np
import re
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm

def split(text):
    return re.findall(r'[\w\d]+', text.lower())

In [123]:
class Vectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, train_vocab: Vocabulary, num_samples: int):
        self.train_vocab = train_vocab
        self.num_samples = num_samples
        
    def vectorize(self, text: list):
        """
        Create a numpy vector with indices of tokens in texts
        """
        vectorized_texts = []
        
        max_text_len = 100 # TODO: don't use particular number
        
        vectorized_text = [self.train_vocab._start_index] # TODO: implement case without start and end tokens
            
        for word in split(text):
            word_idx = self.train_vocab.lookup_token(word)
            vectorized_text.append(word_idx)
        
        vectorized_text.append(self.train_vocab._end_index)
        vectorized_text.extend([0] * (max_text_len - len(vectorized_text)))
        
        return vectorized_text
        
    
    @classmethod
    def from_text_file(cls, path_to_file: Path, cutoff=1):
        """
        Instantiate the vectorizer from text file
        Words with frequency equal of less than cutoff won't be added in words dictionary        
        """
        word_count = defaultdict(int)
        num_samples = 0
        
        with path_to_file.open() as f:
            for line in tqdm(f.readlines()):
                num_samples += 1
                for word in split(line):
                    word_count[word] += 1
        
        cutted_word_dict = dict()
        
        for word in word_count.keys():
            if word_count[word] > cutoff:
                cutted_word_dict[word] = word_count[word]

        
        return cls(train_vocab=Vocabulary(cutted_word_dict), num_samples=num_samples)

In [124]:
path_to_data = Path('/Datasets/Wikipedia/data for small model/test.txt')

vectorizer = Vectorizer.from_text_file(path_to_data)

100%|██████████| 1516335/1516335 [00:12<00:00, 121108.09it/s]


In [91]:
from pathlib import Path
import pickle
import os

import linecache
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [125]:
class FastDataset(Dataset):
    def __init__(self, filename, vectorizer):
        self._filename = str(filename)
        self._vectorizer = vectorizer
        self._total_data = self.rawcount(filename)

    def __getitem__(self, idx):
        line = ' '.join(split(linecache.getline(self._filename, idx + 1)))
        line_vectorized = self._vectorizer.vectorize(line)
        
        return {'raw_text': line,
                'forward_target': np.array(line_vectorized[1:] + [0]),
                'backward_target': np.array([0] + line_vectorized[:-1])
               }
      
    def __len__(self):
        return self._total_data
    
    
    def rawcount(self, filename):
        f = open(filename, 'rb')
        lines = 0
        buf_size = 1024 * 1024
        read_f = f.raw.read

        buf = read_f(buf_size)
        while buf:
            lines += buf.count(b'\n')
            buf = read_f(buf_size)

        return lines

In [126]:
train_dataset = FastDataset(path_to_data, vectorizer)

In [128]:
%timeit train_dataset[np.random.randint(100000)]

39.1 µs ± 132 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [129]:
dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)

In [130]:
for a in dataloader:
    break

In [131]:
a['raw_text']

['john lindow says that the poem may describe a mix of the destruction of the race of giants and of humans as in ragnarök but that many of the predictions of disruption on earth could also fit the volcanic activity that is so common in iceland',
 'the canadian shield also contains the mackenzie dike swarm which is the largest dike swarm known on earth',
 'from 1930 until his death in 1953 abdulaziz ruled saudi arabia as an absolute monarchy',
 'the u s sister city program began in 1956 when president dwight d eisenhower proposed a people to people citizen diplomacy initiative',
 'a further development of this approach is programmable radio output processing where the parameters of the multiband compressor automatically change between different settings according to the current programme block style or the time of day',
 'insignia is found on military hats or berets on the right and left shoulder on the uniform of all soldiers of the armed forces',
 'jackson ritter and the satellite are

In [132]:
a['forward_target']

tensor([[  15987,      21,    2245,  ...,       0,       0,       0],
        [2504847,    4524,     779,  ...,       0,       0,       0],
        [ 188454,    1490,   21188,  ...,       0,       0,       0],
        ...,
        [    386,  409126,   83114,  ...,       0,       0,       0],
        [ 157105,   21741,   18940,  ...,       0,       0,       0],
        [ 927700,    2866,     394,  ...,       0,       0,       0]])

In [133]:
a['backward_target']

tensor([[      0,  236888,   15987,  ...,       0,       0,       0],
        [      0,  236888, 2504847,  ...,       0,       0,       0],
        [      0,  236888,  188454,  ...,       0,       0,       0],
        ...,
        [      0,  236888,     386,  ...,       0,       0,       0],
        [      0,  236888,  157105,  ...,       0,       0,       0],
        [      0,  236888,  927700,  ...,       0,       0,       0]])

Вот еще пример функции, которую будет полезно встроить в процесс обучения нашей модели:
~~~
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict
~~~