In [1]:
import collections
import re
from d2l import torch as d2l

In [2]:
#@save
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():  #@save
    """将时间机器数据集加载到文本行的列表中"""
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine()
print(f'# 文本总行数: {len(lines)}')
print(lines[0])
print(lines[10])

# 文本总行数: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


In [3]:
def read_time_test():
    """将时间机器数据集加载到文本行的列表中"""
    with open('../data/timemachine.txt', 'r') as f:
        lines = f.readlines()
    return lines
lines_text = read_time_test()
print(f'# 文本总行数: {len(lines_text)}')
for line_text in lines_text:
    print(line_text)

# 文本总行数: 3221
The Time Machine, by H. G. Wells [1898]









I





The Time Traveller (for so it will be convenient to speak of him)

was expounding a recondite matter to us. His grey eyes shone and

twinkled, and his usually pale face was flushed and animated. The

fire burned brightly, and the soft radiance of the incandescent

lights in the lilies of silver caught the bubbles that flashed and

passed in our glasses. Our chairs, being his patents, embraced and

caressed us rather than submitted to be sat upon, and there was that

luxurious after-dinner atmosphere when thought roams gracefully

free of the trammels of precision. And he put it to us in this

way--marking the points with a lean forefinger--as we sat and lazily

admired his earnestness over this new paradox (as we thought it)

and his fecundity.



'You must follow me carefully. I shall have to controvert one or two

ideas that are almost universally accepted. The geometry, for

instance, they taught you at school is 

In [4]:
def tokenize(lines, token='word'):
    """将文本行拆分为单词或字符列表"""
    if token == 'word':
        return[line.split() for line in lines]
    elif token == 'char':
        return[list(line) for line in lines]
    else:
        print('错误：未知的`token`类型：', + token)

tokens = tokenize(lines_text)
for i in range(10):
    print(tokens[i])

['The', 'Time', 'Machine,', 'by', 'H.', 'G.', 'Wells', '[1898]']
[]
[]
[]
[]
['I']
[]
[]
['The', 'Time', 'Traveller', '(for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him)']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us.', 'His', 'grey', 'eyes', 'shone', 'and']


In [5]:
def count_corpus(tokens):
    """统计词频"""
    # 这里的`tokens`是一个单词列表的列表
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 将次元列表展平为一个列表
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens) 
    # 返回一个字典; Counter 为我们提供了一个字典，其中键是 tokens 中的元素，值是这些元素在 tokens 中的出现次数。


class Vocab:
    """文本词表"""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None: 
            reserved_tokens = []
        # 按出现的频率排序
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # 未知词的索引为0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token) 
                self.token_to_idx[token] = len(self.idx_to_token) - 1
    def __len__(self): 
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_token(self, indexes):
        if not isinstance(indexes, (list, tuple)):
            return self.idx_to_token[indexes] 
        return [self.idx_to_token[index] for index in indexes]
    
    @property
    def unk(self): 
        return 0
    @property
    def token_freqs(self): 
        return self._token_freqs

In [6]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:10])

[('<unk>', 0), ('the', 1), ('I', 2), ('of', 3), ('and', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('my', 9)]


In [7]:
for i in [0, 10]:
    print('词', vocab.idx_to_token[i], '的索引为', i)
    print('文本：', tokens[i])
    print('索引：', vocab[tokens[i]])

词 <unk> 的索引为 0
文本： ['The', 'Time', 'Machine,', 'by', 'H.', 'G.', 'Wells', '[1898]']
索引： [16, 27, 330, 32, 2493, 2494, 2495, 2496]
词 that 的索引为 10
文本： ['twinkled,', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated.', 'The']
索引： [2500, 4, 22, 1034, 462, 168, 7, 1469, 4, 1470, 16]


In [8]:
for i in range(0, 10):
    print('词', vocab.idx_to_token[i], '的索引为', i)
    print('文本：', tokens[i])
    print('索引：', vocab[tokens[i]])

词 <unk> 的索引为 0
文本： ['The', 'Time', 'Machine,', 'by', 'H.', 'G.', 'Wells', '[1898]']
索引： [16, 27, 330, 32, 2493, 2494, 2495, 2496]
词 the 的索引为 1
文本： []
索引： []
词 I 的索引为 2
文本： []
索引： []
词 of 的索引为 3
文本： []
索引： []
词 and 的索引为 4
文本： []
索引： []
词 a 的索引为 5
文本： ['I']
索引： [2]
词 to 的索引为 6
文本： []
索引： []
词 was 的索引为 7
文本： []
索引： []
词 in 的索引为 8
文本： ['The', 'Time', 'Traveller', '(for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him)']
索引： [16, 27, 112, 2497, 54, 12, 105, 41, 640, 6, 784, 3, 2498]
词 my 的索引为 9
文本： ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us.', 'His', 'grey', 'eyes', 'shone', 'and']
索引： [7, 1468, 5, 2499, 785, 6, 402, 365, 403, 201, 404, 4]


In [9]:
def load_corpus_time_machine(max_tokens=-1):
    """返回时光机器数据集的词表和词元索引列表"""
    lines = read_time_machine() 
    tokens = tokenize(lines, token= 'char')
    vocab = Vocab(tokens)
    # 因为时光机器数据集中每个文本行不一定是一个句子或一个段落
    # 所以将所有文本展平到一个列表
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens] 
    return vocab, corpus

vocab, corpus = load_corpus_time_machine()
len(vocab), len(corpus) 
    

(28, 170580)

In [18]:
print(list(vocab.token_to_idx.items())[0:10])
print(list(vocab.token_to_idx.values())[0:10])
print(list(vocab.token_freqs)[0:10])
print(list(vocab.idx_to_token))
print(corpus[0:10])

[('<unk>', 0), (' ', 1), ('e', 2), ('t', 3), ('a', 4), ('i', 5), ('n', 6), ('o', 7), ('s', 8), ('h', 9)]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[(' ', 29927), ('e', 17838), ('t', 13515), ('a', 11704), ('i', 10138), ('n', 9917), ('o', 9758), ('s', 8486), ('h', 8257), ('r', 7674)]
['<unk>', ' ', 'e', 't', 'a', 'i', 'n', 'o', 's', 'h', 'r', 'd', 'l', 'm', 'u', 'c', 'f', 'w', 'g', 'y', 'p', 'b', 'v', 'k', 'x', 'z', 'j', 'q']
[3, 9, 2, 1, 3, 5, 13, 2, 1, 13]
