In [18]:
import tensorflow as tf
from Dive_into_deep_learning.d2l import tensorflow as d2l

In [19]:
@d2l.add_to_class(d2l.TimeMachine)
def __init__(self, batch_size, num_steps, num_train=10000, num_val=5000):
    super(d2l.TimeMachine, self).__init__()
    self.save_hyperparameters()
    corpus, self.vocab = self.build(self._download()) # corpus是token的整数表示
    array = tf.constant([corpus[i: i + num_steps + 1]
                         for i in range(len(corpus) - num_steps)])
    self.X, self.Y = array[:, :-1], array[:, 1:]

In [20]:
@d2l.add_to_class(d2l.TimeMachine)
def get_dataloader(self, train):
    idx = slice(0, self.num_train) if train else slice(self.num_train, self.num_train + self.num_val)
    return self.get_tensorloader([self.X, self.Y], train, idx)

In [21]:
data = d2l.TimeMachine(batch_size=2, num_steps=10)
for X, Y in data.train_dataloader():
    print('X:', X, '\nY:', Y)
    break

X: tf.Tensor(
[[14  6  0 21  9  6 19  6  0 10]
 [ 0 13  2 17 20  6  5  0 10 15]], shape=(2, 10), dtype=int32) 
Y: tf.Tensor(
[[ 6  0 21  9  6 19  6  0 10 20]
 [13  2 17 20  6  5  0 10 15 21]], shape=(2, 10), dtype=int32)


2023-08-04 13:48:09.052677: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
data.vocab.__len__()

28

In [23]:
import random

def seq_data_iter_random(corpus, batch_size, num_steps):  #@save
    """使用随机抽样生成一个小批量子序列"""
    # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
    corpus = corpus[random.randint(0, num_steps - 1):]
    # 减去1，是因为我们需要考虑标签
    num_subseqs = (len(corpus) - 1) // num_steps
    # 长度为num_steps的子序列的起始索引
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # 在随机抽样的迭代过程中，
    # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
    random.shuffle(initial_indices)

    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return corpus[pos: pos + num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # 在这里，initial_indices包含子序列的随机起始索引
        initial_indices_per_batch = initial_indices[i: i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield tf.constant(X), tf.constant(Y)

In [24]:
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  tf.Tensor(
[[15 16 17 18 19]
 [10 11 12 13 14]], shape=(2, 5), dtype=int32) 
Y: tf.Tensor(
[[16 17 18 19 20]
 [11 12 13 14 15]], shape=(2, 5), dtype=int32)
X:  tf.Tensor(
[[0 1 2 3 4]
 [5 6 7 8 9]], shape=(2, 5), dtype=int32) 
Y: tf.Tensor(
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]], shape=(2, 5), dtype=int32)
X:  tf.Tensor(
[[25 26 27 28 29]
 [20 21 22 23 24]], shape=(2, 5), dtype=int32) 
Y: tf.Tensor(
[[26 27 28 29 30]
 [21 22 23 24 25]], shape=(2, 5), dtype=int32)


In [25]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):  #@save
    """使用顺序分区生成一个小批量子序列"""
    # 从随机偏移量开始划分序列
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    Xs = tf.constant(corpus[offset: offset + num_tokens])
    Ys = tf.constant(corpus[offset + 1: offset + 1 + num_tokens])
    Xs = tf.reshape(Xs, (batch_size, -1))
    Ys = tf.reshape(Ys, (batch_size, -1))
    num_batches = Xs.shape[1] // num_steps
    for i in range(0, num_batches * num_steps, num_steps):
        X = Xs[:, i: i + num_steps]
        Y = Ys[:, i: i + num_steps]
        yield X, Y

In [26]:
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  tf.Tensor(
[[ 4  5  6  7  8]
 [19 20 21 22 23]], shape=(2, 5), dtype=int32) 
Y: tf.Tensor(
[[ 5  6  7  8  9]
 [20 21 22 23 24]], shape=(2, 5), dtype=int32)
X:  tf.Tensor(
[[ 9 10 11 12 13]
 [24 25 26 27 28]], shape=(2, 5), dtype=int32) 
Y: tf.Tensor(
[[10 11 12 13 14]
 [25 26 27 28 29]], shape=(2, 5), dtype=int32)
X:  tf.Tensor(
[[14 15 16 17 18]
 [29 30 31 32 33]], shape=(2, 5), dtype=int32) 
Y: tf.Tensor(
[[15 16 17 18 19]
 [30 31 32 33 34]], shape=(2, 5), dtype=int32)
