In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

In [2]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [3]:
shakespear_url = 'https://homl.info/shakespeare'
filepath = keras.utils.get_file('shakespeare.txt', shakespear_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [4]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) 
# Tokenizer分词器，char_level字符级编码，文本默认转小写，除非lower=False
# n个编码是从1到n，而不是0到n-1，因此在后面要整个序列减去1
tokenizer.fit_on_texts([shakespeare_text])

In [5]:
tokenizer.texts_to_sequences(['First']) # first编码成序列

[[20, 6, 9, 8, 3]]

In [6]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]]) # 序列编码成单词

['f i r s t']

In [7]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [8]:
np.array(tokenizer.texts_to_sequences([shakespeare_text]))

array([[20,  6,  9, ..., 21, 27, 11]])

In [10]:
[encoded] # 每个编码减去1

[array([19,  5,  8, ..., 20, 26, 10])]

In [16]:
encoded.shape # 长度有一百多万

(1115394,)

In [24]:
dataset_size = len(encoded)
train_size = dataset_size * 90 // 100 # 注意整除
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
n_steps = 100
window_length = n_steps + 1 # 目标 = 输入向前移动一个字符 ?没懂
dataset = dataset.window(window_length, shift=1, drop_remainder=True)
# window()方法把长字符串转换为若干小的文本窗口，shift=1，第一个窗口包含0到100字符，第二个窗口包含1到101字符
# drop_remainder=True 丢掉最后小于n_steps的部分
# RNN不能学习比n_steps更长的模式，因此也不能太短

In [25]:
dataset = dataset.flat_map(lambda window: window.batch(window_length)) # 展平

In [26]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))