### Tokenization

In [1]:
import re
from tensorflow.keras.preprocessing.text import Tokenizer # 导入分词器

2023-07-15 14:08:42.717400: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [36]:
# 读取文本
with open('./data/aesop/data.txt', 'r',encoding='utf-8') as f:
    text = f.read()

start = text.find('THE FOX AND THE GRAPES')
end = text.find('ILLUSTRATIONS')
text = text[start:end]

In [39]:
# 清洗文本
seq_length = 20
start_story = '| ' * seq_length
    
text = start_story + text
text = text.lower()
text = text.replace('\n\n\n\n\n', start_story)
text = text.replace('\n', ' ')
text = re.sub('  +', '. ', text).strip()
text = text.replace('..', '.')

text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)

In [40]:
len(text)

83

In [27]:
# 分词
tokenizer = Tokenizer(filters='',char_level=False) # char_level用于控制分词粒度，是否将把每一个字符作为一个词
tokenizer.fit_on_texts([text]) # 读取文本
vocab_size = len(tokenizer.word_index) + 1 # 词汇表大小
token_list = tokenizer.texts_to_sequences([text])[0] # 将文本转换为数字列表

In [32]:
# 清理后的文本
vocab_size

5026

In [41]:
# 通过tokenizer.word_index可以查看每个词对应的数字
print(tokenizer.word_index)

{'.': 1, '|': 2, 'the': 3, ',': 4, 'and': 5, 'a': 6, 'to': 7, 'of': 8, 'he': 9, 'his': 10, 'in': 11, '"': 12, 'you': 13, 'was': 14, 'him': 15, 'for': 16, 'it': 17, 'that': 18, 'with': 19, 'but': 20, 'at': 21, 'they': 22, 'as': 23, 'said': 24, 'i': 25, '-': 26, ':': 27, 'by': 28, 'on': 29, 'be': 30, 'when': 31, ';': 32, 'so': 33, 'one': 34, 'is': 35, 'had': 36, 'all': 37, 'them': 38, 'up': 39, 'not': 40, 'who': 41, 'lion': 42, 'her': 43, 'fox': 44, 'out': 45, 'your': 46, 'were': 47, 'have': 48, 'are': 49, 'me': 50, 'which': 51, 'no': 52, 'ass': 53, 'this': 54, 'their': 55, 'man': 56, 'my': 57, 'or': 58, 'if': 59, 'an': 60, 'wolf': 61, 'from': 62, 'she': 63, 'into': 64, 'do': 65, '!': 66, 'about': 67, 'time': 68, 'came': 69, '?': 70, 'there': 71, 'will': 72, 'upon': 73, 'gutenberg': 74, 'what': 75, 'then': 76, 'any': 77, 'himself': 78, 'some': 79, 'would': 80, 'day': 81, 'once': 82, 'project': 83, 'than': 84, 'other': 85, 'very': 86, 'work': 87, 'dog': 88, 'went': 89, 'could': 90, 'away'

In [42]:
token_list[:20]

[2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1]

In [43]:
len(tokenizer.word_counts)

5025

In [10]:
# 生成训练数据
import numpy as np
from tensorflow.keras.utils import to_categorical

In [44]:
def generate_sequences(token_list, step):
    X = []
    y = []
    # 生成一个0-token_list - seq_length的列表
    for i in range(0, len(token_list) - seq_length,step):
        # 根据i生成一个seq_length长度的序列
        X.append(token_list[i:i+seq_length])
        # 根据i+seq_length生成下一个字符
        y.append(token_list[i+seq_length])
    # 利用to_categorical将y转换为one-hot编码
    y = to_categorical(y, num_classes=vocab_size)

    # 获取sequences的长度
    num_seq = len(X)
    print('Number of sequences:', num_seq)
    return np.array(X), np.array(y), num_seq

In [45]:
# 生成训练数据
step = 1
X, y, num_seq = generate_sequences(token_list, step)

X.shape

Number of sequences: 106662


(106662, 20)

In [13]:
# 构建LSTM网络
from tensorflow.keras.layers import Dense, LSTM, Embedding,Input,Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [14]:
# 定义超参数
n_units = 256 # LSTM神经元数量
embedding_size = 100 # 词向量维度

In [15]:
# 构建模型
text_in = Input(shape=(None,)) # 输入的实际是to_categorical后的X
x = Embedding(vocab_size, embedding_size)(text_in)
x = LSTM(n_units)(x)
x = Dropout(0.2)(x)
text_out = Dense(vocab_size, activation='softmax')(x)

2023-07-15 14:08:59.775724: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-15 14:08:59.836973: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-15 14:08:59.837064: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-15 14:08:59.839272: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-15 14:08:59.839334: I tensorflow/compile

In [16]:
model = Model(text_in, text_out)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         502600    
                                                                 
 lstm (LSTM)                 (None, 256)               365568    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 5026)              1291682   
                                                                 
Total params: 2159850 (8.24 MB)
Trainable params: 2159850 (8.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
optimizer = Adam(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)



In [18]:
# 训练模型
history = model.fit(X, y, batch_size=32, epochs=100)

2023-07-15 14:09:04.007453: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2144332848 exceeds 10% of free system memory.
2023-07-15 14:09:06.441100: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2144332848 exceeds 10% of free system memory.


Epoch 1/100


2023-07-15 14:09:10.890086: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8801
2023-07-15 14:09:11.274027: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f0cdc041cf0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-07-15 14:09:11.274097: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2060, Compute Capability 7.5
2023-07-15 14:09:11.320459: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-07-15 14:09:11.568956: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [46]:
# 定义temprature采样函数
def sample_with_temp(preds,temprature):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temprature # 计算log，并除以temprature
    exp_preds = np.exp(preds) # 计算指数
    preds = exp_preds / np.sum(exp_preds) # 计算概率
    probas = np.random.multinomial(1, preds, 1) # 采样
    return np.argmax(probas) # 返回采样结果

In [53]:
# 定义文本生成函数
def generate_text(seed_text,next_words,max_sequence_length,temp):
    output_text = seed_text # 输出文本初始化
    seed_text = start_story + seed_text # 添加开始

    # 遍历所有的下个词
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_length:] # 获取最后max_sequence_length个词
        token_list = np.array(token_list).reshape(1,max_sequence_length) # 转换为numpy数组，大小为：1,max_sequence_length

        probs = model.predict(token_list)[0] # 预测下一个词的概率
        y_class = sample_with_temp(probs,temp) # 采样下一个词

        if y_class == 0:
            output_word = ''
        else:
            output_word = tokenizer.index_word[y_class]
        if output_word == "|":
            break
        output_text += output_word + ' '
        seed_text += output_word + ' '
    return output_text

### Temprature = 0.2

In [57]:
# 生成文本
seed_text = "the frog and the snake ."
gen_words = 500

print (generate_text(seed_text, gen_words, seq_length, temp = 0.2))




  preds = np.log(preds) / temprature # 计算log，并除以temprature


the frog and the snake .fell . fell . deeply . in . the . height . of . a . bald . man . and . bit . him . in . his . eagerness . to . kill . it , . he . hit . upon . a . lion's . voice . just . then , . when . he . came . to . look . at . it , . he . made . a . great . respect . for . her . nest . at . his . leisure . he . sprang . upon . the . water . and . devoured . them . off . many . as . they . had . to . do . without . their . way , . till . he . begged . to . be . taken . and . said , . "i . think . you . are . or . shall . have . some . sport . at . me , . for . i . can . see . how . thin . i . am . to . work . with . the . most . fighting . christianity . of . falling . rome . or . with . the . most . heathen . traditions . hidden . in . the . hills . of . wales . but . the . word . "mappe" . or . "malory" . will . always . mean . king . arthur ; . even . though . we . find . older . and . better . origins . than . the . mabinogian ; . or . write . later . and . worse . vers

### Tempratrue = 1.0

In [58]:
generate_text(seed_text, gen_words, seq_length, temp = 1.0)



  preds = np.log(preds) / temprature # 计算log，并除以temprature




'the frog and the snake .fell . when . the . lion . set . up . a . golden . axe , . asked . him . if . he . would . be . the . wolf . and . his . dog . "your . conduct . in . hand , . all . the . birds . i . will . never . be . so . greedy : . the . beaten . of . the . earth . as . a . poor . man . as . long . as . he . could . to . be . the . woodman . was . following . his . ear . and . with . all . the . whole . thus . they . were . doing . very . much . frightened , . and . that . he . gave . out . quite . easily . again . when . a . man . hired . there . unable . to . feast . all . his . own . way , . and . then . they . shared . to . the . wolf . in . which . the . farmer . asked . it . with . this . to . be . persuaded . by . water , . and . produced . the . swan , . at . last . he . met . the . door . to . the . file . and . made . it . with . him . a . measure . of . wheat , . whereas . the . dog . was . engaged . in . getting . into . it . when . he . cried . in . this . cond