In [1]:
import numpy as np 
import pandas as pd
import os 
import sys
import time
import tensorflow as tf
from tensorflow import keras
import warnings
warnings.filterwarnings('ignore')
print(tf.__version__)

2.3.1


## 一、导入文本

In [2]:
input_filepath='./shakespeare.txt'
text=open(input_filepath,'r').read()

## 二、数据转换

'''
 数据转换要做三件事情：
    1、生成一个词汇表
    2、将char映射成为数字id
    3、将data数据转换成数字id
    4、预测字符的下一个字符
'''

In [5]:
#生成词表
vocab = sorted(set(text)) #去重排序

In [6]:
print(len(vocab))

65


In [7]:
print(vocab)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [8]:
#词到id的映射
char2idx = { char:idx for idx, char in enumerate(vocab)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [9]:
#将此表变成np.array类型
idx2char = np.array(vocab)

In [10]:
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [11]:
#将文本转换成数字的形式
text_as_int = np.array([char2idx[c] for c in text])

In [12]:
print(text_as_int[0:10])
print(text[0:10])

[18 47 56 57 58  1 15 47 58 47]
First Citi


## 三、定义输入和输出

In [13]:
def split_input_target(id_text):
    """
      abcde -> abcd,bcde  
    """
    return id_text[0:-1],id_text[1:]

In [14]:
#转换成dataset的格式
char_dataset = tf.data.Dataset.from_tensor_slices(
    text_as_int)

In [15]:
seq_length = 100
seq_dataset = char_dataset.batch(
    seq_length + 1, 
    drop_remainder=True)

In [16]:
seq_dataset = seq_dataset.map(split_input_target)

In [17]:
batch_size=64
buffer_size=10000
seq_dataset=seq_dataset.shuffle(buffer_size).batch(
    batch_size,drop_remainder=True)

### 四、构建模型

In [19]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [20]:
#定义模型函数
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
    model=keras.models.Sequential([
        keras.layers.Embedding(vocab_size,
                               embedding_dim,
                               batch_input_shape =[batch_size,None]),
        keras.layers.SimpleRNN(units=rnn_units,
                               return_sequences=True),
        keras.layers.Dense(vocab_size)
    ])
    return model

In [21]:
model = build_model(
    vocab_size = vocab_size,
    embedding_dim= embedding_dim,
    rnn_units=rnn_units,
    batch_size=batch_size)

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [25]:
for input_example_batch,target_example_batch in seq_dataset.take(1):
    example_batch_predictions=model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 100, 65)


## 五、随机采样

In [29]:
sample_indices=tf.random.categorical(logits=example_batch_predictions[0],
                     num_samples=1)
sample_indices=tf.squeeze(sample_indices,axis=1)

In [30]:
print(sample_indices.shape)

(100,)


## 六、定义模型损失函数

In [32]:
def loss(labels,logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels,
        logits,
        from_logits=True)

In [33]:
model.compile(optimizer='adam',loss=loss)
example_loss=loss(target_example_batch,example_batch_predictions)

In [35]:
output_dir='./text_generation_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix=os.path.join(output_dir,'ckpt_{epoch}')
checkpoint_callback=keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
epochs=100

In [None]:
history=model.fit(seq_dataset,
                  epochs=epochs,
                  callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100

In [None]:
tf.train.latest_checkpoint(output_dir)

In [None]:
model2=build_model(
    vocab_size,
    embedding_dim,
    rnn_units,
    batch_size=1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.Tensorshape([1,None]))

In [None]:
def generate_text(model,start_string,num_generate=1000):
    input_eval=[char2idx[ch] for ch in start_string]
    input_eval=tf.expand_dims(input_eval,0)
    text_generated=[]
    model.reset_states()
    for _ in range(num_generate):
        predictions=model(input_eval)
        predictions=tf.squeeze(predictions,0)
        predicted_id=tf.random.categoorical(
            predictions,num_samples=1)[-1,0].numpy()
        text_generated.append(idx2char[predicted_id])
        input_eval=tf.expand_dims([predicted_id],0)
    return start_string + ''.join(text_generated)

In [None]:
new_text=generate_text(model2,'All:')