# 人工智能起名

目标：
我们希望用户输入一些描述的情况和姓氏的情况下，生成一个名字，方便我们在各种情况下使用。

比如输入：
- 描述 `我是一个舞蹈演员，我喜欢跳舞，听歌`
- 性别 `女`
- 姓氏 `陈`

输出：
- `陈青舞`


参考：
- 数据集名称 chinese-person-profiles，制作 @CYang
- [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)
- [tf.keras](https://www.tensorflow.org/guide/keras/sequential_model) 
- [eager execution](https://www.tensorflow.org/guide/eager)

## 基础环境安装

### 安装 datasetstore 下载数据集 chinese-person-profile

In [None]:
!pip install datasetstore

### 安装 tensorflow 和 tensorflow-addons

In [None]:
!pip install tensorflow-addons==0.11.2

In [None]:
!pip install tensorflow

## 数据加载和数据探索 (EDA)

### 读取数据

In [1]:
from datasetstore import load_dataset

dataset = load_dataset("chinese-person-profile-10k")

✅✅✅✅✅✅✅✅✅✅ 下载完成


In [2]:
dataset[0]

{'id': 1669879400,
 'nickname': 'Dear-迪丽热巴',
 'gender': '女',
 'location': '上海',
 'birthday': '双子座',
 'description': '一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒',
 'verified_reason': '嘉行传媒签约演员',
 'followers': 78346000,
 'education': '上海戏剧学院'}

In [3]:
dataset[1]

{'id': 1223178222,
 'nickname': '胡歌',
 'gender': '男',
 'location': '上海 徐汇区',
 'birthday': '0001-00-00',
 'description': '🎈                                                                                                                                🏃🏻',
 'verified_reason': '演员',
 'followers': 71645000,
 'education': '上海戏剧学院 01级'}

### 数据探索

In [4]:
len(dataset)

10827

## 名字生成任务

给定一个姓氏和对于名字的描述，生成对应的名字。

- 输入：<姓氏><mask><mask...> _ <性别> _ <描述>
- 输出：<名字>

### 构建一条样本

In [18]:
def construct_one_example(example):
    if example["nickname"]:
        nickname_len = len(example["nickname"])
        # 输入
        gender = example["gender"] if example["gender"] else ""
        desc = example["description"] if example["description"] else ""
        # head_name + ("<mask>" * (nickname_len - 1)) +
        text = "<start>" + gender + "，" + desc + "<end>"
        text = tf.strings.unicode_split(text, input_encoding="UTF-8")
        # 输出
        label = "<start>" + example["nickname"] + "<end>"
        label = tf.strings.unicode_split(label, input_encoding="UTF-8")
        return {"text": text, "label": label}

In [19]:
one_example = dataset[0]
one_example

{'id': 1669879400,
 'nickname': 'Dear-迪丽热巴',
 'gender': '女',
 'location': '上海',
 'birthday': '双子座',
 'description': '一只喜欢默默表演的小透明。工作联系jaywalk@jaywalk.com.cn 🍒',
 'verified_reason': '嘉行传媒签约演员',
 'followers': 78346000,
 'education': '上海戏剧学院'}

In [20]:
construct_one_example(one_example)

{'text': <tf.Tensor: shape=(55,), dtype=string, numpy=
 array([b'<', b's', b't', b'a', b'r', b't', b'>', b'\xe5\xa5\xb3',
        b'\xef\xbc\x8c', b'\xe4\xb8\x80', b'\xe5\x8f\xaa', b'\xe5\x96\x9c',
        b'\xe6\xac\xa2', b'\xe9\xbb\x98', b'\xe9\xbb\x98', b'\xe8\xa1\xa8',
        b'\xe6\xbc\x94', b'\xe7\x9a\x84', b'\xe5\xb0\x8f', b'\xe9\x80\x8f',
        b'\xe6\x98\x8e', b'\xe3\x80\x82', b'\xe5\xb7\xa5', b'\xe4\xbd\x9c',
        b'\xe8\x81\x94', b'\xe7\xb3\xbb', b'j', b'a', b'y', b'w', b'a',
        b'l', b'k', b'@', b'j', b'a', b'y', b'w', b'a', b'l', b'k', b'.',
        b'c', b'o', b'm', b'.', b'c', b'n', b' ', b'\xf0\x9f\x8d\x92',
        b'<', b'e', b'n', b'd', b'>'], dtype=object)>,
 'label': <tf.Tensor: shape=(21,), dtype=string, numpy=
 array([b'<', b's', b't', b'a', b'r', b't', b'>', b'D', b'e', b'a', b'r',
        b'-', b'\xe8\xbf\xaa', b'\xe4\xb8\xbd', b'\xe7\x83\xad',
        b'\xe5\xb7\xb4', b'<', b'e', b'n', b'd', b'>'], dtype=object)>}

## 数据集文本预处理

### 过滤没有名字的数据

In [21]:
filtered_dataset = dataset.filter(lambda x: x["nickname"])

Loading cached processed dataset at /tmp/tmp1o1630cj/fast-datasets/chinese-person-profile-10k/cache-d7b454f7f0225a16.arrow


In [22]:
len(filtered_dataset)

10811

### 数据集转化为目标格式文本

In [23]:
target_dataset = filtered_dataset.map(construct_one_example)

  0%|          | 0/10811 [00:00<?, ?ex/s]

In [24]:
target_dataset[10]

{'id': 3517080830,
 'nickname': 'Ice-dance柳鑫宇',
 'gender': '男',
 'location': '吉林',
 'birthday': '1994-10-16',
 'description': '花样滑冰国家队冰舞运动员 商务合作邮箱：lightmoon@vip.163.com',
 'verified_reason': '中国花样滑冰队冰舞运动员 柳鑫宇',
 'followers': 1787000,
 'education': None,
 'text': [b'<',
  b's',
  b't',
  b'a',
  b'r',
  b't',
  b'>',
  b'\xe7\x94\xb7',
  b'\xef\xbc\x8c',
  b'\xe8\x8a\xb1',
  b'\xe6\xa0\xb7',
  b'\xe6\xbb\x91',
  b'\xe5\x86\xb0',
  b'\xe5\x9b\xbd',
  b'\xe5\xae\xb6',
  b'\xe9\x98\x9f',
  b'\xe5\x86\xb0',
  b'\xe8\x88\x9e',
  b'\xe8\xbf\x90',
  b'\xe5\x8a\xa8',
  b'\xe5\x91\x98',
  b' ',
  b'\xe5\x95\x86',
  b'\xe5\x8a\xa1',
  b'\xe5\x90\x88',
  b'\xe4\xbd\x9c',
  b'\xe9\x82\xae',
  b'\xe7\xae\xb1',
  b'\xef\xbc\x9a',
  b'l',
  b'i',
  b'g',
  b'h',
  b't',
  b'm',
  b'o',
  b'o',
  b'n',
  b'@',
  b'v',
  b'i',
  b'p',
  b'.',
  b'1',
  b'6',
  b'3',
  b'.',
  b'c',
  b'o',
  b'm',
  b'<',
  b'e',
  b'n',
  b'd',
  b'>'],
 'label': [b'<',
  b's',
  b't',
  b'a',
  b'r',
  b't',
  b'>',
 

### 将文本转化为 onehot 格式

In [25]:
import tensorflow as tf

In [26]:
# 创建 tokenizer
inp_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters="", oov_token="<OOV>"
)
inp_lang_tokenizer.fit_on_texts(target_dataset["text"])
input_tensor = inp_lang_tokenizer.texts_to_sequences(target_dataset["text"])

In [30]:
# padding 数据
padding_inp_tensor = tf.keras.preprocessing.sequence.pad_sequences(
    input_tensor, padding="post"
)
padding_inp_tensor, padding_inp_tensor.shape

(array([[3, 9, 2, ..., 0, 0, 0],
        [3, 9, 2, ..., 0, 0, 0],
        [3, 9, 2, ..., 0, 0, 0],
        ...,
        [3, 9, 2, ..., 0, 0, 0],
        [3, 9, 2, ..., 0, 0, 0],
        [3, 9, 2, ..., 0, 0, 0]], dtype=int32),
 (10811, 572))

In [31]:
# 创建 tokenizer
tar_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters="", oov_token="<OOV>"
)
tar_lang_tokenizer.fit_on_texts(target_dataset["label"])
output_tensor = tar_lang_tokenizer.texts_to_sequences(target_dataset["label"])

In [32]:
# padding 数据
padding_output_tensor = tf.keras.preprocessing.sequence.pad_sequences(
    output_tensor, padding="post"
)
padding_output_tensor, padding_output_tensor.shape

(array([[3, 9, 2, ..., 0, 0, 0],
        [3, 9, 2, ..., 0, 0, 0],
        [3, 9, 2, ..., 0, 0, 0],
        ...,
        [3, 9, 2, ..., 0, 0, 0],
        [3, 9, 2, ..., 0, 0, 0],
        [3, 9, 2, ..., 0, 0, 0]], dtype=int32),
 (10811, 37))

### 构建训练集

In [34]:
BUFFER_SIZE = 32000
BATCH_SIZE = 8
# Let's limit the #training examples for faster training
num_examples = 30000

In [35]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (padding_inp_tensor, padding_output_tensor)
)
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(
    BATCH_SIZE, drop_remainder=True
)

In [36]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([8, 572]), TensorShape([8, 37]))

## 构建模型

接下来，我们要使用 `keras.Model` 子类构建模型。 (For details see [Making new Layers and Models via subclassing](https://www.tensorflow.org/guide/keras/custom_layers_and_models)). 

这个模型有三层：

* `tf.keras.layers.Embedding`: 输入层. 一个训练好的 lookup table 能够把每个 character-ID 映射成一个 `embedding_dim` 大小;
* `tf.keras.layers.GRU`: 一个大小为 `units=rnn_units` 的 GRU 结构 (这里也可以使用 LSTM )
* `tf.keras.layers.Dense`: 输出层, 带有 `vocab_size` 大小的输出层. 它会对字典中的每一个字输出一个。这些就是模型对于每个字的 log-likelihood 。

In [37]:
vocab_inp_size = len(inp_lang_tokenizer.word_index) + 1
vocab_tar_size = len(tar_lang_tokenizer.word_index) + 1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]

embedding_dim = 256
units = 1024
steps_per_epoch = num_examples // BATCH_SIZE

In [38]:
example_input_batch.shape, example_target_batch.shape

(TensorShape([8, 572]), TensorShape([8, 37]))

In [39]:
print(example_input_batch)
print(example_target_batch)

tf.Tensor(
[[3 9 2 ... 0 0 0]
 [3 9 2 ... 0 0 0]
 [3 9 2 ... 0 0 0]
 ...
 [3 9 2 ... 0 0 0]
 [3 9 2 ... 0 0 0]
 [3 9 2 ... 0 0 0]], shape=(8, 572), dtype=int32)
tf.Tensor(
[[   3    9    2    5    8    2    4  166 1835  187 1554  137    3    6
     7   10    4    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [   3    9    2    5    8    2    4   50  775   95   72    3    6    7
    10    4    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [   3    9    2    5    8    2    4  967  618  543    5    7    2   12
     7   11   12    3    6    7   10    4    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [   3    9    2    5    8    2    4  700  194   10    6    7   11    9
     6    3    6    7   10    4    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [   3    9    2    5    8    2    4  355  826  160  140  53

In [40]:
print("最长输入, 最长输出, 输入词典大小, 输出词典大小")
max_length_input, max_length_output, vocab_inp_size, vocab_tar_size

最长输入, 最长输出, 输入词典大小, 输出词典大小


(572, 37, 4748, 3126)

### Encoder 编码器

In [41]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        ##-------- LSTM layer in Encoder ------- ##
        self.lstm_layer = tf.keras.layers.LSTM(
            self.enc_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer="glorot_uniform",
        )

    def call(self, x, hidden):
        x = self.embedding(x)
        output, h, c = self.lstm_layer(x, initial_state=hidden)
        return output, h, c

    def initialize_hidden_state(self):
        return [
            tf.zeros((self.batch_sz, self.enc_units)),
            tf.zeros((self.batch_sz, self.enc_units)),
        ]

### 编码器的验证

In [42]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print(
    "Encoder output shape: (batch size, sequence length, units) {}".format(
        sample_output.shape
    )
)
print("Encoder h vecotr shape: (batch size, units) {}".format(sample_h.shape))
print("Encoder c vector shape: (batch size, units) {}".format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (8, 572, 1024)
Encoder h vecotr shape: (batch size, units) (8, 1024)
Encoder c vector shape: (batch size, units) (8, 1024)


### 解码器

In [43]:
class Decoder(tf.keras.Model):
    def __init__(
        self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type="luong"
    ):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.attention_type = attention_type

        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        # Final Dense layer on which softmax will be applied
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Define the fundamental cell for decoder recurrent structure
        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)

        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(
            self.dec_units,
            None,
            self.batch_sz * [max_length_input],
            self.attention_type,
        )

        # Wrap attention mechanism with the fundamental rnn cell of decoder
        self.rnn_cell = self.build_rnn_cell(batch_sz)

        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(
            self.rnn_cell, sampler=self.sampler, output_layer=self.fc
        )

    def build_rnn_cell(self, batch_sz):
        rnn_cell = tfa.seq2seq.AttentionWrapper(
            self.decoder_rnn_cell,
            self.attention_mechanism,
            attention_layer_size=self.dec_units,
        )
        return rnn_cell

    def build_attention_mechanism(
        self, dec_units, memory, memory_sequence_length, attention_type="luong"
    ):
        # ------------- #
        # typ: Which sort of attention (Bahdanau, Luong)
        # dec_units: final dimension of attention outputs
        # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
        # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

        if attention_type == "bahdanau":
            return tfa.seq2seq.BahdanauAttention(
                units=dec_units,
                memory=memory,
                memory_sequence_length=memory_sequence_length,
            )
        else:
            return tfa.seq2seq.LuongAttention(
                units=dec_units,
                memory=memory,
                memory_sequence_length=memory_sequence_length,
            )

    def build_initial_state(self, batch_sz, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(
            batch_size=batch_sz, dtype=Dtype
        )
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state

    def call(self, inputs, initial_state):
        x = self.embedding(inputs)
        outputs, _, _ = self.decoder(
            x,
            initial_state=initial_state,
            sequence_length=self.batch_sz * [max_length_output - 1],
        )
        return outputs

### 解码器验证

In [44]:
import tensorflow_addons as tfa

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, "luong")
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(
    BATCH_SIZE, [sample_h, sample_c], tf.float32
)

sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

 The versions of TensorFlow you are currently using is 2.5.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Decoder Outputs Shape:  (8, 36, 3126)


### 定义优化器

In [45]:
optimizer = tf.keras.optimizers.Adam()


def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction="none"
    )
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real, 0))  # output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = mask * loss
    loss = tf.reduce_mean(loss)
    return loss

### 检查点(Checkpoints) 

In [47]:
import os


checkpoint_dir = "./training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

### 定义一步的操作

In [48]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)

        dec_input = targ[:, :-1]  # Ignore <end> token
        real = targ[:, 1:]  # ignore <start> token

        # Set the AttentionMechanism object with encoder_outputs
        decoder.attention_mechanism.setup_memory(enc_output)

        # Create AttentionWrapperState as initial_state for decoder
        decoder_initial_state = decoder.build_initial_state(
            BATCH_SIZE, [enc_h, enc_c], tf.float32
        )
        pred = decoder(dec_input, decoder_initial_state)
        logits = pred.rnn_output
        loss = loss_function(real, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

### 训练模型

In [50]:
import time


EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)

    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print(
                "Epoch {} Batch {} Loss {:.4f}".format(
                    epoch + 1, batch, batch_loss.numpy()
                )
            )
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)

    print("Epoch {} Loss {:.4f}".format(epoch + 1, total_loss / steps_per_epoch))
    print("Time taken for 1 epoch {} sec\n".format(time.time() - start))

Epoch 1 Batch 0 Loss 3.7444


KeyboardInterrupt: 

In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0
