# 自然语言处理 （NLP-Notebook）
  作者：[Alafun](https://github.com/Alafun)

  时间：2021/11/23

  描述：基于 `Seq2seq` （编码器-解码器模型） 的聊天机器人
  
  <details>
    <summary><strong>Seq2seq</strong></summary>
      <p>
        Seq2Seq（是 Sequence-to-sequence 的缩写），就如字面意思，输入一个序列，输出另一个序列。这种结构最重要的地方在于输入序列和输出序列的长度是可变的。🤗
      </p>
  </details>

  <img src="https://pytorch.org/tutorials/_images/seq2seq_ts.png" loading="lazy" alt="Overview" width="500" />

  
  >[一文看懂 Encoder-Decoder 和 Seq2Seq ](https://easyai.tech/ai-definition/encoder-decoder-seq2seq/)
 


## 数据准备

#### 如果你有 GoogleDrive 的账号可以先运行这段代码

目的为了将数据保持到 GoogleDrive 方便存储

In [None]:
from google.colab import drive
drive.mount('/gdrive')
# then click the link and enter you code

Mounted at /gdrive


In [None]:
%cd /gdrive/MyDrive/
# 可能不完全一样

/gdrive/MyDrive/seq2seq-chatbot


#### 如果你没有谷歌账号，不要管上面的程序块，直接跳到这个cell

In [None]:
# Clone the entire repo.
# 每次运行session 只需执行一次
!git clone -l -s git://github.com/tensorlayer/seq2seq-chatbot.git seq2seq-chatbot
%cd seq2seq-chatbot
!ls
%cd seq2seq-chatbot/

Cloning into 'seq2seq-chatbot'...
remote: Enumerating objects: 238, done.[K
remote: Total 238 (delta 0), reused 0 (delta 0), pack-reused 238[K
Receiving objects: 100% (238/238), 15.00 MiB | 7.52 MiB/s, done.
Resolving deltas: 100% (112/112), done.
/content/drive/MyDrive/seq2seq-chatbot
data  main.py  README.md  requirements.txt
[Errno 2] No such file or directory: 'seq2seq-chatbot/'
/content/drive/MyDrive/seq2seq-chatbot


In [1]:
%ls

 驱动器 E 中的卷没有标签。
 卷的序列号是 6AB4-7BF4

 E:\code\PyCharm\seq2seq-chatbot 的目录

2021/11/26  12:53    <DIR>          .
2021/11/26  12:53    <DIR>          ..
2021/11/26  00:37               212 .gitignore
2021/11/26  12:43    <DIR>          .ipynb_checkpoints
2021/11/26  12:52    <DIR>          data
2021/11/26  12:53            25,705 deployment_of_chatbot.ipynb
2021/11/26  00:37             5,180 main.py
2021/11/26  01:56        55,206,497 model.npz
2021/11/26  00:37             1,267 README.md
2021/11/26  00:37                65 requirements.txt
               6 个文件     55,238,926 字节
               4 个目录  1,119,748,096 可用字节


## 安装相关库

In [2]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


## 导入相关库

In [3]:
import tensorflow as tf
import tensorlayer as tl
import numpy as np
from tensorlayer.cost import cross_entropy_seq, cross_entropy_seq_with_mask
from tqdm import tqdm
from sklearn.utils import shuffle
from data.twitter import data
from tensorlayer.models.seq2seq import Seq2seq
from tensorlayer.models.seq2seq_with_attention import Seq2seqLuongAttention
import os

## 定义函数

### 定义初始化函数

In [4]:
      """
      加载数据
      将语料库的数据分配给参数
      """
def initial_setup(data_corpus):
    metadata, idx_q, idx_a = data.load_data(PATH='data/{}/'.format(data_corpus))
    (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a)
    trainX = tl.prepro.remove_pad_sequences(trainX.tolist())
    trainY = tl.prepro.remove_pad_sequences(trainY.tolist())
    testX = tl.prepro.remove_pad_sequences(testX.tolist())
    testY = tl.prepro.remove_pad_sequences(testY.tolist())
    validX = tl.prepro.remove_pad_sequences(validX.tolist())
    validY = tl.prepro.remove_pad_sequences(validY.tolist())
    return metadata, trainX, trainY, testX, testY, validX, validY


### 入口

In [5]:
if __name__ == "__main__":
    data_corpus = "twitter"  # 定义corpus为twitter可换成自己的语料库

    #data preprocessing
    metadata, trainX, trainY, testX, testY, validX, validY = initial_setup(data_corpus)

    # Parameters
    src_len = len(trainX)
    tgt_len = len(trainY)

    assert src_len == tgt_len

    batch_size = 32
    n_step = src_len // batch_size
    src_vocab_size = len(metadata['idx2w']) # 8002 (0~8001)
    emb_dim = 1024

    word2idx = metadata['w2idx']   # dict  word 2 index
    idx2word = metadata['idx2w']   # list index 2 word

    unk_id = word2idx['unk']   # 1
    pad_id = word2idx['_']     # 0

    start_id = src_vocab_size  # 8002
    end_id = src_vocab_size + 1  # 8003

    word2idx.update({'start_id': start_id})
    word2idx.update({'end_id': end_id})
    idx2word = idx2word + ['start_id', 'end_id']

    src_vocab_size = tgt_vocab_size = src_vocab_size + 2

    num_epochs = 5    # 迭代轮数 初始为50；
    vocabulary_size = src_vocab_size
    


    def inference(seed, top_n):
        model_.eval()
        seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
        sentence_id = model_(inputs=[[seed_id]], seq_length=20, start_token=start_id, top_n = top_n)
        sentence = []
        for w_id in sentence_id[0]:
            w = idx2word[w_id]
            if w == 'end_id':
                break
            sentence = sentence + [w]
        return sentence

    decoder_seq_length = 20
    model_ = Seq2seq(
        decoder_seq_length = decoder_seq_length,
        cell_enc=tf.keras.layers.GRUCell,
        cell_dec=tf.keras.layers.GRUCell,
        n_layer=3,
        n_units=256,
        embedding_layer=tl.layers.Embedding(vocabulary_size=vocabulary_size, embedding_size=emb_dim),
        )
    

    # Uncomment below statements if you have already saved the model

    # load_weights = tl.files.load_npz(name='model.npz')
    # tl.files.assign_weights(load_weights, model_)
    
    load_weights = tl.files.load_npz(name='model.npz')
    tl.files.assign_weights(load_weights, model_)

    #########################################
    
    optimizer = tf.optimizers.Adam(learning_rate=0.001) # adm优化器
    model_.train()

    seeds = ["happy birthday have a nice day",
                 "donald trump won last nights presidential debate according to snap online polls"]
    for epoch in range(num_epochs):
        # 模型训练
        model_.train()
        
        #shuffle
        trainX, trainY = shuffle(trainX, trainY, random_state=0)
        total_loss, n_iter = 0, 0
        
        # tqdm进度条
        for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False), 
                        total=n_step, desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs), leave=False):

            X = tl.prepro.pad_sequences(X)
            _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
            _target_seqs = tl.prepro.pad_sequences(_target_seqs, maxlen=decoder_seq_length)
            _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
            _decode_seqs = tl.prepro.pad_sequences(_decode_seqs, maxlen=decoder_seq_length)
            _target_mask = tl.prepro.sequences_get_mask(_target_seqs)

            with tf.GradientTape() as tape:
                ## compute outputs
                output = model_(inputs = [X, _decode_seqs])
                
                output = tf.reshape(output, [-1, vocabulary_size])
                ## compute loss and update model
                loss = cross_entropy_seq_with_mask(logits=output, target_seqs=_target_seqs, input_mask=_target_mask)

                grad = tape.gradient(loss, model_.all_weights)
                optimizer.apply_gradients(zip(grad, model_.all_weights))
            
            total_loss += loss
            n_iter += 1

        # printing average loss after every epoch
        print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))

        for seed in seeds:
            print("Query >", seed)
            top_n = 3
            for i in range(top_n):
                sentence = inference(seed, top_n)
                print(" >", ' '.join(sentence))

        tl.files.save_npz(model_.all_weights, name='model.npz')

[TL] Embedding embedding_1: (8004, 1024)
[TL] RNN rnn_1: cell: GRUCell, n_units: 256
[TL] RNN rnn_2: cell: GRUCell, n_units: 256
[TL] RNN rnn_3: cell: GRUCell, n_units: 256
[TL] RNN rnn_4: cell: GRUCell, n_units: 256
[TL] RNN rnn_5: cell: GRUCell, n_units: 256
[TL] RNN rnn_6: cell: GRUCell, n_units: 256
[TL] Reshape reshape_1
[TL] Dense  dense_1: 8004 No Activation
[TL] Reshape reshape_2
[TL] Reshape reshape_3


                                                                                                                                                                                          

Epoch [1/5]: loss 5.0189
Query > happy birthday have a nice day
 > thank u  love u
 > thank god you love me too much
 > thank god   
Query > donald trump won last nights presidential debate according to snap online polls
 > i am not sure if he was born
 > i am going on the same way
 > i dont know how much
[TL] [*] Saving TL weights into model.npz


  val = np.asanyarray(val)


[TL] [*] Saved


                                                                                                                   

Epoch [2/5]: loss 4.7462
Query > happy birthday have a nice day
 > thank you 
 > thank you 
 > thank u so much i miss it too 
Query > donald trump won last nights presidential debate according to snap online polls
 >  he needs to get the truth to the american people
 >  he needs a new york and a half
 >  i am not saying this
[TL] [*] Saving TL weights into model.npz
[TL] [*] Saved


                                                                                                                   

Epoch [3/5]: loss 4.5309
Query > happy birthday have a nice day
 > thank you i miss you so well soon
 > thank god you love me 
 > thank you i appreciate it i appreciate you
Query > donald trump won last nights presidential debate according to snap online polls
 > he admitted that was the reason to read
 > he has no stamina for the other people in his campaign
 > he is a liar for the presidency of the debate
[TL] [*] Saving TL weights into model.npz
[TL] [*] Saved


                                                                                                                   

Epoch [4/5]: loss 4.3351
Query > happy birthday have a nice day
 > thanks girl miss ya too
 > thanks girl i miss it too  i appreciate you   
 > thanks brotha appreciate the     xo
Query > donald trump won last nights presidential debate according to snap online polls
 > he is the only racist he has to get his ass on his hands 
 > he is not the worst of america  he is the best
 > he was an idiot and hes a coke coke
[TL] [*] Saving TL weights into model.npz
[TL] [*] Saved


                                                                                                                   

Epoch [5/5]: loss 4.1476
Query > happy birthday have a nice day
 > thanks al love you so well deserved 
 > thanks brotha miss you so many
 > thanks brotha miss you too
Query > donald trump won last nights presidential debate according to snap online polls
 > trump is an idiot
 > trump will be a president
 > trump should not be fired for the rest of his career or the other time we will see us in
[TL] [*] Saving TL weights into model.npz
[TL] [*] Saved


In [13]:
npz=np.load("model.npz",allow_pickle=True)

In [14]:
npz.files

['params']

In [16]:
npz['params']

array([array([[-0.10055935, -0.33314782,  0.00701442, ...,  0.09860136,
                0.15007073, -0.05226222],
              [ 0.07286207,  0.24003172,  0.03290554, ..., -0.08757132,
                0.18949363, -0.18126886],
              [ 0.00704551,  0.1703544 ,  0.10520507, ...,  0.0008816 ,
               -0.10193058,  0.08364528],
              ...,
              [-0.09227795,  0.4044721 , -0.0742581 , ...,  0.09593895,
               -0.148178  ,  0.05280496],
              [-0.00868996, -0.22137943, -0.17080946, ..., -0.11631478,
               -0.0565454 ,  0.04014151],
              [ 0.14837095,  0.04824803, -0.13587292, ..., -0.08375482,
                0.25499883, -0.05871639]], dtype=float32)              ,
       array([[-0.04944015, -0.12472489,  0.00115804, ...,  0.03284711,
                0.06106395, -0.04631912],
              [ 0.13042727,  0.1995454 , -0.34676737, ..., -0.02441607,
               -0.08482856,  0.3381039 ],
              [-0.05265179,  0.2061678

In [17]:
print(npz['params'].shape)

(21,)


In [18]:
load_weights = tl.files.load_npz(name='model.npz')
tl.files.assign_weights(load_weights, model_)

[<tf.Variable 'UnreadVariable' shape=(1024, 768) dtype=float32, numpy=
 array([[-0.10055935, -0.33314782,  0.00701442, ...,  0.09860136,
          0.15007073, -0.05226222],
        [ 0.07286207,  0.24003172,  0.03290554, ..., -0.08757132,
          0.18949363, -0.18126886],
        [ 0.00704551,  0.1703544 ,  0.10520507, ...,  0.0008816 ,
         -0.10193058,  0.08364528],
        ...,
        [-0.09227795,  0.4044721 , -0.0742581 , ...,  0.09593895,
         -0.148178  ,  0.05280496],
        [-0.00868996, -0.22137943, -0.17080946, ..., -0.11631478,
         -0.0565454 ,  0.04014151],
        [ 0.14837095,  0.04824803, -0.13587292, ..., -0.08375482,
          0.25499883, -0.05871639]], dtype=float32)>,
 <tf.Variable 'UnreadVariable' shape=(256, 768) dtype=float32, numpy=
 array([[-0.04944015, -0.12472489,  0.00115804, ...,  0.03284711,
          0.06106395, -0.04631912],
        [ 0.13042727,  0.1995454 , -0.34676737, ..., -0.02441607,
         -0.08482856,  0.3381039 ],
        [-0.

In [32]:
inference('are you ok',top_n)

['i', 'know', 'but', 'i', 'dont', 'know']