# Teacher forcing 

## data preprocessing

In [2]:
!nvidia-smi

Wed Mar  3 10:36:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 440.82       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:01:00.0  On |                  N/A |
| 27%   32C    P8     8W / 250W |    101MiB / 11016MiB |     13%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:02:00.0 Off |                  N/A |
| 38%   65C    P3    53W / 250W |      1MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1' #用0這顆比較不會跟別人打架
# os.environ['TF_FORCE_GPU_ALLOW_GROWTH']='false'
import tensorflow as tf
from tensorflow.python.client import device_lib


print(device_lib.list_local_devices())
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.allow_soft_placement=True
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
#         print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
else:
    print('No GPU!!')

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5809261071221456325
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 684667629247630923
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10812368487
locality {
  bus_id: 1
  links {
  }
}
incarnation: 15658543298815003355
physical_device_desc: "device: 0, name: GeForce RTX 2080 Ti, pci bus id: 0000:02:00.0, compute capability: 7.5"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 13749953680598633488
physical_device_desc: "device: XLA_GPU device"
]


In [4]:
# coding: utf-8


"""

@author: charlie
"""
import tensorflow as tf
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding , Input , Dense, Flatten , Activation ,\
                        GRU, LSTM
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping 


import gc
import matplotlib.pyplot as plt

import pickle 

In [5]:
with open('post-response_100000_8words.pkl' , 'rb' ) as f:
    post, response  = pickle.load(f)


In [6]:

class Tokenizer:
    
    def __init__(self , all_train_list, seq_len = 10):
        """traininig data only
        input list: post + response
        """
        self.seq_len = seq_len 
        
        self.word_uniq = {'EOS','BOS','OTHER'}
        self.word2id = dict()
        self.id2word = dict()
        
        for line in all_train_list:
            if len(line)==0:
                continue
                
            self.word_uniq |= set(line)
        self.word_uniq = sorted(list(self.word_uniq))
        self.num_word = len(self.word_uniq)
        for i,word in enumerate(self.word_uniq):
            self.word2id[word] =i+1
        
        self.word2id['PAD']=0
        self.PAD= self.word2id['PAD']
        self.OTHER=self.word2id['OTHER']
        self.EOS = self.word2id['EOS']
        self.BOS = self.word2id['BOS']
        
        for k,v in self.word2id.items():
            self.id2word[v]=k
        
    
    def token(self,  sents):
        """ tokenize & padding 
        input/output: list of sentence
        """
        
        tokened = []
        for line in sents:
            tmp = []
            for word in line:
                tmp.append(self.word2id.get(word , self.OTHER))
            tmp.append(self.EOS)
            tmp = tmp if len(tmp)<= self.seq_len else tmp[:self.seq_len-1]+[self.EOS]
            while len(tmp) <self.seq_len:
                tmp.append(self.PAD)
            tokened.append(tmp)
        return tokened
    def detoken(self, sents, space_unused_token=False):
        """ de-tokenize 
        input/output: list of sentence
        """
        detokened = []
        for line in sents:
            tmp = []
            for token in line:
                if space_unused_token and token in {self.OTHER,
                                                    self.EOS,
                                                    self.BOS,
                                                    self.PAD }:
                    continue
                tmp.append(self.id2word.get(token , ''))
            detokened.append(tmp)
        return detokened
        

    

In [7]:
len(post), len(response)

(100000, 100000)

In [8]:
print('max len of post and response')
max([len(line)  for line in post]) , max([len(line)  for line in  response])

max len of post and response


(8, 8)

In [9]:
seq_len=10
post , response = [sent if len(sent)<=seq_len+1 else sent[:seq_len+1] for sent in post] , \
                  [sent if len(sent)<=seq_len+1 else sent[:seq_len+1] for sent in response]

post_train , post_test = post[:50000] , post[50000:]
response_train , response_test = response[:50000] , response[50000:]

tokenizer = Tokenizer(post_train+response_train , seq_len = seq_len)

In [10]:
print(f'num_word: {tokenizer.num_word}')

num_word: 49607


In [11]:
# Test 

print('reverted sentence')
for l in tokenizer.detoken(tokenizer.token(post_test[:3]),False): 
    #  include padded tokens if False
    print(l)
    
print('\noriginal sentence')
for l in  post_test[:3]:
    print(l)

reverted sentence
['谁', '说', '宅男', '战力', '不如', '鹅', '来的', '？', 'EOS', 'PAD']
['OTHER', '。', '右', '一', '为', '毛', '主席', '。', 'EOS', 'PAD']
['摄影师们', '平时', '一定', '要', '注意', '加强', '身体', '锻炼', 'EOS', 'PAD']

original sentence
['谁', '说', '宅男', '战力', '不如', '鹅', '来的', '？']
['毛家', '。', '右', '一', '为', '毛', '主席', '。']
['摄影师们', '平时', '一定', '要', '注意', '加强', '身体', '锻炼']


## Training process

In [12]:
x1 ,x2 , y = [],[],[]

for line  in tokenizer.token(post):
    x1.append(line)
    
for line in tokenizer.token(response):
    teacher = line if line[-1] != tokenizer.EOS else line[:-2]+[tokenizer.EOS, tokenizer.PAD]
    x2.append([tokenizer.BOS]+teacher[:-1])
    y.append(line)
x1,x2,y = np.array(x1) , np.array(x2) , np.array(y).reshape(-1 ,seq_len , 1)
x1_train , x1_test = x1[:50000 ] , x1[50000: ]
x2_train , x2_test = x2[:50000] , x2[50000:]
y_train ,y_test = y[:50000] , y[50000:]

In [13]:


def get_model(num_word = tokenizer.num_word ,
              seq_len = tokenizer.seq_len, 
              nunit=256 , 
              embedding_dim=128 ):

    # declare
    input_x       =Input(shape=(seq_len ,))
    input_teacher = Input(shape=(seq_len , ))
    embedding     = Embedding(num_word+1, embedding_dim)    # token start from 1 and including PAD
    encoder = LSTM(nunit,return_state=True) 
    decoder = LSTM(nunit,return_sequences = True)
    logit = Dense(num_word+1,activation='softmax')
    
    
    # inference
    emb_x       = embedding(input_x)
    emb_teacher = embedding(input_teacher)
    latent_enc ,h,c  = encoder(emb_x)
    latent_dec = decoder(emb_teacher , initial_state=[h,c])
    prob = logit(latent_dec)
    
    model = Model(inputs=[input_x, input_teacher] , outputs = prob)
    model.summary()
    
    return model

earlystop = EarlyStopping(monitor='loss',patience = 10 , verbose=2, mode='auto')
model = get_model()
model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])
    



Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 128)      6349824     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 394240      embedding[0][0]              

In [14]:
# default adam
# learning_rate=0.001,
#                beta_1=0.9,
#                beta_2=0.999,
#                epsilon=1e-7,
#                amsgrad=False,
#                name='Adam',

# optim = tf.keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
# model.compile(loss='sparse_categorical_crossentropy',
#               optimizer='adagrad',
#              metrics=['acc'])

In [15]:
# x1_tensor=tf.convert_to_tensor(x1[:100])
# x2_tensor=tf.convert_to_tensor(x2[:100])
# y_tensor = tf.convert_to_tensor(y[:100])

# x1_tensor=np.array(x1)
# x2_tensor=np.array(x2)
# y_tensor = np.array(y).reshape(-1 , 40 , 1)


train_size = 'all'

if train_size =='all':
    x1_part , x2_part , y_part = x1_train , x2_train , y_train
else:
    x1_part , x2_part , y_part = x1_train[:train_size] , x2_train[:train_size] , y_train[:train_size]

In [None]:
Hist=model.fit([x1_part,x2_part] , y_part,
                validation_data=([x1_test, x2_test],y_test),
#                validation_split = .2,
                epochs=100,
                batch_size = 128,
                verbose = 2)
#                   callbacks=[earlystop])

Train on 50000 samples, validate on 50000 samples
Epoch 1/100
50000/50000 - 95s - loss: 5.6802 - acc: 0.2945 - val_loss: 5.1489 - val_acc: 0.3487
Epoch 2/100
50000/50000 - 84s - loss: 4.9619 - acc: 0.3546 - val_loss: 5.0554 - val_acc: 0.3606
Epoch 3/100
50000/50000 - 74s - loss: 4.7628 - acc: 0.3713 - val_loss: 5.0026 - val_acc: 0.3691
Epoch 4/100
50000/50000 - 75s - loss: 4.6019 - acc: 0.3857 - val_loss: 4.9724 - val_acc: 0.3729
Epoch 5/100
50000/50000 - 75s - loss: 4.1895 - acc: 0.4192 - val_loss: 4.9503 - val_acc: 0.3802
Epoch 8/100
50000/50000 - 76s - loss: 4.0634 - acc: 0.4290 - val_loss: 4.9656 - val_acc: 0.3820
Epoch 9/100
50000/50000 - 76s - loss: 3.9400 - acc: 0.4379 - val_loss: 4.9776 - val_acc: 0.3819
Epoch 10/100
50000/50000 - 75s - loss: 3.8188 - acc: 0.4461 - val_loss: 5.0056 - val_acc: 0.3826
Epoch 11/100
50000/50000 - 73s - loss: 3.6997 - acc: 0.4535 - val_loss: 5.0297 - val_acc: 0.3831
Epoch 12/100
50000/50000 - 73s - loss: 3.5828 - acc: 0.4614 - val_loss: 5.0597 - val

In [None]:
del model
gc.collect()

In [None]:
!nvidia-smi

In [10]:
# from nltk.translate.bleu_score import sentence_bleu
reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']]
candidate = ['this','is', 'test']
score = sentence_bleu(reference, candidate)
print('%2.8f'%score)

0.00000000



print('x1:\n',x1_train[:2])
print('x2:\n',x2_train[:2])

In [30]:
pred_prob = model.predict([x1_train[:10],x2_train[:10]])
pred_ids = tf.argmax(pred_prob, axis=2)

output = tokenizer.detoken(pred_ids.numpy(),False)

In [31]:
for line in output:
    print(''.join(line))

驰骋乔恩内蒙古转···只是转···只是峯岸PADPAD
正宗十一点四十五学文无所从业者转···只是WhenPAD
掛墙式转···只是转···只是盆切深乔恩寒舟妖风PAD
左眼跳财妖风妖风大世界菊花兰大排球转···只是
小半坛入校无所麦粒肿拉米苏转···只是WhenWhenWhenPAD
系外无所最恨删博啊转···只是When
攻击肥胖转···只是妖风寒舟WhenWhenWhenWhen
吃好妖风妖风乔恩乔恩WhenWhen兰大PAD
盛夫马三立拳拳之心性感呐今朝PADPADPAD
可刷老大帅老大帅心满意足之余乔恩When


In [32]:
for line in tokenizer.detoken(y_train[:10].reshape(-1 , seq_len),False):
    print(''.join(line))

王大姐，打字细心一点EOSPADPADPAD
慈善再透明，捐款都无意EOSPADPADPAD
都说喵是会飞的真不开玩笑EOSPADPAD
你就是“反动派”。罗！EOSPAD
厨房那个太搞笑了！哈哈哈EOSPADPAD
儿童节快乐！我要礼物…EOSPADPAD
乐出音了，旁边人直看我EOSPAD
少了个新浪微博呀！EOSPADPAD
下面在做，上面在看EOSPADPADPADPAD
缘分终究会到只不过早晚的事EOSPAD


In [54]:
def predict_seqs(model ,x1_test, tokenizer= tokenizer):
    x2_tmp = np.zeros(x1_test.shape)
    x2_tmp[:][0]=tokenizer.BOS
    
    for i in range(x1_test.shape[1]-1):
        
        x1_tmp = x1_test[i]
        next_tokens = np.argmax(model.predict([x1_test ,x2_tmp])[:,i])
        x2_tmp[:,i+1] = next_tokens
    
    return x2_tmp


pred_ids = predict_seqs(model , x1_train[:12])
output = tokenizer.detoken(pred_ids,False)
for line in output:
    print(''.join(line))

BOS
PAD
PAD
PAD
PAD
PAD
PAD
PAD
PAD
PAD
PAD
PAD


In [None]:
!nvidia-smi

In [56]:
# tokenizer.num_word
# pred_ids
model.predict([x1_train, x2_train]).shape

MemoryError: Unable to allocate 115. GiB for an array with shape (50000, 10, 61609) and data type float32

In [None]:


# 绘制训练 & 验证的准确率值
plt.plot(Hist.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()



In [None]:
# 绘制训练 & 验证的损失值
plt.plot(Hist.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [51]:

# keras.models.load_model
model.save('TF_len10_1.model')


In [29]:
model = keras.models.load_model('TF_len10_1.model')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
