In [1]:
import tensorflow as tf;
print(tf.__version__)
import numpy as np
import time
from tqdm import tqdm, tnrange, tqdm_notebook

1.12.0


In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

print(tf.test.is_gpu_available())
print(tf.test.is_built_with_cuda())

True
True


In [3]:
word2idx = {'<stop>': 0}
idx2word = ['<stop>']
index = 1
max_seq_len = 0

def load_voc(filename):
    print("loading %s"%filename)
    global index, max_seq_len, word2idx, idx2word
    sentences = []
    num_tokens = 0
    with open(filename, 'r') as f:
        for line in f:
            stn = []
            for w in line.rstrip().split(' '):
                if w not in word2idx: 
                    word2idx[w] = index
                    index += 1
                    idx2word.append(w)
                stn.append(word2idx[w])
            num_tokens += len(stn)
            max_seq_len = max(max_seq_len, len(stn))
            sentences.append( np.array(stn, dtype=int) )
    print("#sentences {}, #tokens {}".format(len(sentences), num_tokens))
    return sentences    

In [4]:
trn_sentences = load_voc('trn-wiki.txt')
dev_sentences = load_voc('dev-wiki.txt')
tst_sentences = load_voc('tst-wiki.txt')
print('vocb size %d'%index)
print('max stn len %d'%max_seq_len)

loading trn-wiki.txt
#sentences 17556, #tokens 1800340
loading dev-wiki.txt
#sentences 1841, #tokens 188963
loading tst-wiki.txt
#sentences 2183, #tokens 212719
vocb size 27767
max stn len 641


In [62]:
vocabulary_size = index
input_size = 32
hidden_size = 32
batch_size = 32
seq_len = max_seq_len

In [63]:
tf.reset_default_graph()

wordids_placeholder = tf.placeholder(tf.int64, [batch_size, None])
word_embeddings = tf.get_variable("word_embeddings", [vocabulary_size, input_size], trainable=True)
embedded_words = tf.nn.embedding_lookup(word_embeddings, wordids_placeholder)

lstm = tf.contrib.cudnn_rnn.CudnnLSTM(2, hidden_size)
output2wordid = tf.layers.Dense(vocabulary_size)

seq_weight = tf.cast(tf.sign(wordids_placeholder[:,1:]), tf.float32)
seq_length = tf.cast(tf.reduce_sum(seq_weight, axis=1), tf.int32)
total_seq_length = tf.cast(tf.reduce_sum(seq_length, axis=0), tf.float32)

inputs = embedded_words[:, :-1, ]
outputs, state = lstm(inputs)

labels = wordids_placeholder[:,1:]
logits = tf.map_fn(lambda x: output2wordid(x), outputs)

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,logits=logits) * seq_weight

# preds = tf.argmax(logits, axis=-1)
# acces = tf.cast(tf.equal(preds, labels), tf.float32)
    
total_loss = tf.reduce_sum(losses) / total_seq_length
tf.summary.scalar('loss', total_loss)

perplexity = tf.exp(total_loss)
tf.summary.scalar('perplexity', perplexity)

# accuracy = tf.reduce_mean(acces)
# tf.summary.scalar('accuracy', accuracy)

with tf.name_scope('train'):
    learning_rate = tf.placeholder(tf.float32, shape=[])
#     train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)
    opt = tf.train.AdamOptimizer(learning_rate)
    grads_and_vars = opt.compute_gradients(total_loss)
    capped_grads_and_vars = [(tf.clip_by_norm(grad, 5.0), var) for grad, var in grads_and_vars]
    train_step = opt.apply_gradients(capped_grads_and_vars)

merged_summary = tf.summary.merge_all()

In [64]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

epoches = 10

with tf.Session(config=config) as sess:
    now = time.strftime("%c")
    train_writer = tf.summary.FileWriter('./logs/'+now, sess.graph)

    sess.run(tf.global_variables_initializer())

    for epoch_idx in range(epoches):
        num_batch = len(trn_sentences) // batch_size - 1
        loss_arr = []
        weight_arr = []
        for batch_id in tnrange(num_batch, desc='training epoch %d'%epoch_idx):
            len_arr = [sent.shape[0] for sent in trn_sentences[batch_id*batch_size:(batch_id+1)*batch_size]]
            max_len = max(len_arr)
            total_len = sum(len_arr)
            
            padded = [np.pad( sent, (0,  max_len - sent.shape[0]), 'edge') for sent in trn_sentences[batch_id*batch_size:(batch_id+1)*batch_size] ]
            batch_stn = np.stack(padded, axis=0)
            summary_, _, total_loss_ = sess.run(
                [merged_summary, train_step, total_loss], 
                feed_dict = {
                    learning_rate : 0.01,
                    wordids_placeholder: batch_stn
                })              
            train_writer.add_summary(summary_, num_batch*epoch_idx+batch_id)
            loss_arr.append(total_loss_)
            weight_arr.append(total_len)
        print( np.exp(np.average(loss_arr, weights=weight_arr)) )
        
        loss_arr = []
        weight_arr = []
        # eval
        num_batch = len(dev_sentences) // batch_size - 1
        for batch_id in tnrange(num_batch, desc='validate epoch %d'%epoch_idx):
            len_arr = [sent.shape[0] for sent in trn_sentences[batch_id*batch_size:(batch_id+1)*batch_size]]
            max_len = max(len_arr)
            total_len = sum(len_arr)
            
            padded = [np.pad( sent, (0,  max_len - sent.shape[0]), 'edge') for sent in trn_sentences[batch_id*batch_size:(batch_id+1)*batch_size] ]
            batch_stn = np.stack(padded, axis=0)
            total_loss_ = sess.run(
                total_loss, 
                feed_dict = {
                    wordids_placeholder: batch_stn
                })  
            loss_arr.append(total_loss_)
            weight_arr.append(total_len)
        print( np.exp(np.average(loss_arr, weights=weight_arr)) )

            

HBox(children=(IntProgress(value=0, description='training epoch 0', max=547, style=ProgressStyle(description_w…

1683.7567255909507


HBox(children=(IntProgress(value=0, description='validate epoch 0', max=56, style=ProgressStyle(description_wi…

1166.9886299305033


HBox(children=(IntProgress(value=0, description='training epoch 1', max=547, style=ProgressStyle(description_w…

953.9654510103248


HBox(children=(IntProgress(value=0, description='validate epoch 1', max=56, style=ProgressStyle(description_wi…

797.9252775561479


HBox(children=(IntProgress(value=0, description='training epoch 2', max=547, style=ProgressStyle(description_w…

697.5038325918881


HBox(children=(IntProgress(value=0, description='validate epoch 2', max=56, style=ProgressStyle(description_wi…

654.2010072327398


HBox(children=(IntProgress(value=0, description='training epoch 3', max=547, style=ProgressStyle(description_w…

593.9921129113538


HBox(children=(IntProgress(value=0, description='validate epoch 3', max=56, style=ProgressStyle(description_wi…

586.1735562495693


HBox(children=(IntProgress(value=0, description='training epoch 4', max=547, style=ProgressStyle(description_w…

540.5446447012463


HBox(children=(IntProgress(value=0, description='validate epoch 4', max=56, style=ProgressStyle(description_wi…

548.0394358548497


HBox(children=(IntProgress(value=0, description='training epoch 5', max=547, style=ProgressStyle(description_w…

500.1542941021347


HBox(children=(IntProgress(value=0, description='validate epoch 5', max=56, style=ProgressStyle(description_wi…

520.9864541289346


HBox(children=(IntProgress(value=0, description='training epoch 6', max=547, style=ProgressStyle(description_w…

472.2925925346374


HBox(children=(IntProgress(value=0, description='validate epoch 6', max=56, style=ProgressStyle(description_wi…

498.5720407526523


HBox(children=(IntProgress(value=0, description='training epoch 7', max=547, style=ProgressStyle(description_w…

450.6047973757119


HBox(children=(IntProgress(value=0, description='validate epoch 7', max=56, style=ProgressStyle(description_wi…

494.97375416641444


HBox(children=(IntProgress(value=0, description='training epoch 8', max=547, style=ProgressStyle(description_w…

432.6919408281369


HBox(children=(IntProgress(value=0, description='validate epoch 8', max=56, style=ProgressStyle(description_wi…

483.11216665277334


HBox(children=(IntProgress(value=0, description='training epoch 9', max=547, style=ProgressStyle(description_w…

417.4465160015626


HBox(children=(IntProgress(value=0, description='validate epoch 9', max=56, style=ProgressStyle(description_wi…

472.9016711701382


In [None]:
tf.reset_default_graph()

wordids_placeholder = tf.placeholder(tf.int64, [batch_size, seq_len])
word_embeddings = tf.get_variable("word_embeddings", [vocabulary_size, input_size], trainable=True)
embedded_words = tf.nn.embedding_lookup(word_embeddings, wordids_placeholder)

lstm = tf.contrib.rnn.LSTMCell(hidden_size)
output2wordid = tf.layers.Dense(vocabulary_size)

seq_weight = tf.sign(wordids_placeholder[:,1:])
seq_length = tf.cast(tf.reduce_sum(seq_weight, axis=1), tf.int32)

outputs, state = tf.nn.dynamic_rnn(lstm, 
                                   embedded_words[:, :-1, ], 
                                   sequence_length=seq_length, 
                                   dtype=tf.float32)

num_pred = tf.squeeze(seq_length)
labels = wordids_placeholder[:,1:num_pred+1]
logits = tf.map_fn(lambda x: output2wordid(x), outputs)[:,:num_pred]


losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,logits=logits)

preds = tf.argmax(logits, axis=-1)
acces = tf.cast(tf.equal(preds, labels), tf.float32)
probs = tf.log(tf.reduce_max(logits, axis=-1) / tf.reduce_sum(logits, axis=-1))


# initial_c_state = tf.get_variable("initial_c_hidden_state", [batch_size, hidden_size], initializer= tf.initializers.random_uniform, trainable=True )
# initial_m_state = tf.get_variable("initial_m_hidden_state", [batch_size, hidden_size], initializer= tf.initializers.random_uniform, trainable=True )

# embedded_word_series = tf.unstack(embedded_words, axis=1)
# # print(embedded_word_series)
# state = (initial_c_state, initial_m_state)
# losses = []
# acc = []
# for i in range(len(embedded_word_series)-1):
#     embedded_word = embedded_word_series[i]
#     output, state = lstm(embedded_word, state)
    
#     correct_pred = wordids_placeholder[:, i+1]
    
#     prob = output2wordid(output)
#     acc.append( tf.cast(tf.equal(tf.argmax(prob, axis=1), correct_pred), tf.float32) )
# #     print(prob.shape, correct_pred.shape)
#     losses.append(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=prob, labels = correct_pred))
    
    
total_loss = tf.reduce_mean(losses)
tf.summary.scalar('loss', total_loss)

perplexity = tf.exp(total_loss)
tf.summary.scalar('perplexity', perplexity)

accuracy = tf.reduce_mean(acces)
tf.summary.scalar('accuracy', accuracy)

with tf.name_scope('train'):
    learning_rate = tf.placeholder(tf.float32, shape=[])
#     train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)
    opt = tf.train.GradientDescentOptimizer(learning_rate)
    grads_and_vars = opt.compute_gradients(total_loss)
    capped_grads_and_vars = [(tf.clip_by_norm(grad, 5.0), var) for grad, var in grads_and_vars]
    train_step = opt.apply_gradients(capped_grads_and_vars)

merged_summary = tf.summary.merge_all()

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

epoches = 10

with tf.Session(config=config) as sess:
    now = time.strftime("%c")
    train_writer = tf.summary.FileWriter('./logs/'+now, sess.graph)

    sess.run(tf.global_variables_initializer())

    for epoch_idx in range(epoches):
        num_batch = len(trn_sentences) / batch_size
        for batch_id, stn in enumerate(tqdm_notebook(trn_sentences, desc='training epoch %d'%epoch_idx)):
            stn_len = stn.shape[0]
            i = 0
            input_stn = np.expand_dims(np.pad( stn[i:i+seq_len], (0,  i+seq_len - stn_len), 'edge'), axis=0)
            summary_, _ = sess.run(
                [merged_summary, train_step], 
                feed_dict = {
                    learning_rate : 0.01,
                    wordids_placeholder: input_stn
                })              
            train_writer.add_summary(summary_, num_batch*epoch_idx+batch_id)

    
        loss_arr = []
        # eval
        num_batch = len(dev_sentences) / batch_size
        for batch_id, stn in enumerate(tqdm_notebook(dev_sentences, desc='validate epoch %d'%epoch_idx)):
            stn_len = stn.shape[0]
            i = 0
            input_stn = np.expand_dims(np.pad( stn[i:i+seq_len], (0,  i+seq_len - stn_len), 'edge'), axis=0)
            loss_ = sess.run(
                total_loss, 
                feed_dict = {wordids_placeholder: input_stn})  
            loss_arr.append(loss_)
            
        print( np.exp(np.mean(loss_arr)) )
    train_writer.close()

    # test
    f = open('jw7jb-tst-logprob.txt', 'w')
    for batch_id, stn in enumerate(tqdm_notebook(tst_sentences, desc='testing')):
        input_stn = np.expand_dims(np.pad( stn, (0,  seq_len - stn.shape[0]), 'edge'), axis=0)
        probs_ = sess.run(
            probs, 
            feed_dict = {wordids_placeholder: input_stn})  
#         f.write('<start> ')
        for wid, prob in zip(stn[1:], probs_[0]) :
            f.write( '{}\t{}\n'.format(idx2word[wid], prob) )
    f.close()
    

        


In [None]:
train_writer.close()