In [27]:
import numpy as np
import json
import tensorflow as tf
from config import Config
from utils import *

In [28]:
'''
with open("imdb_word_index.json") as file_json:
    wid_dict = json.load(file_json)

id2w = {}
for (k, v) in wid_dict.items():
    id2w[v] = k

for w in dataset["x_train"][1]:
    print(id2w[w], end=" ")
    
x_train, y_train, x_test, y_test, train_length, test_length, wid_dict, id2w = load_data()
'''

'\nwith open("imdb_word_index.json") as file_json:\n    wid_dict = json.load(file_json)\n\nid2w = {}\nfor (k, v) in wid_dict.items():\n    id2w[v] = k\n\nfor w in dataset["x_train"][1]:\n    print(id2w[w], end=" ")\n    \nx_train, y_train, x_test, y_test, train_length, test_length, wid_dict, id2w = load_data()\n'

In [29]:
config = Config(batch_size=32, 
                embedding_size=100,
                encoder_hidden_size=200,
                vocab_size=88584,
                lr=0.001, 
                epoch_num=50,
                save_per_epoch=5,
                maxlen=100)

class imdb_classifier(object):
    def __init__(self, config, session, x_train, y_train, x_test, y_test, train_length, test_lentgh):
        self.config = config
        self.embedding_size = config.embedding_size
        self.batch_size = config.batch_size
        self.encoder_hidden_size = config.encoder_hidden_size
        self.maxlen = config.maxlen
        self.vocab_size = config.vocab_size
        self.lr = config.lr
        self.sess = session
        self.epoch_num = config.epoch_num
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.train_length = train_length
        self.test_length = test_length
        
    def build(self):
        self.global_step = tf.Variable(0, name="global_step")
        
        self.encoder_input = tf.placeholder(shape=(None, None), dtype=tf.int32, name="encoder_input")
        self.encoder_input_length = tf.placeholder(shape=(None,), dtype=tf.int32, name="encoder_input_length")
        self.labels = tf.placeholder(shape=(None,), dtype=tf.int32, name="label")
        self.embedding = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0), 
                                     dtype=tf.float32,
                                     name="embedding")
        self.encoder_input_embedded = tf.nn.embedding_lookup(self.embedding, 
                                                             self.encoder_input)
        self.encoder_fw = tf.contrib.rnn.GRUCell(self.encoder_hidden_size)
        self.encoder_bw = tf.contrib.rnn.GRUCell(self.encoder_hidden_size)
        
        # Since time_major == False, output shape should be [batch_size, max_time, ...]
        ((self.encoder_fw_output, self.encoder_bw_output), 
         (self.encoder_fw_state, self.encoder_bw_state)) = (
            tf.nn.bidirectional_dynamic_rnn(cell_fw=self.encoder_fw, 
                                            cell_bw=self.encoder_bw, 
                                            inputs=self.encoder_input_embedded,
                                            sequence_length=self.encoder_input_length,
                                            dtype=tf.float32)
        )
        
        self.encoder_output = tf.concat((self.encoder_fw_output, self.encoder_bw_output), 2)
        self.encoder_state = tf.concat((self.encoder_fw_state, self.encoder_bw_state), 1)
        
        self.output_w = tf.Variable(
            tf.truncated_normal(shape=(self.encoder_hidden_size*2, 2), stddev=0.1), name="output_w") 
        self.output_b = tf.Variable(tf.zeros(2), name="output_b")
        
        self.logits = tf.matmul(self.encoder_output[:, -1, :], self.output_w) + self.output_b
        self.prediction = tf.argmax(self.logits, 1)
        
        self.loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.labels)
        )
        
        self.train_op = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss, global_step=self.global_step)
    def train(self, config):
        self.build()
        self.sess.run(tf.global_variables_initializer())
        for epoch in range(self.epoch_num):
            for x_batch, y_batch, input_length in self.minibatches(
                self.x_train, self.y_train, self.train_length, batch_size=self.batch_size, shuffle=False):
                # pad inputs
                x_batch = self.padding_sequence(x_batch)
                feed_dict = {
                    self.encoder_input: x_batch,
                    self.encoder_input_length: input_length,
                    self.labels: y_batch
                }
                _, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
                print(loss)
    
    def minibatches(self, inputs=None, targets=None, input_len=None, batch_size=None, shuffle=False):
        assert len(inputs) == len(targets)
        #assert len(inputs) == len(inputs_length)
        if shuffle:
            indices = np.arange(len(inputs))
            np.random.shuffle(indices)
        for start_idx in range(0, len(inputs) - batch_size + 1, batch_size):
            if shuffle:
                excerpt = indices[start_idx:start_idx + batch_size]
            else:
                excerpt = slice(start_idx, start_idx + batch_size)
            yield inputs[excerpt], targets[excerpt], input_len[excerpt]
    
    def padding_sequence(self, inputs):
        batch_size = len(inputs)
        #assert self.batch_size == batch_size
        maxlen = np.max([len(i) for i in inputs])
        output = np.zeros([batch_size, maxlen], dtype=np.int8)
        for i, seq in enumerate(inputs):
            output[i, :len(seq)] = np.array(seq)
        return output

In [None]:
x_train, y_train, x_test, y_test, train_length, test_length, wid_dict, id2w = load_data()

tf.reset_default_graph()
sess = tf.Session()
test = imdb_classifier(config, sess, x_train, y_train, x_test, y_test, train_length, test_length)

In [None]:
test.train(config)

0.692642
0.689341
0.697098
0.695244
0.694308
0.696904
0.691427
0.688735
0.692373
0.688031
0.686562
