In [1]:
import cPickle
import numpy as np
import tensorflow as tf

print "loading data..."
x = cPickle.load(open("Data/imdb-train-val-test.pickle", "rb"))
revs, word2vec, word_idx_map = x[0], x[1], x[2]
print "data loaded!"

loading data...
data loaded!


In [2]:
def split_train_valid_test(revs, train_ratio=0.8, valid_ratio=0.05, test_ratio=0.15):
    n_train=int(len(revs)*0.65)
    n_valid=int(len(revs)*0.15)
    train, valid, test=[], [], []
    for i in range(len(revs)):
        if i<n_train:
            train.append(revs[i])
        elif i<n_train+n_valid:
            valid.append(revs[i])
        else:
            test.append(revs[i])
    return train, valid, test

train, valid, test=split_train_valid_test(revs)    
print("Training size {}, validation size {}, test size {}".format(len(train), \
                                                                  len(valid), len(test)))

Training size 130000, validation size 30000, test size 40000


In [3]:
import pandas as pd
max_sentence_len=np.max((pd.DataFrame(revs)['num_words']))
print("The longest sentence has {} words".format(max_sentence_len))

The longest sentence has 105 words


In [4]:
# del revs

In [5]:
def sentence2_1hot(sentence, dictionary, max_length):
    V=np.zeros(shape=max_length, dtype=np.int32)
    i=0
    for word in sentence.split():
        if i>=max_length:
            break
        V[i]=dictionary[word]
        i+=1
    return V


In [6]:
word2vec.shape

(150855, 300)

In [7]:
def embedding_batch(revs, W, word_idx_map, max_sentence_len, batch_size=50, batch_index=0):
    tail=batch_size*(batch_index+1)
    assert tail<=len(revs)
    embedded=np.zeros(shape=(batch_size, max_sentence_len, 300), dtype=np.float64)
    labels=np.zeros(shape=batch_size, dtype=np.int32)
    k=0
    for i in range(tail-batch_size, tail):
        sentence_1hot=sentence2_1hot(revs[i]['text'], word_idx_map, max_sentence_len)
        for j in range(len(sentence_1hot)):
            embedded[k][j][:]=W[sentence_1hot[j]]
        labels[k]=int(revs[i]['y']*0.25)
        k+=1
    return embedded, labels

In [8]:
#####################################################
#Define a DNN class that can be used in sklearn randomized search
#####################################################
from tensorflow.contrib.layers import batch_norm
from tensorflow.contrib.layers import fully_connected
import tensorflow as tf 
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import time
from sklearn.base import BaseEstimator, TransformerMixin
he_init = tf.contrib.layers.variance_scaling_initializer()
class My_DNN(BaseEstimator, TransformerMixin):
    def __init__(self, data, W, n_input=30*300, n_output=2, \
                 n_neurons=140, n_layers=5, learning_rate=0.01, n_epoches=1000,\
                 batch_size=200,batch_norm_momentum=0.99, dropout_rate=0.5, \
                 activation=tf.nn.elu, initializer=he_init):
        self.W=W #word2vec
        self.data=data
        self.n_input=n_input
        self.n_output=n_output
        self.n_neurons=n_neurons
        self.n_layers=n_layers
        self.learning_rate=learning_rate
        self.n_epoches=n_epoches
        self.batch_size=batch_size
        self.batch_norm_momentum=batch_norm_momentum
        self.dropout_rate=dropout_rate
        self.activation=activation
        self.initializer=initializer
        self._session = None
    
    def build_hidden_layers(self, input):
        is_training=tf.placeholder_with_default(False, shape=(), name='training')
        for layer in range(self.n_layers-1):
            if(self.dropout_rate!=None):
                input=tf.layers.dropout(input, self.dropout_rate, training=is_training)
            input=tf.layers.dense(input, self.n_neurons, activation=self.activation,\
                                          kernel_initializer=self.initializer)
            if(self.batch_norm_momentum!=None):
                input=tf.layers.batch_normalization(input, momentum=self.batch_norm_momentum, training=is_training)
        self.is_training=is_training
        return input
    
    def build_graph(self):
        X=tf.placeholder(tf.float32, shape=(None, self.n_input), name='tweets')
        y=tf.placeholder(tf.int64, shape=(None), name='sentiment')       
        output=self.build_hidden_layers(X)
        logits=tf.layers.dense(output, self.n_output, activation=tf.nn.softmax, kernel_initializer=self.initializer)
        xentropy=tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        loss=tf.reduce_mean(xentropy, name='loss')
        optimizer=tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        training_op=optimizer.minimize(loss)
        correct=tf.nn.in_top_k(logits, y, 1)
        accuracy=tf.reduce_mean(tf.cast(correct, tf.float32))
        Y_prob = tf.nn.softmax(logits, name="Y_proba")
        init=tf.global_variables_initializer()
        saver = tf.train.Saver()
        # Make the important operations available easily through instance variables
        self._X, self._y = X, y
        self._loss, self.Y_prob = loss, Y_prob
        self._training_op, self._accuracy = training_op, accuracy
        self._init, self._saver = init, saver
        
    def fit(self, X_validation, y_validation):
        n_batches=len(self.data)//self.batch_size
        best_validation_loss=np.infty
        max_checks_no_progress=20
        check_without_process=0
        
        start=time.time()
        self.build_graph()
        self._session = tf.Session()
        with self._session.as_default() as sess:
            self._init.run()
            #revs, W, word_idx_map, max_sentence_len, batch_size=50, batch_index=1
            for epoch in range(self.n_epoches):
                for batch in range(1, n_batches):
                    X_batch, y_batch=embedding_batch(self.data, self.W, word_idx_map, 30, \
                                            batch_size=self.batch_size, batch_index=batch)
                    sess.run(self._training_op, feed_dict={self.is_training:True, self._X:X_batch.reshape(self.batch_size,self.n_input), self._y:y_batch.reshape(self.batch_size)})
                batch_training_accuracy=self._accuracy.eval(feed_dict={self.is_training:True, self._X:X_batch.reshape(self.batch_size,self.n_input), self._y:y_batch.reshape(self.batch_size)})
                validation_accuracy=self._accuracy.eval(feed_dict={self.is_training:False, self._X:X_validation.reshape(-1,self.n_input), self._y:y_validation.reshape(-1)})
                validation_loss=self._loss.eval(feed_dict={self.is_training:False, self._X:X_validation.reshape(-1,self.n_input), self._y:y_validation.reshape(-1)})
                print("Epoch:", epoch, "training:", batch_training_accuracy, "validation:", validation_accuracy, \
                      "valid_loss", validation_loss, "best_loss", best_validation_loss)
                if(validation_loss<best_validation_loss):
                    save_path = self._saver.save(sess, "./dnn_sentiment.ckpt")
                    best_validation_loss=validation_loss
                    check_without_process=0
                else:
                    check_without_process+=1
                    if check_without_process>=max_checks_no_progress:
                        print("Early stopping!")
                        break;
        end=time.time()
        print("Training completed in {} minutes".format((time.time()-start)/60))
        
    def predict_proba(self, X):
#         if not self._session:
#             raise NotFittedError("This %s instance is not fitted yet" % self.__class__.__name__)
        #with tf.Session() as sess:
        with self._session.as_default() as sess:
        #self._init.run()
            return self.Y_prob.eval(feed_dict={self._X: X})

    def predict(self, X):
        class_indices = np.argmax(self.predict_proba(X.reshape(-1, self.n_input)), axis=1)
        return np.array(class_indices)

In [None]:
# dnn_clf=My_DNN(W=word2vec, n_neurons=100, n_layers=4, learning_rate=0.01, n_epoches=300,\
#                batch_size=64, batch_norm_momentum=0.99, dropout_rate=0.5)
# dnn_clf.fit(revs)
a=np.arange(0,200).reshape(10,20)
a.reshape(-1).shape

(200,)

In [None]:
from sklearn.model_selection import GridSearchCV
import time

X_validation, y_validation=embedding_batch(valid, word2vec, word_idx_map, 30, \
                                            batch_size=5000, batch_index=0)

del valid

params = {
    "n_neurons": [100, 150, 250],
    "batch_size": [64, 128],
    "learning_rate": [0.01, 0.001],
#     "activation": [tf.nn.relu, tf.nn.elu],
    # you could also try exploring different numbers of hidden layers, different optimizers, etc.
    "n_layers": [3,5,7,9],
#     "batch_norm_momentum": [0.99, None],
#     "dropout_rate":[0.4, 0.5, 0.6, None],
    #"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
}
start=time.time()
rnd_dnn_clf=My_DNN(data=train, W=word2vec, n_epoches=500, batch_norm_momentum=0.99, dropout_rate=0.5)

rnd_search = GridSearchCV(rnd_dnn_clf, param_grid = params, scoring='f1', verbose=2)
rnd_search.fit(X_validation, y_validation)
print("*******************************************************")
print("Complete in {} minutes".format((time.time()-start)/60))

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] learning_rate=0.01, n_layers=3, n_neurons=100, batch_size=64 ....
('Epoch:', 0, 'training:', 0.734375, 'validation:', 0.63126314, 'valid_loss', 0.67676961, 'best_loss', inf)
('Epoch:', 1, 'training:', 0.65625, 'validation:', 0.66336632, 'valid_loss', 0.64863241, 'best_loss', 0.67676961)
('Epoch:', 2, 'training:', 0.71875, 'validation:', 0.67656767, 'valid_loss', 0.63635153, 'best_loss', 0.64863241)
('Epoch:', 3, 'training:', 0.6875, 'validation:', 0.67626762, 'valid_loss', 0.63609034, 'best_loss', 0.63635153)
('Epoch:', 4, 'training:', 0.734375, 'validation:', 0.69276929, 'valid_loss', 0.61935794, 'best_loss', 0.63609034)
('Epoch:', 5, 'training:', 0.78125, 'validation:', 0.66726673, 'valid_loss', 0.64603835, 'best_loss', 0.61935794)
('Epoch:', 6, 'training:', 0.78125, 'validation:', 0.6870687, 'valid_loss', 0.62603199, 'best_loss', 0.61935794)
('Epoch:', 7, 'training:', 0.828125, 'validation:', 0.66666669, 'valid_loss'

('Epoch:', 71, 'training:', 0.828125, 'validation:', 0.72757274, 'valid_loss', 0.58568579, 'best_loss', 0.57050318)
('Epoch:', 72, 'training:', 0.828125, 'validation:', 0.73537356, 'valid_loss', 0.57788932, 'best_loss', 0.57050318)
('Epoch:', 73, 'training:', 0.828125, 'validation:', 0.72427243, 'valid_loss', 0.58892149, 'best_loss', 0.57050318)
('Epoch:', 74, 'training:', 0.875, 'validation:', 0.72997302, 'valid_loss', 0.5832898, 'best_loss', 0.57050318)
('Epoch:', 75, 'training:', 0.859375, 'validation:', 0.72607261, 'valid_loss', 0.58719027, 'best_loss', 0.57050318)
('Epoch:', 76, 'training:', 0.90625, 'validation:', 0.73087311, 'valid_loss', 0.58238971, 'best_loss', 0.57050318)
('Epoch:', 77, 'training:', 0.734375, 'validation:', 0.72817284, 'valid_loss', 0.5850895, 'best_loss', 0.57050318)
('Epoch:', 78, 'training:', 0.921875, 'validation:', 0.72787279, 'valid_loss', 0.58546072, 'best_loss', 0.57050318)
('Epoch:', 79, 'training:', 0.78125, 'validation:', 0.73327333, 'valid_loss', 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 60.3min remaining:    0.0s


[CV] learning_rate=0.01, n_layers=3, n_neurons=100, batch_size=64 ....
('Epoch:', 0, 'training:', 0.640625, 'validation:', 0.65346533, 'valid_loss', 0.65339291, 'best_loss', inf)
('Epoch:', 1, 'training:', 0.78125, 'validation:', 0.64146417, 'valid_loss', 0.66925889, 'best_loss', 0.65339291)
('Epoch:', 2, 'training:', 0.71875, 'validation:', 0.65166515, 'valid_loss', 0.66142589, 'best_loss', 0.65339291)
('Epoch:', 3, 'training:', 0.734375, 'validation:', 0.66216624, 'valid_loss', 0.65095192, 'best_loss', 0.65339291)
('Epoch:', 4, 'training:', 0.6875, 'validation:', 0.68976897, 'valid_loss', 0.62278867, 'best_loss', 0.65095192)
('Epoch:', 5, 'training:', 0.71875, 'validation:', 0.69216919, 'valid_loss', 0.62146169, 'best_loss', 0.62278867)
('Epoch:', 6, 'training:', 0.71875, 'validation:', 0.68526852, 'valid_loss', 0.6275757, 'best_loss', 0.62146169)
('Epoch:', 7, 'training:', 0.78125, 'validation:', 0.68586856, 'valid_loss', 0.62710583, 'best_loss', 0.62146169)
('Epoch:', 8, 'training:

('Epoch:', 71, 'training:', 0.78125, 'validation:', 0.74797481, 'valid_loss', 0.56528801, 'best_loss', 0.55898738)
('Epoch:', 72, 'training:', 0.875, 'validation:', 0.74617463, 'valid_loss', 0.56688595, 'best_loss', 0.55898738)
('Epoch:', 73, 'training:', 0.796875, 'validation:', 0.74347436, 'valid_loss', 0.56977868, 'best_loss', 0.55898738)
('Epoch:', 74, 'training:', 0.734375, 'validation:', 0.75277525, 'valid_loss', 0.56048417, 'best_loss', 0.55898738)
('Epoch:', 75, 'training:', 0.8125, 'validation:', 0.74617463, 'valid_loss', 0.56700277, 'best_loss', 0.55898738)
('Epoch:', 76, 'training:', 0.8125, 'validation:', 0.74947494, 'valid_loss', 0.56378782, 'best_loss', 0.55898738)
('Epoch:', 77, 'training:', 0.875, 'validation:', 0.75097507, 'valid_loss', 0.56228781, 'best_loss', 0.55898738)
('Epoch:', 78, 'training:', 0.828125, 'validation:', 0.75337535, 'valid_loss', 0.55988753, 'best_loss', 0.55898738)
('Epoch:', 79, 'training:', 0.8125, 'validation:', 0.75697571, 'valid_loss', 0.5563

('Epoch:', 32, 'training:', 0.765625, 'validation:', 0.74565089, 'valid_loss', 0.56761217, 'best_loss', 0.56145895)
('Epoch:', 33, 'training:', 0.765625, 'validation:', 0.74475104, 'valid_loss', 0.56844676, 'best_loss', 0.56145895)
('Epoch:', 34, 'training:', 0.859375, 'validation:', 0.74775046, 'valid_loss', 0.56552613, 'best_loss', 0.56145895)
('Epoch:', 35, 'training:', 0.796875, 'validation:', 0.73935211, 'valid_loss', 0.57392704, 'best_loss', 0.56145895)
('Epoch:', 36, 'training:', 0.703125, 'validation:', 0.74895018, 'valid_loss', 0.56431228, 'best_loss', 0.56145895)
('Epoch:', 37, 'training:', 0.875, 'validation:', 0.74325132, 'valid_loss', 0.56982404, 'best_loss', 0.56145895)
('Epoch:', 38, 'training:', 0.8125, 'validation:', 0.74895018, 'valid_loss', 0.56431252, 'best_loss', 0.56145895)
('Epoch:', 39, 'training:', 0.75, 'validation:', 0.75074983, 'valid_loss', 0.56226897, 'best_loss', 0.56145895)
('Epoch:', 40, 'training:', 0.828125, 'validation:', 0.74715054, 'valid_loss', 0.

('Epoch:', 29, 'training:', 0.75, 'validation:', 0.70657068, 'valid_loss', 0.60668981, 'best_loss', 0.59415317)
('Epoch:', 30, 'training:', 0.671875, 'validation:', 0.71407139, 'valid_loss', 0.59919143, 'best_loss', 0.59415317)
('Epoch:', 31, 'training:', 0.828125, 'validation:', 0.70537055, 'valid_loss', 0.60789198, 'best_loss', 0.59415317)
('Epoch:', 32, 'training:', 0.734375, 'validation:', 0.70627064, 'valid_loss', 0.60699189, 'best_loss', 0.59415317)
('Epoch:', 33, 'training:', 0.765625, 'validation:', 0.70207024, 'valid_loss', 0.61119676, 'best_loss', 0.59415317)
('Epoch:', 34, 'training:', 0.796875, 'validation:', 0.71287131, 'valid_loss', 0.60043293, 'best_loss', 0.59415317)
('Epoch:', 35, 'training:', 0.78125, 'validation:', 0.70537055, 'valid_loss', 0.60789186, 'best_loss', 0.59415317)
('Epoch:', 36, 'training:', 0.796875, 'validation:', 0.71287131, 'valid_loss', 0.60039127, 'best_loss', 0.59415317)
('Epoch:', 37, 'training:', 0.75, 'validation:', 0.7050705, 'valid_loss', 0.6

('Epoch:', 30, 'training:', 0.84375, 'validation:', 0.70837086, 'valid_loss', 0.60489178, 'best_loss', 0.59517485)
('Epoch:', 31, 'training:', 0.8125, 'validation:', 0.72187221, 'valid_loss', 0.59127206, 'best_loss', 0.59517485)
('Epoch:', 32, 'training:', 0.6875, 'validation:', 0.69456947, 'valid_loss', 0.61870182, 'best_loss', 0.59127206)
('Epoch:', 33, 'training:', 0.796875, 'validation:', 0.70357037, 'valid_loss', 0.6096921, 'best_loss', 0.59127206)
('Epoch:', 34, 'training:', 0.71875, 'validation:', 0.70987099, 'valid_loss', 0.60339159, 'best_loss', 0.59127206)
('Epoch:', 35, 'training:', 0.859375, 'validation:', 0.71587157, 'valid_loss', 0.59749943, 'best_loss', 0.59127206)
('Epoch:', 36, 'training:', 0.78125, 'validation:', 0.72307229, 'valid_loss', 0.59016055, 'best_loss', 0.59127206)
('Epoch:', 37, 'training:', 0.765625, 'validation:', 0.70297033, 'valid_loss', 0.61029232, 'best_loss', 0.59016055)
('Epoch:', 38, 'training:', 0.796875, 'validation:', 0.72097212, 'valid_loss', 0