In [1]:
%run env_setup.py
import keras
import numpy as np
from importlib import reload
from lessdeep.datasets.stanford import imdb
import lessdeep as ld

Using TensorFlow backend.


In [40]:
reload(imdb)
idx = imdb.get_word_index()
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'it', 'in', 'i']

In [41]:
idx2word = {i: w for w, i in idx.items()}

In [43]:
(x_train, labels_train), (x_test, labels_test) = imdb.load_data()

Use pre-calculated vocabulary


In [44]:
def show_sentence(arr):
    return ' '.join([idx2word[i] for i in arr])

In [45]:
show_sentence(x_train[0])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high 's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i m here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it is n't"

In [22]:
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [46]:
vocab_size = 5000

trn = [np.array([min(i, vocab_size - 1) for i in s]) for s in x_train]
tst = [np.array([min(i, vocab_size - 1) for i in s]) for s in x_test]

In [47]:
trn_lens = np.array(list(map(len, trn)))
[trn_lens.min(), trn_lens.mean(), trn_lens.max()]

[10, 242.54411999999999, 2527]

In [48]:
pad_value = 0

In [49]:
seq_max_len = 600
from keras.preprocessing.sequence import pad_sequences
trn = pad_sequences(trn, maxlen=seq_max_len, value=pad_value)
tst = pad_sequences(tst, maxlen=seq_max_len, value=pad_value)

In [50]:
print(trn.shape)
print(tst.shape)

(25000, 600)
(25000, 600)


## Simple NN

In [12]:
word_features = 32
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, output_dim=word_features, input_length=seq_max_len),
    keras.layers.Flatten(),
    keras.layers.Dense(100),
    keras.layers.Dropout(0.7),
    keras.layers.Dense(1, activation='sigmoid')
])
model.compile(keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 600, 32)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 19200)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1920100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 2,080,201
Trainable params: 2,080,201
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(trn, labels_train, validation_data=(tst, labels_test), epochs=2, batch_size=64, callbacks=[ld.utils.tf_board()])

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2a5f3dd2a58>

## CNN
Convolve neural network is good for sequence

In [28]:
word_features = 32
cnn_model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, output_dim=word_features, input_length=seq_max_len),
    keras.layers.Dropout(0.5),
    keras.layers.Conv1D(filters=32, kernel_size=5, activation='relu'),
    keras.layers.MaxPool1D(),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.5),
    #keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dropout(0.7),
    keras.layers.Dense(1, activation='sigmoid')
])
cnn_model.compile(keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 600, 32)           160000    
_________________________________________________________________
dropout_20 (Dropout)         (None, 600, 32)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 596, 32)           5152      
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 298, 32)           0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 9536)              0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 9536)              0         
_________________________________________________________________
dense_15 (Dense)             (None, 100)               953700    
__________

In [29]:
cnn_model.fit(trn, labels_train, validation_data=(tst, labels_test), shuffle=True, epochs=8, batch_size=128, callbacks=[ld.utils.tf_board('cnn')])

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x2a62d3135c0>

In [16]:
print(ld.utils.tf_board)
reload(ld)
reload(ld.utils)
print(ld.utils.tf_board)

<function tf_board at 0x000002A5D61BB1E0>
<function tf_board at 0x000002A5D61BB158>


## Use Pre-Trained Embeddings

In [12]:
from lessdeep.utils.word_vec import glove
g_words, g_vecs = glove()
g_word2idx = {w:i for i, w in enumerate(g_words)}
g_features = len(g_vecs[0])

In [54]:
def create_emb(vecs, word2idx, old_vocab):
    feature_num = len(vecs[0])
    res_emb = np.empty((len(old_vocab), feature_num), dtype=type(vecs[0][0]))
    for i, word in enumerate(old_vocab):
        if word in word2idx:
            res_emb[i, :] = vecs[word2idx[word]]
        else:
            print('[{0}]word: '.format(i) + word + ' not inside new vocabulary')
            res_emb[i, :] = np.random.normal(scale=0.6, size=(feature_num,))
    # the padding value
    res_emb[pad_value, :] = np.random.normal(scale=0.6, size=(feature_num,))

    return res_emb

g_emb = create_emb(g_vecs, g_word2idx, idx_arr[:vocab_size])

In [68]:
glove_cnn_model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, output_dim=g_features, input_length=seq_max_len,
                           weights=[g_emb/3], trainable=False),
    keras.layers.Dropout(0.1),
    keras.layers.Conv1D(filters=32, kernel_size=5, activation='relu', padding='same'),
    keras.layers.MaxPool1D(),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.5),
    #keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dropout(0.7),
    keras.layers.Dense(1, activation='sigmoid')
])
glove_cnn_model.compile(keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
glove_cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 600, 50)           250000    
_________________________________________________________________
dropout_25 (Dropout)         (None, 600, 50)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 600, 32)           8032      
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 300, 32)           0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 9600)              0         
_________________________________________________________________
dropout_26 (Dropout)         (None, 9600)              0         
_________________________________________________________________
dense_17 (Dense)             (None, 100)               960100    
__________

In [72]:
glove_cnn_model.fit(trn, labels_train, validation_data=(tst, labels_test), shuffle=True, epochs=8, batch_size=128, callbacks=[ld.utils.tf_board('glv_cnn')])

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x25640a99cf8>

TODO: Continue last epoch in both tensorboard and training