In [1]:
%matplotlib inline
import utils ; reload(utils)
from utils import *
from __future__ import division, print_function

Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 5103)
Using Theano backend.


In [2]:
model_path = "../data/imdb/models/"

## Setting up of Data 

In [3]:
from keras.datasets import imdb
idx = imdb.get_word_index()

In [4]:
#sort the dict and get the first ten values
idx_arr = sorted(idx,key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [5]:
idx2word = {v:k for k,v in idx.iteritems()}

In [6]:
#download the reviews using code copied from keras.datasets
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [7]:
len(x_train)

25000

In [8]:
#print the first sentence
', '.join(map(str, x_train[0]))

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

In [9]:
#the first sentence with label
' '.join([idx2word[i] for i in x_train[0]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [10]:
labels_train[:20]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [11]:
## Reduce vocab_size 
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [12]:
lens = np.array(map(len,trn))
(lens.max(), lens.min(),lens.mean())

(2493, 10, 237.71364)

In [13]:
#pad with 0 
seq_len = 500
trn = sequence.pad_sequences(trn, maxlen = seq_len, value=0 )
test = sequence.pad_sequences(test, maxlen = seq_len, value=0 )

In [14]:
trn.shape

(25000, 500)

## Modeling.

single layer NN

In [15]:
#note - 32 is the feature mapping, hence 5000 * 32 = 160000 parameters 
model = Sequential([
        Embedding(vocab_size, 32, input_length = seq_len), 
        Flatten(),
        Dense(100,activation='relu'),
        Dropout(0.7),
        Dense(1,activation='sigmoid')
    ])

In [16]:
model.compile(loss = 'binary_crossentropy',optimizer=Adam(), metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 500, 32)       160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 16000)         0           embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 100)           1600100     flatten_1[0][0]                  
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 100)           0           dense_1[0][0]                    
___________________________________________________________________________________________

In [17]:
model.fit(trn,labels_train, validation_data=(test,labels_test),nb_epoch=2,batch_size=64, verbose =2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
2s - loss: 0.4689 - acc: 0.7459 - val_loss: 0.2957 - val_acc: 0.8736
Epoch 2/2
2s - loss: 0.2045 - acc: 0.9216 - val_loss: 0.3193 - val_acc: 0.8711


<keras.callbacks.History at 0x7fd0d11c9450>

## Conv with max pooling 

taking adv of ordered data

In [18]:
conv1 = Sequential([
        Embedding(vocab_size, 32, input_length= seq_len, dropout=0.2),
        Dropout(0.2),
        Convolution1D(64,5,border_mode='same',activation='relu'),
        Dropout(0.2),
        MaxPooling1D(),
        Flatten(),
        Dense(100,activation='relu'),
        Dropout(0.7),
        Dense(1,activation='sigmoid')
    ])

In [19]:
conv1.compile(loss = 'binary_crossentropy', optimizer= Adam(), metrics=['accuracy'])

In [20]:
conv1.fit(trn,labels_train, validation_data=(test,labels_test),nb_epoch=4,batch_size=64, verbose =2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
7s - loss: 0.5242 - acc: 0.7100 - val_loss: 0.3207 - val_acc: 0.8628
Epoch 2/4
7s - loss: 0.3042 - acc: 0.8772 - val_loss: 0.2689 - val_acc: 0.8893
Epoch 3/4
7s - loss: 0.2593 - acc: 0.8979 - val_loss: 0.2561 - val_acc: 0.8948
Epoch 4/4
7s - loss: 0.2331 - acc: 0.9106 - val_loss: 0.2535 - val_acc: 0.8961


<keras.callbacks.History at 0x7fd0c723c350>

In [21]:
conv1.save_weights(model_path + 'conv1.h5')

In [22]:
conv1.load_weights(model_path + 'conv1.h5')

## Pre-trained vectors 

Replicate previous CNN, but with pre-trained word embeddings

In [23]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [24]:
vecs,words,wordidx = load_vectors("../data/glove/results/6B.50d")

The glove word ids and imdb word ids use different indexes. So we create a simple function that creates an embedding matrix using the indexes from imdb, and the embeddings from glove (where they exist).

In [25]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [26]:
emb = create_emb()

In [35]:
#pass embedding matrix to the keras embedding, and set to non trainable
model = Sequential([
        Embedding(vocab_size,50,input_length=seq_len, dropout=0.2, weights=[emb], trainable = True),
        Dropout(0.25),
        Convolution1D(64,5,border_mode='same',activation='relu'),
        Dropout(0.25),
        MaxPooling1D(),
        Flatten(),
        Dense(100,activation='relu'),
        Dropout(0.7),
        Dense(1,activation='sigmoid')
    ])

In [36]:
model.compile(loss = 'binary_crossentropy',optimizer=Adam(), metrics=['accuracy'])

In [37]:
model.optimizer.lr.eval()

CudaNdarray(0.0010000000475)

In [38]:
model.fit(trn,labels_train, validation_data=(test,labels_test),nb_epoch=3,batch_size=64, verbose =2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
9s - loss: 0.4856 - acc: 0.7512 - val_loss: 0.3075 - val_acc: 0.8753
Epoch 2/3
8s - loss: 0.3042 - acc: 0.8756 - val_loss: 0.2734 - val_acc: 0.8988
Epoch 3/3
8s - loss: 0.2758 - acc: 0.8888 - val_loss: 0.2546 - val_acc: 0.8970


<keras.callbacks.History at 0x7fd0b546d7d0>

Decrease the learning rates

In [39]:
#model.layers[0].trainable = True
model.optimizer.lr = 1e-4
model.fit(trn,labels_train, validation_data=(test,labels_test),nb_epoch=5,batch_size=64, verbose =2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
8s - loss: 0.2553 - acc: 0.8986 - val_loss: 0.2598 - val_acc: 0.8954
Epoch 2/5
8s - loss: 0.2372 - acc: 0.9070 - val_loss: 0.2598 - val_acc: 0.8986
Epoch 3/5
8s - loss: 0.2253 - acc: 0.9094 - val_loss: 0.2473 - val_acc: 0.9001
Epoch 4/5
8s - loss: 0.2147 - acc: 0.9142 - val_loss: 0.2493 - val_acc: 0.8982
Epoch 5/5
8s - loss: 0.2045 - acc: 0.9177 - val_loss: 0.2479 - val_acc: 0.8999


<keras.callbacks.History at 0x7fd0b546da10>

In [40]:
model.save_weights(model_path+'glove50.h5')

## Multi-size CNN

In [41]:
from keras.layers import Merge

In [42]:
graph_in = Input((vocab_size,50))
convs = []
for fsz in range(3,6):
    x = Convolution1D(64,fsz, border_mode='same',activation='relu')(graph_in)
    x = MaxPooling1D()(x)
    x = Flatten()(x)
    convs.append(x)

out = Merge(mode = "concat")(convs)
graph = Model(graph_in,out)

In [43]:
emb = create_emb()

In [85]:
model = Sequential([
    Embedding(vocab_size,50,input_length=seq_len , dropout=0.2, weights = [emb]),
    Dropout(0.2),
    graph,
    Dropout(0.5),
    Dense(100,activation='relu'),
    Dropout(0.7),
    Dense(1,activation='sigmoid')
])

In [86]:
model.compile(loss = 'binary_crossentropy', optimizer= Adam() ,metrics = ['accuracy'])

In [87]:
model.optimizer.lr.eval()

CudaNdarray(0.0010000000475)

In [88]:
model.fit(trn,labels_train, validation_data=(test,labels_test),nb_epoch=2,batch_size=64, verbose =2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
22s - loss: 0.4003 - acc: 0.8194 - val_loss: 0.2937 - val_acc: 0.8950
Epoch 2/2
22s - loss: 0.2889 - acc: 0.8823 - val_loss: 0.2680 - val_acc: 0.9000


<keras.callbacks.History at 0x7fd09d12b790>

In [89]:
model.layers[0].trainable = False
model.optimizer.lr = 1e-5

In [90]:
model.fit(trn,labels_train, validation_data=(test,labels_test),nb_epoch=3,batch_size=64, verbose =2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
22s - loss: 0.2521 - acc: 0.8985 - val_loss: 0.2646 - val_acc: 0.8948
Epoch 2/3
22s - loss: 0.2333 - acc: 0.9064 - val_loss: 0.2523 - val_acc: 0.9018
Epoch 3/3
22s - loss: 0.2139 - acc: 0.9153 - val_loss: 0.2520 - val_acc: 0.8983


<keras.callbacks.History at 0x7fd09d12bfd0>

## LSTM

Next week lesson - running for now 

In [91]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len, mask_zero=True,
              W_regularizer=l2(1e-6), dropout=0.2),
    LSTM(100, consume_less='gpu'),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_14 (Embedding)         (None, 500, 32)       160000      embedding_input_14[0][0]         
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           53200       embedding_14[0][0]               
____________________________________________________________________________________________________
dense_27 (Dense)                 (None, 1)             101         lstm_1[0][0]                     
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
____________________________________________________________________________________________________


In [92]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=5, batch_size=64, verbose = 2)


Train on 25000 samples, validate on 25000 samples
Epoch 1/5
134s - loss: 0.5605 - acc: 0.7006 - val_loss: 0.3353 - val_acc: 0.8566
Epoch 2/5
134s - loss: 0.3509 - acc: 0.8535 - val_loss: 0.3445 - val_acc: 0.8548
Epoch 3/5
134s - loss: 0.3084 - acc: 0.8752 - val_loss: 0.3107 - val_acc: 0.8735
Epoch 4/5
134s - loss: 0.2778 - acc: 0.8893 - val_loss: 0.3727 - val_acc: 0.8605
Epoch 5/5
134s - loss: 0.2677 - acc: 0.8944 - val_loss: 0.3222 - val_acc: 0.8658


<keras.callbacks.History at 0x7fd08e482710>