In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from keras.utils import to_categorical
from keras import models
from keras import layers
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras.datasets import imdb
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=1000)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [3]:
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

**Exploring the Data**

In [4]:
print('Categories : ', np.unique(targets))
print('Number of unique words : ', len(np.unique(np.hstack(data))))

Categories :  [0 1]
Number of unique words :  998


In [5]:
length = [len(i) for i in data]
print('Average review length : ', np.mean(length))
print('Standard deviation : ', round(np.std(length)))

Average review length :  234.75892
Standard deviation :  173.0


In [6]:
print('Label : ', targets[0])

Label :  1


In [7]:
print(data[0])

[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]


**Retrieves the dictionary mapping word indices back into the original words so that we can read them. It replaces every unknown word with a “#”**

In [8]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join( [reverse_index.get(i - 3, "#") for i in data[0]] )
print(decoded)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
# this film was just brilliant casting # # story direction # really # the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same # # as myself so i loved the fact there was a real # with this film the # # throughout the film were great it was just brilliant so much that i # the film as soon as it was released for # and would recommend it to everyone to watch and the # # was amazing really # at the end it was so sad and you know what they say if you # at a film it must have been good and this definitely was also # to the two little # that played the # of # and paul they were just brilliant children are often left out of the # # i think because the stars that play them all # up are such a big # for the whole film but these children are amazing and should be # for what they have done don't you think the whole story was 

### DATA PREPARATION

In [9]:
def vectorize(sequences, dimension=10000):
  results = np.zeros((len(sequences), dimension))
  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1
  return results

In [10]:
data = vectorize(data)
targets = np.array(targets).astype("float32")

In [11]:
test_x = data[:10000]
test_y = targets[:10000]

train_x = data[10000:]
train_y = targets[10000:]

print('data size : ', len(data))
print('targets size : ', len(targets))
print('test_x size : ', len(test_x), ', test_y size : ', len(test_y))
print('train_x size : ', len(train_x), ', train_y : ', len(train_y))

data size :  50000
targets size :  50000
test_x size :  10000 , test_y size :  10000
train_x size :  40000 , train_y :  40000


**Input - Layer**

In [12]:
model = Sequential()
model.add(layers.Dense(50, activation='relu', input_shape=(10000,)))

**Hidden - Layer**

In [13]:
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation='relu'))

**Output - Layer**

In [14]:
model.add(layers.Dense(1, activation='sigmoid'))

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                500050    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Total params: 505,201
Trainable params: 505,201
Non-trainable params: 0
_________________________________________________________________


### COMPILE MODEL
We use the “adam” optimizer. The optimizer is the algorithm that changes the weights and biases during training. We also choose binary-crossentropy as loss (because we deal with binary classification) and accuracy as our evaluation metric.

In [16]:
model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

### TRAIN MODEL

We train our model with a batch_size of 500 and only for two epochs because the model become overfits if we train it longer. The Batch size defines the number of samples that will be propagated through the network and an epoch is an iteration over the entire training data

In [17]:
results = model.fit(train_x, train_y,
                   epochs = 2,
                   batch_size = 500,
                   validation_data = (test_x, test_y))

Train on 40000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2


In [19]:
model.save('imdb_analytic_weight.h5')