In [1]:
# IMDB 数据集，包含来自互联网电影数据库（IMDB）的50000条严重两极分化的评论
from keras.datasets import imdb

Using TensorFlow backend.


In [3]:
# 参数 num_words = 10000 的意思是仅保留训练数据中前 10000 个最常出现的单词
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = 10000)

In [5]:
word_index = imdb.get_word_index()
reverse_word_index = dict(
    [(value, key) for (key, value) in word_index.items()])

# 索引减去3，是因为 0、1、2 是为“padding”、“start of sequence”、“unknown”分别保留的索引
decoded_review = ' '.join(
    [reverse_word_index.get(i-3, '?') for i in train_data[0]])

In [8]:
print(train_data[0])
print(decoded_review[:])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
? this film was just brilliant casting location scenery stor

In [9]:
# 准备数据，在输入网络之前要先调整数据的维数为指定的长度
import numpy as np

def vectorize_sequences(sequences, dimension = 10000):
    """
    将sequence数据调整为10000维的长度, one_hot 形式
    """
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

x_train = vectorize_sequences(train_data)     
x_test = vectorize_sequences(test_data)

## 构建网络进行训练

In [14]:
from keras import models
from keras import layers

"""
使用Sequential 类构建模型，仅用于层的线性堆叠，
这是目前最常见的网络架构。
"""
model = models.Sequential() 
# input_shape = (10000,) 说明是以单个样本作为输入数据，并不是一个 batch 作为输入
model.add(layers.Dense(16, activation = 'relu', input_shape = (10000, )))
model.add(layers.Dense(16, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

"""
上述构建网络的过程也可以通过函数式api的调用来实现
input_tensor = layers.Input(shape = (10000,))
x = layers.Dense(16, activation = 'relu')(input_tensor)
x = layers.Dense(16, activation = 'relu')(x)
output_tensor = layers.Dense(1, activation = 'sigmod')(x)
model = models.Model(inputs = input_tensor, outputs = output_tensor)
"""

model.compile(optimizer = 'rmsprop', 
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])


In [16]:
# 将数据划分成验证集和训练集
x_val = x_train[:10000]
y_val = y_train[:10000]
partial_x_train = x_train[10000:]
partial_y_train = y_train[10000:]

(25000, 10000)
