# 二分类问题

情感分析，可应用场景：趋势预测

## 1. 导入包，并打印版本信息

In [None]:
import tensorflow as tf
import numpy as np

imdb = tf.keras.datasets.imdb
models = tf.keras.models
layers = tf.keras.layers
activations = tf.keras.activations
optimizers = tf.keras.optimizers
losses = tf.keras.losses

print('tensorflow:' + tf.__version__)
print('numpy:' + tf.__version__)

## 2. 准备样本数据
tf框架自带一些测试数据

In [None]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

## 3. 看一下数据形状和内容

In [None]:
print('train_data.shape: ' + str(train_data.shape))
print('train_labels.shape: ' + str(train_labels.shape))
print('\n--------------------------------------------------train_data--------------------------------------------------')
print(train_data)
print('\n--------------------------------------------------train_data[0]--------------------------------------------------')
print(train_data[0])
print('\n--------------------------------------------------train_labels--------------------------------------------------')
print(train_labels)


## 4. 看一下前10条评论的人可识别内容

In [None]:
word_index = imdb.get_word_index()

reverse_word_index = dict([(value, key) for key, value in word_index.items()])


def show_content(content_data):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in content_data])

for i in range(10):
    print(str(i + 1) + '#: ' + show_content(train_data[i]) + '\n')


## 5. 词嵌入函数实现
NOTE: 对于文本处理一般是两种方式`词嵌入`和`One-Hot`, `tf.keras.preprocessing.text.Tokenizer`处理类能直接用于生产

[tf.keras.preprocessing.text.Tokenizer文档传送门](https://tensorflow.google.cn/api_docs/python/tf/keras/preprocessing/text/Tokenizer)

In [None]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

## 6. 对train_data和test_data做词嵌入编码

In [None]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

## 7. 看一下编码后的数据

In [None]:
print('x_train.shape: ' + str(x_train.shape))
print('x_train[0]: ' + str(x_train[0]))

## 8. 处理一下标注数据
整型转浮点型

In [None]:
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

## 9. 比对一下处理前与处理后的标注数据

In [None]:
print('tarin_labels.shape: ' + str(train_labels.shape))
print('tarin_labels: ' + str(train_labels))
print('y_train.shape: ' + str(y_train.shape))
print('y_train: ' + str(y_train))

## 10. 构建模型

In [None]:
model = models.Sequential()

# NOTE： 飞驰人生：一顿操作猛如虎，定睛一看原地杵。
model.add(layers.Dense(16, activation=activations.relu, input_shape=(10000, )))
model.add(layers.Dense(16, activation=activations.relu))
model.add(layers.Dense(1, activation=activations.sigmoid))

## 11. 编译模型

In [None]:
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=['accuracy'])

## 12. 看一下模型概况

In [None]:
model.summary()

## 13. 训练模型
NOTE：从训练精度和验证精度来判断过拟合和欠拟合

In [None]:
model.fit(x_train, y_train, epochs=10, batch_size=512, validation_split=0.2)

## 14. 使用测试集评估模型

In [None]:
results = model.evaluate(x_test, y_test)
print(results)

## 15. 使用模型预测结果

In [None]:
x_samples = x_test[:20]
y_samples = y_test[:20]

predications = model.predict(x_samples)

print('predications: ')
print(predications)
print('actual: ')
print(y_samples)