# 深度学习用于文本和序列

#### 所用算法
- 循环神经网络
- 一维卷积神经网络
#### 算法应用
- 文档分类和时间序列分类，比如识别文章的主题或书的作者；
- 时间序列对比，比如估测两个文档或两支股票行情的相关程度；
- 序列到序列的学习，比如将英语翻译成法语；
- 情感分析，比如将推文或电影评论的情感划分为正面或负面；
- 时间序列预测，比如根据某地最近的天气数据来预测未来天气。

本章的示例重点讨论两个小任务：一个是 IMDB 数据集的情感分析，这个任务前面介绍过；
另一个是温度预测。但这两个任务中所使用的技术可以应用于上面列出来的所有应用

6.1 处理文本数据

    深度神经网络只能够处理向量，文本**向量化**，是指将文本转换为数值张量的过程。可用方法有：
   - 分割为单词
   - 分割为字符
   - 提取单词或者字符的n-gram，然后将n-gram转化为向量
   - 解法
   这个过程称之为**分词**，分完的单元称之为**标记**
    #### 理解
   - one-hot编码：将单词转化为向量

##### 单词级的one-hot

In [33]:
# 写出samples的单词级矩阵表示，即one-hot表示
import numpy as np

samples = ['The cat sat on the mat.','The dog ate my homework']

# 用split()方法对文本进行分词，但是这里没有对符号进行处理
# 为每一个单词指定一个索引，0并不作为索引存在。
token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            # 直接加入字典：key-单词：value-索引
            token_index[word] = len(token_index) + 1
print(token_index)
max_length = 10

# np矩阵
results = np.zeros(shape = (len(samples),      # 2句话
                           max_length,         # 每句话最大长度 10
                           max(token_index.values())+1)) # 每个单词用（最大值+1 = 11）个维度表示


for i, sample in enumerate(samples):
    t = sample.split()
#     print(t)
    print(t[:10])
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)   # 得到这个单词在之前字典里的索引位置
        results[i, j, index] = 1        # 把该索引位置标记为1
        
print(results)

{'The': 1, 'cat': 2, 'sat': 3, 'on': 4, 'the': 5, 'mat.': 6, 'dog': 7, 'ate': 8, 'my': 9, 'homework': 10}
['The', 'cat', 'sat', 'on', 'the', 'mat.']
['The', 'dog', 'ate', 'my', 'homework']
[[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]

 [[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0

###### 字符级的one-hot编码

In [40]:
# 自己写出samples的字符级one-hot表示
import numpy as np

samples = ['The cat sat on the mat.','The dog ate my homework']

letter = {}
for i, le in enumerate('abcdefghigklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',start = 1):
    letter[le] = i
print(letter)

lett = np.zeros(shape = (len(samples), # 两个句子
               50, # 每个句子包含50个字符，这个值是要比句子中可出现的字符数大的，否则出错
               max(letter.values())+1)) # 每个单词由50个维度表示，其中只有一个维度为1

for i, sample in enumerate(samples):
    for j, le in list(enumerate(sample))[:50]:
        index = letter.get(le)  # 字符在词典中的索引
        lett[i, j, index] = 1
print(lett)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 10, 'h': 8, 'i': 9, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52}
[[[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 [[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]]


In [41]:
# 标准的字符级
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

characters = string.printable  # 所有可打印的ASCII字符

token_index = dict(zip(range(1, len(characters) + 1), characters))

max_length = 50

results = np.zeros((len(samples), max_length, max(token_index.keys()) + 1))

for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        results[i, j, index] = 1
print(results)

[[[ 1.  1.  1. ...,  1.  1.  1.]
  [ 1.  1.  1. ...,  1.  1.  1.]
  [ 1.  1.  1. ...,  1.  1.  1.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 [[ 1.  1.  1. ...,  1.  1.  1.]
  [ 1.  1.  1. ...,  1.  1.  1.]
  [ 1.  1.  1. ...,  1.  1.  1.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]]


In [32]:
# list特性
a = [1,2,3,4,5,6]
b = list(a)[:10]
print(b)

[1, 2, 3, 4, 5, 6]


### 用keras实现单词级的one-hot编码

In [46]:
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# 创建一个分词器（tokenizer），设置为只考虑前 1000 个最常见的单词
tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(samples) # 构建单词索引

# 给上面samples中的每一个单词分配一个整数作为索引，例如The:1，cat:2,...
sequences = tokenizer.texts_to_sequences(samples)
print(sequences)

# 
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
print(one_hot_results)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print(word_index)

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
[[ 0.  1.  1.  1.  1.  1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  1.  1.  1.  1.]]
Found 9 unique tokens.
{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}


### 疑问：从这里的输出来看，这并不像是one-hot编码？

### 6.12 使用词嵌入

In [None]:
1. 利用 Embedding 层学习词嵌入

In [None]:
from keras.layers import Embedding

# Embedding 层至少需要两个参数：标记(标记的定义前面有)的个数（这里是 1000，即最大单词索引 +1）和嵌入的维度（这里是 64）
# (samples, sequence_length)
embedding_layer = Embedding(1000, 64)

#### 加载 IMDB 数据，准备用于 Embedding 层

In [49]:
from keras.datasets import imdb
from keras.layers import preprocessing

max_features = 10000  # 常用的10000个单词
maxlen = 20 # 一个评论的最大长度，多余部分截掉

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

ImportError: cannot import name 'preprocessing'

In [48]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, y_train,
epochs=10,
batch_size=32,
validation_split=0.2)

NameError: name 'maxlen' is not defined