## Keras实现文本预处理

In [1]:
import keras.preprocessing.text as T
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
text1 = 'some/thing to eat'
text2 = 'some thing to drink'
texts = [text1, text2]
print(' '.join(text1.split('/')))


some thing to eat


In [4]:
tokenizer = Tokenizer(num_words=None)  # num_words:None或整数,处理的最大单词数量。少于此数的单词丢掉
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(sequences)

[[1, 2, 3, 4], [1, 2, 3, 5]]


In [6]:
print(tokenizer.word_counts)  # [('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]
print(tokenizer.word_index)  # {'some': 1, 'thing': 2,'to': 3 ','eat': 4, drink': 5}
print(tokenizer.word_docs)  # {'some': 2, 'thing': 2, 'to': 2, 'drink': 1,  'eat': 1}
print(tokenizer.index_docs)

OrderedDict([('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)])
{'some': 1, 'thing': 2, 'to': 3, 'eat': 4, 'drink': 5}
{'to': 2, 'thing': 2, 'eat': 1, 'some': 2, 'drink': 1}
{3: 2, 2: 2, 4: 1, 1: 2, 5: 1}


In [12]:
from keras.preprocessing.sequence import pad_sequences

word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=10)
print(data)

[[0 0 0 0 0 0 1 2 3 4]
 [0 0 0 0 0 0 1 2 3 5]]


In [13]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

text1 = "今天 北京 下 暴雨 了"
text2 = "我 今天 打车 回家"
texts = [text1, text2]

print(text_to_word_sequence(text1))  # 按空格分割语料
# ['今天', '北京', '下', '暴雨', '了']

tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(texts)
print(tokenizer.document_count) # 处理文档的数量
# 2
print(tokenizer.word_counts) # 词频字典，按词频从大到小排序
# OrderedDict([('今天', 2), ('北京', 1), ('下', 1), ('暴雨', 1), ('了', 1), ('我', 1), ('打车', 1), ('回家', 1)])
print(tokenizer.word_docs) # 保存每个word出现的文档的数量
# {'了': 1, '暴雨': 1, '北京': 1, '下': 1, '今天': 2, '打车': 1, '回家': 1, '我': 1}
print(tokenizer.word_index) # 给每个词唯一id
# {'今天': 1, '北京': 2, '下': 3, '暴雨': 4, '了': 5, '我': 6, '打车': 7, '回家': 8}
print(tokenizer.index_docs) # 保存word的id出现的文档的数量
# {5: 1, 4: 1, 2: 1, 3: 1, 1: 2, 7: 1, 8: 1, 6: 1}

# 将序列填充到maxlen长度
print(pad_sequences([[1,2,3],[4,5,6]],maxlen=10,padding='pre')) # 在序列前填充
# [[0 0 0 0 0 0 0 1 2 3]
# [0 0 0 0 0 0 0 4 5 6]]
print(pad_sequences([[1,2,3],[4,5,6]],maxlen=10,padding='post')) # 在序列后填充
# [[1 2 3 0 0 0 0 0 0 0]
# [4 5 6 0 0 0 0 0 0 0]]

['今天', '北京', '下', '暴雨', '了']
2
OrderedDict([('今天', 2), ('北京', 1), ('下', 1), ('暴雨', 1), ('了', 1), ('我', 1), ('打车', 1), ('回家', 1)])
{'今天': 2, '北京': 1, '了': 1, '下': 1, '暴雨': 1, '我': 1, '打车': 1, '回家': 1}
{'今天': 1, '北京': 2, '下': 3, '暴雨': 4, '了': 5, '我': 6, '打车': 7, '回家': 8}
{1: 2, 2: 1, 5: 1, 3: 1, 4: 1, 6: 1, 7: 1, 8: 1}
[[0 0 0 0 0 0 0 1 2 3]
 [0 0 0 0 0 0 0 4 5 6]]
[[1 2 3 0 0 0 0 0 0 0]
 [4 5 6 0 0 0 0 0 0 0]]


[Keras文本预处理相关函数简介](https://blog.csdn.net/edogawachia/article/details/79394780)

[Keras---text.Tokenizer：文本与序列预处理](https://blog.csdn.net/lovebyz/article/details/77712003)