### 深度学习的中文分词
- 基于word2vec + 神经网络进行中文分词
    - 步骤1：使用的sogou的语料库建立初始的字向量。
    - 步骤2：读入有标注的训练语料库，处理成keras需要的数据格式。
    - 步骤3：根据训练数据建模，使用neural network
    - 步骤4：读入无标注的检验语料库，用neural network模型进行分词标注
    - 步骤5：检查最终的效果
- 参考资料:[中文分词资源](http://www.52nlp.cn/%E4%B8%AD%E6%96%87%E5%88%86%E8%AF%8D%E5%85%A5%E9%97%A8%E4%B9%8B%E8%B5%84%E6%BA%90) 
    [中文分词标注法](http://www.52nlp.cn/the-character-based-tagging-method-of-chinese-word-segmentation) [基于word2vec的中文分词](http://blog.csdn.net/itplus/article/details/17122431)

- 步骤1：先用sogou语料库生成中文的单字向量

In [1]:
from __future__ import absolute_import
from __future__ import print_function
import os
import re
import codecs
import itertools
from tqdm import trange
from pickle import dump, load
from pprint import pprint

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2


# nltk  
import nltk
from nltk.probability import FreqDist 

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Reshape, Flatten ,Dropout
from keras.regularizers import l1,l2
from keras.layers.convolutional import Convolution2D, MaxPooling2D,MaxPooling1D

from utils import *

Using Theano backend.


In [2]:
text_t = load_sogou_reduced()
# number of files in each directory
flen = [len(t) for t in text_t.values()]
print(list(text_t.keys()), flen)

[0, 1, 2, 3, 4, 5, 6, 7, 8] [1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990]


In [3]:
labels = np.repeat(list(text_t.keys()),flen)
# flatter nested list
docs = flatten(text_t.values())

In [4]:
df = pd.DataFrame({'label': labels, 'txt': docs})
df.head()

Unnamed: 0,label,txt
0,0,本报记者陈雪频实习记者唐翔发自上海\r\n 一家刚刚成立两年的网络支付公司，它的目标是...
1,0,证券通：百联股份未来5年有能力保持高速增长\r\n\r\n 深度报告 权威内参...
2,0,5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www....
3,0,5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www....
4,0,5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www....


In [5]:
corpus_doc = map(lambda x: ''.join(x.split()), df.txt)
sequences = doc_to_seq(corpus_doc, left=1, right=1)
tokens_flat = flatten(sequences)

In [6]:
# character frequency
fdist = FreqDist(tokens_flat) 
w, f = zip(*fdist.items()) # zip(*args) to zip back(unpack)
freqdf = pd.DataFrame({'word':w,'freq':f}) 
freqdf.sort_values(by='freq',ascending =False, inplace=True)
freqdf['idx'] = np.arange(len(f))
freqdf.head()

Unnamed: 0,freq,word,idx
1325,4327816,�,0
4958,899937,，,1
2866,710057,的,2
1071,417715,。,3
241,227585,一,4


In [7]:
dump(df, open('df.pickle', 'wb'))
df = load(open('df.pickle','rb'))

In [8]:
word_to_num = dict(zip(freqdf.word, freqdf.idx))
num_to_word = dict(zip(freqdf.idx, freqdf.word))
print(len(list(fdist.keys())), len(word_to_num))

7116 7116


In [9]:
# word2vec
from gensim.models import word2vec
def trainW2V(corpus, epochs=50, num_features = 100,
             min_word_count=1, num_workers=8,
             context=10, sample=1e-5):
    global w2v
    w2v = word2vec.Word2Vec(workers = num_workers,
                          sample = sample,
                          size = num_features,
                          min_count=min_word_count,
                          window = context)
    np.random.shuffle(corpus)
    w2v.build_vocab(corpus)  
    for epoch in trange(epochs):
        np.random.shuffle(corpus)
        w2v.train(corpus)
        w2v.alpha *= 0.9  
        w2v.min_alpha = w2v.alpha  
    print("Done.")

In [10]:
# word2vec
trainW2V(sequences, epochs=5)

100%|██████████| 5/5 [03:00<00:00, 36.56s/it]


Done.


In [11]:
w2v.save('sogou_vectors.bin')
w2v.save_word2vec_format('sogou_vectors.txt')

In [12]:
print('A to B is C to what?')
pprint(w2v.most_similar(positive=['新', '男'], negative=['旧']))
pprint(w2v.most_similar(positive=['早', '左'], negative=['晚']))

A to B is C to what?
[('女', 0.4913490116596222),
 ('性', 0.43872058391571045),
 ('士', 0.3781505227088928),
 ('技', 0.35824400186538696),
 ('魅', 0.35579732060432434),
 ('仪', 0.349954217672348),
 ('晓', 0.34426361322402954),
 ('创', 0.34095433354377747),
 ('视', 0.3369499742984772),
 ('伴', 0.33621135354042053)]
[('右', 0.6779191493988037),
 ('螯', 0.37705448269844055),
 ('侧', 0.3653992712497711),
 ('翼', 0.3480273187160492),
 ('初', 0.34055307507514954),
 ('ㄍ', 0.33485740423202515),
 ('岁', 0.33165547251701355),
 ('始', 0.3298236131668091),
 ('偻', 0.3256300985813141),
 ('铬', 0.32533490657806396)]


In [13]:
# word embeddings' look up table matrix，each row is a word's embedding vector,
init_wv = []
for i in range(freqdf.shape[0]):
    init_wv.append(w2v[num_to_word[i]])

In [14]:
# 定义'UNK'为未登陆新字UNKNOWN, 空格为两头padding，并增加两个相应的向量表示
char_num = len(init_wv)
num_to_word[char_num] = u'<UNK>'
word_to_num[u'<UNK>'] = char_num
num_to_word[char_num + 1] = u' '
word_to_num[u' '] = char_num + 1

init_wv.append(np.random.randn(100,))
init_wv.append(np.zeros(100,))

In [15]:
from pickle import dump,load
dump(init_wv, open('init_wv.pickle', 'wb'))
init_wv= load(open('init_wv.pickle','rb'))

- 步骤2：训练数据读取和转换

In [16]:
DATA_ROOT = '../../../data/text'
# 读取数据，将格式进行转换为带四种标签 S B M E
train_input_file = '%s/icwb2-data/training/msr_training.utf8' % (DATA_ROOT)
train_output_file = '%s/icwb2-data/training/msr_training.tagging.utf8' % (DATA_ROOT)

In [17]:
# 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
tag_character_BMES(train_input_file, train_output_file)

In [18]:
# 分离word 和 label
with open(train_output_file) as f:
    doc = f.readlines()
    train_sentences = [''.join([w[0] for w in line.split()]) for line in doc]
    train_tags = [w[-1] for line in doc for w in line.split()]

In [19]:
# 输入字符list，输出数字list
sent_to_windows(train_sentences[0], word_to_num, window=7)

array([[ 303,  303,  303,   19,   11,   52,  223],
       [ 303,  303,   19,   11,   52,  223,   82],
       [ 303,   19,   11,   52,  223,   82,   31],
       [  19,   11,   52,  223,   82,   31,  275],
       [  11,   52,  223,   82,   31,  275,    5],
       [  52,  223,   82,   31,  275,    5,    4],
       [ 223,   82,   31,  275,    5,    4,   85],
       [  82,   31,  275,    5,    4,   85,  192],
       [  31,  275,    5,    4,   85,  192,  220],
       [ 275,    5,    4,   85,  192,  220,  412],
       [   5,    4,   85,  192,  220,  412,    1],
       [   4,   85,  192,  220,  412,    1,   79],
       [  85,  192,  220,  412,    1,   79,  709],
       [ 192,  220,  412,    1,   79,  709,  107],
       [ 220,  412,    1,   79,  709,  107,  666],
       [ 412,    1,   79,  709,  107,  666,    2],
       [   1,   79,  709,  107,  666,    2,  140],
       [  79,  709,  107,  666,    2,  140,  432],
       [ 709,  107,  666,    2,  140,  432,  224],
       [ 107,  666,    2,  140,

In [20]:
# 将所有训练文本转成index window list
train_windows = []
for line in train_sentences:
    train_windows.extend(sent_to_windows(line, word_to_num, window=7))

In [21]:
print(len(train_windows), len(train_tags), set(train_tags))

4050469 4050469 {'M', 'E', 'B', 'S'}


In [22]:
dump(train_windows, open('train_windows.pickle', 'wb'))
train_windows = load(open('train_windows.pickle','rb'))

- 步骤3：
    - Data preprocessing
    - 训练模型

In [23]:
# 建立两个字典
num_to_tag, tag_to_num = list_to_mappings(train_tags)
print(tag_to_num)
print(num_to_tag)
# 将目标变量转为数字index
train_labels = [tag_to_num[y] for y in train_tags]

nb_classes = len(tag_to_num)
print('number of classes:', nb_classes)

{'E': 1, 'B': 0, 'S': 3, 'M': 2}
{0: 'B', 1: 'E', 2: 'M', 3: 'S'}
number of classes: 4


In [24]:
# split and convert dataset
from sklearn.cross_validation import train_test_split
train_X, val_X, train_y, val_y = train_test_split(
    np.array(train_windows), np.array(train_labels) , train_size=0.9, random_state=1)

Y_train = np_utils.to_categorical(train_y, nb_classes)
Y_val = np_utils.to_categorical(val_y, nb_classes)
print(train_X.shape, Y_train.shape, 'train sequences')
print(val_X.shape, Y_val.shape, 'test sequences')

(3645422, 7) (3645422, 4) train sequences
(405047, 7) (405047, 4) test sequences


In [25]:
# init variables and hyperparameters
init_weight = [np.array(init_wv)]
batch_size = 256
vocab_size = init_weight[0].shape[0] # 词典大小

In [26]:
# one hidden layer neural network，input size = 700，hidden size = 100，output size = 4 classes
# 迭代时同时更新神经网络权重，以及词向量
print('Build model...')
model = Sequential()
# 词向量初始化，输入维度：词典大小|V|，输出维度：词向量100
# 使用初使词向量可以增加准确率
model.add(Embedding(vocab_size, 100, weights=init_weight, input_length=7)) 
model.add(Flatten())
model.add(Dense(700, input_dim=100)) # affine layer( fully connected layer)
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(nb_classes, input_dim=100)) # affine layer( fully connected layer)
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Build model...


In [27]:
# train_X, test_X, Y_train, Y_test
print("Train...")
earlystop = EarlyStopping(patience=0, verbose=1)
result = model.fit(train_X, Y_train, batch_size=batch_size, nb_epoch=1, 
          validation_split=0.1,callbacks=[earlystop])

Train...
Train on 3280879 samples, validate on 364543 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
score = earlystop.model.evaluate(val_X, Y_val, batch_size=batch_size)
print('Test score:', score)

Test score: [0.15990888450063415, 0.94513229329309956]


In [29]:
# test数据集，准确率0.94
classes = earlystop.model.predict_classes(val_X, batch_size=batch_size)
acc = np_utils.accuracy(classes, val_y) # 
print('Test accuracy:', acc)

Test accuracy: 0.945132293289


In [30]:
dump(model, open('model.pickle', 'wb'))
model = load(open('model.pickle','rb'))

- 步骤4：用test文本进行预测，评估效果

In [31]:
temp_txt = u'国家食药监总局发布通知称，酮康唑口服制剂因存在严重肝毒性不良反应，即日起停止生产销售使用。\
今天杭州下大雨。随便给我放一首Linkin Park摇滚音乐。'
temp_windows = sent_to_windows(temp_txt, word_to_num)

In [32]:
# 根据输入得到标注推断
temp = predict_tags(temp_windows, temp_txt, model, tag_to_num, num_to_tag)

(76, 7) 76


In [33]:
test_file = '%s/icwb2-data/testing/msr_test.utf8' % (DATA_ROOT)
with open(test_file,'r') as f:
    test_sentences = f.readlines()
    
test_X = []
for line in test_sentences:
    test_windows = sent_to_windows(line, word_to_num)
    test_X.extend(test_windows)
input_sentences = ''.join(test_sentences)

In [34]:
test_tags = predict_tags(test_X, input_sentences, model, tag_to_num, num_to_tag)

(184358, 7) 184358


In [35]:
test_input_file = '%s/icwb2-data/testing/msr_test_output.utf8' % (DATA_ROOT)
test_output_file = '%s/icwb2-data/testing/msr_test.split.tag2word.utf8' % (DATA_ROOT)
with open(test_input_file, 'wb') as f:
    f.write(bytes('%s' % (test_tags), 'utf-8'))

untag_character_BMES(test_input_file, test_output_file)

- perl脚本检验的F值为0.929

In [36]:
! ../../../data/text/icwb2-data/scripts/score ../../../data/text/icwb2-data/gold/msr_training_words.utf8 ../../../data/text/icwb2-data/gold/msr_test_gold.utf8 ../../../data/text/icwb2-data/testing/msr_test.split.tag2word.utf8 > deep.score