<a href="https://colab.research.google.com/github/CHIEH-YU/ancient_chinese_auto_pos/blob/master/hw2_105101059_ipynb_txt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chinese Word Segmentation

The following performance : recall: 0.8859610491289881 precision: 0.945555671610612 f1: 0.9147888035569928

## import module 

In [0]:
import math
import gc
import numpy as np
import time
import keras
from keras.layers import *
from keras.models import *
from keras_contrib.layers import CRF
from keras.preprocessing import sequence
from keras import backend as K
import codecs
from keras.utils import plot_model
from keras.utils import np_utils


Using TensorFlow backend.


## Prepare Training Data

In [0]:
raw_train = []
raw_test = []
with codecs.open("data/as_training.utf8", "r",encoding = 'utf-8') as fin:
    for line in fin:
        raw_train.append(line.strip().split("　"))   # It is a full white space

with codecs.open("data/as_testing_gold.utf8","r",encoding = 'utf-8') as fin:
    for line in fin:
        raw_test.append(line.strip().split("　"))   # It is a full white space

print("Number of sentences in the training data: %d" % len(raw_train))
print("Number of sentences in the test data: %d" % len(raw_test)) 

Number of sentences in the training data: 708953
Number of sentences in the test data: 14432


## Convert a list of words to a sequence of tags

In [0]:
def words_to_tags(words):
    tags = []
    for word in words:
        if len(word) == 1:
            tags.append('S')
        else:
            for i in range(len(word)):
                if i == 0:
                    tags.append('S')
                elif i == len(word) - 1:
                    tags.append('I')
                else:
                    tags.append('I')
    return tags

In [0]:
tag2idx = {"PAD":0,"S":1, "I":2}
train_X = []
train_Y = []

test_X = []
test_Y = []

x_train = []
y_train = []
x_test = []
y_test = []

for sent in raw_train:
    x_train.append(list("".join(sent)))  # Make the unsegmented sentence as a sequence of characters
    y_train.append(words_to_tags(sent))
    
for sent in raw_test:
    x_test.append(list("".join(sent)))  # Make the unsegmented sentence
    y_test.append(words_to_tags(sent))


    # to collect the words appear in training data 
word_list = []
for this_x in x_train:
    word_list.extend(this_x)

# to know the words appear in training data
# if train data is huge, collections should be considerd to include high-frequency words only
word_set = set(word_list)

# build dictionary
word2idx = {"<PAD>":0, "<UNK>":1}
idx2word = {0:"<PAD>", 1:"<UNK>"}
for word in word_set:
    word2idx.update({word:len(word2idx)})
    idx2word.update({len(idx2word):word})

In [0]:
def encode(list_string):
    encoded = []
    for word in list_string:
        try:
            encoded.append(word2idx[word])
        except:
            encoded.append(word2idx["<UNK>"])
    return encoded

def encode_tag(list_string):
    encoded = []
    for word in list_string:
        encoded.append(tag2idx[word])
    return encoded

def decode(list_idx):
    decoded = []
    for idx in list_idx:
        decoded.append(idx2word[idx])
    return decoded

def numplized_data(data, maxlen):
    output = np.zeros((len(data), maxlen)).astype(np.int32)
    for idx, this in enumerate(data):
        if len(this) <= maxlen:
            output[idx,:len(this)] = this
        else:
            output[idx] = this[:maxlen]
    return output


In [0]:
np_x_train = numplized_data(encoded_x_train, maxlen)
_np_y_train = numplized_data(encoded_y_train, maxlen)
np_y_train = np_utils.to_categorical(_np_y_train, class_label_count)


model=Bilstm_CNN_Crf(maxlen,char_value_dict_len,class_label_count)
model.summary()
#train
model.fit(np_x_train,np_y_train,batch_size=1028,epochs=2,verbose=1)
#model.load_weights('train_model.hdf5')
model.save_weights('train_model.hdf5')



encoded_x_train = []
encoded_y_train = []
for x in x_train:
    encoded_x_train.append(encode(x))
for y in y_train:
    encoded_y_train.append(encode_tag(y))


maxlen = 50
char_value_dict_len = len(word2idx)
class_label_count = 3





## Create a CRF model for word segmentation 

In [0]:
def Bilstm_CNN_Crf(maxlen,char_value_dict_len,class_label_count,is_train=False):
	word_input=Input(shape=(maxlen,),dtype='int32',name='word_input')
	if is_train:
		word_emb=Embedding(char_value_dict_len+2,output_dim=64,\
                    input_length=maxlen,weights=[embedding_weights],\
                    name='word_emb')(word_input)
	else:
		word_emb=Embedding(char_value_dict_len+2,output_dim=64,\
                    input_length=maxlen,\
                    name='word_emb')(word_input)	
	# bilstm
	bilstm=Bidirectional(LSTM(256,return_sequences=True))(word_emb)
	bilstm_d=Dropout(0.1)(bilstm)

	# cnn
	half_window_size=2
	padding_layer=ZeroPadding1D(padding=half_window_size)(word_emb)
	conv=Conv1D(nb_filter=50,filter_length=2*half_window_size+1,\
			padding='valid')(padding_layer)
	conv_d=Dropout(0.1)(conv)
	dense_conv=TimeDistributed(Dense(50))(conv_d)

	# merge
	rnn_cnn_merge=Concatenate(axis=2)([bilstm_d,dense_conv])
	dense=TimeDistributed(Dense(class_label_count))(rnn_cnn_merge)

	# crf
	crf=CRF(class_label_count,sparse_target=False)
	crf_output=crf(dense)

	# build model
	model=Model(input=[word_input],output=[crf_output])

	model.compile(loss=crf.loss_function,optimizer='adam',metrics=[crf.accuracy])

	# model.summary()

	return model



In [0]:
np_x_train = numplized_data(encoded_x_train, maxlen)
_np_y_train = numplized_data(encoded_y_train, maxlen)
np_y_train = np_utils.to_categorical(_np_y_train, class_label_count)


model=Bilstm_CNN_Crf(maxlen,char_value_dict_len,class_label_count)
model.summary()
#train
model.fit(np_x_train,np_y_train,batch_size=1028,epochs=2,verbose=1)
#model.load_weights('train_model.hdf5')
model.save_weights('train_model.hdf5')





Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input (InputLayer)         (None, 50)           0                                            
__________________________________________________________________________________________________
word_emb (Embedding)            (None, 50, 64)       391616      word_input[0][0]                 
__________________________________________________________________________________________________
zero_padding1d_1 (ZeroPadding1D (None, 54, 64)       0           word_emb[0][0]                   
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 50, 50)       16050       zero_padding1d_1[0][0]           
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/6
Epoch 2/6
Epoch 3/6


KeyboardInterrupt: 

## Evaluation

In [0]:
def compare(actual_toks, pred_toks):
    i = 0
    j = 0
    p = 0
    q = 0
    tp = 0
    fp = 0
    while i < len(actual_toks) and j < len(pred_toks):
        if p == q:
            if actual_toks[i] == pred_toks[j]:
                tp += 1
            else:
                fp += 1
            p += len(actual_toks[i])
            q += len(pred_toks[j])
            i += 1
            j += 1
        elif p < q:
            p += len(actual_toks[i])
            i += 1
        else:
            fp += 1
            q += len(pred_toks[j])
            j += 1
    return tp, fp, len(actual_toks)
    
def score(actual_sents, pred_sents):
    tp = 0
    fp = 0
    total = 0
    for actual_toks, pred_toks in zip(actual_sents, pred_sents):
        tp_, fp_, total_ = compare(actual_toks, pred_toks)
        tp += tp_
        fp += fp_
        total += total_
    recall = float(tp) / total
    precision = float(tp) / (tp + fp)
    f1 = 2.0 * recall * precision / (recall + precision)
    print('recall:',recall,'precision:',precision,'f1:',f1)
    return recall, precision, f1        

## Testing for sample sentence

In [0]:
# inference
text='法國總統馬克宏已到現場勘災'
encoded_text = encode(text)
np_text = numplized_data([encoded_text], maxlen)

_output_tag = model.predict(np_text)
output_tag = np.argmax(_output_tag,axis=2)[0]

print(text)

parsed = ""
for idx, char in enumerate(text):
    if output_tag[idx] == 1: 
        parsed += ' ' + char
    if output_tag[idx] == 2:
        parsed += char
    if output_tag[idx] == 0:
        break
print(parsed)



法國總統馬克宏已到現場勘災
 法國 總統 馬克宏 已 到 現場 勘 災


## testing for testing data

In [0]:
pred = []
actual = []
for idx,sent in enumerate(raw_test):
    parsed1=''
    encoded_text = encode(sent)
    np_text = numplized_data([encoded_text], 50)
    _output_tag = model.predict(np_text)
    output_tag = np.argmax(_output_tag,axis=2)[0]
    for idx1, char in enumerate(sent):
        if idx1 >=50:
            break
        if output_tag[idx1] == 1:  
            parsed1 += ' ' + char
        if output_tag[idx1] == 2:
            parsed1 += char
        if output_tag[idx1] == 0:
            break
    parsed1 = parsed1.split(' ')[1:]
    pred.append(parsed1)
    actual.append(sent)

print(score(actual, pred))

recall: 0.8859610491289881 precision: 0.945555671610612 f1: 0.9147888035569928
(0.8859610491289881, 0.945555671610612, 0.9147888035569928)
