In [1]:
import os
import sys
import numpy as np
import pandas as pd

import tensorflow as tf
import ckiptagger

# 讀取訓練用資料

In [2]:
file_path= os.getcwd() + '\\raw_data\\train_2.txt'

In [3]:
def loadInputFile(path):
    trainingset = list()  # store trainingset [content,content,...]
    position = list()  # store position [article_id, start_pos, end_pos, entity_text, entity_type, ...]
    mentions = dict()  # store mentions[mention] = Type
    with open(file_path, 'r', encoding='utf8') as f:
        file_text=f.read().encode('utf-8').decode('utf-8-sig')
    datas=file_text.split('\n\n--------------------\n\n')[:-1]
    for data in datas:
        data=data.split('\n')
        content=data[0]
        trainingset.append(content)
        annotations=data[1:]
        for annot in annotations[1:]:
            annot=annot.split('\t') #annot= article_id, start_pos, end_pos, entity_text, entity_type
            position.extend(annot)
            mentions[annot[3]]=annot[4]
    
    return trainingset, position, mentions

# 處理資料的label

In [4]:
def LabelFile(path):
    if (os.path.isfile("mylabel.txt")):
        os.remove("mylabel.txt")
    with open("mylabel.txt", "w", encoding='utf8') as fw:
        with open(path, 'r', encoding='utf8') as f:
            fw.write('article_id,start_position,end_position,entity_text,entity_type\n')
            label = []
            for i in range(len(trainingset)):
                f.readline()  # skip the content
                f.readline()  # first line (lable title)
                if i != 0:
                    f.readline()
                while True:
                    label_text = f.readline()
                    if '--' not in label_text:
                        if label_text != '\n':
                            label_text = ','.join(label_text.split()) + '\n'
                            fw.write(label_text)
                    else:
                        break

# 使用pretrained的模型ckip來標記詞性作為feature
[需要使用的模型](http://ckip.iis.sinica.edu.tw/data/ckiptagger/data.zip)
ckipath為解壓縮後資料夾的path

In [5]:
from ckiptagger import WS, POS, NER
ckipath = '.\\data\\data\\data'
ws = WS(ckipath)
pos = POS(ckipath)
ner = NER(ckipath)

In [6]:
 def WritePOSTagFile(article_set, file_name = 'POSTag.txt'):
    POST_tag = list()

    if (os.path.isfile(".\\processed_data\\" + file_name)):
        os.remove(".\\processed_data\\" + file_name)
    with open(".\\processed_data\\" + file_name, "w", encoding='utf8') as f:
        f.write('article_id,entity_text,POS,length\n')
        iter = 0
        for article in article_set:    
            word_s = ws([article],
                        sentence_segmentation=True,
                        segment_delimiter_set={'?', '？', '!', '！', '。', ',',   
                                            '，', ';', ':', '、'})
            word_p = pos(word_s)

            for idx_word_s in range(len(word_s[0])):        # each word in article
                for character in word_s[0][idx_word_s]:     # each single character in words
                    f.write(str(iter)+','+character+','+word_p[0][idx_word_s]+str(len(word_s[0][idx_word_s]))+'\n')
            iter = iter + 1

            if iter%10 == 0:
                print('Total complete articles:', iter)

# 將資料標記為CRF的label形式(同baseline)

In [7]:
def CRFFormatData(trainingset, position, path):
    if (os.path.isfile(path)):
        os.remove(path)
    outputfile = open(path, 'a', encoding= 'utf-8')

    # output file lines
    count = 0 # annotation counts in each content
    tagged = list()
    for article_id in range(len(trainingset)):
        trainingset_split = list(trainingset[article_id])
        while '' or ' ' in trainingset_split:
            if '' in trainingset_split:
                trainingset_split.remove('')
            else:
                trainingset_split.remove(' ')
        start_tmp = 0
        for position_idx in range(0,len(position),5):
            if int(position[position_idx]) == article_id:
                count += 1
                if count == 1:
                    start_pos = int(position[position_idx+1])
                    end_pos = int(position[position_idx+2])
                    entity_type=position[position_idx+4]
                    if start_pos == 0:
                        token = list(trainingset[article_id][start_pos:end_pos])
                        whole_token = trainingset[article_id][start_pos:end_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            # BIO states
                            if token_idx == 0:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                            
                            output_str = token[token_idx] + ' ' + label + '\n'
                            outputfile.write(output_str)

                    else:
                        token = list(trainingset[article_id][0:start_pos])
                        whole_token = trainingset[article_id][0:start_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            
                            output_str = token[token_idx] + ' ' + 'O' + '\n'
                            outputfile.write(output_str)

                        token = list(trainingset[article_id][start_pos:end_pos])
                        whole_token = trainingset[article_id][start_pos:end_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            # BIO states
                            if token[0] == '':
                                if token_idx == 1:
                                    label = 'B-'+entity_type
                                else:
                                    label = 'I-'+entity_type
                            else:
                                if token_idx == 0:
                                    label = 'B-'+entity_type
                                else:
                                    label = 'I-'+entity_type

                            output_str = token[token_idx] + ' ' + label + '\n'
                            outputfile.write(output_str)

                    start_tmp = end_pos
                else:
                    start_pos = int(position[position_idx+1])
                    end_pos = int(position[position_idx+2])
                    entity_type=position[position_idx+4]
                    if start_pos<start_tmp:
                        continue
                    else:
                        token = list(trainingset[article_id][start_tmp:start_pos])
                        whole_token = trainingset[article_id][start_tmp:start_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            output_str = token[token_idx] + ' ' + 'O' + '\n'
                            outputfile.write(output_str)

                    token = list(trainingset[article_id][start_pos:end_pos])
                    whole_token = trainingset[article_id][start_pos:end_pos]
                    for token_idx in range(len(token)):
                        if len(token[token_idx].replace(' ','')) == 0:
                            continue
                        # BIO states
                        if token[0] == '':
                            if token_idx == 1:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                        else:
                            if token_idx == 0:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                        
                        output_str = token[token_idx] + ' ' + label + '\n'
                        outputfile.write(output_str)
                    start_tmp = end_pos

        token = list(trainingset[article_id][start_tmp:])
        whole_token = trainingset[article_id][start_tmp:]
        for token_idx in range(len(token)):
            if len(token[token_idx].replace(' ','')) == 0:
                continue

            
            output_str = token[token_idx] + ' ' + 'O' + '\n'
            outputfile.write(output_str)

        count = 0
    
        output_str = '\n'
        outputfile.write(output_str)
        ID = trainingset[article_id]

        if article_id%10 == 0:
            print('Total complete articles:', iter)

    # close output file
    outputfile.close()

In [8]:
trainingset, position, mentions=loadInputFile(file_path)

In [9]:
data_path='data/sample.data'
CRFFormatData(trainingset, position, data_path)

Total complete articles: 0
Total complete articles: 10
Total complete articles: 20
Total complete articles: 30
Total complete articles: 40
Total complete articles: 50
Total complete articles: 60
Total complete articles: 70
Total complete articles: 80
Total complete articles: 90
Total complete articles: 100
Total complete articles: 110
Total complete articles: 120
Total complete articles: 130
Total complete articles: 140
Total complete articles: 150
Total complete articles: 160
Total complete articles: 170
Total complete articles: 180
Total complete articles: 190


In [10]:
WritePOSTagFile(trainingset, 'train2_POSTag.txt')

NameError: name 'article_id' is not defined

# trainingset示範

In [11]:
for i in range(5):
    print('{0}.'.format(i) + trainingset[i][:30] + '......' + trainingset[i][(len(trainingset[i])-20):])

0.醫師：啊回去還好嗎？民眾：欸，還是虛虛的，但。醫師：欸，真的......一百五十塊一點點車馬費。民眾：這麼好喔。
1.醫師：阿阿嬤她好像說有，前天又有在發燒喔。家屬：對阿都，有時......。醫師：好，ok。家屬：謝謝。醫師：好。
2.民眾：也有點不舒服，可是就是腰這邊有也一點點痛，我脫起來我想......：那樣這邊就可以了。醫師：那個是回診單。
3.醫師：謝謝你這樣幫忙他們這樣，那最近還好嗎？民眾：就是因為不......會拿過去，啊你先後面先稍等。醫師：ＯＫ。
4.醫師：那個，吃藥還Ok嗎？民眾：OK。醫師：沒什麼問題？民眾......。民眾：恩恩。醫師：假如方便的話也可以。


In [12]:
for i in range(5):
    print(position[ (i*5):(i*5+6) ])

['0', '69', '71', '前天', 'time', '0']
['0', '75', '77', '前天', 'time', '0']
['0', '738', '740', '85', 'med_exam', '0']
['0', '741', '744', '102', 'med_exam', '0']
['0', '809', '811', '前年', 'time', '0']


In [13]:
for i in range(5):
    print(position[(i*5+3)],'|', mentions[position[(i*5+3)]])

前天 | time
前天 | time
85 | med_exam
102 | med_exam
前年 | time


# ckip示範

In [14]:
word_s = ws([trainingset[0]],
            sentence_segmentation=True,
            segment_delimiter_set={'?', '？', '!', '！', '。', ',',   
                                   '，', ';', ':', '、'})
print(word_s[0][:10])

['醫師', '：', '啊', '回去', '還好', '嗎', '？', '民眾', '：', '欸']


## [中研院平衡與料庫詞類標記集](http://ckipsvr.iis.sinica.edu.tw/papers/category_list.pdf)

Na: 普通名詞, Nb: 專有名詞, ...

In [15]:
word_p = pos(word_s)

for i in range(10):
    print(word_s[0][i]+':\t'+word_p[0][i])

醫師:	Na
：:	COLONCATEGORY
啊:	I
回去:	VA
還好:	VH
嗎:	T
？:	QUESTIONCATEGORY
民眾:	Na
：:	COLONCATEGORY
欸:	I
