In [1]:
import os
import sys
import numpy as np
import pandas as pd

import tensorflow as tf
import ckiptagger

# 讀取訓練與開發資料

In [2]:
train_path= os.getcwd() + '\\raw_data\\train_2.txt'
dev_path = os.getcwd() + '\\raw_data\\development_2.txt'

In [3]:
def loadInputFile(train_path, dev_path):
    developeset = list()  # store developeset [content,content,...]
    trainingset = list()  # store trainingset [content,content,...]
    position = list()  # store position [article_id, start_pos, end_pos, entity_text, entity_type, ...]
    mentions = dict()  # store mentions[mention] = Type
    
    with open(train_path, 'r', encoding='utf8') as f:
        file_text=f.read().encode('utf-8').decode('utf-8-sig')
    datas=file_text.split('\n\n--------------------\n\n')[:-1]
    for data in datas:
        data=data.split('\n')
        content=data[0]
        trainingset.append(content)
        annotations=data[1:]
        for annot in annotations[1:]:
            annot=annot.split('\t') #annot= article_id, start_pos, end_pos, entity_text, entity_type
            position.extend(annot)
            mentions[annot[3]]=annot[4]
    
    with open(dev_path, 'r', encoding='utf8') as f:
    
        while(True):
            article_id = f.readline()
            if 'article_id:' not in article_id:
                break
            else:
                developeset.append(f.readline())
            f.readline()
            f.readline()
            f.readline()
            

    return developeset, trainingset, position, mentions

# 使用pretrained的模型ckip來標記詞性作為feature
[需要使用的模型](http://ckip.iis.sinica.edu.tw/data/ckiptagger/data.zip)
ckipath為解壓縮後資料夾的path

In [4]:
from ckiptagger import WS, POS, NER
ckipath = '.\\data\\data\\data'
ws = WS(ckipath)
pos = POS(ckipath)
# ner = NER(ckipath)

In [5]:
 def LoadPosTag(trainingset, developmentset):

    train_POSTarr = list()
    dev_POSTarr = list()

    # combin all article
    full_article = list()
    full_article = trainingset.copy()
    for dev_article in developmentset:
        full_article.append(dev_article)

    word_s = ws(full_article,
                sentence_segmentation=True,
                segment_delimiter_set={'?', '？', '!', '！', '。', ',',   
                                    '，', ';', ':', '、'})
    word_p = pos(word_s)
    print('CKIP-tag complete')
    
    for idx_list in range(len(word_s)):
        temp_arr = list()
        for idx_tuple in range(len(word_s[idx_list])):
            for character in word_s[idx_list][idx_tuple]:
                temp_arr.append(word_p[idx_list][idx_tuple])
        # is training set
        if idx_list < len(trainingset):
            train_POSTarr.append(temp_arr)
        else:
             dev_POSTarr.append(temp_arr)
        
        if (idx_list)%10==0 and idx_list != 0:
            print(idx_list, 'completed')

    return train_POSTarr, dev_POSTarr

In [6]:
def WritePOSTagFile(dev_set, train_set, dev_POS, train_POS,dev_file_name='dev_POSTag.txt', train_file_name='train_POSTag.txt'):

    # write dev file
    if (os.path.isfile(".\\processed_data\\" + dev_file_name)):
        os.remove(".\\processed_data\\" + dev_file_name)
    with open(".\\processed_data\\" + dev_file_name, "w", encoding='utf8') as f:
        f.write('article_id,entity_text,POS\n')
        iter = 0

        for idx_dev in range(len(dev_set)):
            for idx_text_POSTag in range(len(dev_set[idx_dev])):
                f.write(str(iter)+','+dev_set[idx_dev][idx_text_POSTag]+','+dev_POS[idx_dev][idx_text_POSTag]+'\n')
            iter = iter + 1
        
        if iter%10 == 0:
            print('Total complete development articles:', iter)

    # write train file
    if (os.path.isfile(".\\processed_data\\" + train_file_name)):
        os.remove(".\\processed_data\\" + train_file_name)
    with open(".\\processed_data\\" + train_file_name, "w", encoding='utf8') as f:
        f.write('article_id,entity_text,POS\n')
        iter = 0

        for idx_train in range(len(train_set)):
            for idx_text_POSTag in range(len(train_set[idx_train])):
                f.write(str(iter)+','+train_set[idx_train][idx_text_POSTag]+','+train_POS[idx_train][idx_text_POSTag]+'\n')
            iter = iter + 1
        
        if iter%10 == 0:
            print('Total complete train articles:', iter)

# 將資料標記為CRF的label形式(同baseline)

In [7]:
def CRFFormatData(trainingset, position, path):
    if (os.path.isfile(path)):
        os.remove(path)
    outputfile = open(path, 'a', encoding= 'utf-8')

    # output file lines
    count = 0 # annotation counts in each content
    tagged = list()
    for article_id in range(len(trainingset)):
        trainingset_split = list(trainingset[article_id])
        while '' or ' ' in trainingset_split:
            if '' in trainingset_split:
                trainingset_split.remove('')
            else:
                trainingset_split.remove(' ')
        start_tmp = 0
        for position_idx in range(0,len(position),5):
            if int(position[position_idx]) == article_id:
                count += 1
                if count == 1:
                    start_pos = int(position[position_idx+1])
                    end_pos = int(position[position_idx+2])
                    entity_type=position[position_idx+4]
                    if start_pos == 0:
                        token = list(trainingset[article_id][start_pos:end_pos])
                        whole_token = trainingset[article_id][start_pos:end_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            # BIO states
                            if token_idx == 0:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                            
                            output_str = token[token_idx] + ' ' + label + '\n'
                            outputfile.write(output_str)

                    else:
                        token = list(trainingset[article_id][0:start_pos])
                        whole_token = trainingset[article_id][0:start_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            
                            output_str = token[token_idx] + ' ' + 'O' + '\n'
                            outputfile.write(output_str)

                        token = list(trainingset[article_id][start_pos:end_pos])
                        whole_token = trainingset[article_id][start_pos:end_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            # BIO states
                            if token[0] == '':
                                if token_idx == 1:
                                    label = 'B-'+entity_type
                                else:
                                    label = 'I-'+entity_type
                            else:
                                if token_idx == 0:
                                    label = 'B-'+entity_type
                                else:
                                    label = 'I-'+entity_type

                            output_str = token[token_idx] + ' ' + label + '\n'
                            outputfile.write(output_str)

                    start_tmp = end_pos
                else:
                    start_pos = int(position[position_idx+1])
                    end_pos = int(position[position_idx+2])
                    entity_type=position[position_idx+4]
                    if start_pos<start_tmp:
                        continue
                    else:
                        token = list(trainingset[article_id][start_tmp:start_pos])
                        whole_token = trainingset[article_id][start_tmp:start_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            output_str = token[token_idx] + ' ' + 'O' + '\n'
                            outputfile.write(output_str)

                    token = list(trainingset[article_id][start_pos:end_pos])
                    whole_token = trainingset[article_id][start_pos:end_pos]
                    for token_idx in range(len(token)):
                        if len(token[token_idx].replace(' ','')) == 0:
                            continue
                        # BIO states
                        if token[0] == '':
                            if token_idx == 1:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                        else:
                            if token_idx == 0:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                        
                        output_str = token[token_idx] + ' ' + label + '\n'
                        outputfile.write(output_str)
                    start_tmp = end_pos

        token = list(trainingset[article_id][start_tmp:])
        whole_token = trainingset[article_id][start_tmp:]
        for token_idx in range(len(token)):
            if len(token[token_idx].replace(' ','')) == 0:
                continue

            
            output_str = token[token_idx] + ' ' + 'O' + '\n'
            outputfile.write(output_str)

        count = 0
    
        output_str = '\n'
        outputfile.write(output_str)
        ID = trainingset[article_id]

        if article_id%10 == 0:
            print('Total complete articles:', article_id)

    # close output file
    outputfile.close()

In [8]:
developset, trainingset, position, mention = loadInputFile(train_path, dev_path)
train_POS, dev_POS = LoadPosTag(trainingset, developset)

CKIP-tag complete
10 completed
20 completed
30 completed
40 completed
50 completed
60 completed
70 completed
80 completed
90 completed
100 completed
110 completed
120 completed
130 completed
140 completed
150 completed
160 completed
170 completed
180 completed
190 completed
200 completed
210 completed
220 completed
230 completed
240 completed
250 completed
260 completed


In [9]:
# length checking
print('# of tagged train article\t', len(train_POS))
print('# of tagged text in article0\t', len(train_POS[0]))
print('# of text in article0\t\t', len(trainingset[0]))
print('# of tagged development article\t', len(dev_POS))
print('# of tagged text in article0\t',len(dev_POS[0]))
print('# of text in article0\t\t',len(developset[0]))

# of tagged train article	 200
# of tagged text in article0	 4211
# of text in article0		 4211
# of tagged development article	 70
# of tagged text in article0	 3374
# of text in article0		 3374


In [10]:
WritePOSTagFile(developset, trainingset, dev_POS, train_POS, 'dev2_POSTag.txt', 'train2_POSTag.txt')

Total complete development articles: 70
Total complete train articles: 200


In [11]:
# write CRF label file at data_path
data_path='data/train2_sample.data'
CRFFormatData(trainingset, position, data_path)

Total complete articles: 0
Total complete articles: 10
Total complete articles: 20
Total complete articles: 30
Total complete articles: 40
Total complete articles: 50
Total complete articles: 60
Total complete articles: 70
Total complete articles: 80
Total complete articles: 90
Total complete articles: 100
Total complete articles: 110
Total complete articles: 120
Total complete articles: 130
Total complete articles: 140
Total complete articles: 150
Total complete articles: 160
Total complete articles: 170
Total complete articles: 180
Total complete articles: 190


In [12]:
# make sure the feature size is identical
dev_set = set()
for article_POS in dev_POS:
    for text_POS in article_POS:
        dev_set.add(text_POS)
print('POSTag type of development set\t', len(dev_set))

train_set = set()
for article_POS in train_POS:
    for text_POS in article_POS:
        train_set.add(text_POS)
print('POSTag type of tarining set\t', len(train_set))

POSTag type of development set	 56
POSTag type of tarining set	 56


In [13]:
print(dev_set)

{'VJ', 'Nc', 'Cbb', 'V_2', 'Nh', 'COLONCATEGORY', 'PAUSECATEGORY', 'QUESTIONCATEGORY', 'T', 'SHI', 'VG', 'Di', 'PARENTHESISCATEGORY', 'Na', 'DE', 'Nes', 'VD', 'VI', 'Nf', 'VK', 'WHITESPACE', 'Nep', 'VAC', 'VE', 'I', 'Dfb', 'VA', 'Neqb', 'VB', 'VC', 'Ng', 'Neu', 'Dk', 'ETCCATEGORY', 'Neqa', 'VCL', 'COMMACATEGORY', 'P', 'EXCLAMATIONCATEGORY', 'VH', 'Nd', 'Nv', 'Cba', 'Cab', 'Nb', 'Dfa', 'A', 'VL', 'PERIODCATEGORY', 'Ncd', 'VHC', 'D', 'Da', 'FW', 'VF', 'Caa'}
