In [1]:
import os
import sys
import string
import unicodedata

import numpy as np
import pandas as pd
from tqdm import tqdm

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

In [2]:
output_path='output93iter_addAlphaDict.tsv'
model_name = 'crf93iter_AlphaDict'

In [3]:
def CRF(x_train, y_train):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=93,
        all_possible_transitions=True
    )
    crf.fit(x_train, y_train)
    labels = list(crf.classes_)
    
    return labels, crf

In [4]:
# load pretrained word vectors
# get a dict of tokens (key) and their pretrained word vectors (value)
# pretrained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
dim = 0
word_vecs= {}
# open pretrained word vector file
with open('.\\raw_data\\cna.cbow.cwe_p.tar_g.512d.0.txt', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0]
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [5]:
print('vocabulary_size: ',len(word_vecs),' word_vector_dim: ',vec.shape)

vocabulary_size:  158566  word_vector_dim:  (512,)


In [6]:
# simply load data without splitting
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)

    return data_list

In [7]:
# open the pos tag file
df_POSTag = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
df_POSTag.head()

Unnamed: 0,article_id,entity_text,POS,LEN
0,0,醫,Na,2
1,0,師,Na,2
2,0,：,COLONCATEGORY,1
3,0,啊,I,1
4,0,回,VA,2


In [8]:
print(len(df_POSTag))
print(len(df_POSTag['POS'].unique()))

415530
86


In [9]:
# look up the POSTag txt
# encode the word
def POSTagEncode(data_list, POSTag_csv='.\\processed_data\\train2_POSTag.txt'):
    
    
    POSTag_list = list()
    LEN_list = list()

    df = pd.read_csv(POSTag_csv)
    df = df[df['entity_text'] != ' ']

    # load overall catagory
    with open('processed_data\\trian2_AND_dev2_cat.txt', 'r', encoding='utf-8') as f:
        POSTag_label = f.readline()
    POSTag_label = POSTag_label.split(',')

    for idx_list in range(len(data_list)):
        df_temp = df[df['article_id'] == idx_list]
        POS_temp = list(df_temp['POS'])
        LEN_temp = list(df_temp['LEN'])
        # print(idx_list,'\'',str(len(data_list[idx_list])-len(POS_temp)))
        POSTag_list_temp = list()
        LEN_list_temp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            word_POSTag_temp = list()
            
            # POSTag
            for POSTag_label_code in POSTag_label:
                if POS_temp[idx_tuple] == POSTag_label_code:
                    word_POSTag_temp.append(1)
                else:
                    word_POSTag_temp.append(0)
            POSTag_list_temp.append(word_POSTag_temp)

            # LEN
            LEN_list_temp.append(LEN_temp[idx_tuple])


        POSTag_list.append(POSTag_list_temp)
        LEN_list.append(LEN_list_temp)

    return POSTag_list, LEN_list

In [10]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def Word2Vector(data_list, embedding_dict):
    
    alphabet_list = list(string.ascii_uppercase)
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector=np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            # alphabet and width
            if unicodedata.normalize('NFKC',key).upper() in alphabet_list:
                key = unicodedata.normalize('NFKC', key).upper()

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                embedding_dict[key] = unk_vector
                value = embedding_dict[key]
                
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
    
    return embedding_list

In [11]:
# input features: pretrained word vectors of each token
# return a list of feature dicts, each feature dict corresponding to each token
def Feature(embed_list, p, l):
    
    with open('processed_data\\trian2_AND_dev2_cat.txt', 'r', encoding='utf-8') as f:
        POS_unique_list = f.readline()
    POS_unique_list = POS_unique_list.split(',')
    POS_unique_list.append('Start') # 1 if no last word
    
    feature_list = list()
    
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            
            feature_dict = dict()
            
            # feature of word's POSTag(56+56+1)
            feature_dict['Start'] = 0
            for idx_POS in range(len(POS_unique_list)-1): # exclude Start
                feature_dict[POS_unique_list[idx_POS]] = p[idx_list][idx_tuple][idx_POS]
                if idx_tuple != 0:
                    feature_dict['last_' + POS_unique_list[idx_POS]] = p[idx_list][idx_tuple-1][idx_POS]
                else:
                    feature_dict['Start'] = 1
                    feature_dict['last_' + POS_unique_list[idx_POS]] = 0

            # feature of word LEN(1)
            feature_dict['word_LEN'] = l[idx_list][idx_tuple]
            #-----------------
            # feature of word's vector(512)
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]

            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)
        print(idx_list+1, '\\', len(embed_list)+1, ', # of token:', len(embed_list[idx_list]))
        
    return feature_list

In [12]:
# get the labels of each tokens in train.data
# return a list of lists of labels
def Preprocess(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
    return label_list

In [13]:
data_list = Dataset('.\\data\\train2_sample.data')

In [14]:
p, l = POSTagEncode(data_list)
embed_list = Word2Vector(data_list, word_vecs)
X = Feature(embed_list, p, l)
y = Preprocess(data_list)

1 \ 201 , # of token: 4211
2 \ 201 , # of token: 2652
3 \ 201 , # of token: 1689
4 \ 201 , # of token: 1229
5 \ 201 , # of token: 767
6 \ 201 , # of token: 567
7 \ 201 , # of token: 3101
8 \ 201 , # of token: 1790
9 \ 201 , # of token: 1134
10 \ 201 , # of token: 2213
11 \ 201 , # of token: 710
12 \ 201 , # of token: 3503
13 \ 201 , # of token: 1933
14 \ 201 , # of token: 1229
15 \ 201 , # of token: 1340
16 \ 201 , # of token: 1645
17 \ 201 , # of token: 3635
18 \ 201 , # of token: 1204
19 \ 201 , # of token: 2051
20 \ 201 , # of token: 2603
21 \ 201 , # of token: 1282
22 \ 201 , # of token: 1809
23 \ 201 , # of token: 3463
24 \ 201 , # of token: 1633
25 \ 201 , # of token: 1702
26 \ 201 , # of token: 1597
27 \ 201 , # of token: 3369
28 \ 201 , # of token: 4294
29 \ 201 , # of token: 1588
30 \ 201 , # of token: 1495
31 \ 201 , # of token: 3755
32 \ 201 , # of token: 1142
33 \ 201 , # of token: 2970
34 \ 201 , # of token: 1760
35 \ 201 , # of token: 956
36 \ 201 , # of token: 2459
37 \ 

In [15]:
# release resources
if 'embed_list' in globals():
    del embed_list
if 'p' in globals():
    del p
if 'data_list' in globals():
    del data_list

In [16]:
import joblib

labels, crf = CRF(X, y)
joblib.dump(crf, model_name)

['crf93iter_AlphaDict']

In [17]:
def loadInputFile(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        dev_set = list()
        
        while(True):
            article_id = f.readline()
            if 'article_id:' not in article_id:
                break
            else:
                dev_set.append(f.readline())
            f.readline()
            f.readline()
            f.readline()
            
    return dev_set

In [18]:
dev_set = loadInputFile('raw_data\development_2.txt')

In [19]:
p, l = POSTagEncode(dev_set, '.\\processed_data\\dev2_POSTag.txt')
embed_list = Word2Vector(dev_set, word_vecs)
X = Feature(embed_list, p, l)

1 \ 71 , # of token: 3374
2 \ 71 , # of token: 801
3 \ 71 , # of token: 1483
4 \ 71 , # of token: 3292
5 \ 71 , # of token: 2133
6 \ 71 , # of token: 2857
7 \ 71 , # of token: 1647
8 \ 71 , # of token: 1117
9 \ 71 , # of token: 961
10 \ 71 , # of token: 761
11 \ 71 , # of token: 557
12 \ 71 , # of token: 2365
13 \ 71 , # of token: 956
14 \ 71 , # of token: 1184
15 \ 71 , # of token: 1405
16 \ 71 , # of token: 1927
17 \ 71 , # of token: 724
18 \ 71 , # of token: 3155
19 \ 71 , # of token: 656
20 \ 71 , # of token: 6066
21 \ 71 , # of token: 4666
22 \ 71 , # of token: 3495
23 \ 71 , # of token: 3934
24 \ 71 , # of token: 1874
25 \ 71 , # of token: 1048
26 \ 71 , # of token: 1271
27 \ 71 , # of token: 2593
28 \ 71 , # of token: 638
29 \ 71 , # of token: 2655
30 \ 71 , # of token: 2583
31 \ 71 , # of token: 1683
32 \ 71 , # of token: 2148
33 \ 71 , # of token: 2849
34 \ 71 , # of token: 979
35 \ 71 , # of token: 4617
36 \ 71 , # of token: 5114
37 \ 71 , # of token: 908
38 \ 71 , # of token

In [20]:
y_pred = crf.predict(X)

In [21]:
print(len(y_pred))
print(len(y_pred[0]))
print(y_pred[0][0])
print(y_pred[0][0][0])
print(len(dev_set[0]))

70
3374
O
O
3374


In [22]:
i = 0
for dev_id in range(len(y_pred)):
    for pred_id in range(len(y_pred[dev_id])):
        if y_pred[dev_id][pred_id][0] == 'B':
            i = i+1
print(i)

1098


In [23]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for dev_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[dev_id])):
        if y_pred[dev_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[dev_id][pred_id][2:]
        elif start_pos is not None and y_pred[dev_id][pred_id][0]=='I'and pred_id<len(y_pred[dev_id])-1 and y_pred[dev_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([dev_set[dev_id][position] for position in range(start_pos,end_pos+1)])
            line=str(dev_id)+'\t'+str(start_pos)+'\t'+str(end_pos+1)+'\t'+entity_text+'\t'+entity_type
            output+=line+'\n'
        pos+=1     

In [24]:
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [25]:
print(output)

2247	荷蘭	location
11	2347	2350	巫醫師	name
12	214	220	3點多4點多	time
12	342	345	129	med_exam
12	349	352	129	med_exam
12	360	363	129	med_exam
12	407	412	7月26號	time
12	434	436	92	med_exam
12	855	857	85	med_exam
12	885	887	85	med_exam
13	48	52	9月2號	time
13	60	63	9．7	med_exam
13	87	89	56	med_exam
13	158	161	9．6	med_exam
13	188	191	9．6	med_exam
13	394	398	2019	med_exam
13	463	465	3月	time
13	481	484	7個月	time
13	488	491	10月	time
13	492	496	10月底	time
14	27	30	753	med_exam
14	45	47	37	med_exam
14	242	246	38.7	med_exam
14	250	253	387	med_exam
14	384	388	68.7	med_exam
14	459	462	2個月	time
14	476	479	2個月	time
14	621	624	這個月	time
14	655	657	8月	time
14	662	664	4月	time
14	671	673	4月	time
14	701	703	8月	time
14	1094	1096	今年	time
14	1136	1140	前兩個月	time
14	1168	1170	22	med_exam
14	1173	1177	5月21	time
14	1179	1184	5月21號	time
14	1385	1390	9月20號	time
14	1396	1404	9月……9月20	time
15	92	95	2.4	med_exam
15	99	102	2.4	med_exam
15	177	179	58	med_exam
15	182	185	2.4	med_exam
15	632	634	前鎮	location
15	1437	1439	10	med_exam
