In [1]:
!pip3 install sklearn_crfsuite



In [2]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

In [3]:
def CRF(x_train, y_train, x_test, y_test):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(x_train, y_train)
#     print(crf) #
    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

#     print(y_pred_mar) #

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
    print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
    
#     eli5.show_weights(crf, top=10, feature_re='^word\.is',
#                       horizontal_layout=False, show=['targets'])
    
    return y_pred, y_pred_mar, f1score

In [4]:
# load pretrained word vectors
# get a dict of tokens (key) and their pretrained word vectors (value)
# pretrained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
dim = 0
word_vecs= {}
# open pretrained word vector file
with open('.\\raw_data\\cna.cbow.cwe_p.tar_g.512d.0.txt', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0]
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [6]:
print('vocabulary_size: ',len(word_vecs),' word_vector_dim: ',vec.shape)

vocabulary_size:  158566  word_vector_dim:  (512,)


In [19]:
# load `train.data` and separate into a list of labeled data of each text
# return:
#   data_list: a list of lists of tuples, storing tokens and labels (wrapped in tuple) of each text in `train.data`
#   traindata_list: a list of lists, storing training data_list splitted from data_list
#   testdata_list: a list of lists, storing testing data_list splitted from data_list
from sklearn.model_selection import train_test_split
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # here we random split data into training dataset and testing dataset
    # but you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # and generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,
                                                                                                    article_id_list,
                                                                                                    test_size=0.33,
                                                                                                    random_state=56)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [8]:
# open the pos tag file
df_POSTag = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
df_POSTag.head()

Unnamed: 0,article_id,entity_text,POS,length
0,0,醫,Na2,
1,0,師,Na2,
2,0,：,COLONCATEGORY1,
3,0,啊,I1,
4,0,回,VA2,


In [9]:
print(len(df_POSTag['entity_text'].unique()))
print(len(df_POSTag))
print(len(df_POSTag['POS'].unique()))

1300
53703
147


In [10]:
df_POSTag.drop_duplicates(inplace=True)
df_POSTag.head()
text_pool = set(df_POSTag['entity_text'])

In [17]:
# look up the POSTag txt
# encode the word
def POSTagEncode(data_list, article_id_list):
    
    
    POSTag_list = list()

    df = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
    df = df[df['entity_text'] != ' ']
    POSTag_label = list(df['POS'].unique())
    
    for idx_list in range(len(data_list)):
        df_temp = df[df['article_id']==article_id_list[idx_list]]
        text_temp = list(df_temp['entity_text'])
        POS_temp  = list(df_temp['POS'])
        print(article_id_list[idx_list],'\'',str(len(data_list[idx_list])-len(POS_temp)))
        POSTag_list_temp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            word_POSTag_temp = list()
            for POSTag_label_code in POSTag_label:
                if POS_temp[idx_tuple] == POSTag_label_code:
                    word_POSTag_temp.append(1)
                else:
                    word_POSTag_temp.append(0)
            POSTag_list_temp.append(word_POSTag_temp)
        POSTag_list.append(POSTag_list_temp)

    return POSTag_list

In [12]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def Word2Vector(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector=np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
    
    return embedding_list

In [13]:
# input features: pretrained word vectors of each token
# return a list of feature dicts, each feature dict corresponding to each token
def Feature(embed_list, p):
    
    df = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
    POS_unique_list = list(df['POS'].unique())
    POS_unique_list.append('Start') # 1 if no last word
    feature_list = list()
    
    # feature of w2d (original)
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            
            feature_dict = dict()
            
            # feature of word's POSTag
            feature_dict['Start'] = 0
            for idx_POS in range(len(POS_unique_list)-1): # exclude Start
                feature_dict[POS_unique_list[idx_POS]] = p[idx_list][idx_tuple][idx_POS]
                if idx_tuple != 0:
                    feature_dict['last_' + POS_unique_list[idx_POS]] = p[idx_list][idx_tuple-1][idx_POS]
                else:
                    feature_dict['Start'] = 1
            #-----------------
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]
            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)
        print(idx_list+1, '\\', len(embed_list)+1, ', # of token:', len(embed_list[idx_list]))
        
    return feature_list

In [14]:
# get the labels of each tokens in train.data
# return a list of lists of labels
def Preprocess(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
    return label_list

In [20]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = Dataset('.\\data\\train2_sample.data')

In [34]:
ptrain = POSTagEncode(traindata_list, traindata_article_id_list)
ptest  = POSTagEncode(testdata_list, testdata_article_id_list)

62 ' 0
127 ' 0
46 ' 0
116 ' 0
187 ' 0
170 ' 0
109 ' 0
99 ' 0
171 ' 0
168 ' 0
66 ' 0
47 ' 0
169 ' 0
148 ' 0
147 ' 0
79 ' 0
26 ' 0
132 ' 0
24 ' 0
51 ' 0
124 ' 0
172 ' 0
72 ' 0
198 ' 0
138 ' 0
34 ' 0
121 ' 0
195 ' 0
112 ' 0
150 ' 0
88 ' 0
193 ' 0
33 ' 0
134 ' 0
36 ' 0
133 ' 0
56 ' 0
97 ' 0
49 ' 0
189 ' 0
188 ' 0
114 ' 0
5 ' 0
163 ' 0
146 ' 0
39 ' 0
137 ' 0
83 ' 0
48 ' 0
197 ' 0
23 ' 0
104 ' 0
164 ' 0
2 ' 0
179 ' 0
86 ' 0
27 ' 0
180 ' 0
37 ' 0
184 ' 0
59 ' 0
107 ' 0
1 ' 0
95 ' 0
154 ' 0
50 ' 0
157 ' 0
54 ' 0
155 ' 0
190 ' 0
58 ' 0
52 ' 0
105 ' 0
28 ' 0
118 ' 0
136 ' 0
174 ' 0
199 ' 0
67 ' 0
173 ' 0
131 ' 0
176 ' 0
166 ' 0
177 ' 0
160 ' 0
25 ' 0
159 ' 0
14 ' 0
115 ' 0
93 ' 0
75 ' 0
145 ' 0
191 ' 0
94 ' 0
29 ' 0
8 ' 0
32 ' 0
20 ' 0
13 ' 0
16 ' 0
65 ' 0
74 ' 0
69 ' 0
19 ' 0
178 ' 0
175 ' 0
15 ' 0
196 ' 0
165 ' 0
156 ' 0
31 ' 0
10 ' 0
89 ' 0
161 ' 0
80 ' 0
110 ' 0
11 ' 0
43 ' 0
140 ' 0
152 ' 0
113 ' 0
111 ' 0
55 ' 0
90 ' 0
185 ' 0
100 ' 0
22 ' 0
87 ' 0
142 ' 0
122 ' 0
162 ' 0
192 ' 0
143 ' 0
8

In [35]:
# Load Word Embedding
trainembed_list = Word2Vector(traindata_list, word_vecs)
testembed_list = Word2Vector(testdata_list, word_vecs)

# CRF - Train Data (Augmentation Data)
x_train = Feature(trainembed_list, ptrain)
y_train = Preprocess(traindata_list)

# CRF - Test Data (Golden Standard)
x_test = Feature(testembed_list, ptest)
y_test = Preprocess(testdata_list)

1 \ 135 , # of token: 1375
2 \ 135 , # of token: 1259
3 \ 135 , # of token: 2099
4 \ 135 , # of token: 2861
5 \ 135 , # of token: 1827
6 \ 135 , # of token: 2345
7 \ 135 , # of token: 1791
8 \ 135 , # of token: 3335
9 \ 135 , # of token: 2675
10 \ 135 , # of token: 4396
11 \ 135 , # of token: 661
12 \ 135 , # of token: 1607
13 \ 135 , # of token: 1326
14 \ 135 , # of token: 1466
15 \ 135 , # of token: 2925
16 \ 135 , # of token: 4079
17 \ 135 , # of token: 3369
18 \ 135 , # of token: 2156
19 \ 135 , # of token: 1702
20 \ 135 , # of token: 1718
21 \ 135 , # of token: 4435
22 \ 135 , # of token: 3731
23 \ 135 , # of token: 3689
24 \ 135 , # of token: 1185
25 \ 135 , # of token: 936
26 \ 135 , # of token: 956
27 \ 135 , # of token: 2214
28 \ 135 , # of token: 1988
29 \ 135 , # of token: 7309
30 \ 135 , # of token: 575
31 \ 135 , # of token: 1077
32 \ 135 , # of token: 2829
33 \ 135 , # of token: 1760
34 \ 135 , # of token: 823
35 \ 135 , # of token: 5276
36 \ 135 , # of token: 637
37 \ 13

In [36]:
print(len(ptrain))
print(len(ptrain[0]))
print(len(ptrain[0][0]))

print(len(trainembed_list))
print(len(trainembed_list[0]))
print(len(trainembed_list[0][0]))

134
1375
181
134
1375
512


In [37]:
# each word with 521 dimension
print(x_train[0][1]['dim_1'])
print(len(x_train[0][1]))
print(y_train[0][0])

1.148426
875
O


In [38]:
y_pred, y_pred_mar, f1score = CRF(x_train, y_train, x_test, y_test)

                  precision    recall  f1-score   support

            B-ID      0.000     0.000     0.000         3
            I-ID      0.000     0.000     0.000         7
B-clinical_event      0.000     0.000     0.000         1
I-clinical_event      0.000     0.000     0.000         3
       B-contact      0.000     0.000     0.000        29
       I-contact      0.000     0.000     0.000       107
     B-education      0.000     0.000     0.000         3
     I-education      0.000     0.000     0.000         9
        B-family      0.250     0.091     0.133        11
        I-family      0.250     0.091     0.133        11
      B-location      0.784     0.843     0.812       108
      I-location      0.762     0.767     0.765       159
      B-med_exam      0.571     0.066     0.118       182
      I-med_exam      0.604     0.078     0.138       371
         B-money      0.714     0.222     0.339        45
         I-money      0.694     0.217     0.331       115
          B-n

In [39]:
f1score

0.586899510662267

In [40]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for test_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[test_id])):
        if y_pred[test_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[test_id][pred_id][2:]
        elif start_pos is not None and y_pred[test_id][pred_id][0]=='I' and y_pred[test_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([testdata_list[test_id][position][0] for position in range(start_pos,end_pos+1)])
            line=str(testdata_article_id_list[test_id])+'\t'+str(start_pos)+'\t'+str(end_pos+1)+'\t'+entity_text+'\t'+entity_type
            output+=line+'\n'
        pos+=1  

In [41]:
output_path='output.tsv'
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [42]:
print(output)

article_id	start_position	end_position	entity_text	entity_type
71	9	12	兩個月	time
71	32	37	兩個小時前	time
71	79	82	兩個月	time
71	213	216	兩個月	time
71	329	331	兩年	time
71	2805	2807	兩天	time
42	180	183	五個月	time
42	278	281	兩個月	time
42	329	332	兩個月	time
42	554	559	8月25號	time
42	901	905	兩個禮拜	time
42	941	944	五個月	time
42	950	953	五個月	time
82	4	7	這個月	time
82	191	196	這一天禮拜	time
82	197	202	今天禮拜四	time
82	217	219	禮拜	time
82	222	225	禮拜五	time
82	390	392	八點	time
82	416	419	八點一	med_exam
82	428	430	禮拜	time
82	444	446	禮拜	time
82	485	490	禮拜五晚上	time
82	523	525	八點	time
82	575	579	7月1號	time
82	584	588	7月1號	time
82	593	595	八點	time
82	854	857	這個月	time
167	16	19	這個月	time
167	147	151	下下禮拜	time
167	172	176	下下禮拜	time
167	183	188	禮拜五早上	time
167	241	245	第八個月	time
167	413	415	台中	location
167	420	422	新竹	location
167	524	526	新竹	location
167	636	640	第八個月	time
167	659	662	下禮拜	time
167	718	721	禮拜四	time
167	726	728	下午	time
167	790	794	第九個月	time
167	803	807	十月九號	time
167	826	829	前一天	time
167	844	847	前一週	time
167	852	855	下一週	time
167	88