In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

In [2]:
def CRF(x_train, y_train):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=105,
        all_possible_transitions=True
    )
    crf.fit(x_train, y_train)
    labels = list(crf.classes_)
    
    return labels, crf

In [3]:
# load pretrained word vectors
# get a dict of tokens (key) and their pretrained word vectors (value)
# pretrained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
dim = 0
word_vecs= {}
# open pretrained word vector file
with open('.\\raw_data\\cna.cbow.cwe_p.tar_g.512d.0.txt', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0]
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [4]:
print('vocabulary_size: ',len(word_vecs),' word_vector_dim: ',vec.shape)

vocabulary_size:  158566  word_vector_dim:  (512,)


In [5]:
# simply load data without splitting
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)

    return data_list

In [6]:
# open the pos tag file
df_POSTag = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
df_POSTag.head()

Unnamed: 0,article_id,entity_text,POS,LEN
0,0,醫,Na,2
1,0,師,Na,2
2,0,：,COLONCATEGORY,1
3,0,啊,I,1
4,0,回,VA,2


In [7]:
print(len(df_POSTag))
print(len(df_POSTag['POS'].unique()))

415530
85


In [14]:
# look up the POSTag txt
# encode the word
def POSTagEncode(data_list, POSTag_csv='.\\processed_data\\train2_POSTag.txt'):
    
    
    POSTag_list = list()

    df = pd.read_csv(POSTag_csv)

    # load overall catagory
    with open('processed_data\\trian2_AND_dev2_cat.txt', 'r', encoding='utf-8') as f:
        POSTag_label = f.readline()
    POSTag_label = POSTag_label.split(',')

    for idx_list in range(len(data_list)):
        df_temp = df[df['article_id'] == idx_list]
        POS_temp = list(df_temp['POS'])
        # print(idx_list,'\'',str(len(data_list[idx_list])-len(POS_temp)))
        POSTag_list_temp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            word_POSTag_temp = list()
            for POSTag_label_code in POSTag_label:
                if POS_temp[idx_tuple] == POSTag_label_code:
                    word_POSTag_temp.append(1)
                else:
                    word_POSTag_temp.append(0)

            POSTag_list_temp.append(word_POSTag_temp)
        POSTag_list.append(POSTag_list_temp)

    return POSTag_list

In [9]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def Word2Vector(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector=np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
    
    return embedding_list

In [10]:
# input features: pretrained word vectors of each token
# return a list of feature dicts, each feature dict corresponding to each token
def Feature(embed_list, p):
    
    with open('processed_data\\trian2_AND_dev2_cat.txt', 'r', encoding='utf-8') as f:
        POS_unique_list = f.readline()
    POS_unique_list = POS_unique_list.split(',')
    POS_unique_list.append('Start') # 1 if no last word
    
    feature_list = list()
    
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            
            feature_dict = dict()
            
            # feature of word's POSTag(56+56+1)
            feature_dict['Start'] = 0
            for idx_POS in range(len(POS_unique_list)-1): # exclude Start
                feature_dict[POS_unique_list[idx_POS]] = p[idx_list][idx_tuple][idx_POS]
                if idx_tuple != 0:
                    feature_dict['last_' + POS_unique_list[idx_POS]] = p[idx_list][idx_tuple-1][idx_POS]
                else:
                    feature_dict['Start'] = 1
                    feature_dict['last_' + POS_unique_list[idx_POS]] = 0

            #-----------------
            # feature of word's vector(512)
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]

            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)
        print(idx_list+1, '\\', len(embed_list)+1, ', # of token:', len(embed_list[idx_list]))
        
    return feature_list

In [11]:
# get the labels of each tokens in train.data
# return a list of lists of labels
def Preprocess(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
    return label_list

In [12]:
data_list = Dataset('.\\data\\train2_sample.data')

In [15]:
p = POSTagEncode(data_list)
embed_list = Word2Vector(data_list, word_vecs)
X = Feature(embed_list, p)
y = Preprocess(data_list)

1 \ 201 , # of token: 4211
2 \ 201 , # of token: 2652
3 \ 201 , # of token: 1689
4 \ 201 , # of token: 1229
5 \ 201 , # of token: 767
6 \ 201 , # of token: 567
7 \ 201 , # of token: 3101
8 \ 201 , # of token: 1790
9 \ 201 , # of token: 1134
10 \ 201 , # of token: 2213
11 \ 201 , # of token: 710
12 \ 201 , # of token: 3503
13 \ 201 , # of token: 1933
14 \ 201 , # of token: 1229
15 \ 201 , # of token: 1340
16 \ 201 , # of token: 1645
17 \ 201 , # of token: 3635
18 \ 201 , # of token: 1204
19 \ 201 , # of token: 2051
20 \ 201 , # of token: 2603
21 \ 201 , # of token: 1282
22 \ 201 , # of token: 1809
23 \ 201 , # of token: 3463
24 \ 201 , # of token: 1633
25 \ 201 , # of token: 1702
26 \ 201 , # of token: 1597
27 \ 201 , # of token: 3369
28 \ 201 , # of token: 4294
29 \ 201 , # of token: 1588
30 \ 201 , # of token: 1495
31 \ 201 , # of token: 3755
32 \ 201 , # of token: 1142
33 \ 201 , # of token: 2970
34 \ 201 , # of token: 1760
35 \ 201 , # of token: 956
36 \ 201 , # of token: 2459
37 \ 

In [19]:
# release resources
if 'embed_list' in globals():
    del embed_list
if 'p' in globals():
    del p
if 'data_list' in globals():
    del data_list

In [20]:
import joblib

labels, crf = CRF(X, y)
joblib.dump(crf, 'crf_105iter')

['crf_85iter']

In [21]:
def loadInputFile(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        dev_set = list()
        
        while(True):
            article_id = f.readline()
            if 'article_id:' not in article_id:
                break
            else:
                dev_set.append(f.readline())
            f.readline()
            f.readline()
            f.readline()
            
    return dev_set

In [22]:
dev_set = loadInputFile('raw_data\development_2.txt')

In [23]:
p = POSTagEncode(dev_set, '.\\processed_data\\dev2_POSTag.txt')
embed_list = Word2Vector(dev_set, word_vecs)
X = Feature(embed_list, p)

1 \ 71 , # of token: 3374
2 \ 71 , # of token: 801
3 \ 71 , # of token: 1483
4 \ 71 , # of token: 3292
5 \ 71 , # of token: 2133
6 \ 71 , # of token: 2857
7 \ 71 , # of token: 1647
8 \ 71 , # of token: 1117
9 \ 71 , # of token: 961
10 \ 71 , # of token: 761
11 \ 71 , # of token: 557
12 \ 71 , # of token: 2365
13 \ 71 , # of token: 956
14 \ 71 , # of token: 1184
15 \ 71 , # of token: 1405
16 \ 71 , # of token: 1927
17 \ 71 , # of token: 724
18 \ 71 , # of token: 3155
19 \ 71 , # of token: 656
20 \ 71 , # of token: 6066
21 \ 71 , # of token: 4666
22 \ 71 , # of token: 3495
23 \ 71 , # of token: 3934
24 \ 71 , # of token: 1874
25 \ 71 , # of token: 1048
26 \ 71 , # of token: 1271
27 \ 71 , # of token: 2593
28 \ 71 , # of token: 638
29 \ 71 , # of token: 2655
30 \ 71 , # of token: 2583
31 \ 71 , # of token: 1683
32 \ 71 , # of token: 2148
33 \ 71 , # of token: 2849
34 \ 71 , # of token: 979
35 \ 71 , # of token: 4617
36 \ 71 , # of token: 5114
37 \ 71 , # of token: 908
38 \ 71 , # of token

In [24]:
y_pred = crf.predict(X)

In [25]:
print(len(y_pred))
print(len(y_pred[0]))
print(y_pred[0][0])
print(y_pred[0][0][0])
print(len(dev_set[0]))

70
3374
O
O
3374


In [26]:
i = 0
for dev_id in range(len(y_pred)):
    for pred_id in range(len(y_pred[dev_id])):
        if y_pred[dev_id][pred_id][0] == 'B':
            i = i+1
print(i)

1143


In [27]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for dev_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[dev_id])):
        if y_pred[dev_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[dev_id][pred_id][2:]
        elif start_pos is not None and y_pred[dev_id][pred_id][0]=='I'and pred_id<len(y_pred[dev_id])-1 and y_pred[dev_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([dev_set[dev_id][position] for position in range(start_pos,end_pos+1)])
            line=str(dev_id)+'\t'+str(start_pos)+'\t'+str(end_pos+1)+'\t'+entity_text+'\t'+entity_type
            output+=line+'\n'
        pos+=1     

In [28]:
output_path='output105iter.tsv'
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [29]:
print(output)

time
33	48	50	早上	time
33	51	54	禮拜五	time
33	55	57	下午	time
33	836	841	……5、6	profession
33	880	885	10月7號	time
33	899	902	下禮拜	time
33	920	923	五下午	time
34	436	440	四五年前	time
34	724	727	半年前	time
34	732	735	半年前	time
34	772	775	五個月	time
34	779	782	五個月	time
34	1513	1516	那天天	time
34	1536	1538	昨天	time
34	1805	1809	一個禮拜	time
34	2476	2481	40、50	profession
34	2693	2696	6個月	time
34	2803	2810	概1800多塊	profession
34	2830	2832	半年	time
34	2874	2877	三個月	time
34	2882	2885	700	profession
34	2916	2919	400	profession
34	2946	2950	1000	profession
34	2946	3537	1000塊但是你每次回來都可以領到一整瓶藥。民眾：OK。醫師：那當然藥你可能一個月用不完你就留著，沒有關係那反正就後面慢慢吃。民眾：了解。醫師：那所以我們今天原則上就先抽血。民眾：好。醫師：好。民眾：嗯。醫師：所以你跟你伴侶是講好是不是開放性的關係？民眾：不是。醫師：就是不可以、不可以去外面偷吃。民眾：嗯不行，也不是啦就是本來就……好我、我也懶。因為他他其實對感情也比較專一啦，應該說他們，應該說當然他一開始的時候他就先、事先跟我說有這個狀況。醫師：嗯哼。民眾：那，我就覺得說這人怎麼這麼誠實。醫師：你曾經有交往過感染者嗎？民眾：沒有。醫師：這是第一次？民眾：對對對，只是因為我覺得說這人怎麼這麼誠實，就是他就一五一十的把所有事情都跟我講，然後我就想說就覺得說欸，就覺得嗯……醫師：是老實的……民眾：我覺得就是這個有很勇敢的表現。醫師：嗯哼。民眾：因為不是每個人都敢把自己這種狀況跟對方說。醫師：是。民眾：所以我會覺得就是……醫師：是。民眾：老實說我會覺得這樣是個誠實的表現，所以我就滿信任他的。醫師：他是年紀比你小？民