In [1]:
!pip3 install sklearn_crfsuite

You should consider upgrading via the 'c:\users\user\appdata\local\programs\python\python36\python.exe -m pip install --upgrade pip' command.


In [2]:
import os
import sys
import unicodedata
import numpy as np
import pandas as pd
from tqdm import tqdm

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

In [3]:
def CRF(x_train, y_train, x_test, y_test):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(x_train, y_train)
#     print(crf) #
    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

#     print(y_pred_mar) #

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
    print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
    
#     eli5.show_weights(crf, top=10, feature_re='^word\.is',
#                       horizontal_layout=False, show=['targets'])
    
    return y_pred, y_pred_mar, f1score, crf

In [4]:
# load pretrained word vectors
# get a dict of tokens (key) and their pretrained word vectors (value)
# pretrained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
dim = 0
word_vecs= {}
# open pretrained word vector file
with open('.\\raw_data\\cna.cbow.cwe_p.tar_g.512d.0.txt', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0]
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [5]:
print('vocabulary_size: ',len(word_vecs),' word_vector_dim: ',vec.shape)

vocabulary_size:  158566  word_vector_dim:  (512,)


In [6]:
# load `train.data` and separate into a list of labeled data of each text
# return:
#   data_list: a list of lists of tuples, storing tokens and labels (wrapped in tuple) of each text in `train.data`
#   traindata_list: a list of lists, storing training data_list splitted from data_list
#   testdata_list: a list of lists, storing testing data_list splitted from data_list
from sklearn.model_selection import train_test_split
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # here we random split data into training dataset and testing dataset
    # but you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # and generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,
                                                                                                    article_id_list,
                                                                                                    test_size=0.33,
                                                                                                    random_state=56)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [7]:
# open the pos tag file
df_POSTag = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
df_POSTag.head()

Unnamed: 0,article_id,entity_text,POS,length
0,0,醫,Na2,
1,0,師,Na2,
2,0,：,COLONCATEGORY1,
3,0,啊,I1,
4,0,回,VA2,


In [8]:
print(len(df_POSTag['entity_text'].unique()))
print(len(df_POSTag))
print(len(df_POSTag['POS'].unique()))

2140
415530
181


In [9]:
df_POSTag.drop_duplicates(inplace=True)
df_POSTag.head()
text_pool = set(df_POSTag['entity_text'])

In [10]:
# look up the POSTag txt
# encode the word
def POSTagEncode(data_list, article_id_list):
    
    
    POSTag_list = list()

    df = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
    df = df[df['entity_text'] != ' ']
    POSTag_label = list(df['POS'].unique())

    for idx_list in range(len(data_list)):
        df_temp = df[df['article_id']==article_id_list[idx_list]]
        text_temp = list(df_temp['entity_text'])
        POS_temp  = list(df_temp['POS'])
        print(article_id_list[idx_list],'\'',str(len(data_list[idx_list])-len(POS_temp)))
        POSTag_list_temp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            word_POSTag_temp = list()
            for POSTag_label_code in POSTag_label:
                if POS_temp[idx_tuple] == POSTag_label_code:
                    word_POSTag_temp.append(1)
                else:
                    word_POSTag_temp.append(0)
                    
            POSTag_list_temp.append(word_POSTag_temp)
        POSTag_list.append(POSTag_list_temp)

    return POSTag_list

In [11]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def Word2Vector(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector=np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
    
    return embedding_list

In [12]:
# input features: pretrained word vectors of each token
# return a list of feature dicts, each feature dict corresponding to each token
def Feature(embed_list, p):
    
    df = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
    POS_unique_list = list(df['POS'].unique())
    # alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    # for alpha in alphabet_list:
    #     POS_unique_list.append(alpha)
    POS_unique_list.append('Start') # 1 if no last word
    
    feature_list = list()
    
    # feature of w2d (original)
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            
            feature_dict = dict()
            
            # feature of word's POSTag(55+55+1)
            feature_dict['Start'] = 0
            for idx_POS in range(len(POS_unique_list)-1): # exclude Start
                feature_dict[POS_unique_list[idx_POS]] = p[idx_list][idx_tuple][idx_POS]
                if idx_tuple != 0:
                    feature_dict['last_' + POS_unique_list[idx_POS]] = p[idx_list][idx_tuple-1][idx_POS]
                else:
                    feature_dict['Start'] = 1

            #-----------------
            # feature of word's vector(512)
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]

            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)
        print(idx_list+1, '\\', len(embed_list)+1, ', # of token:', len(embed_list[idx_list]))
        
    return feature_list

In [13]:
# get the labels of each tokens in train.data
# return a list of lists of labels
def Preprocess(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
    return label_list

In [14]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = Dataset('.\\data\\train2_sample.data')

In [16]:
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
crfs = []

a = list(range(200))
p = POSTagEncode(data_list, a)
embed_list = Word2Vector(data_list, word_vecs)
X = Feature(embed_list, p)
y = Preprocess(data_list)

for train_index, test_index in rs.split(data_list):
    
    x_train = list()
    y_train = list()
    for idx in train_index:
        x_train.append(X[idx])
        y_train.append(y[idx])
    x_test = list()
    y_test = list()
    for idx in test_index:
        x_test.append(X[idx])
        y_test.append(y[idx])

    y_pred, y_pred_mar, f1score, crf = CRF(x_train, y_train, x_test, y_test)
    crfs.append(crf)

0 ' 0
1 ' 0
2 ' 0
3 ' 0
4 ' 0
5 ' 0
6 ' 0
7 ' 0
8 ' 0
9 ' 0
10 ' 0
11 ' 0
12 ' 0
13 ' 0
14 ' 0
15 ' 0
16 ' 0
17 ' 0
18 ' 0
19 ' 0
20 ' 0
21 ' 0
22 ' 0
23 ' 0
24 ' 0
25 ' 0
26 ' 0
27 ' 0
28 ' 0
29 ' 0
30 ' 0
31 ' 0
32 ' 0
33 ' 0
34 ' 0
35 ' 0
36 ' 0
37 ' 0
38 ' 0
39 ' 0
40 ' 0
41 ' 0
42 ' 0
43 ' 0
44 ' 0
45 ' 0
46 ' 0
47 ' 0
48 ' 0
49 ' 0
50 ' 0
51 ' 0
52 ' 0
53 ' 0
54 ' 0
55 ' 0
56 ' 0
57 ' 0
58 ' 0
59 ' 0
60 ' 0
61 ' 0
62 ' 0
63 ' 0
64 ' 0
65 ' 0
66 ' 0
67 ' 0
68 ' 0
69 ' 0
70 ' 0
71 ' 0
72 ' 0
73 ' 0
74 ' 0
75 ' 0
76 ' 0
77 ' 0
78 ' 0
79 ' 0
80 ' 0
81 ' 0
82 ' 0
83 ' 0
84 ' 0
85 ' 0
86 ' 0
87 ' 0
88 ' 0
89 ' 0
90 ' 0
91 ' 0
92 ' 0
93 ' 0
94 ' 0
95 ' 0
96 ' 0
97 ' 0
98 ' 0
99 ' 0
100 ' 0
101 ' 0
102 ' 0
103 ' 0
104 ' 0
105 ' 0
106 ' 0
107 ' 0
108 ' 0
109 ' 0
110 ' 0
111 ' 0
112 ' 0
113 ' 0
114 ' 0
115 ' 0
116 ' 0
117 ' 0
118 ' 0
119 ' 0
120 ' 0
121 ' 0
122 ' 0
123 ' 0
124 ' 0
125 ' 0
126 ' 0
127 ' 0
128 ' 0
129 ' 0
130 ' 0
131 ' 0
132 ' 0
133 ' 0
134 ' 0
135 ' 0
136 ' 0
137 ' 0
138 ' 

In [17]:
import joblib

# save the trained model
for idx in range(len(crfs)):
    filename = '.\\saved_model\\crf1208_' + str(idx) + '.sav'
    joblib.dump(crfs[idx], filename)

In [None]:
ptrain = POSTagEncode(traindata_list, traindata_article_id_list)
ptest  = POSTagEncode(testdata_list, testdata_article_id_list)

In [None]:
# Load Word Embedding
trainembed_list = Word2Vector(traindata_list, word_vecs)
testembed_list = Word2Vector(testdata_list, word_vecs)

# CRF - Train Data (Augmentation Data)
x_train = Feature(trainembed_list, ptrain)
y_train = Preprocess(traindata_list)

# CRF - Test Data (Golden Standard)
x_test = Feature(testembed_list, ptest)
y_test = Preprocess(testdata_list)

In [None]:
print(len(ptrain))
print(len(ptrain[0]))
print(len(ptrain[0][0]))

print(len(trainembed_list))
print(len(trainembed_list[0]))
print(len(trainembed_list[0][0]))

In [None]:
# each word with 521 dimension
print(x_train[0][1]['dim_1'])
print(len(x_train[0][1]))
print(y_train[0][0])

In [None]:
# release resources
if 'trainembed_list' in globals():
    del trainembed_list
if 'testembed_list' in globals():
    del testembed_list
if 'ptrain' in globals():
    del ptrain
if 'ptest' in globals():
    del ptest

In [None]:
y_pred, y_pred_mar, f1score, crf = CRF(x_train, y_train, x_test, y_test)

In [None]:
f1score

In [None]:
if 'x_train' in globals():
    del x_train
if 'y_train' in globals():
    del y_train
if 'x_test' in globals():
    del x_test
if 'y_test' in globals():
    del y_test