In [9]:
!python --version

Python 3.6.8


In [10]:
!pip3 install sklearn_crfsuite

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

In [2]:
def CRF(x_train, y_train, x_test, y_test):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(x_train, y_train)
#     print(crf) #
    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

#     print(y_pred_mar) #

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
    print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
    
#     eli5.show_weights(crf, top=10, feature_re='^word\.is',
#                       horizontal_layout=False, show=['targets'])
    
    return y_pred, y_pred_mar, f1score

In [3]:
import numpy as np

In [4]:
# load pretrained word vectors
# get a dict of tokens (key) and their pretrained word vectors (value)
# pretrained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
dim = 0
word_vecs= {}
# open pretrained word vector file
with open('Dataset/cna.cbow.cwe_p.tar_g.512d.0.txt', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0]
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [5]:
print('vocabulary_size: ',len(word_vecs),' word_vector_dim: ',vec.shape)

vocabulary_size:  37230  word_vector_dim:  (353,)


In [6]:
# load `train.data` and separate into a list of labeled data of each text
# return:
#   data_list: a list of lists of tuples, storing tokens and labels (wrapped in tuple) of each text in `train.data`
#   traindata_list: a list of lists, storing training data_list splitted from data_list
#   testdata_list: a list of lists, storing testing data_list splitted from data_list
from sklearn.model_selection import train_test_split
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # here we random split data into training dataset and testing dataset
    # but you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # and generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,
                                                                                                    article_id_list,
                                                                                                    test_size=0.33,
                                                                                                    random_state=42)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [7]:
# open the pos tag file
df_POSTag = pd.read_csv('Dataset/POSTag.txt')
df_POSTag.head()

Unnamed: 0,article_id,text_entity,POS,length
0,1,醫,Na,2
1,1,師,Na,2
2,1,：,COLONCATEGORY,1
3,1,你,Nh,1
4,1,有,D,1


In [8]:
print(len(df_POSTag['text_entity'].unique()))
print(len(df_POSTag))
print(len(df_POSTag['POS'].unique()))

1231
47040
55


In [9]:
df_POSTag.drop_duplicates(inplace=True)
df_POSTag.head()
text_pool = set(df_POSTag['text_entity'])

In [10]:
# look up the POSTag txt
# encode the word
def POSTagEncode(data_list, article_id_list):
    
    POSTag_list = list()

    df = pd.read_csv('Dataset/POSTag.txt')
    POSTag_label = list(df['POS'].unique())
    
    for idx_list in range(len(data_list)):
        df_temp = df[df['article_id']==article_id_list[idx_list]+1]

        text_temp = list(df_temp['text_entity'])
        POS_temp  = list(df_temp['POS'])
        POSTag_list_temp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            word_POSTag_temp=list()
            temp=0
            for POSTag_label_code in POSTag_label:
                if POS_temp[idx_tuple] == POSTag_label_code:
                    word_POSTag_temp.append(temp)
                    break
                else:
                    temp+=1
            POSTag_list_temp.append(word_POSTag_temp)
        POSTag_list.append(POSTag_list_temp)

    return POSTag_list

In [11]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def Word2Vector(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector=np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
    
    return embedding_list

In [32]:
# input features: pretrained word vectors of each token
# return a list of feature dicts, each feature dict corresponding to each token
def Feature(embed_list, p):
    
    df = pd.read_csv('Dataset/POSTag.txt')
    feature_list = list()

    list_temp=list(df['POS'].unique())

    # feature of w2d (original)
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            
            feature_dict = dict()
            # my feature
            feature_dict["POS"] = p[idx_list][idx_tuple][0]
            #print(f"content with 0 {p[idx_list][idx_tuple][0]},content without 0 {p[idx_list][idx_tuple]}")
            #-----------------
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]
            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)
        print(idx_list+1, '\\', len(embed_list)+1, ', # of token:', len(embed_list[idx_list]))
        
    return feature_list

In [13]:
# get the labels of each tokens in train.data
# return a list of lists of labels
def Preprocess(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
    return label_list

In [14]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = Dataset('Dataset/sample.data')

In [30]:
ptrain = POSTagEncode(traindata_list, traindata_article_id_list)
ptest  = POSTagEncode(testdata_list, testdata_article_id_list)

In [33]:
# Load Word Embedding
trainembed_list = Word2Vector(traindata_list, word_vecs)
testembed_list = Word2Vector(testdata_list, word_vecs)

# CRF - Train Data (Augmentation Data)
x_train = Feature(trainembed_list, ptrain)
y_train = Preprocess(traindata_list)

# CRF - Test Data (Golden Standard)
x_test = Feature(testembed_list, ptest)
y_test = Preprocess(testdata_list)

1 \ 18 , # of token: 1759
2 \ 18 , # of token: 1827
3 \ 18 , # of token: 1522
4 \ 18 , # of token: 2519
5 \ 18 , # of token: 2618
6 \ 18 , # of token: 1840
7 \ 18 , # of token: 1287
8 \ 18 , # of token: 1000
9 \ 18 , # of token: 1077
10 \ 18 , # of token: 1686
11 \ 18 , # of token: 2024
12 \ 18 , # of token: 3689
13 \ 18 , # of token: 912
14 \ 18 , # of token: 1988
15 \ 18 , # of token: 1080
16 \ 18 , # of token: 1074
17 \ 18 , # of token: 2130
1 \ 10 , # of token: 2829
2 \ 10 , # of token: 1090
3 \ 10 , # of token: 2882
4 \ 10 , # of token: 2075
5 \ 10 , # of token: 1394
6 \ 10 , # of token: 992
7 \ 10 , # of token: 1185
8 \ 10 , # of token: 2731
9 \ 10 , # of token: 1830


In [34]:
print(len(ptrain))
print(len(ptrain[0]))
print(len(ptrain[0][0]))

print(len(trainembed_list))
print(len(trainembed_list[0]))
print(len(trainembed_list[0][0]))

17
1759
1
17
1759
512


In [35]:
# each word with 521 dimension
print(x_train[0][1]['dim_1'])
print(len(x_train[0][1]))
print(y_train[0][0])

0.803237
513
O


In [38]:
y_pred, y_pred_mar, f1score = CRF(x_train, y_train, x_test, y_test)

              precision    recall  f1-score   support

  B-location      0.000     0.000     0.000        15
  I-location      0.000     0.000     0.000        41
  B-med_exam      0.000     0.000     0.000        33
  I-med_exam      1.000     0.025     0.049        80
     B-money      0.444     0.333     0.381        12
     I-money      0.462     0.171     0.250        35
      B-name      0.000     0.000     0.000         7
      I-name      0.000     0.000     0.000        10
      B-time      0.618     0.423     0.503       111
      I-time      0.828     0.528     0.645       265

   micro avg      0.716     0.327     0.449       609
   macro avg      0.335     0.148     0.183       609
weighted avg      0.640     0.327     0.401       609



In [39]:
f1score

0.4006377684321067