In [None]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

In [None]:
def CRF(x_train, y_train):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=150,
        all_possible_transitions=True
    )
    crf.fit(x_train, y_train)
    labels = list(crf.classes_)
    
    return labels, crf

In [None]:
# load pretrained word vectors
# get a dict of tokens (key) and their pretrained word vectors (value)
# pretrained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
dim = 0
word_vecs= {}
# open pretrained word vector file
with open('.\\raw_data\\cna.cbow.cwe_p.tar_g.512d.0.txt', encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0]
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [None]:
print('vocabulary_size: ',len(word_vecs),' word_vector_dim: ',vec.shape)

In [None]:
# simply load data without splitting
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)

    return data_list

In [None]:
# open the pos tag file
df_POSTag = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
df_POSTag.head()

In [None]:
print(len(df_POSTag))
print(len(df_POSTag['POS'].unique()))

In [None]:
# look up the POSTag txt
# encode the word
def POSTagEncode(data_list, POSTag_csv='.\\processed_data\\train2_POSTag.txt'):
    
    
    POSTag_list = list()

    df = pd.read_csv(POSTag_csv)
    df = df[df['entity_text'] != ' ']
    POSTag_label = list(df['POS'].unique())

    for idx_list in range(len(data_list)):
        df_temp = df[df['article_id']==idx_list]
        POS_temp  = list(df_temp['POS'])
        print(idx_list,'\'',str(len(data_list[idx_list])-len(POS_temp)))
        POSTag_list_temp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            word_POSTag_temp = list()
            for POSTag_label_code in POSTag_label:
                if POS_temp[idx_tuple] == POSTag_label_code:
                    word_POSTag_temp.append(1)
                else:
                    word_POSTag_temp.append(0)
                    
            POSTag_list_temp.append(word_POSTag_temp)
        POSTag_list.append(POSTag_list_temp)

    return POSTag_list

In [None]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def Word2Vector(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector=np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
    
    return embedding_list

In [None]:
# input features: pretrained word vectors of each token
# return a list of feature dicts, each feature dict corresponding to each token
def Feature(embed_list, p):
    
    df = pd.read_csv('.\\processed_data\\train2_POSTag.txt')
    POS_unique_list = list(df['POS'].unique())
    # alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    # for alpha in alphabet_list:
    #     POS_unique_list.append(alpha)
    POS_unique_list.append('Start') # 1 if no last word
    
    feature_list = list()
    
    # feature of w2d (original)
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            
            feature_dict = dict()
            
            # feature of word's POSTag(56+56+1)
            feature_dict['Start'] = 0
            for idx_POS in range(len(POS_unique_list)-1): # exclude Start
                feature_dict[POS_unique_list[idx_POS]] = p[idx_list][idx_tuple][idx_POS]
                if idx_tuple != 0:
                    feature_dict['last_' + POS_unique_list[idx_POS]] = p[idx_list][idx_tuple-1][idx_POS]
                else:
                    feature_dict['Start'] = 1

            #-----------------
            # feature of word's vector(512)
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]

            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)
        print(idx_list+1, '\\', len(embed_list)+1, ', # of token:', len(embed_list[idx_list]))
        
    return feature_list

In [None]:
# get the labels of each tokens in train.data
# return a list of lists of labels
def Preprocess(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
    return label_list

In [None]:
data_list = Dataset('.\\data\\train2_sample.data')

In [None]:
p = POSTagEncode(data_list)
embed_list = Word2Vector(data_list, word_vecs)
X = Feature(embed_list, p)
y = Preprocess(data_list)

In [None]:
# release resources
if 'embed_list' in globals():
    del embed_list
if 'p' in globals():
    del p
if 'data_list' in globals():
    del data_list

In [None]:
import joblib

labels, crf = CRF(X, y)
joblib.dump(crf, 'crf_150iter')

In [None]:
def loadInputFile(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        dev_set = list()
        
        while(True):
            article_id = f.readline()
            if 'article_id:' not in article_id:
                break
            else:
                dev_set.append(f.readline())
            f.readline()
            f.readline()
            f.readline()
            
    return dev_set

In [None]:
dev_set = loadInputFile('raw_data\development_2.txt')

In [None]:
p = POSTagEncode(dev_set, '.\\processed_data\\dev2_POSTag.txt')
embed_list = Word2Vector(dev_set, word_vecs)
X = Feature(embed_list, p)

In [None]:
y_pred = crf.predict(X)

In [None]:
print(len(y_pred))
print(len(y_pred[0]))
print(y_pred[0][0])
print(y_pred[0][0][0])
print(len(dev_set[0]))

In [None]:
i = 0
for dev_id in range(len(y_pred)):
    for pred_id in range(len(y_pred[dev_id])):
        if y_pred[dev_id][pred_id][0] == 'B':
            i = i+1
print(i)

In [None]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for dev_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[dev_id])):
        if y_pred[dev_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[dev_id][pred_id][2:]
        elif start_pos is not None and y_pred[dev_id][pred_id][0]=='I'and pred_id<len(y_pred[dev_id])-1 and y_pred[dev_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([dev_set[dev_id][position] for position in range(start_pos,end_pos+1)])
            line=str(dev_id)+'\t'+str(start_pos)+'\t'+str(end_pos+1)+'\t'+entity_text+'\t'+entity_type
            output+=line+'\n'
        pos+=1     

In [None]:
output_path='output2.tsv'
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [None]:
print(output)