運行環境:Colab<br>
最後一段code為失敗的CRF (BiLSTM)<br>

In [94]:

import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm

In [95]:
# Load your cloud drive
from google.colab import drive
drive.mount('/content/drive')

folder = 'drive/MyDrive/forth_up/AI_IN_medical/hw3'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [96]:
!ls

drive  sample_data


## Preprocessing
* Change input data (ex. train.txt) into CRF model input format (ex. train.data)
    * CRF model input format (ex. train.data):
        ```
        肝 O
        功 O
        能 O
        6 B-med_exam
        8 I-med_exam
        ```

## Data pre-processing
- annot = article_id, start_pos, end_pos, entity_text, entity_type (columns)

In [97]:
file_path = f'{folder}/sample_data.txt'

with open(file_path, 'r', encoding='utf8') as f:
    file_text = f.read().encode('utf-8').decode('utf-8-sig')

datas = file_text.split('\n\n--------------------\n\n')[:-1]

with open(f"{folder}/processed_data.txt", "w") as f:
    for article_id, data in enumerate(datas):
        data=data.split('\n')
        content=data[0]

        annotations=data[1:]
        row = list()
        for annot in annotations[1:]:
            annot=annot.split('\t') #annot= article_id, start_pos, end_pos, entity_text, entity_type
            row.append(annot)

        df = pd.DataFrame(row, columns=data[1].split('\t'))
        position_cols = ['start_position', 'end_position']
        df[position_cols] = df[position_cols].astype('int')

        tmp_label_list = np.array(['O'] * len(content), dtype=object)
        for i in range(len(df)):
            start, end, etype = df['start_position'][i], df['end_position'][i], df['entity_type'][i]
            # print(start, end, etype)
            tmp_label_list[start] = "B-" + str(etype)
            tmp_label_list[start+1:end] = "I-" + str(etype)

        for i, row in enumerate(zip(list(content), tmp_label_list)):
            f.write(" ".join(row) + '\n')
        
        f.write('\n')

## NER model
### CRF (Conditional Random Field model)
* Using `sklearn-crfsuite` API

    (you may try `CRF++`, `python-crfsuite`, `pytorch-crfsuite`(neural network version))

In [98]:
!pip install sklearn-crfsuite
import sklearn_crfsuite

from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [99]:
def CRF(x_train, y_train, x_test, y_test):
    # Doc: https://sklearn-crfsuite.readthedocs.io/en/latest/api.html#module-sklearn_crfsuite
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True,
    )
    crf.fit(x_train, y_train)

    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results

    return y_pred, y_pred_mar, f1score

## Model Input: 
* input features:
    * word vector: pretrained traditional chinese word embedding by Word2Vec-CBOW
    
    (you may try add some other features, ex. pos-tag, word_length, word_position, ...) 

In [100]:
import numpy as np

In [101]:
# Load pretrained word vectors
# Get a dict of tokens (key) and their pretrained word vectors (value)
# Pre-trained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
# Pre-trained fastText word embeddings: https://fasttext.cc/docs/en/crawl-vectors.html
# cc.zh.300.vec cna.cbow.cwe_p.tar_g.512d.0.txt
dim = 0
word_vecs= {}
# Open pretrained word vector file
with open(f'{folder}/cc.zh.300.vec') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0] 
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [102]:
print(f'vocabulary_size: {len(word_vecs)}')
print(f'word_vector_dim: {vec.shape}')

vocabulary_size: 2000000
word_vector_dim: (300,)


Here we split data into training dataset and testing dataset,
however, we'll provide `development data` and `test data` which is real testing dataset.

You should upload prediction on `development data` and `test data` to system, not this splitted testing dataset.

In [103]:
# Load `train.data` and separate into a list of labeled data of each text
# return:
#   data_list: a list of lists of tuples, storing tokens and labels (wrapped in tuple) of each text in `train.data`
#   traindata_list: a list of lists, storing training data_list splitted from data_list
#   testdata_list: a list of lists, storing testing data_list splitted from data_list
from sklearn.model_selection import train_test_split


def make_dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data = f.readlines() #.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list = list()
    idx = 0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # Here we random split data into training dataset and testing dataset
    # But you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # nd generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,
                                                                                                        article_id_list,
                                                                                                        test_size=0.33,
                                                                                                        random_state=42)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [104]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data


def build_word_vectors(data_list, embedding_dict):
    embedding_list = list()
    temp = 0

    # No Match Word (unknown word) Vector in Embedding
    unk_vector = np.random.rand(*(list(embedding_dict.values())[0].shape))
    # print(unk_vector)
    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token
            
            
            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            
            embedding_list_tmp.append([key,value])
        
        embedding_list.append(embedding_list_tmp)
    # print(embedding_list[0][0])
    return embedding_list

In [105]:
# Input features: pretrained word vectors of each token
# Return a list of feature dicts, each feature dict corresponding to each token
def make_features(embed_list, interval_list):
    interval = 0
    feature_list = list()
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            feature_dict = dict()
            for idx_vec in range(len(embed_list[idx_list][idx_tuple][1])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][1][idx_vec]
            if ord('0') <= ord(embed_list[idx_list][idx_tuple][0]) <= ord('9'):
                feature_dict['Is_num'] = True
            else:
                feature_dict['Is_num'] = False
                if embed_list[idx_list][idx_tuple][0] == "." and feature_list_tmp[-1]['Is_num'] == True:
                    feature_dict['Is_num'] = True
            if embed_list[idx_list][idx_tuple][0] == "！" or embed_list[idx_list][idx_tuple][0] == "？" or embed_list[idx_list][idx_tuple][0] == "：" or embed_list[idx_list][idx_tuple][0] == "。" or embed_list[idx_list][idx_tuple][0] == "，" or  embed_list[idx_list][idx_tuple][0] == "！" :
                feature_dict["Is_mark"] = True
            else:
                feature_dict["Is_mark"] = False

            if embed_list[idx_list][idx_tuple][0][0] == "B" :
                feature_dict["Interval"] = interval_list[interval]
                interval += 1
            else:
                feature_dict["Interval"] = 0

            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)

    return feature_list

In [106]:
# Get the labels of each tokens in train.data
# Return a list of lists of labels
def process_labels(data_list):
    label_list = list()
    interval_list = list()
    interval = 0
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
            if data_list[idx_list][idx_tuple][1][0] == "B" :
                interval = 0
            if data_list[idx_list][idx_tuple][1][0] == "I" and data_list[idx_list][idx_tuple + 1][1][0] != "I":
                interval_list.append(interval + 1)
            interval += 1
        
        label_list.append(label_list_tmp)
        
    return label_list, interval_list

## Training

In [107]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = make_dataset(f"{folder}/processed_data.txt")

In [108]:
# Load Word Embedding
trainembed_list = build_word_vectors(traindata_list, word_vecs)
testembed_list = build_word_vectors(testdata_list, word_vecs)

# CRF - Train Data (Augmentation Data)
y_train, interval_list = process_labels(traindata_list)
x_train = make_features(trainembed_list, interval_list)


# CRF - Test Data (Golden Standard)
y_test, interval_list = process_labels(testdata_list)
x_test = make_features(testembed_list, interval_list)


In [109]:
y_pred, y_pred_mar, f1score = CRF(x_train, y_train, x_test, y_test)

In [110]:
f1score

0.5962773144411311

cc.zh.300.vec<br>
with num     0.6012018963243285<br>
without num  0.6023806685573265<br>
add interval 0.5962773144411311

another vectore <br>
with num     0.35295256099491557<br>
without num  0.36087741955476677<br>
add interval 0.374521107776724<br>

## Output data
* Change model output into `output.tsv` 
* Only accept this output format uploading to competition system

In [111]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for test_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[test_id])):
        if y_pred[test_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[test_id][pred_id][2:]
        elif start_pos is not None and y_pred[test_id][pred_id][0]=='I' and y_pred[test_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([testdata_list[test_id][position][0] for position in range(start_pos,end_pos+1)])
            line=str(testdata_article_id_list[test_id])+'\t'+str(start_pos)+'\t'+str(end_pos+1)+'\t'+entity_text+'\t'+entity_type
            output+=line+'\n'
        pos+=1

In [112]:
output_path=f'{folder}/output.tsv'
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [113]:
print(output)

article_id	start_position	end_position	entity_text	entity_type
8	10	12	38	med_exam
8	189	193	二十分鐘	time
8	293	295	五年	time
8	519	521	吩咐	time
8	540	544	兩個禮拜	time
8	858	862	前天下午	time
8	1354	1356	娜美	name
8	1549	1551	五天	time
8	1622	1627	五天禮拜三	time
8	1939	1941	恍惚	time
8	1992	1997	禮拜三下午	time
8	2279	2282	185	med_exam
8	2377	2380	185	med_exam
8	2387	2390	185	med_exam
8	2560	2563	兩個月	time
8	2671	2674	155	med_exam
8	2679	2682	155	med_exam
8	2696	2699	155	med_exam
16	60	66	九、十點晚上	time
16	122	124	三年	time
16	130	132	三年	time
16	247	249	三年	time
16	592	595	5個月	time
0	55	57	68	med_exam
0	66	68	68	med_exam
0	435	437	歐洲	location
0	1264	1271	10.78公分	med_exam
0	2523	2526	法馬上	time
0	2575	2578	四五天	time
0	2604	2609	3月18號	time
0	2630	2635	3月24日	time
0	2650	2654	3月24	time
0	2663	2670	禮拜二到禮拜四	time
0	2692	2697	3月31日	time
24	48	51	三個月	time
24	53	56	七公斤	med_exam
24	113	115	三年	time
24	141	143	三年	time
24	496	499	來箝制	time
24	536	539	來箝制	time
24	547	550	來箝制	time
24	1018	1023	5月28日	time
24	1196	1200	300塊	money
24	1247	125

In [114]:
# # ## fail version

# import pickle
# # from plot_keras_history import plot_history
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import multilabel_confusion_matrix
# from keras_contrib.utils import save_load_utils

# from keras import layers
# from keras import optimizers

# from keras.models import Model


# from tensorflow.python.keras.models import Input
# # from keras.models import Input

# from keras_contrib.layers import CRF
# from keras_contrib import losses
# from keras_contrib import metrics
# # preprocessing
# all_words = list(words)
# all_tags = list(tags)
# tag2index = {tag:idx + 1 for idx, tag in enumerate(all_tags)}
# tag2index["--PADDING--"] = 0
# print(tags)
# print(tag2index)
# index2tag = {idx: word for word, idx in tag2index.items()}
# print(index2tag)

# word2index = {word: idx + 2 for idx, word in enumerate(all_words)}

# word2index["--UNKNOWN_WORD--"]=0

# word2index["--PADDING--"]=1

# index2word = {idx: word for word, idx in word2index.items()}



# MAX_SENTENCE = 0
# WORD_COUNT = len(index2word)
# DENSE_EMBEDDING = 50
# LSTM_UNITS = 50
# LSTM_DROPOUT = 0 # 0.1
# DENSE_UNITS = 50 #100
# BATCH_SIZE = 30 #256
# MAX_EPOCHS = 5
# TAG_COUNT = len(tag2index)
# print(TAG_COUNT)


# def MY_CRF(x_train, y_train, x_test, y_test):   ##(512dim, tags, 512dim, tags)
#     # Doc: https://sklearn-crfsuite.readthedocs.io/en/latest/api.html#module-sklearn_crfsuite
#     # crf = sklearn_crfsuite.CRF(
#     #     algorithm='lbfgs',
#     #     c1=0.1,
#     #     c2=0.1,
#     #     max_iterations=100,
#     #     all_possible_transitions=True,
#     # )
#     # crf.fit(x_train, y_train)

    

#     # y_pred = crf.predict(x_test)
#     # y_pred_mar = crf.predict_marginals(x_test)
#     print(MAX_SENTENCE, MAX_SENTENCE * 300)

#     input_layer = layers.Input(shape=(MAX_SENTENCE*300,))

#     model = layers.Embedding(WORD_COUNT, DENSE_EMBEDDING, embeddings_initializer="uniform", input_length=MAX_SENTENCE)(input_layer)

#     model = layers.Bidirectional(layers.LSTM(LSTM_UNITS, recurrent_dropout=LSTM_DROPOUT, return_sequences=True))(model)

#     model = layers.TimeDistributed(layers.Dense(DENSE_UNITS, activation="relu"))(model)

#     crf_layer = CRF(units=TAG_COUNT)
#     output_layer = crf_layer(model)

#     ner_model = Model(input_layer, output_layer)

#     loss = losses.crf_loss
#     acc_metric = metrics.crf_accuracy
#     # acc_metric = metrics.accuracy_score
#     opt = optimizers.Adam(learning_rate=0.001)

#     ner_model.compile(optimizer=opt, loss=loss, metrics=[acc_metric])

#     ner_model.summary()
#     print("fit")
#     print(f"x_train len = {len(x_train)}")
#     print(f"x_train len[0] = {len(x_train[0])}")
#     print(f"x_train len[0][0] = {len(x_train[0][0])}")
#     history = ner_model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, validation_split=0.1, verbose=2)

#     print("fit finished")

        
#     padded_sentence = x_test + [word2index["--PADDING--"]] * (MAX_SENTENCE - len(x_test))
#     padded_sentence = [word2index.get(w, 0) for w in padded_sentence]

#     y_pred = ner_model.predict(np.array([padded_sentence]))
#     # labels = list(crf.classes_)
#     labels = list(tag2index)
#     labels.remove('O')
#     print(labels)
#     f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
#     sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results

#     return y_pred,  f1score # y_pred_mar,



## Note
* You may try `python-crfsuite` to train an neural network for NER tagging optimized by gradient descent back propagation
    * [Documentation](https://github.com/scrapinghub/python-crfsuite)
* You may try `CRF++` tool for NER tagging by CRF model
    * [Documentation](http://taku910.github.io/crfpp/)
    * Need design feature template
    * Can only computed in CPU
* You may try other traditional chinese word embedding (ex. fasttext, bert, ...) for input features
* You may try add other features for NER model, ex. POS-tag, word_length, word_position, ...
* You should upload the prediction output on `development data` or `test data` provided later to the competition system. Note don't upload prediction output on the splitted testing dataset like this baseline example.

-----------------------------------------------------