In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
# Load your cloud drive
from google.colab import drive
drive.mount('/content/drive')

folder = 'drive/MyDrive/Courses/AIMAS_2022'

Mounted at /content/drive


In [None]:
!ls

drive  sample_data


## Preprocessing
* Change input data (ex. train.txt) into CRF model input format (ex. train.data)
    * CRF model input format (ex. train.data):
        ```
        肝 O
        功 O
        能 O
        6 B-med_exam
        8 I-med_exam
        ```

## Data pre-processing
- annot = article_id, start_pos, end_pos, entity_text, entity_type (columns)

In [3]:
file_path = f'{folder}/sample_data.txt'

with open(file_path, 'r', encoding='utf8') as f:
    file_text = f.read().encode('utf-8').decode('utf-8-sig')

datas = file_text.split('\n\n--------------------\n\n')[:-1]

with open(f"{folder}/processed_data.txt", "w") as f:
    for article_id, data in enumerate(datas):
        data=data.split('\n')
        content=data[0]

        annotations=data[1:]
        row = list()
        for annot in annotations[1:]:
            annot=annot.split('\t') #annot= article_id, start_pos, end_pos, entity_text, entity_type
            row.append(annot)

        df = pd.DataFrame(row, columns=data[1].split('\t'))
        position_cols = ['start_position', 'end_position']
        df[position_cols] = df[position_cols].astype('int')

        tmp_label_list = np.array(['O'] * len(content), dtype=object)
        for i in range(len(df)):
            start, end, etype = df['start_position'][i], df['end_position'][i], df['entity_type'][i]
            # print(start, end, etype)
            tmp_label_list[start] = "B-" + str(etype)
            tmp_label_list[start+1:end] = "I-" + str(etype)

        for i, row in enumerate(zip(list(content), tmp_label_list)):
            f.write(" ".join(row) + '\n')
        
        f.write('\n')

## NER model
### CRF (Conditional Random Field model)
* Using `sklearn-crfsuite` API

    (you may try `CRF++`, `python-crfsuite`, `pytorch-crfsuite`(neural network version))

In [4]:
!pip install sklearn-crfsuite
import sklearn_crfsuite

from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 30.3 MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6


In [5]:
def CRF(x_train, y_train, x_test, y_test):
    # Doc: https://sklearn-crfsuite.readthedocs.io/en/latest/api.html#module-sklearn_crfsuite
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        # calibration_eta = 0.01,
        all_possible_transitions=True,
    )
    crf.fit(x_train, y_train)

    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results

    return y_pred, y_pred_mar, f1score

## Model Input: 
* input features:
    * word vector: pretrained traditional chinese word embedding by Word2Vec-CBOW
    
    (you may try add some other features, ex. pos-tag, word_length, word_position, ...) 

In [6]:
import numpy as np

In [7]:
# Load pretrained word vectors
# Get a dict of tokens (key) and their pretrained word vectors (value)
# Pre-trained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
# Pre-trained fastText word embeddings: https://fasttext.cc/docs/en/crawl-vectors.html

dim = 0
word_vecs= {}
# Open pretrained word vector file
# with open(f'{folder}/cna.cbow.cwe_p.tar_g.512d.0.txt') as f:
with open(f'{folder}/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5_2','r',encoding='utf-8') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0] 
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [None]:
print(f'vocabulary_size: {len(word_vecs)}')
print(f'word_vector_dim: {vec.shape}')

vocabulary_size: 635921
word_vector_dim: (300,)


Here we split data into training dataset and testing dataset,
however, we'll provide `development data` and `test data` which is real testing dataset.

You should upload prediction on `development data` and `test data` to system, not this splitted testing dataset.

In [8]:
# Load `train.data` and separate into a list of labeled data of each text
# return:
#   data_list: a list of lists of tuples, storing tokens and labels (wrapped in tuple) of each text in `train.data`
#   traindata_list: a list of lists, storing training data_list splitted from data_list
#   testdata_list: a list of lists, storing testing data_list splitted from data_list
from sklearn.model_selection import train_test_split


def make_dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data = f.readlines() #.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list = list()
    idx = 0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # Here we random split data into training dataset and testing dataset
    # But you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # nd generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,article_id_list,test_size=0.33,random_state=42)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [9]:
def make_eval_dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data = f.readlines() #.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list = list()
    idx = 0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)    
    
    return data_list, article_id_list

In [10]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def build_word_vectors(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector = np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
        
    return embedding_list

In [11]:
# Input features: pretrained word vectors of each token
# Return a list of feature dicts, each feature dict corresponding to each token
def make_features(embed_list):
    feature_list = list()
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            feature_dict = dict()
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]
            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)

    return feature_list

In [12]:
# Get the labels of each tokens in train.data
# Return a list of lists of labels
def process_labels(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
        
    return label_list

## Training

In [40]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = make_dataset(f"{folder}/processed_data.txt")

In [41]:
# Load Word Embedding
trainembed_list = build_word_vectors(traindata_list, word_vecs)
testembed_list = build_word_vectors(testdata_list, word_vecs)

# CRF - Train Data (Augmentation Data)
x_train = make_features(trainembed_list)
y_train = process_labels(traindata_list)

# CRF - Test Data (Golden Standard)
x_test = make_features(testembed_list)
y_test = process_labels(testdata_list)

In [42]:
y_pred, y_pred_mar, f1score = CRF(x_train, y_train, x_test, y_test)

In [43]:
f1score

0.5733489453140539

# Eval

In [44]:
evaldata_list, eval_article_id_list = make_eval_dataset(f"{folder}/processed_data.txt")

In [45]:
# Load Word Embedding
trainembed_list = build_word_vectors(traindata_list, word_vecs)
evalembed_list = build_word_vectors(evaldata_list, word_vecs)

# CRF - Train Data (Augmentation Data)
x_train = make_features(trainembed_list)
y_train = process_labels(traindata_list)

# CRF - Eval Data (Full processed_data.txt)
x_eval = make_features(evalembed_list)
y_eval = process_labels(evaldata_list)

In [46]:
y_pred, y_pred_mar, f1score = CRF(x_train, y_train, x_eval, y_eval)

In [47]:
f1score

0.7194071116508134

## Output data
* Change model output into `output.tsv` 
* Only accept this output format uploading to competition system

In [48]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for eval_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[eval_id])):
        if y_pred[eval_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[eval_id][pred_id][2:]
        elif start_pos is not None and y_pred[eval_id][pred_id][0]=='I' and y_pred[eval_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([evaldata_list[eval_id][position][0] for position in range(start_pos,end_pos+1)])
            line=str(eval_article_id_list[eval_id])+'\t'+str(start_pos)+'\t'+str(end_pos+1)+'\t'+entity_text+'\t'+entity_type
            output+=line+'\n'
        pos+=1

In [49]:
output_path=f'{folder}/output.csv'
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [50]:
print(output)

article_id	start_position	end_position	entity_text	entity_type
0	55	57	68	time
0	66	68	68	time
0	1264	1271	10.78公分	med_exam
0	1358	1362	三多路上	time
0	1736	1738	較矮	time
0	2304	2306	海洛	location
0	2575	2578	四五天	time
0	2604	2609	3月18號	time
0	2630	2635	3月24日	time
0	2650	2654	3月24	time
0	2667	2670	禮拜四	time
0	2692	2697	3月31日	time
1	235	237	中午	time
1	254	256	中午	time
1	802	805	三個月	time
1	904	908	五十二百	money
1	915	919	五十二百	money
1	988	991	兩百多	money
1	997	1000	兩百多	money
1	1021	1024	兩百多	money
1	1052	1055	五個月	time
1	1385	1387	六年	time
1	1394	1396	四年	time
1	1404	1406	四年	time
1	1414	1416	四年	time
1	2261	2263	半月	time
1	2295	2299	兩個禮拜	time
1	2334	2338	三個禮拜	time
2	543	545	53	med_exam
2	772	779	５月份５月１號	time
2	784	788	五月一號	time
2	809	813	三個禮拜	time
2	1036	1038	前年	time
2	1047	1049	前年	time
2	1060	1063	三月份	time
2	1069	1074	前年三月份	time
2	1089	1094	三、四個月	time
2	1452	1456	37.2	med_exam
2	1524	1526	35	med_exam
2	1575	1577	38	med_exam
2	1669	1671	38	med_exam
2	1759	1761	38	med_exam
2	1770	1772	35	med_exam
3	281	284	9公分	

In [52]:
!pip list -v

Package                       Version                Location                               Installer
----------------------------- ---------------------- -------------------------------------- ---------
absl-py                       1.3.0                  /usr/local/lib/python3.8/dist-packages pip
aeppl                         0.0.33                 /usr/local/lib/python3.8/dist-packages pip
aesara                        2.7.9                  /usr/local/lib/python3.8/dist-packages pip
aiohttp                       3.8.3                  /usr/local/lib/python3.8/dist-packages pip
aiosignal                     1.3.1                  /usr/local/lib/python3.8/dist-packages pip
alabaster                     0.7.12                 /usr/local/lib/python3.8/dist-packages pip
albumentations                1.2.1                  /usr/local/lib/python3.8/dist-packages pip
altair                        4.2.0                  /usr/local/lib/python3.8/dist-packages pip
appdirs                     

## Note
* You may try `python-crfsuite` to train an neural network for NER tagging optimized by gradient descent back propagation
    * [Documentation](https://github.com/scrapinghub/python-crfsuite)
* You may try `CRF++` tool for NER tagging by CRF model
    * [Documentation](http://taku910.github.io/crfpp/)
    * Need design feature template
    * Can only computed in CPU
* You may try other traditional chinese word embedding (ex. fasttext, bert, ...) for input features
* You may try add other features for NER model, ex. POS-tag, word_length, word_position, ...
* You should upload the prediction output on `development data` or `test data` provided later to the competition system. Note don't upload prediction output on the splitted testing dataset like this baseline example.

-----------------------------------------------------