## Conditional Random Fields

In [2]:
!git clone https://github.com/VinAIResearch/PhoNER_COVID19.git

Cloning into 'PhoNER_COVID19'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 44 (delta 14), reused 30 (delta 12), pack-reused 0[K
Unpacking objects: 100% (44/44), done.


In [3]:

def load_data(directory):
  sentence = []
  res = []
  with open(directory) as f:
    for lines in f:
      lines = lines.strip()
      if(lines==""):
        res.append(sentence)
        sentence = []
      else:
        word = lines.split(" ")[0]
        label = lines.split(" ")[1]
        sentence.append((word, label))
  return res


In [4]:

train_data = load_data("/content/PhoNER_COVID19/data/syllable/train_syllable.conll")
test_data = load_data("/content/PhoNER_COVID19/data/syllable/test_syllable.conll")
print(len(train_data))
print(len(test_data))


5027
3000


In [5]:
def word2features(sentence, i):
    """
    Arguments:
        sentence (list): list of words [w1, w2,...,w_n]
        i (int): index of the word
    Return:
        features (dict): dictionary of features
    """
    word = sentence[i]
    features = {
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'is_first_capital': word[0].isupper(),
        'is_all_caps': int(word.upper() == word),
        'is_all_lower': word.lower() == word,
        'word': word,
        'word.lower()': word.lower(),
        'prefix_1': word[0],
        'prefix_2': word[:2],
        'prefix_3': word[:3],
        'prefix_4': word[:4],
        'suffix_1': word[-1],
        'suffix_2': word[-2:],
        'suffix_3': word[-3:],
        'suffix_4': word[-4:],
        'prev_word': '' if i==0 else sentence[i-1].lower(),
        'next_word': '' if i==len(sentence)-1 else sentence[i+1].lower(),
        'has_hyphen': '-' in word,
        'is_numeric': word.isdigit(),
        'capitals_inside': word[1:].lower() != word[1:]
    }
    
    return features


def sent2features(sentence):
    """
    sentence is a list of words [w1, w2,...,w_n]
    """
    return [word2features(sentence, i) for i in range(len(sentence))]


def sent2labels(sentence):
    """
    sentence is a list of tuples (word, postag)
    """    
    return [postag for token, postag in sentence]

def untag(sentence):
    """
    sentence is a list of tuples (word, postag)
    """
    return [token for token, _ in sentence]


In [6]:
print(untag(train_data[2]))
print(train_data[2])
print(sent2labels(train_data[2]))

['Ngoài', 'ra', ',', 'những', 'người', 'tiếp', 'xúc', 'gián', 'tiếp', '(', 'đã', 'gặp', 'những', 'người', 'tiếp', 'xúc', 'gần', 'với', 'bệnh', 'nhân', ')', 'được', 'lập', 'danh', 'sách', 'và', 'yêu', 'cầu', 'cách', 'ly', 'y', 'tế', 'tại', 'nơi', 'ở', '.']
[('Ngoài', 'O'), ('ra', 'O'), (',', 'O'), ('những', 'O'), ('người', 'O'), ('tiếp', 'O'), ('xúc', 'O'), ('gián', 'O'), ('tiếp', 'O'), ('(', 'O'), ('đã', 'O'), ('gặp', 'O'), ('những', 'O'), ('người', 'O'), ('tiếp', 'O'), ('xúc', 'O'), ('gần', 'O'), ('với', 'O'), ('bệnh', 'O'), ('nhân', 'O'), (')', 'O'), ('được', 'O'), ('lập', 'O'), ('danh', 'O'), ('sách', 'O'), ('và', 'O'), ('yêu', 'O'), ('cầu', 'O'), ('cách', 'O'), ('ly', 'O'), ('y', 'O'), ('tế', 'O'), ('tại', 'O'), ('nơi', 'O'), ('ở', 'O'), ('.', 'O')]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [7]:
sent2features(untag(train_data[2]))[0]

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': 0,
 'is_all_lower': False,
 'is_first': True,
 'is_first_capital': True,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'ra',
 'prefix_1': 'N',
 'prefix_2': 'Ng',
 'prefix_3': 'Ngo',
 'prefix_4': 'Ngoà',
 'prev_word': '',
 'suffix_1': 'i',
 'suffix_2': 'ài',
 'suffix_3': 'oài',
 'suffix_4': 'goài',
 'word': 'Ngoài',
 'word.lower()': 'ngoài'}

In [8]:
X_train = [sent2features(untag(s)) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

print(len(X_train))
print(len(y_train))

5027
5027


In [9]:

!pip install -U 'scikit-learn<0.24'
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer

!pip install -U 'sklearn-crfsuite<0.24'
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



Collecting scikit-learn<0.24
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 6.7 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imbalanced-learn 0.8.1 requires scikit-learn>=0.24, but you have scikit-learn 0.23.2 which is incompatible.[0m
Successfully installed scikit-learn-0.23.2
Collecting sklearn-crfsuite<0.24
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 7.0 MB/s 
Installin

In [10]:
crf = sklearn_crfsuite.CRF(
    algorithm='l2sgd',
    c2=3.2,
    max_iterations=100,
    all_possible_transitions=True,
    all_possible_states=True,
)
crf.fit(X_train, y_train)



CRF(algorithm='l2sgd', all_possible_states=True, all_possible_transitions=True,
    c2=3.2, keep_tempfiles=None, max_iterations=100)

In [11]:
with open('output.txt', 'w') as f:
    for i in range(len(test_data)):
        for j in range(len(test_data[i])):
          f.write(test_data[i][j][0]+' '+test_data[i][j][1]+'\n')
        f.write('\n')

In [12]:
with open('answer.txt', 'w') as f:
    for i in range(len(test_data)):
      X_test = [sent2features(untag(test_data[i]))]
      y_pred = crf.predict(X_test)
      for j in range(len(test_data[i])):
           f.write(test_data[i][j][0]+' '+y_pred[0][j]+'\n')
      f.write('\n')

In [13]:
!pip install seqeval[cpu]

Collecting seqeval[cpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l[K     |███████▌                        | 10 kB 40.1 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 18.6 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 15.4 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 14.2 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.7 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=23d80a1d214d7981bd60671af9ae50e8fd073a87eba618f7a40f08b8d46b2f4b
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [14]:
from seqeval.metrics import precision_score, recall_score, f1_score

def get_tags(filepath):
    res = []
    with open(filepath, 'r') as f:
        cur_sen = []
        for line in f:
            line = line.strip()
            if line == '':
                if len(cur_sen) != 0:
                    res.append(cur_sen)
                    cur_sen = []
            else:
                word, tag = line.split()
                cur_sen.append(tag)
    if len(cur_sen) != 0:
        res.append(cur_sen)
    return res

def evaluate(test_file, output_file):
    y_true = get_tags(test_file)
    y_pred = get_tags(output_file)

    p = precision_score(y_true, y_pred)
    r = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return p, r, f1

In [15]:
evaluate('./answer.txt', './output.txt')

(0.8788240306774606, 0.9158969804618117, 0.896977603826919)

## Demo

In [17]:
sentence = "Xin mơi nghe hướng dẫn của Bộ Y tế. Vào ngày 23 tháng 4 , Bệnh nhân số 234 hiện đã mắc Covid được phát hiện tại nhà riêng ở số 64 đường Đông Tác, Đông thọ. Bệnh nhân đang được chuyển tới bệnh viện nhiệt đới trung ương để được chăm sóc"

res = sentence.split()
print(res)

X = [sent2features(res)]
y = crf.predict(X)
print(y)

['Xin', 'mơi', 'nghe', 'hướng', 'dẫn', 'của', 'Bộ', 'Y', 'tế.', 'Vào', 'ngày', '23', 'tháng', '4', ',', 'Bệnh', 'nhân', 'số', '234', 'hiện', 'đã', 'mắc', 'Covid', 'được', 'phát', 'hiện', 'tại', 'nhà', 'riêng', 'ở', 'số', '64', 'đường', 'Đông', 'Tác,', 'Đông', 'thọ.', 'Bệnh', 'nhân', 'đang', 'được', 'chuyển', 'tới', 'bệnh', 'viện', 'nhiệt', 'đới', 'trung', 'ương', 'để', 'được', 'chăm', 'sóc']
[['O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'B-DATE', 'B-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'B-PATIENT_ID', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PATIENT_ID', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
