## Hidden Markov Model

In [None]:
!git clone https://github.com/VinAIResearch/PhoNER_COVID19.git

Cloning into 'PhoNER_COVID19'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 44 (delta 14), reused 30 (delta 12), pack-reused 0[K
Unpacking objects: 100% (44/44), done.


In [None]:
tagset = set()
with open('/content/PhoNER_COVID19/data/syllable/train_syllable.conll') as f:
    for line in f:
        wordtags = line.split()
        if(len(wordtags)>0):
          tag = wordtags[-1]
          tagset.add(tag)

In [None]:
print('# Number of tags: %d' % len(tagset))
print(tagset)

# Number of tags: 21
{'B-NAME', 'I-DATE', 'I-JOB', 'B-DATE', 'I-PATIENT_ID', 'B-ORGANIZATION', 'B-JOB', 'I-AGE', 'B-LOCATION', 'B-PATIENT_ID', 'I-SYMPTOM_AND_DISEASE', 'B-TRANSPORTATION', 'B-GENDER', 'B-SYMPTOM_AND_DISEASE', 'I-ORGANIZATION', 'I-GENDER', 'I-TRANSPORTATION', 'I-NAME', 'B-AGE', 'O', 'I-LOCATION'}


In [None]:

def load_data(directory):
  sentence = []
  res = []
  with open(directory) as f:
    for lines in f:
      lines = lines.strip()
      if(lines==""):
        res.append(sentence)
        sentence = []
      else:
        word = lines.split(" ")[0]
        label = lines.split(" ")[1]
        sentence.append((word, label))
  return res


In [None]:
test_data = load_data("/content/PhoNER_COVID19/data/syllable/test_syllable.conll")

In [None]:
from collections import defaultdict

def train(train_file: str, model_file: str):
    emit = defaultdict(int)   # dictionary to store emission count C(t_i, w_i)
    transition = defaultdict(int)   # transition count C(t_{i-1}, t_i)
    context = defaultdict(int)  # count the context
    with open(train_file, 'r') as f:
        previous = '<s>'
        for line in f:
            line = line.strip()

            if(line==''):
              transition[previous + ' </s>'] += 1
              previous = '<s>'    # Make the sentence start
              context[previous] += 1
            else:
              wordtags = line.split()
              word, tag = wordtags[0], wordtags[1]
              transition[previous + ' ' + tag] += 1   # Count the transition
              context[tag] += 1   # Count the context
              emit[tag + ' ' + word] += 1   # Count the emission
              previous = tag

    # Now we will save the parameters of the model to a file
    with open(model_file, 'w') as fo:

        # Save transition probabilities
        for key, value in transition.items():
            previous, word = key.split(' ')
            fo.write('T %s %f\n' % (key, value/context[previous]))

        # Save emission probabilities
        for key, value in emit.items():
            tag, word = key.split(' ')
            fo.write('E %s %f\n' % (key, value/context[tag]))

    print('Finished training first-order HMM!')

In [None]:
train('/content/PhoNER_COVID19/data/syllable/train_syllable.conll', '/content/HMM_model.txt')

Finished training first-order HMM!


In [None]:
def load_model(model_file: str):
    """Load saved HMM model
    """
    transition = defaultdict(lambda: 0)
    emission = defaultdict(lambda: 0)
    possible_tags = {}
    
    with open(model_file, 'r') as f:
        for line in f:
            line = line.strip()
            _type, context, word, prob = line.split(' ')
            prob = float(prob)
            possible_tags[context] = 1  # # We use this to enumerate all tags
            if _type == 'T':
                transition[context + ' ' + word] = prob
            else:
                emission[context + ' ' + word] = prob
    return transition, emission, possible_tags

In [None]:
transition, emission, possible_tags = load_model('HMM_model.txt')
print( list(possible_tags.keys()) )
print(transition)
print(emission)

['<s>', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'B-LOCATION', 'I-LOCATION', 'B-DATE', 'B-PATIENT_ID', 'B-AGE', 'B-NAME', 'I-DATE', 'B-JOB', 'I-JOB', 'B-TRANSPORTATION', 'B-GENDER', 'I-GENDER', 'I-TRANSPORTATION', 'I-NAME', 'I-AGE', 'I-PATIENT_ID']
defaultdict(<function load_model.<locals>.<lambda> at 0x7faa12da70e0>, {'<s> O': 0.946489, 'O O': 0.843383, 'O B-ORGANIZATION': 0.007436, 'B-ORGANIZATION I-ORGANIZATION': 0.983289, 'I-ORGANIZATION I-ORGANIZATION': 0.767954, 'I-ORGANIZATION O': 0.230178, 'O </s>': 0.038345, 'O B-SYMPTOM_AND_DISEASE': 0.011014, 'B-SYMPTOM_AND_DISEASE I-SYMPTOM_AND_DISEASE': 0.762335, 'I-SYMPTOM_AND_DISEASE O': 0.482819, 'O B-LOCATION': 0.040903, 'B-LOCATION O': 0.073731, 'B-LOCATION I-LOCATION': 0.919785, 'I-LOCATION I-LOCATION': 0.596637, 'I-LOCATION O': 0.401332, 'O B-DATE': 0.019686, 'B-DATE O': 0.508042, 'O B-PATIENT_ID': 0.025004, 'B-PATIENT_ID O': 0.991358, 'O B-AGE': 0.004869, 'B-AGE O': 0.997067, 'B-PAT

In [None]:
import math


def viterbi(line, transition, emission, possible_tags):
    """Infer the tag sequence for a tokenized sentence

    Args:
        line (str): a tokenized word sequence
                    e.g., "Chiều cuối thu , trời vùng_biển Nghi_Xuân ảm_đạm ."
        transition (dict): transition probabilities
        emission (dict): emission probabilities
    """
    words = line.split()
    l = len(words)
    best_score = {}
    best_edge = {}
    best_score[('0 <s>')] = 0  # Start with <s>
    best_edge[('0 <s>')] = None

    K = frozenset(possible_tags.keys())
    K0 = frozenset(['<s>'])
    # Forward Step
    for i in range(l):
        tagset_prev, tagset_next = K, K
        if i == 0:
            tagset_prev = K0
        for prev in tagset_prev:
            for _next in tagset_next:
                if str(i) + ' ' + prev in best_score and prev + ' ' + _next in transition:
                    if emission[_next + ' ' + words[i]] == 0:
                        # To avoid zero probabilities, we use very small value
                        emission[_next + " " + words[i]] = 10 ** (-10)
                    
                    score = best_score[str(i) + ' ' + prev] + (-math.log(transition[prev + ' ' + _next])) + (-math.log(emission[_next + ' ' + words[i]]))

                    if str(i + 1) + " " + _next not in best_score or best_score[str(i + 1) + " " + _next] > score:
                        best_score[str(i + 1) + " " + _next] = score
                        best_edge[str(i + 1) + " " + _next] = str(i) + " " + prev
    
    tagset_prev = K
    if l == 0:
        tagset_prev = K0
    for prev in tagset_prev:
        if str(l) + ' ' + prev in best_score:
            if (prev + ' ' + '</s>') not in transition:
                transition[prev + ' ' + '</s>'] = 10 ** (-10)
            
            # Calculate best_score[str(l+1) + ' </s>'] and best_edge[str(l+1) + ' </s>'] 
            # for the sentence top symbole '</s>'
            # The different from the other time step is that, we do not use emission probility in calculating score
            score = best_score[str(l) + ' ' + prev] + (-math.log(transition[prev + ' ' + '</s>']))
            if str(l+1) + ' ' + '</s>' not in best_score or best_score[str(l+1) + ' </s>'] > score:
                best_score[str(l+1) + ' </s>'] = score
                best_edge[str(l+1) + ' </s>'] = str(l) + ' ' + prev
    
    # Backward Step
    tags = []
    next_edge = best_edge[str(l + 1) + " " + "</s>"]
    # TODO: Complete the backward step in Viterbi algorithm
    # Finish the while loop in the pseudo code
    while next_edge != "0 <s>":
        position, tag= next_edge.split()
        tags.append(tag)
        next_edge = best_edge[next_edge]
    
    # END OF YOUR CODE
    tags.reverse()
    return ' '.join(tags)

In [None]:
viterbi('Xin mơi nghe hướng dẫn của Bộ Y tế .   ', transition, emission, possible_tags)

'O O O B-JOB I-JOB O B-ORGANIZATION I-ORGANIZATION I-ORGANIZATION O'

In [None]:
with open('output.txt', 'w') as f:
    for i in range(len(test_data)):
        for j in range(len(test_data[i])):
          f.write(test_data[i][j][0]+' '+test_data[i][j][1]+'\n')
        f.write('\n')

In [None]:
with open('answer.txt', 'w') as f:
    for i in range(len(test_data)):
      X_test = test_data[i]
      sentence = ''
      for wordtags in X_test:
        sentence += wordtags[0]+' '
      y_pred = viterbi(sentence,transition, emission, possible_tags).split(' ')

      for j in range(len(test_data[i])):
           f.write(test_data[i][j][0]+' '+y_pred[j]+'\n')
      f.write('\n')

In [None]:

!pip install seqeval[cpu]
from seqeval.metrics import precision_score, recall_score, f1_score

def get_tags(filepath):
    res = []
    with open(filepath, 'r') as f:
        cur_sen = []
        for line in f:
            line = line.strip()
            if line == '':
                if len(cur_sen) != 0:
                    res.append(cur_sen)
                    cur_sen = []
            else:
                word, tag = line.split()
                cur_sen.append(tag)
    if len(cur_sen) != 0:
        res.append(cur_sen)
    return res

def evaluate(test_file, output_file):
    y_true = get_tags(test_file)
    y_pred = get_tags(output_file)

    p = precision_score(y_true, y_pred)
    r = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return p, r, f1


Collecting seqeval[cpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l[K     |███████▌                        | 10 kB 21.6 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 19.5 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 10.4 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 8.3 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.1 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=f466b7653e6e757f9b3bf230c42894824d378cca33212e74bef88485d7fbead5
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
evaluate('./answer.txt', './output.txt')

(0.735832978270132, 0.7359584079093157, 0.7358956877450146)