# POS Tagging Using CRF

## Import libraries

In [5]:
import pandas as pd       
import nltk
import sklearn
import sklearn_crfsuite
import scipy.stats
import math, string, re

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer

## Load and format data

In [6]:
def read_txt(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    return lines


def format_data(text_l):
    sent_l = []
    for sent in text_l:
        tuple_l = []
        tuple_words = sent.strip('\n').split()
        for tup in tuple_words:
            split = tup.split('/')
            if len(split) < 2:
                continue
            split_word = '/'.join(split[:-1])
            split_tag = split[-1]
            
            tuple_l.append((split_word, split_tag))
        sent_l.append(tuple_l)
    return sent_l

In [17]:
train_text_l = read_txt('../../processed_data/train.txt')
val_text_l = read_txt('../../processed_data/val.txt')
test_text_l = read_txt('../../processed_data/test.txt')

train_sent_l = format_data(train_text_l)
val_sent_l = format_data(val_text_l)
test_sent_l = format_data(test_text_l)

In [30]:
train_sent_l[2], val_sent_l[2]

([('“', 'O'),
  ('Thuỳ', 'B-PER'),
  ('Trâm', 'I-PER'),
  ('không', 'O'),
  ('định', 'O'),
  ('viết', 'O'),
  ('cho', 'O'),
  ('cả', 'O'),
  ('thế_giới', 'O'),
  ('này', 'O'),
  ('đọc', 'O'),
  (',', 'O'),
  ('nhưng', 'O'),
  ('có_lẽ', 'O'),
  ('chính', 'O'),
  ('vì_thế', 'O'),
  ('mà', 'O'),
  ('niềm', 'O'),
  ('tin', 'O'),
  ('sâu', 'O'),
  ('thẳm', 'O'),
  ('nơi', 'O'),
  ('chị', 'O'),
  ('được', 'O'),
  ('viết', 'O'),
  ('ra', 'O'),
  ('một_cách', 'O'),
  ('chân_phương', 'O'),
  (',', 'O'),
  ('rõ_ràng', 'O'),
  ('và', 'O'),
  ('tôi', 'O'),
  ('đã', 'O'),
  ('thấy', 'O'),
  ('chị', 'O'),
  ('có', 'O'),
  ('đủ', 'O'),
  ('dũng_cảm', 'O'),
  ('để', 'O'),
  ('theo_đuổi', 'O'),
  ('niềm', 'O'),
  ('tin', 'O'),
  ('ấy', 'O'),
  ('trong', 'O'),
  ('trận', 'O'),
  ('thử_thách', 'O'),
  ('cuối_cùng', 'O'),
  ('...', 'O'),
  ('”', 'O'),
  ('.', 'O')],
 [('Khi', 'O'),
  ('mở', 'O'),
  ('ra', 'O'),
  ('một', 'O'),
  ('bến', 'O'),
  ('mới', 'O'),
  ('cũng', 'O'),
  ('lắm', 'O'),
  ('điều', 'O'

## Feature define
1. The word
2. The word in lowercase
3. Length of the word
4. Prefixes and suffixes of the word of varying lengths
5. If the word is a digit
6. If the word is a punctuation mark
7. If the word is at the beginning of the sentence (BOS) or the end of the sentence (EOS) or neither
8. Features for the previous words, the following words

In [31]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.lower()': word.lower(),
        'word.ispunctuation': (word in string.punctuation),
        'word.isdigit()': word.isdigit(),
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.lower()': word1.lower(),
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.ispunctuation': (word1 in string.punctuation),
        })
    else:
        features['SOS'] = True
    
    if i > 1:
        word2 = sent[i-2][0]
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.lower()': word2.lower(),
            '-2:word[:3]': word2[:3],
            '-2:word[:2]': word2[:2],
            '-2:word[-3:]': word2[-3:],
            '-2:word[-2:]': word2[-2:],
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.ispunctuation': (word2 in string.punctuation),
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.lower()': word1.lower(),
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.ispunctuation': (word1 in string.punctuation),
        })
    else:
        features['EOS'] = True
    
    if i < len(sent) - 2:
        word2 = sent[i+2][0]
        features.update({
            '+2:word': word2,
            '+2:len(word)': len(word2),
            '+2:word.lower()': word2.lower(),
            '+2:word[:3]': word2[:3],
            '+2:word[:2]': word2[:2],
            '+2:word[-3:]': word2[-3:],
            '+2:word[-2:]': word2[-2:],
            '+2:word.isdigit()': word2.isdigit(),
            '+2:word.ispunctuation': (word2 in string.punctuation),
        })
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2words(sent):
    return [word[0] for word in sent]

In [41]:
# Extracting features
X_train = [sent2features(s) for s in train_sent_l]
y_train = [sent2labels(s) for s in train_sent_l]

X_val = [sent2features(s) for s in val_sent_l]
y_val = [sent2labels(s) for s in val_sent_l]

X_test = [sent2features(s) for s in test_sent_l]
y_test = [sent2labels(s) for s in test_sent_l]

## Model and train

In [21]:
# Define model
crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    c1 = 0.25,
    c2 = 0.3,
    max_iterations = 100,
    all_possible_transitions=True
)

In [22]:
# Train model
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.25, c2=0.3, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [23]:
labels = list(crf.classes_)
labels

['O',
 'B-PER',
 'I-PER',
 'B-ORG',
 'I-ORG',
 'B-LOC',
 'B-MISC',
 'I-MISC',
 'I-LOC',
 '',
 'Os',
 'B-NP',
 'I-NP',
 'o',
 'Cc',
 'R',
 'P',
 'Ob',
 '0',
 'OR',
 'B']

In [24]:
y_pred = crf.predict(X_train)
print('F1 score on the train set = {}\n'.format(metrics.flat_f1_score(y_train, y_pred, average='weighted', labels=labels)))
print('Accuracy on the train set = {}\n'.format(metrics.flat_accuracy_score(y_train, y_pred)))

print(metrics.flat_classification_report(y_train, y_pred))

F1 score on the train set = 0.9995813837695372

Accuracy on the train set = 0.9995837878660737

              precision    recall  f1-score   support

                   1.00      1.00      1.00       228
           0       1.00      0.50      0.67         2
           B       1.00      1.00      1.00         1
       B-LOC       0.99      0.99      0.99      4995
      B-MISC       1.00      1.00      1.00       218
        B-NP       1.00      0.83      0.91         6
       B-ORG       1.00      0.99      0.99       942
       B-PER       1.00      1.00      1.00      6187
          Cc       1.00      0.86      0.92         7
       I-LOC       0.99      1.00      0.99      2194
      I-MISC       1.00      1.00      1.00       215
        I-NP       1.00      1.00      1.00         1
       I-ORG       1.00      1.00      1.00      1631
       I-PER       1.00      1.00      1.00      2828
           O       1.00      1.00      1.00    278446
          OR       1.00      1.00      

In [42]:
y_pred = crf.predict(X_val)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(y_val, y_pred, average='weighted', labels=labels)))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(y_val, y_pred)))

print(metrics.flat_classification_report(y_val, y_pred, labels=labels))

F1 score on the test set = 0.9777647144914523

Accuracy on the test set = 0.9789036499161097

              precision    recall  f1-score   support

           O       0.99      1.00      0.99     33696
       B-PER       0.93      0.78      0.85       669
       I-PER       0.92      0.84      0.88       394
       B-ORG       0.83      0.46      0.60       183
       I-ORG       0.74      0.61      0.67       256
       B-LOC       0.85      0.80      0.83       702
      B-MISC       0.92      0.80      0.86        30
      I-MISC       0.92      0.75      0.83        32
       I-LOC       0.83      0.70      0.76       362
                   1.00      0.96      0.98        27
          Os       0.00      0.00      0.00         1
        B-NP       0.00      0.00      0.00         0
        I-NP       0.00      0.00      0.00         0
           o       0.00      0.00      0.00         0
          Cc       0.00      0.00      0.00         0
           R       0.00      0.00      0.

In [43]:
y_pred = crf.predict(X_test)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(y_test, y_pred)))

print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

F1 score on the test set = 0.983975059119143

Accuracy on the test set = 0.9847953877458738

              precision    recall  f1-score   support

           O       0.99      1.00      0.99     33335
       B-PER       0.95      0.78      0.86       622
       I-PER       0.92      0.88      0.90       302
       B-ORG       0.81      0.65      0.72        85
       I-ORG       0.85      0.61      0.71       173
       B-LOC       0.93      0.75      0.83       533
      B-MISC       0.94      0.94      0.94        32
      I-MISC       0.94      0.94      0.94        32
       I-LOC       0.88      0.74      0.80       237
                   1.00      1.00      1.00        22
          Os       0.00      0.00      0.00         0
        B-NP       0.00      0.00      0.00         0
        I-NP       0.00      0.00      0.00         0
           o       0.00      0.00      0.00         5
          Cc       0.00      0.00      0.00         0
           R       0.00      0.00      0.0

In [16]:
y_pred

[['O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  ''],
 ['O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
